Merge pull request #141 from pesco/iterative

Iterative (chunked) input processing
This commit is contained in:
Meredith L. Patterson 2015-09-23 10:52:22 +02:00
commit cb93c3b4ec
11 changed files with 668 additions and 80 deletions

View file

@ -346,7 +346,10 @@ void h_lalr_free(HParser *parser)
HParserBackendVTable h__lalr_backend_vtable = {
.compile = h_lalr_compile,
.parse = h_lr_parse,
.free = h_lalr_free
.free = h_lalr_free,
.parse_start = h_lr_parse_start,
.parse_chunk = h_lr_parse_chunk,
.parse_finish = h_lr_parse_finish
};
@ -355,8 +358,6 @@ HParserBackendVTable h__lalr_backend_vtable = {
// dummy!
int test_lalr(void)
{
HAllocator *mm__ = &system_allocator;
/*
E -> E '-' T
| T
@ -371,44 +372,24 @@ int test_lalr(void)
h_bind_indirect(E, E_);
HParser *p = E;
printf("\n==== G R A M M A R ====\n");
HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p));
if (g == NULL) {
fprintf(stderr, "h_cfgrammar failed\n");
HCFGrammar *g = h_pprint_lr_info(stdout, p);
if(!g)
return 1;
}
h_pprint_grammar(stdout, g, 0);
printf("\n==== D F A ====\n");
HLRDFA *dfa = h_lr0_dfa(g);
if (dfa) {
h_pprint_lrdfa(stdout, g, dfa, 0);
} else {
fprintf(stderr, "h_lalr_dfa failed\n");
}
printf("\n==== L R ( 0 ) T A B L E ====\n");
HLRTable *table0 = h_lr0_table(g, dfa);
if (table0) {
h_pprint_lrtable(stdout, g, table0, 0);
} else {
fprintf(stderr, "h_lr0_table failed\n");
}
h_lrtable_free(table0);
printf("\n==== L A L R T A B L E ====\n");
fprintf(stdout, "\n==== L A L R T A B L E ====\n");
if (h_compile(p, PB_LALR, NULL)) {
fprintf(stderr, "does not compile\n");
fprintf(stdout, "does not compile\n");
return 2;
}
h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0);
printf("\n==== P A R S E R E S U L T ====\n");
fprintf(stdout, "\n==== P A R S E R E S U L T ====\n");
HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13);
if (res) {
h_pprint(stdout, res->ast, 0, 2);
} else {
printf("no parse\n");
fprintf(stdout, "no parse\n");
}
return 0;
}

View file

@ -12,6 +12,7 @@ static const size_t DEFAULT_KMAX = 1;
* maps lookahead strings to productions (HCFSequence).
*/
typedef struct HLLkTable_ {
size_t kmax;
HHashTable *rows;
HCFChoice *start; // start symbol
HArena *arena;
@ -188,6 +189,7 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HStringMap *row,
*/
static int fill_table(size_t kmax, HCFGrammar *g, HLLkTable *table)
{
table->kmax = kmax;
table->start = g->start;
// iterate over g->nts
@ -259,56 +261,172 @@ void h_llk_free(HParser *parser)
/* LL(k) driver */
HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream)
typedef struct {
HArena *arena; // will hold the results
HArena *tarena; // tmp, deleted after parse
HSlist *stack;
HCountedArray *seq; // accumulates current parse result
uint8_t *buf; // for lookahead across chunk boundaries
// allocated to size 2*kmax
// new chunk starts at index kmax
// ( 0 ... kmax ... 2*kmax-1 )
// \_old_/\______new_______/
HInputStream win; // win.length is set to 0 when not in use
} HLLkState;
// in order to construct the parse tree, we delimit the symbol stack into
// frames corresponding to production right-hand sides. since only left-most
// derivations are produced this linearization is unique.
// the 'mark' allocated below simply reserves a memory address to use as the
// frame delimiter.
// nonterminals, instead of being popped and forgotten, are put back onto the
// stack below the mark to tell us which validations and semantic actions to
// execute on their corresponding result.
// also on the stack below the mark, we store the previously accumulated
// value for the surrounding production.
static void const * const MARK = &MARK; // stack frame delimiter
static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser)
{
const HLLkTable *table = parser->backend_data;
assert(table != NULL);
HArena *arena = h_new_arena(mm__, 0); // will hold the results
HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse
HSlist *stack = h_slist_new(tarena);
HCountedArray *seq = h_carray_new(arena); // accumulates current parse result
HLLkState *s = h_new(HLLkState, 1);
s->arena = h_new_arena(mm__, 0);
s->tarena = h_new_arena(mm__, 0);
s->stack = h_slist_new(s->tarena);
s->seq = h_carray_new(s->arena);
s->buf = h_arena_malloc(s->tarena, 2 * table->kmax);
// in order to construct the parse tree, we delimit the symbol stack into
// frames corresponding to production right-hand sides. since only left-most
// derivations are produced this linearization is unique.
// the 'mark' allocated below simply reserves a memory address to use as the
// frame delimiter.
// nonterminals, instead of being popped and forgotten, are put back onto the
// stack below the mark to tell us which validations and semantic actions to
// execute on their corresponding result.
// also on the stack below the mark, we store the previously accumulated
// value for the surrounding production.
void *mark = h_arena_malloc(tarena, 1);
s->win.input = s->buf;
s->win.length = 0; // unused
// initialize with the start symbol on the stack.
h_slist_push(stack, table->start);
h_slist_push(s->stack, table->start);
return s;
}
// helper: add new input to the lookahead window
static void append_win(size_t kmax, HLLkState *s, HInputStream *stream)
{
assert(stream->bit_offset == 0);
assert(s->win.input == s->buf);
assert(s->win.length == kmax);
assert(s->win.index < kmax);
size_t n = stream->length - stream->index; // bytes to copy
if(n > kmax)
n = kmax;
memcpy(s->buf + kmax, stream->input + stream->index, n);
s->win.length += n;
}
// helper: save old input to the lookahead window
static void save_win(size_t kmax, HLLkState *s, HInputStream *stream)
{
assert(stream->bit_offset == 0);
size_t len = stream->length - stream->index;
assert(len < kmax);
if(len == 0) {
// stream empty? nothing to do.
return;
} else if(s->win.length > 0) {
// window active? should contain all of stream.
assert(s->win.length == kmax + len);
assert(s->win.index <= kmax);
// shift contents down:
//
// (0 kmax )
// ... \_old_/\_new_/ ...
//
// (0 kmax )
// ... \_old_/\_new_/ ...
//
s->win.pos += len; // position of the window shifts up
len = s->win.length - s->win.index;
assert(len <= kmax);
memmove(s->buf + kmax - len, s->buf + s->win.index, len);
} else {
// window not active? save stream to window.
// buffer starts kmax bytes below chunk boundary
s->win.pos = stream->pos - kmax;
memcpy(s->buf + kmax - len, stream->input + stream->index, len);
}
// metadata
s->win = *stream;
s->win.input = s->buf;
s->win.index = kmax - len;
s->win.length = kmax;
}
// returns partial result or NULL (no parse)
static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser,
HInputStream* chunk)
{
HParsedToken *tok = NULL; // will hold result token
HCFChoice *x = NULL; // current symbol (from top of stack)
HInputStream *stream;
assert(chunk->index == 0);
assert(chunk->bit_offset == 0);
const HLLkTable *table = parser->backend_data;
assert(table != NULL);
HArena *arena = s->arena;
HArena *tarena = s->tarena;
HSlist *stack = s->stack;
HCountedArray *seq = s->seq;
size_t kmax = table->kmax;
if(!seq)
return NULL; // parse already failed
if(s->win.length > 0) {
append_win(kmax, s, chunk);
stream = &s->win;
} else {
stream = chunk;
}
// when we empty the stack, the parse is complete.
while(!h_slist_empty(stack)) {
tok = NULL;
// pop top of stack for inspection
HCFChoice *x = h_slist_pop(stack);
x = h_slist_pop(stack);
assert(x != NULL);
if(x != mark && x->type == HCF_CHOICE) {
if(x != MARK && x->type == HCF_CHOICE) {
// x is a nonterminal; apply the appropriate production and continue
// push stack frame
h_slist_push(stack, seq); // save current partial value
h_slist_push(stack, x); // save the nonterminal
h_slist_push(stack, mark); // frame delimiter
// open a fresh result sequence
seq = h_carray_new(arena);
// look up applicable production in parse table
const HCFSequence *p = h_llk_lookup(table, x, stream);
if(p == NULL)
goto no_parse;
if(p == NEED_INPUT) {
save_win(kmax, s, chunk);
goto need_input;
}
// an infinite loop case that shouldn't happen
assert(!p->items[0] || p->items[0] != x);
// push stack frame
h_slist_push(stack, seq); // save current partial value
h_slist_push(stack, x); // save the nonterminal
h_slist_push(stack, (void *)MARK); // frame delimiter
// open a fresh result sequence
seq = h_carray_new(arena);
// push production's rhs onto the stack (in reverse order)
HCFChoice **s;
for(s = p->items; *s; s++);
@ -319,11 +437,10 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream*
}
// the top of stack is such that there will be a result...
HParsedToken *tok; // will hold result token
tok = h_arena_malloc(arena, sizeof(HParsedToken));
tok->index = stream->index;
tok->index = stream->pos + stream->index;
tok->bit_offset = stream->bit_offset;
if(x == mark) {
if(x == MARK) {
// hit stack frame boundary...
// wrap the accumulated parse result, this sequence is finished
tok->token_type = TT_SEQUENCE;
@ -340,17 +457,25 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream*
// consume the input token
uint8_t input = h_read_bits(stream, 8, false);
// when old chunk consumed from window, switch to new chunk
if(s->win.length > 0 && s->win.index >= kmax) {
s->win.length = 0; // disable the window
stream = chunk;
}
switch(x->type) {
case HCF_END:
if(!stream->overrun)
goto no_parse;
if(!stream->last_chunk)
goto need_input;
h_arena_free(arena, tok);
tok = NULL;
break;
case HCF_CHAR:
if(stream->overrun)
goto no_parse;
goto need_input;
if(input != x->chr)
goto no_parse;
tok->token_type = TT_UINT;
@ -359,7 +484,7 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream*
case HCF_CHARSET:
if(stream->overrun)
goto no_parse;
goto need_input;
if(!charset_isset(x->charset, input))
goto no_parse;
tok->token_type = TT_UINT;
@ -388,24 +513,82 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream*
h_carray_append(seq, tok);
}
// success
// since we started with a single nonterminal on the stack, seq should
// contain exactly the parse result.
assert(seq->used == 1);
h_delete_arena(tarena);
return make_result(arena, seq->elements[0]);
return seq;
no_parse:
h_delete_arena(tarena);
h_delete_arena(arena);
s->arena = NULL;
return NULL;
need_input:
if(stream->last_chunk)
goto no_parse;
if(tok)
h_arena_free(arena, tok); // no result, yet
h_slist_push(stack, x); // try this symbol again next time
return seq;
}
static HParseResult *llk_parse_finish_(HAllocator *mm__, HLLkState *s)
{
HParseResult *res = NULL;
if(s->seq) {
assert(s->seq->used == 1);
res = make_result(s->arena, s->seq->elements[0]);
}
h_delete_arena(s->tarena);
h_free(s);
return res;
}
HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream)
{
HLLkState *s = llk_parse_start_(mm__, parser);
assert(stream->last_chunk);
s->seq = llk_parse_chunk_(s, parser, stream);
HParseResult *res = llk_parse_finish_(mm__, s);
if(res)
res->bit_length = stream->index * 8 + stream->bit_offset;
return res;
}
void h_llk_parse_start(HSuspendedParser *s)
{
s->backend_state = llk_parse_start_(s->mm__, s->parser);
}
bool h_llk_parse_chunk(HSuspendedParser *s, HInputStream *input)
{
HLLkState *state = s->backend_state;
state->seq = llk_parse_chunk_(state, s->parser, input);
return (state->seq == NULL || h_slist_empty(state->stack));
}
HParseResult *h_llk_parse_finish(HSuspendedParser *s)
{
return llk_parse_finish_(s->mm__, s->backend_state);
}
HParserBackendVTable h__llk_backend_vtable = {
.compile = h_llk_compile,
.parse = h_llk_parse,
.free = h_llk_free
.free = h_llk_free,
.parse_start = h_llk_parse_start,
.parse_chunk = h_llk_parse_chunk,
.parse_finish = h_llk_parse_finish
};

View file

@ -199,15 +199,14 @@ bool h_lrtable_row_empty(const HLRTable *table, size_t i)
/* LR driver */
HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table,
const HInputStream *stream)
static
HLREngine *h_lrengine_new_(HArena *arena, HArena *tarena, const HLRTable *table)
{
HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine));
engine->table = table;
engine->state = 0;
engine->stack = h_slist_new(tarena);
engine->input = *stream;
engine->merged[0] = NULL;
engine->merged[1] = NULL;
engine->arena = arena;
@ -216,6 +215,14 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table,
return engine;
}
HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table,
const HInputStream *stream)
{
HLREngine *engine = h_lrengine_new_(arena, tarena, table);
engine->input = *stream;
return engine;
}
static const HLRAction *
terminal_lookup(const HLREngine *engine, const HInputStream *stream)
{
@ -351,7 +358,9 @@ HParseResult *h_lrengine_result(HLREngine *engine)
// on top of the stack is the start symbol's semantic value
assert(!h_slist_empty(engine->stack));
HParsedToken *tok = engine->stack->head->elem;
return make_result(engine->arena, tok);
HParseResult *res = make_result(engine->arena, tok);
res->bit_length = (engine->input.pos + engine->input.index) * 8;
return res;
} else {
return NULL;
}
@ -377,7 +386,53 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream*
return result;
}
void h_lr_parse_start(HSuspendedParser *s)
{
HLRTable *table = s->parser->backend_data;
assert(table != NULL);
HArena *arena = h_new_arena(s->mm__, 0); // will hold the results
HArena *tarena = h_new_arena(s->mm__, 0); // tmp, deleted after parse
HLREngine *engine = h_lrengine_new_(arena, tarena, table);
s->backend_state = engine;
}
bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream)
{
HLREngine *engine = s->backend_state;
engine->input = *stream;
bool run = true;
while(run) {
// check input against table to determine which action to take
const HLRAction *action = h_lrengine_action(engine);
if(action == NEED_INPUT) {
// XXX assume lookahead 1
assert(engine->input.length - engine->input.index == 0);
break;
}
// execute action
run = h_lrengine_step(engine, action);
if(engine->input.overrun && !engine->input.last_chunk)
break;
}
*stream = engine->input;
return !run; // done if engine no longer running
}
HParseResult *h_lr_parse_finish(HSuspendedParser *s)
{
HLREngine *engine = s->backend_state;
HParseResult *result = h_lrengine_result(engine);
if(!result)
h_delete_arena(engine->arena);
h_delete_arena(engine->tarena);
return result;
}
/* Pretty-printers */
@ -536,3 +591,35 @@ void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table,
fputc('\n', f);
#endif
}
HCFGrammar *h_pprint_lr_info(FILE *f, HParser *p)
{
HAllocator *mm__ = &system_allocator;
fprintf(f, "\n==== G R A M M A R ====\n");
HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p));
if (g == NULL) {
fprintf(f, "h_cfgrammar failed\n");
return NULL;
}
h_pprint_grammar(f, g, 0);
fprintf(f, "\n==== D F A ====\n");
HLRDFA *dfa = h_lr0_dfa(g);
if (dfa) {
h_pprint_lrdfa(f, g, dfa, 0);
} else {
fprintf(f, "h_lalr_dfa failed\n");
}
fprintf(f, "\n==== L R ( 0 ) T A B L E ====\n");
HLRTable *table0 = h_lr0_table(g, dfa);
if (table0) {
h_pprint_lrtable(f, g, table0, 0);
} else {
fprintf(f, "h_lr0_table failed\n");
}
h_lrtable_free(table0);
return g;
}

View file

@ -134,6 +134,9 @@ const HLRAction *h_lrengine_action(const HLREngine *engine);
bool h_lrengine_step(HLREngine *engine, const HLRAction *action);
HParseResult *h_lrengine_result(HLREngine *engine);
HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream);
void h_lr_parse_start(HSuspendedParser *s);
bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream);
HParseResult *h_lr_parse_finish(HSuspendedParser *s);
HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream);
void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item);
@ -143,5 +146,6 @@ void h_pprint_lrdfa(FILE *f, const HCFGrammar *g,
const HLRDFA *dfa, unsigned int indent);
void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table,
unsigned int indent);
HCFGrammar *h_pprint_lr_info(FILE *f, HParser *p);
#endif

View file

@ -349,6 +349,7 @@ void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool en
return m->epsilon_branch;
}
// A NULL result means no parse. NEED_INPUT means lookahead is too short.
void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead)
{
while(m) {
@ -362,9 +363,13 @@ void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead)
// reading bits from it does not consume them from the real input.
uint8_t c = h_read_bits(&lookahead, 8, false);
if (lookahead.overrun) { // end of input
// XXX assumption of byte-wise grammar and input
return m->end_branch;
if (lookahead.overrun) { // end of chunk
if (lookahead.last_chunk) { // end of input
// XXX assumption of byte-wise grammar and input
return m->end_branch;
} else {
return NEED_INPUT;
}
}
// no match yet, descend

View file

@ -56,6 +56,9 @@ bool h_stringmap_empty(const HStringMap *m);
static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_t c)
{ return h_hashtable_get(m->char_branches, (void *)char_key(c)); }
// dummy return value used by h_stringmap_get_lookahead when out of input
#define NEED_INPUT ((void *)-1)
/* Convert 'parser' into CFG representation by desugaring and compiling the set
* of nonterminals.

View file

@ -43,6 +43,7 @@ typedef struct {
#define DEFAULT_ENDIANNESS (BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN)
HParseResult* h_parse(const HParser* parser, const uint8_t* input, size_t length) {
return h_parse__m(&system_allocator, parser, input, length);
@ -50,12 +51,14 @@ HParseResult* h_parse(const HParser* parser, const uint8_t* input, size_t length
HParseResult* h_parse__m(HAllocator* mm__, const HParser* parser, const uint8_t* input, size_t length) {
// Set up a parse state...
HInputStream input_stream = {
.pos = 0,
.index = 0,
.bit_offset = 0,
.overrun = 0,
.endianness = BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN,
.endianness = DEFAULT_ENDIANNESS,
.length = length,
.input = input
.input = input,
.last_chunk = true
};
return backends[parser->backend]->parse(mm__, parser, &input_stream);
@ -96,3 +99,92 @@ int h_compile__m(HAllocator* mm__, HParser* parser, HParserBackend backend, cons
parser->backend = backend;
return ret;
}
HSuspendedParser* h_parse_start(const HParser* parser) {
return h_parse_start__m(&system_allocator, parser);
}
HSuspendedParser* h_parse_start__m(HAllocator* mm__, const HParser* parser) {
if(!backends[parser->backend]->parse_start)
return NULL;
// allocate and init suspended state
HSuspendedParser *s = h_new(HSuspendedParser, 1);
if(!s)
return NULL;
s->mm__ = mm__;
s->parser = parser;
s->backend_state = NULL;
s->done = false;
s->pos = 0;
s->bit_offset = 0;
s->endianness = DEFAULT_ENDIANNESS;
// backend-specific initialization
// should allocate s->backend_state
backends[parser->backend]->parse_start(s);
return s;
}
bool h_parse_chunk(HSuspendedParser* s, const uint8_t* input, size_t length) {
assert(backends[s->parser->backend]->parse_chunk != NULL);
// no-op if parser is already done
if(s->done)
return true;
// input
HInputStream input_stream = {
.pos = s->pos,
.index = 0,
.bit_offset = 0,
.overrun = 0,
.endianness = s->endianness,
.length = length,
.input = input,
.last_chunk = false
};
// process chunk
s->done = backends[s->parser->backend]->parse_chunk(s, &input_stream);
s->endianness = input_stream.endianness;
s->pos += input_stream.index;
s->bit_offset = input_stream.bit_offset;
return s->done;
}
HParseResult* h_parse_finish(HSuspendedParser* s) {
assert(backends[s->parser->backend]->parse_chunk != NULL);
assert(backends[s->parser->backend]->parse_finish != NULL);
HAllocator *mm__ = s->mm__;
// signal end of input if parser is not already done
if(!s->done) {
HInputStream empty = {
.pos = s->pos,
.index = 0,
.bit_offset = 0,
.overrun = 0,
.endianness = s->endianness,
.length = 0,
.input = NULL,
.last_chunk = true
};
s->done = backends[s->parser->backend]->parse_chunk(s, &empty);
assert(s->done);
}
// extract result
HParseResult *r = backends[s->parser->backend]->parse_finish(s);
if(r)
r->bit_length = s->pos * 8 + s->bit_offset;
// NB: backend should have freed backend_state
h_free(s);
return r;
}

View file

@ -140,6 +140,8 @@ typedef struct HParser_ {
HCFChoice *desugared; /* if the parser can be desugared, its desugared form */
} HParser;
typedef struct HSuspendedParser_ HSuspendedParser;
/**
* Type of an action to apply to an AST, used in the action() parser.
* It can be any (user-defined) function that takes a HParseResult*
@ -265,6 +267,27 @@ typedef struct HBenchmarkResults_ {
*/
HAMMER_FN_DECL(HParseResult*, h_parse, const HParser* parser, const uint8_t* input, size_t length);
/**
* Initialize a parser for iteratively consuming an input stream in chunks.
* This is only supported by some backends.
*
* Result is NULL if not supported by the backend.
*/
HAMMER_FN_DECL(HSuspendedParser*, h_parse_start, const HParser* parser);
/**
* Run a suspended parser (as returned by h_parse_start) on a chunk of input.
*
* Returns true if the parser is done (needs no more input).
*/
bool h_parse_chunk(HSuspendedParser* s, const uint8_t* input, size_t length);
/**
* Finish an iterative parse. Signals the end of input to the backend and
* returns the parse result.
*/
HParseResult* h_parse_finish(HSuspendedParser* s);
/**
* Given a string, returns a parser that parses that string value.
*

View file

@ -72,13 +72,15 @@ typedef struct HCFStack_ HCFStack;
typedef struct HInputStream_ {
// This should be considered to be a really big value type.
const uint8_t *input;
size_t pos; // position of this chunk in a multi-chunk stream
size_t index;
size_t length;
char bit_offset;
char margin; // The number of bits on the end that is being read
// towards that should be ignored.
char endianness;
char overrun;
bool overrun;
bool last_chunk;
} HInputStream;
typedef struct HSlistNode_ {
@ -210,10 +212,32 @@ struct HParseState_ {
HSlist *symbol_table; // its contents are HHashTables
};
struct HSuspendedParser_ {
HAllocator *mm__;
const HParser *parser;
void *backend_state;
bool done;
// input stream state
size_t pos;
uint8_t bit_offset;
uint8_t endianness;
};
typedef struct HParserBackendVTable_ {
int (*compile)(HAllocator *mm__, HParser* parser, const void* params);
HParseResult* (*parse)(HAllocator *mm__, const HParser* parser, HInputStream* stream);
void (*free)(HParser* parser);
void (*parse_start)(HSuspendedParser *s);
// parse_start should allocate s->backend_state.
bool (*parse_chunk)(HSuspendedParser *s, HInputStream *input);
// if parser is done, return true. otherwise:
// parse_chunk MUST consume all input, integrating it into s->backend_state.
// parse_chunk will not be called again after it reports done.
HParseResult *(*parse_finish)(HSuspendedParser *s);
// parse_finish must free s->backend_state.
// parse_finish will not be called before parse_chunk reports done.
} HParserBackendVTable;

View file

@ -443,6 +443,115 @@ static void test_rightrec(gconstpointer backend) {
g_check_parse_match(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "(u0x61 (u0x61 (u0x61)))");
}
static void test_iterative(gconstpointer backend) {
HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend);
HParser *p;
p = h_token((uint8_t*)"foobar", 6);
g_check_parse_chunks_match(p, be, "foo",3, "bar",3, "<66.6f.6f.62.61.72>");
g_check_parse_chunks_match(p, be, "foo",3, "barbaz",6, "<66.6f.6f.62.61.72>");
g_check_parse_chunks_failed(p, be, "fou",3, "bar",3);
g_check_parse_chunks_failed(p, be, "foo",3, "par",3);
g_check_parse_chunks_failed(p, be, "foo",3, "baz",3);
p = h_sequence(h_ch('f'), h_token((uint8_t*)"ooba", 4), h_ch('r'), NULL);
g_check_parse_chunks_match(p, be, "foo",3, "bar",3, "(u0x66 <6f.6f.62.61> u0x72)");
g_check_parse_chunks_match(p, be, "foo",3, "barbaz",6, "(u0x66 <6f.6f.62.61> u0x72)");
g_check_parse_chunks_failed(p, be, "fou",3, "bar",3);
g_check_parse_chunks_failed(p, be, "foo",3, "par",3);
g_check_parse_chunks_failed(p, be, "foo",3, "baz",3);
p = h_choice(h_token((uint8_t*)"foobar", 6),
h_token((uint8_t*)"phupar", 6), NULL);
g_check_parse_chunks_match(p, be, "foo",3, "bar",3, "<66.6f.6f.62.61.72>");
g_check_parse_chunks_match(p, be, "foo",3, "barbaz",6, "<66.6f.6f.62.61.72>");
g_check_parse_chunks_match(p, be, "phu",3, "par",3, "<70.68.75.70.61.72>");
g_check_parse_chunks_failed(p, be, "fou",3, "bar",3);
g_check_parse_chunks_failed(p, be, "foo",3, "baz",3);
g_check_parse_chunks_match(p, be, "foobar",6, "",0, "<66.6f.6f.62.61.72>");
g_check_parse_chunks_match(p, be, "",0, "foobar",6, "<66.6f.6f.62.61.72>");
g_check_parse_chunks_failed(p, be, "foo",3, "",0);
g_check_parse_chunks_failed(p, be, "",0, "foo",3);
p = h_sequence(h_ch('f'), h_choice(h_token((uint8_t*)"oo", 2),
h_token((uint8_t*)"uu", 2), NULL), NULL);
g_check_parse_chunks_match(p, be, "f",1, "oo",2, "(u0x66 <6f.6f>)");
g_check_parse_chunks_match(p, be, "f",1, "uu",2, "(u0x66 <75.75>)");
g_check_parse_chunks_failed(p, be, "g",1, "oo",2);
g_check_parse_chunks_failed(p, be, "f",1, "ou",2);
g_check_parse_chunks_failed(p, be, "f",1, "uo",2);
}
static void test_iterative_lookahead(gconstpointer backend) {
HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend);
HParser *p;
// needs 2 lookahead
p = h_sequence(h_ch('f'), h_choice(h_token((uint8_t*)"oo", 2),
h_token((uint8_t*)"ou", 2), NULL), NULL);
if(h_compile(p, be, (void *)2) != 0) {
g_test_message("Compile failed");
g_test_fail();
return;
}
// partial chunk consumed
g_check_parse_chunks_match_(p, "fo",2, "o",1, "(u0x66 <6f.6f>)");
g_check_parse_chunks_match_(p, "fo",2, "u",1, "(u0x66 <6f.75>)");
g_check_parse_chunks_failed_(p, "go",2, "o",1);
g_check_parse_chunks_failed_(p, "fa",2, "u",1);
g_check_parse_chunks_failed_(p, "fo",2, "b",1);
}
static void test_iterative_result_length(gconstpointer backend) {
HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend);
HParser *p = h_token((uint8_t*)"foobar", 6);
if(h_compile(p, be, NULL) != 0) {
g_test_message("Compile failed");
g_test_fail();
return;
}
HSuspendedParser *s = h_parse_start(p);
if(!s) {
g_test_message("Chunked parsing not available");
g_test_fail();
return;
}
h_parse_chunk(s, (uint8_t*)"foo", 3);
h_parse_chunk(s, (uint8_t*)"ba", 2);
h_parse_chunk(s, (uint8_t*)"rbaz", 4);
HParseResult *r = h_parse_finish(s);
if(!r) {
g_test_message("Parse failed");
g_test_fail();
return;
}
g_check_cmp_int64(r->bit_length, ==, 48);
}
static void test_result_length(gconstpointer backend) {
HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend);
HParser *p = h_token((uint8_t*)"foo", 3);
if(h_compile(p, be, NULL) != 0) {
g_test_message("Compile failed");
g_test_fail();
return;
}
HParseResult *r = h_parse(p, (uint8_t*)"foobar", 6);
if(!r) {
g_test_message("Parse failed");
g_test_fail();
return;
}
g_check_cmp_int64(r->bit_length, ==, 24);
}
static void test_ambiguous(gconstpointer backend) {
HParser *d_ = h_ch('d');
HParser *p_ = h_ch('+');
@ -653,6 +762,7 @@ void register_parser_tests(void) {
g_test_add_data_func("/core/parser/packrat/putget", GINT_TO_POINTER(PB_PACKRAT), test_put_get);
g_test_add_data_func("/core/parser/packrat/permutation", GINT_TO_POINTER(PB_PACKRAT), test_permutation);
g_test_add_data_func("/core/parser/packrat/bind", GINT_TO_POINTER(PB_PACKRAT), test_bind);
g_test_add_data_func("/core/parser/packrat/result_length", GINT_TO_POINTER(PB_PACKRAT), test_result_length);
g_test_add_data_func("/core/parser/llk/token", GINT_TO_POINTER(PB_LLk), test_token);
g_test_add_data_func("/core/parser/llk/ch", GINT_TO_POINTER(PB_LLk), test_ch);
@ -691,6 +801,10 @@ void register_parser_tests(void) {
g_test_add_data_func("/core/parser/llk/ignore", GINT_TO_POINTER(PB_LLk), test_ignore);
//g_test_add_data_func("/core/parser/llk/leftrec", GINT_TO_POINTER(PB_LLk), test_leftrec);
g_test_add_data_func("/core/parser/llk/rightrec", GINT_TO_POINTER(PB_LLk), test_rightrec);
g_test_add_data_func("/core/parser/llk/result_length", GINT_TO_POINTER(PB_LLk), test_result_length);
g_test_add_data_func("/core/parser/llk/iterative", GINT_TO_POINTER(PB_LLk), test_iterative);
g_test_add_data_func("/core/parser/llk/iterative/lookahead", GINT_TO_POINTER(PB_LLk), test_iterative_lookahead);
g_test_add_data_func("/core/parser/llk/iterative/result_length", GINT_TO_POINTER(PB_LLk), test_iterative_result_length);
g_test_add_data_func("/core/parser/regex/token", GINT_TO_POINTER(PB_REGULAR), test_token);
g_test_add_data_func("/core/parser/regex/ch", GINT_TO_POINTER(PB_REGULAR), test_ch);
@ -703,8 +817,8 @@ void register_parser_tests(void) {
g_test_add_data_func("/core/parser/regex/uint32", GINT_TO_POINTER(PB_REGULAR), test_uint32);
g_test_add_data_func("/core/parser/regex/uint16", GINT_TO_POINTER(PB_REGULAR), test_uint16);
g_test_add_data_func("/core/parser/regex/uint8", GINT_TO_POINTER(PB_REGULAR), test_uint8);
g_test_add_data_func("/core/parser/regex/int_range", GINT_TO_POINTER(PB_REGULAR), test_int_range);
#if 0
g_test_add_data_func("/core/parser/regex/int_range", GINT_TO_POINTER(PB_REGULAR), test_int_range);
g_test_add_data_func("/core/parser/regex/float64", GINT_TO_POINTER(PB_REGULAR), test_float64);
g_test_add_data_func("/core/parser/regex/float32", GINT_TO_POINTER(PB_REGULAR), test_float32);
#endif
@ -728,6 +842,7 @@ void register_parser_tests(void) {
g_test_add_data_func("/core/parser/regex/epsilon_p", GINT_TO_POINTER(PB_REGULAR), test_epsilon_p);
g_test_add_data_func("/core/parser/regex/attr_bool", GINT_TO_POINTER(PB_REGULAR), test_attr_bool);
g_test_add_data_func("/core/parser/regex/ignore", GINT_TO_POINTER(PB_REGULAR), test_ignore);
g_test_add_data_func("/core/parser/regex/result_length", GINT_TO_POINTER(PB_REGULAR), test_result_length);
g_test_add_data_func("/core/parser/lalr/token", GINT_TO_POINTER(PB_LALR), test_token);
g_test_add_data_func("/core/parser/lalr/ch", GINT_TO_POINTER(PB_LALR), test_ch);
@ -767,6 +882,10 @@ void register_parser_tests(void) {
g_test_add_data_func("/core/parser/lalr/leftrec", GINT_TO_POINTER(PB_LALR), test_leftrec);
g_test_add_data_func("/core/parser/lalr/leftrec-ne", GINT_TO_POINTER(PB_LALR), test_leftrec_ne);
g_test_add_data_func("/core/parser/lalr/rightrec", GINT_TO_POINTER(PB_LALR), test_rightrec);
g_test_add_data_func("/core/parser/lalr/result_length", GINT_TO_POINTER(PB_LALR), test_result_length);
g_test_add_data_func("/core/parser/lalr/iterative", GINT_TO_POINTER(PB_LALR), test_iterative);
g_test_add_data_func("/core/parser/lalr/iterative/lookahead", GINT_TO_POINTER(PB_LALR), test_iterative_lookahead);
g_test_add_data_func("/core/parser/lalr/iterative/result_length", GINT_TO_POINTER(PB_LALR), test_iterative_result_length);
g_test_add_data_func("/core/parser/glr/token", GINT_TO_POINTER(PB_GLR), test_token);
g_test_add_data_func("/core/parser/glr/ch", GINT_TO_POINTER(PB_GLR), test_ch);
@ -807,4 +926,5 @@ void register_parser_tests(void) {
g_test_add_data_func("/core/parser/glr/leftrec-ne", GINT_TO_POINTER(PB_GLR), test_leftrec_ne);
g_test_add_data_func("/core/parser/glr/rightrec", GINT_TO_POINTER(PB_GLR), test_rightrec);
g_test_add_data_func("/core/parser/glr/ambiguous", GINT_TO_POINTER(PB_GLR), test_ambiguous);
g_test_add_data_func("/core/parser/glr/result_length", GINT_TO_POINTER(PB_GLR), test_result_length);
}

View file

@ -90,7 +90,8 @@
#define g_check_parse_failed(parser, backend, input, inp_len) do { \
int skip = h_compile((HParser *)(parser), (HParserBackend)backend, NULL); \
if(skip != 0) { \
g_test_message("Backend not applicable, skipping test"); \
g_test_message("Compile failed"); \
g_test_fail(); \
break; \
} \
const HParseResult *result = h_parse(parser, (const uint8_t*)input, inp_len); \
@ -103,7 +104,8 @@
#define g_check_parse_ok(parser, backend, input, inp_len) do { \
int skip = h_compile((HParser *)(parser), (HParserBackend) backend, NULL); \
if(skip) { \
g_test_message("Backend not applicable, skipping test"); \
g_test_message("Compile failed"); \
g_test_fail(); \
break; \
} \
HParseResult *res = h_parse(parser, (const uint8_t*)input, inp_len); \
@ -124,7 +126,8 @@
#define g_check_parse_match(parser, backend, input, inp_len, result) do { \
int skip = h_compile((HParser *)(parser), (HParserBackend) backend, NULL); \
if(skip) { \
g_test_message("Backend not applicable, skipping test"); \
g_test_message("Compile failed"); \
g_test_fail(); \
break; \
} \
HParseResult *res = h_parse(parser, (const uint8_t*)input, inp_len); \
@ -145,6 +148,69 @@
} \
} while(0)
#define g_check_parse_chunks_failed(parser, backend, chunk1, c1_len, chunk2, c2_len) do { \
int skip = h_compile((HParser *)(parser), (HParserBackend)backend, NULL); \
if(skip) { \
g_test_message("Compile failed"); \
g_test_fail(); \
break; \
} \
g_check_parse_chunks_failed_(parser, chunk1, c1_len, chunk2, c2_len); \
} while(0)
#define g_check_parse_chunks_failed_(parser, chunk1, c1_len, chunk2, c2_len) do { \
HSuspendedParser *s = h_parse_start(parser); \
if(!s) { \
g_test_message("Chunk-wise parsing not available"); \
g_test_fail(); \
break; \
} \
h_parse_chunk(s, (const uint8_t*)chunk1, c1_len); \
h_parse_chunk(s, (const uint8_t*)chunk2, c2_len); \
const HParseResult *res = h_parse_finish(s); \
if (NULL != res) { \
g_test_message("Check failed: shouldn't have succeeded, but did"); \
g_test_fail(); \
} \
} while(0)
#define g_check_parse_chunks_match(parser, backend, chunk1, c1_len, chunk2, c2_len, result) do { \
int skip = h_compile((HParser *)(parser), (HParserBackend) backend, NULL); \
if(skip) { \
g_test_message("Compile failed"); \
g_test_fail(); \
break; \
} \
g_check_parse_chunks_match_(parser, chunk1, c1_len, chunk2, c2_len, result); \
} while(0)
#define g_check_parse_chunks_match_(parser, chunk1, c1_len, chunk2, c2_len, result) do { \
HSuspendedParser *s = h_parse_start(parser); \
if(!s) { \
g_test_message("Chunk-wise parsing not available"); \
g_test_fail(); \
break; \
} \
h_parse_chunk(s, (const uint8_t*)chunk1, c1_len); \
h_parse_chunk(s, (const uint8_t*)chunk2, c2_len); \
HParseResult *res = h_parse_finish(s); \
if (!res) { \
g_test_message("Parse failed on line %d", __LINE__); \
g_test_fail(); \
} else { \
char* cres = h_write_result_unamb(res->ast); \
g_check_string(cres, ==, result); \
(&system_allocator)->free(&system_allocator, cres); \
HArenaStats stats; \
h_allocator_stats(res->arena, &stats); \
g_test_message("Parse used %zd bytes, wasted %zd bytes. " \
"Inefficiency: %5f%%", \
stats.used, stats.wasted, \
stats.wasted * 100. / (stats.used+stats.wasted)); \
h_delete_arena(res->arena); \
} \
} while(0)
#define g_check_hashtable_present(table, key) do { \
if(!h_hashtable_present(table, key)) { \
g_test_message("Check failed: key should have been in table, but wasn't"); \