From 7a681faeeb8c9a321c0bf5f34a41b60cfbc8ed63 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 11:57:46 +0200 Subject: [PATCH 01/95] remove a dead line --- src/cfgrammar.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index a5a9b1a..b056261 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -463,8 +463,6 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) const HCFStringMap *first_tail = h_first_seq(k, g, tail); - //h_stringmap_update(ret, first_tail); - // extend the elems of first_k(tail) up to length k from follow(A) stringset_extend(g, ret, k, first_tail, h_follow_, &a); } From 9112452709023e91485a4e6f570ae13817d31fa8 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 14:19:37 +0200 Subject: [PATCH 02/95] fix incorrect usage of h_stringset_put_char in stringset_extend --- src/cfgrammar.c | 9 +++++++-- src/cfgrammar.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index b056261..d45da2e 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -235,11 +235,16 @@ void h_stringmap_put_epsilon(HCFStringMap *m, void *v) m->epsilon_branch = v; } +void h_stringmap_put_after(HCFStringMap *m, uint8_t c, HCFStringMap *ends) +{ + h_hashtable_put(m->char_branches, (void *)char_key(c), ends); +} + void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v) { HCFStringMap *node = h_stringmap_new(m->arena); h_stringmap_put_epsilon(node, v); - h_hashtable_put(m->char_branches, (void *)char_key(c), node); + h_stringmap_put_after(m, c, node); } // helper for h_stringmap_update @@ -505,7 +510,7 @@ static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, // t { a b | a <- as_, b <- f_l(tail), l=k-|a|-1 } // so we can use recursion over k HCFStringMap *ret_ = h_stringmap_new(g->arena); - h_stringmap_put_char(ret, c, ret_); + h_stringmap_put_after(ret, c, ret_); stringset_extend(g, ret_, k-1, as_, f, tail); } diff --git a/src/cfgrammar.h b/src/cfgrammar.h index cec5d6e..eb53b01 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -42,6 +42,7 @@ typedef struct HCFStringMap_ { HCFStringMap *h_stringmap_new(HArena *a); void h_stringmap_put_end(HCFStringMap *m, void *v); void h_stringmap_put_epsilon(HCFStringMap *m, void *v); +void h_stringmap_put_after(HCFStringMap *m, uint8_t c, HCFStringMap *ends); void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v); void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n); void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); From d789d90017ce925fc460e7ccaf4ae26256f780c6 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 14:42:16 +0200 Subject: [PATCH 03/95] fix comma-separation some more in h_pprint_stringset --- src/cfgrammar.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index d45da2e..972cc4d 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -719,7 +719,9 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in #define BUFSIZE 512 -void pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, const HCFStringMap *set) +static bool +pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, + const HCFStringMap *set) { assert(n < BUFSIZE-4); @@ -764,9 +766,11 @@ void pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, cons n_ += sprintf(prefix+n_, "\\x%.2X", c); } - pprint_stringset_elems(file, first, prefix, n_, ends); + first = pprint_stringset_elems(file, first, prefix, n_, ends); } } + + return first; } void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent) From 3ff32e86e4aebeaaf72b45d59a4d6ca13500d226 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 14:42:43 +0200 Subject: [PATCH 04/95] make dummy example require LL(2) --- src/backends/llk.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 6e8a983..3337ebc 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -357,9 +357,11 @@ int test_llk(void) Y -> y -- for k=3 use "yy" */ - HParser *c = h_many(h_ch('x')); - HParser *q = h_sequence(c, h_ch('y'), NULL); - HParser *p = h_choice(q, h_end_p(), NULL); + HParser *X = h_optional(h_ch('x')); + HParser *Y = h_sequence(h_ch('y'), NULL); + HParser *A = h_sequence(X, Y, h_ch('a'), NULL); + HParser *B = h_sequence(Y, h_ch('b'), NULL); + HParser *p = h_choice(A, B, NULL); HCFGrammar *g = h_cfgrammar(&system_allocator, p); @@ -372,13 +374,16 @@ int test_llk(void) printf("derive epsilon: "); h_pprint_symbolset(stdout, g, g->geneps, 0); printf("first(A) = "); - h_pprint_stringset(stdout, g, h_first(2, g, g->start), 0); - printf("follow(C) = "); - h_pprint_stringset(stdout, g, h_follow(2, g, h_desugar(&system_allocator, c)), 0); + h_pprint_stringset(stdout, g, h_first(3, g, g->start), 0); + //printf("follow(C) = "); + //h_pprint_stringset(stdout, g, h_follow(3, g, h_desugar(&system_allocator, c)), 0); - h_compile(p, PB_LLk, NULL); + if(h_compile(p, PB_LLk, NULL)) { + fprintf(stderr, "does not compile\n"); + return 2; + } - HParseResult *res = h_parse(p, (uint8_t *)"xxy", 3); + HParseResult *res = h_parse(p, (uint8_t *)"xya", 3); if(res) h_pprint(stdout, res->ast, 0, 2); else From 5e3c681dbc0e9e114a0edf5f37a0048ff39f74d6 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 21:01:37 +0200 Subject: [PATCH 05/95] generalize most of llk.c to arbitrary k (ex. h_predict) - still bugged --- src/backends/llk.c | 200 ++++++++++++++++++++++++++++++++------------- src/cfgrammar.c | 28 ++++++- src/cfgrammar.h | 4 + 3 files changed, 172 insertions(+), 60 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 3337ebc..bef5aa6 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -3,17 +3,16 @@ #include "../cfgrammar.h" #include "../parsers/parser_internal.h" -// XXX despite the names, this is all LL(1) right now. TODO - /* Generating the LL(k) parse table */ -/* Maps each nonterminal (HCFChoice) of the grammar to another hash table that - * maps lookahead tokens (HCFToken) to productions (HCFSequence). +/* Maps each nonterminal (HCFChoice) of the grammar to a HCFStringMap that + * maps lookahead strings to productions (HCFSequence). */ typedef struct HLLkTable_ { HHashTable *rows; HCFChoice *start; // start symbol + size_t k; // lookahead depth XXX needed? HArena *arena; HAllocator *mm__; } HLLkTable; @@ -28,20 +27,34 @@ static const HCFToken end_token = 0x200; const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, HInputStream lookahead) { - // note the lookahead stream is passed by value, i.e. a copy. - // reading bits from it does not consume them from the real input. - HCFToken tok; - uint8_t c = h_read_bits(&lookahead, 8, false); - if(lookahead.overrun) - tok = end_token; - else - tok = char_token(c); - - const HHashTable *row = h_hashtable_get(table->rows, x); + const HCFStringMap *row = h_hashtable_get(table->rows, x); assert(row != NULL); // the table should have one row for each nonterminal - const HCFSequence *production = h_hashtable_get(row, (void *)tok); - return production; + assert(!row->epsilon_branch); // would match without looking at the input + // XXX cases where this could be useful? + + const HCFStringMap *m = row; + while(m) { + if(m->epsilon_branch) { // input matched + // assert: another lookahead would not bring a more specific match. + // this is for the table generator to ensure. + return m->epsilon_branch; + } + + // note the lookahead stream is passed by value, i.e. a copy. + // reading bits from it does not consume them from the real input. + uint8_t c = h_read_bits(&lookahead, 8, false); + + if(lookahead.overrun) { // end of input + // XXX assumption of byte-wise grammar and input + return m->end_branch; + } + + // no match yet, descend + m = h_stringmap_get_char(m, c); + } + + return NULL; } /* Allocate a new parse table. */ @@ -72,58 +85,126 @@ void h_llktable_free(HLLkTable *table) h_free(table); } -/* Compute the predict set of production "A -> rhs". */ -HHashSet *h_predict(HCFGrammar *g, const HCFChoice *A, const HCFSequence *rhs) +/* Compute the predict_k set of production "A -> rhs". + * Always returns a newly-allocated HCFStringMap. + */ +HCFStringMap *h_predict(size_t k, HCFGrammar *g, + const HCFChoice *A, const HCFSequence *rhs) { + assert(k==1); // XXX + HCFStringMap *ret = h_stringmap_new(g->arena); + // predict(A -> rhs) = first(rhs) u follow(A) if "" can be derived from rhs // predict(A -> rhs) = first(rhs) otherwise - const HCFStringMap *first_rhs = h_first_seq(1, g, rhs->items); - const HCFStringMap *follow_A = h_follow(1, g, A); - HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); - h_hashset_put_all(ret, first_rhs->char_branches); - if(first_rhs->end_branch) - h_hashset_put(ret, (void *)end_token); + h_stringmap_update(ret, h_first_seq(k, g, rhs->items)); + if(h_derives_epsilon_seq(g, rhs->items)) + h_stringmap_update(ret, h_follow(k, g, A)); - if(h_derives_epsilon_seq(g, rhs->items)) { - h_hashset_put_all(ret, follow_A->char_branches); - if(follow_A->end_branch) - h_hashset_put(ret, (void *)end_token); - } + // make sure there are only strings of length _exactly_ k + ret->epsilon_branch = NULL; return ret; } -/* Generate entries for the production "A -> rhs" in the given table row. */ -static -int fill_table_row(HCFGrammar *g, HHashTable *row, - const HCFChoice *A, HCFSequence *rhs) +void *const CONFLICT = (void *)(uintptr_t)(-1); + +static HHashSet *cte_workset; // emulating a closure +static void *combine_table_entry(void *dst, const void *src) { - // iterate over predict(A -> rhs) - HHashSet *pred = h_predict(g, A, rhs); + if(dst == CONFLICT) { // previous conflict + h_hashset_put(cte_workset, src); + } else if(dst != src) { // new conflict + h_hashset_put(cte_workset, dst); + h_hashset_put(cte_workset, src); + dst = CONFLICT; + } + return dst; +} - size_t i; - HHashTableEntry *hte; - for(i=0; i < pred->capacity; i++) { - for(hte = &pred->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; - HCFToken x = (uintptr_t)hte->key; - - if(h_hashtable_present(row, (void *)x)) - return -1; // table would be ambiguous - - h_hashtable_put(row, (void *)x, rhs); - } +// add the mappings of src to dst, calling combine if there is a collision +// note: might reuse parts of src in building up dst! +static void stringmap_merge(void *(*combine)(void *, const void *), + HCFStringMap *dst, HCFStringMap *src) +{ + if(src->epsilon_branch) { + if(dst->epsilon_branch) + dst->epsilon_branch = combine(dst->epsilon_branch, src->epsilon_branch); + else + dst->epsilon_branch = src->epsilon_branch; } + if(src->end_branch) { + if(dst->end_branch) + dst->end_branch = combine(dst->end_branch, src->end_branch); + else + dst->end_branch = src->end_branch; + } + + // iterate over src->char_branches + const HHashTable *ht = src->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + HCharKey c = (HCharKey)hte->key; + HCFStringMap *src_ = hte->value; + + if(src_) { + HCFStringMap *dst_ = h_hashtable_get(dst->char_branches, (void *)c); + if(dst_) + stringmap_merge(combine, dst_, src_); + else + dst_ = src_; + } + } + } +} + +/* Generate entries for the production "A -> rhs" in the given table row. */ +static int fill_production_entries(size_t k, HCFGrammar *g, HCFStringMap *row, + const HCFChoice *A, HCFSequence *rhs) +{ + + for(size_t i=1; i<=k; i++) { + HCFStringMap *pred = h_predict(i, g, A, rhs); + h_stringmap_replace(pred, NULL, rhs); // make all values in pred map to rhs + + // clear previous conflict markers + h_stringmap_replace(row, CONFLICT, NULL); + + // merge predict set into the row, accumulating conflicts in workset + cte_workset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + // will be deleted after compile + stringmap_merge(combine_table_entry, row, pred); + + // if the workset is empty, row is free of conflicts and we are done. + if(h_hashset_empty(cte_workset)) + return 0; + } + + // if we reach here, conflicts remain at maximum lookahead + return -1; +} + +/* Generate entries for the production "A" in the given table row. */ +static int fill_table_row(size_t k, HCFGrammar *g, HCFStringMap *row, + const HCFChoice *A) +{ + // iterate over A's productions + for(HCFSequence **s = A->seq; *s; s++) { + // record this production in row as appropriate + if(fill_production_entries(k, g, row, A, *s) < 0) + return -1; + } return 0; } /* Generate the LL(k) parse table from the given grammar. * Returns -1 on error, 0 on success. */ -static int fill_table(HCFGrammar *g, HLLkTable *table) +static int fill_table(size_t k, HCFGrammar *g, HLLkTable *table) { table->start = g->start; @@ -138,18 +219,14 @@ static int fill_table(HCFGrammar *g, HLLkTable *table) assert(a->type == HCF_CHOICE); // create table row for this nonterminal - HHashTable *row = h_hashtable_new(table->arena, h_eq_ptr, h_hash_ptr); + HCFStringMap *row = h_stringmap_new(table->arena); h_hashtable_put(table->rows, a, row); - // iterate over a's productions - HCFSequence **s; - for(s = a->seq; *s; s++) { - // record this production in row as appropriate - // this can signal an ambiguity conflict. + if(fill_table_row(k, g, row, a) < 0) { + // unresolvable conflicts in row // NB we don't worry about deallocating anything, h_llk_compile will // delete the whole arena for us. - if(fill_table_row(g, row, a, *s) < 0) - return -1; + return -1; } } } @@ -157,8 +234,13 @@ static int fill_table(HCFGrammar *g, HLLkTable *table) return 0; } +static const size_t K_DEFAULT = 1; + int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) { + size_t k = params? (uintptr_t)params : K_DEFAULT; + assert(k>0); + // Convert parser to a CFG. This can fail as indicated by a NULL return. HCFGrammar *grammar = h_cfgrammar(mm__, parser); if(grammar == NULL) @@ -170,7 +252,7 @@ int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) // generate table and store in parser->backend_data. HLLkTable *table = h_llktable_new(mm__); - if(fill_table(grammar, table) < 0) { + if(fill_table(k, grammar, table) < 0) { // the table was ambiguous h_cfgrammar_free(grammar); h_llktable_free(table); @@ -358,7 +440,7 @@ int test_llk(void) */ HParser *X = h_optional(h_ch('x')); - HParser *Y = h_sequence(h_ch('y'), NULL); + HParser *Y = h_epsilon_p(); //h_sequence(h_ch('y'), NULL); HParser *A = h_sequence(X, Y, h_ch('a'), NULL); HParser *B = h_sequence(Y, h_ch('b'), NULL); HParser *p = h_choice(A, B, NULL); diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 972cc4d..b694197 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -265,12 +265,38 @@ void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) h_hashtable_merge(combine_stringmap, m->char_branches, n->char_branches); } +/* Replace all occurances of old in m with new. + * If old is NULL, replace all values in m with new. + * If new is NULL, remove the respective values. + */ +void h_stringmap_replace(HCFStringMap *m, void *old, void *new) +{ + if(!old || m->epsilon_branch == old) + m->epsilon_branch = new; + + if(!old || m->end_branch == old) + m->end_branch = new; + + // iterate over m->char_branches + const HHashTable *ht = m->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + HCFStringMap *m_ = hte->value; + if(m_) + h_stringmap_replace(m_, old, new); + } + } +} + void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end) { for(size_t i=0; iend_branch) return m->end_branch; - m = h_hashtable_get(m->char_branches, (void *)char_key(str[i])); + m = h_stringmap_get_char(m, str[i]); if(!m) return NULL; } diff --git a/src/cfgrammar.h b/src/cfgrammar.h index eb53b01..8dc4449 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -45,10 +45,14 @@ void h_stringmap_put_epsilon(HCFStringMap *m, void *v); void h_stringmap_put_after(HCFStringMap *m, uint8_t c, HCFStringMap *ends); void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v); void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n); +void h_stringmap_replace(HCFStringMap *m, void *old, void *new); void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); bool h_stringmap_present_epsilon(const HCFStringMap *m); +static inline void *h_stringmap_get_char(const HCFStringMap *m, const uint8_t c) + { return h_hashtable_get(m->char_branches, (void *)char_key(c)); } + /* Convert 'parser' into CFG representation by desugaring and compiling the set * of nonterminals. From e5ee61029c4425e1873d355bea235c8b7c54972e Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 22:19:13 +0200 Subject: [PATCH 06/95] herp-a-derp, that wasn't actually done - still bugged, though --- src/backends/llk.c | 122 ++++++++++++++++++++++++++------------------- src/cfgrammar.c | 2 +- 2 files changed, 71 insertions(+), 53 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index bef5aa6..acd6c5f 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -12,7 +12,6 @@ typedef struct HLLkTable_ { HHashTable *rows; HCFChoice *start; // start symbol - size_t k; // lookahead depth XXX needed? HArena *arena; HAllocator *mm__; } HLLkTable; @@ -109,34 +108,40 @@ HCFStringMap *h_predict(size_t k, HCFGrammar *g, void *const CONFLICT = (void *)(uintptr_t)(-1); -static HHashSet *cte_workset; // emulating a closure -static void *combine_table_entry(void *dst, const void *src) +// helper for stringmap_merge +static void *combine_entries(HHashSet *workset, void *dst, const void *src) { + assert(dst != NULL); + assert(src != NULL); + if(dst == CONFLICT) { // previous conflict - h_hashset_put(cte_workset, src); + h_hashset_put(workset, src); } else if(dst != src) { // new conflict - h_hashset_put(cte_workset, dst); - h_hashset_put(cte_workset, src); + h_hashset_put(workset, dst); + h_hashset_put(workset, src); dst = CONFLICT; } + return dst; } -// add the mappings of src to dst, calling combine if there is a collision -// note: might reuse parts of src in building up dst! -static void stringmap_merge(void *(*combine)(void *, const void *), - HCFStringMap *dst, HCFStringMap *src) +// add the mappings of src to dst, marking conflicts and adding the conflicting +// values to workset. +// note: reuses parts of src to build dst! +static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap *src) { if(src->epsilon_branch) { if(dst->epsilon_branch) - dst->epsilon_branch = combine(dst->epsilon_branch, src->epsilon_branch); + dst->epsilon_branch = + combine_entries(workset, dst->epsilon_branch, src->epsilon_branch); else dst->epsilon_branch = src->epsilon_branch; } if(src->end_branch) { if(dst->end_branch) - dst->end_branch = combine(dst->end_branch, src->end_branch); + dst->end_branch = + combine_entries(workset, dst->end_branch, src->end_branch); else dst->end_branch = src->end_branch; } @@ -154,7 +159,7 @@ static void stringmap_merge(void *(*combine)(void *, const void *), if(src_) { HCFStringMap *dst_ = h_hashtable_get(dst->char_branches, (void *)c); if(dst_) - stringmap_merge(combine, dst_, src_); + stringmap_merge(workset, dst_, src_); else dst_ = src_; } @@ -162,49 +167,62 @@ static void stringmap_merge(void *(*combine)(void *, const void *), } } -/* Generate entries for the production "A -> rhs" in the given table row. */ -static int fill_production_entries(size_t k, HCFGrammar *g, HCFStringMap *row, - const HCFChoice *A, HCFSequence *rhs) -{ - - for(size_t i=1; i<=k; i++) { - HCFStringMap *pred = h_predict(i, g, A, rhs); - h_stringmap_replace(pred, NULL, rhs); // make all values in pred map to rhs - - // clear previous conflict markers - h_stringmap_replace(row, CONFLICT, NULL); - - // merge predict set into the row, accumulating conflicts in workset - cte_workset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); - // will be deleted after compile - stringmap_merge(combine_table_entry, row, pred); - - // if the workset is empty, row is free of conflicts and we are done. - if(h_hashset_empty(cte_workset)) - return 0; - } - - // if we reach here, conflicts remain at maximum lookahead - return -1; -} - /* Generate entries for the production "A" in the given table row. */ -static int fill_table_row(size_t k, HCFGrammar *g, HCFStringMap *row, +static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, const HCFChoice *A) { - // iterate over A's productions - for(HCFSequence **s = A->seq; *s; s++) { - // record this production in row as appropriate - if(fill_production_entries(k, g, row, A, *s) < 0) - return -1; + HHashSet *workset; // to be deleted after compile + // ~> alloc in g->arena + + // initialize working set to the productions of A + workset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + for(HCFSequence **s = A->seq; *s; s++) + h_hashset_put(workset, *s); + + // run until workset exhausted or kmax hit + size_t k; + for(k=1; k<=kmax; k++) { + // iterate over productions in workset... + const HHashTable *ht = workset; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + HCFSequence *rhs = (void *)hte->key; + assert(rhs != NULL); + assert(rhs != CONFLICT); // just to be sure there's no mixup + + // remove this production from workset + h_hashset_del(workset, rhs); + + // calculate predict set; let values map to rhs + HCFStringMap *pred = h_predict(k, g, A, rhs); + h_stringmap_replace(pred, NULL, rhs); + + // merge predict set into the row; accumulates conflicts in workset + stringmap_merge(workset, row, pred); + } + } + + // if the workset is empty, row is without conflict; we're done + if(h_hashset_empty(workset)) + break; + + // clear conflict markers for next iteration + h_stringmap_replace(row, CONFLICT, NULL); } - return 0; + + if(k>kmax) // conflicts remain + return -1; + else + return 0; } /* Generate the LL(k) parse table from the given grammar. * Returns -1 on error, 0 on success. */ -static int fill_table(size_t k, HCFGrammar *g, HLLkTable *table) +static int fill_table(size_t kmax, HCFGrammar *g, HLLkTable *table) { table->start = g->start; @@ -222,7 +240,7 @@ static int fill_table(size_t k, HCFGrammar *g, HLLkTable *table) HCFStringMap *row = h_stringmap_new(table->arena); h_hashtable_put(table->rows, a, row); - if(fill_table_row(k, g, row, a) < 0) { + if(fill_table_row(kmax, g, row, a) < 0) { // unresolvable conflicts in row // NB we don't worry about deallocating anything, h_llk_compile will // delete the whole arena for us. @@ -234,12 +252,12 @@ static int fill_table(size_t k, HCFGrammar *g, HLLkTable *table) return 0; } -static const size_t K_DEFAULT = 1; +static const size_t DEFAULT_KMAX = 1; int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) { - size_t k = params? (uintptr_t)params : K_DEFAULT; - assert(k>0); + size_t kmax = params? (uintptr_t)params : DEFAULT_KMAX; + assert(kmax>0); // Convert parser to a CFG. This can fail as indicated by a NULL return. HCFGrammar *grammar = h_cfgrammar(mm__, parser); @@ -252,7 +270,7 @@ int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) // generate table and store in parser->backend_data. HLLkTable *table = h_llktable_new(mm__); - if(fill_table(k, grammar, table) < 0) { + if(fill_table(kmax, grammar, table) < 0) { // the table was ambiguous h_cfgrammar_free(grammar); h_llktable_free(table); diff --git a/src/cfgrammar.c b/src/cfgrammar.c index b694197..911c0eb 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -248,7 +248,7 @@ void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v) } // helper for h_stringmap_update -void *combine_stringmap(void *v1, void *v2) +static void *combine_stringmap(void *v1, void *v2) { h_stringmap_update((HCFStringMap *)v1, (HCFStringMap *)v2); return v1; From faebe355a82733f4cb4383ec492ec2155ab7f0a5 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 22:53:45 +0200 Subject: [PATCH 07/95] fix h_stringmap_replace for the all-values case --- src/cfgrammar.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 911c0eb..9abfd10 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -271,11 +271,13 @@ void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) */ void h_stringmap_replace(HCFStringMap *m, void *old, void *new) { - if(!old || m->epsilon_branch == old) - m->epsilon_branch = new; - - if(!old || m->end_branch == old) - m->end_branch = new; + if(!old) { + if(m->epsilon_branch) m->epsilon_branch = new; + if(m->end_branch) m->end_branch = new; + } else { + if(m->epsilon_branch == old) m->epsilon_branch = new; + if(m->end_branch == old) m->end_branch = new; + } // iterate over m->char_branches const HHashTable *ht = m->char_branches; From 4c5ca5ceab9efbe76c470661982971b83e14b0fb Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 22:54:49 +0200 Subject: [PATCH 08/95] clean up a bit --- src/backends/llk.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index acd6c5f..ad09aa5 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -171,8 +171,7 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, const HCFChoice *A) { - HHashSet *workset; // to be deleted after compile - // ~> alloc in g->arena + HHashSet *workset; // initialize working set to the productions of A workset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); @@ -182,7 +181,10 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, // run until workset exhausted or kmax hit size_t k; for(k=1; k<=kmax; k++) { - // iterate over productions in workset... + // allocate a fresh workset for the next round + HHashSet *nextset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + + // iterate over the productions in workset... const HHashTable *ht = workset; for(size_t i=0; i < ht->capacity; i++) { for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { @@ -193,18 +195,20 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, assert(rhs != NULL); assert(rhs != CONFLICT); // just to be sure there's no mixup - // remove this production from workset - h_hashset_del(workset, rhs); - // calculate predict set; let values map to rhs HCFStringMap *pred = h_predict(k, g, A, rhs); h_stringmap_replace(pred, NULL, rhs); - // merge predict set into the row; accumulates conflicts in workset - stringmap_merge(workset, row, pred); + // merge predict set into the row + // accumulates conflicts in new workset + stringmap_merge(nextset, row, pred); } } + // switch to the updated workset + h_hashtable_free(workset); + workset = nextset; + // if the workset is empty, row is without conflict; we're done if(h_hashset_empty(workset)) break; @@ -213,10 +217,8 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, h_stringmap_replace(row, CONFLICT, NULL); } - if(k>kmax) // conflicts remain - return -1; - else - return 0; + h_hashset_free(workset); + return (k>kmax)? -1 : 0; } /* Generate the LL(k) parse table from the given grammar. @@ -483,7 +485,7 @@ int test_llk(void) return 2; } - HParseResult *res = h_parse(p, (uint8_t *)"xya", 3); + HParseResult *res = h_parse(p, (uint8_t *)"xa", 2); if(res) h_pprint(stdout, res->ast, 0, 2); else From f6983a50419140950185178a5d1b183bf3df475f Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 23:25:32 +0200 Subject: [PATCH 09/95] debug table generation --- src/backends/llk.c | 33 +++++++++++++++++++++++---------- src/cfgrammar.c | 4 ++-- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index ad09aa5..69fb7ef 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -3,6 +3,8 @@ #include "../cfgrammar.h" #include "../parsers/parser_internal.h" +static const size_t DEFAULT_KMAX = 1; + /* Generating the LL(k) parse table */ @@ -17,11 +19,6 @@ typedef struct HLLkTable_ { } HLLkTable; -// XXX adaptation to LL(1), to be removed -typedef HCharKey HCFToken; -static const HCFToken end_token = 0x200; -#define char_token char_key - /* Interface to look up an entry in the parse table. */ const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, HInputStream lookahead) @@ -116,7 +113,7 @@ static void *combine_entries(HHashSet *workset, void *dst, const void *src) if(dst == CONFLICT) { // previous conflict h_hashset_put(workset, src); - } else if(dst != src) { // new conflict + } else if(dst == src) { // new conflict h_hashset_put(workset, dst); h_hashset_put(workset, src); dst = CONFLICT; @@ -161,12 +158,15 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * if(dst_) stringmap_merge(workset, dst_, src_); else - dst_ = src_; + h_hashtable_put(dst->char_branches, (void *)c, src_); } } } } +void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq); +void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x); + /* Generate entries for the production "A" in the given table row. */ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, const HCFChoice *A) @@ -202,8 +202,23 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, // merge predict set into the row // accumulates conflicts in new workset stringmap_merge(nextset, row, pred); + + // XXX debug + if(A == g->start) { + printf("predict("); + pprint_sequence(stdout, g, rhs); + printf(" ) = "); + h_pprint_stringset(stdout, g, pred, 0); + } } } + // XXX debug + if(A == g->start) { + printf("row("); + pprint_symbol(stdout, g, A); + printf(") = "); + h_pprint_stringset(stdout, g, row, 0); + } // switch to the updated workset h_hashtable_free(workset); @@ -254,8 +269,6 @@ static int fill_table(size_t kmax, HCFGrammar *g, HLLkTable *table) return 0; } -static const size_t DEFAULT_KMAX = 1; - int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) { size_t kmax = params? (uintptr_t)params : DEFAULT_KMAX; @@ -460,7 +473,7 @@ int test_llk(void) */ HParser *X = h_optional(h_ch('x')); - HParser *Y = h_epsilon_p(); //h_sequence(h_ch('y'), NULL); + HParser *Y = h_sequence(h_ch('y'), NULL); HParser *A = h_sequence(X, Y, h_ch('a'), NULL); HParser *B = h_sequence(Y, h_ch('b'), NULL); HParser *p = h_choice(A, B, NULL); diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 9abfd10..1721122 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -624,7 +624,7 @@ static HCFChoice **pprint_string(FILE *f, HCFChoice **x) return x; } -static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) +void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) { switch(x->type) { case HCF_CHAR: @@ -643,7 +643,7 @@ static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) } } -static void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) +void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) { HCFChoice **x = seq->items; From 748845ca0c6862d456d1e1ddd33d2c9e15d02338 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 23 May 2013 23:35:10 +0200 Subject: [PATCH 10/95] add API for pretty-printing for grammar symbols and symbol sequences --- src/backends/llk.c | 9 +++------ src/cfgrammar.c | 23 ++++++++++++++--------- src/cfgrammar.h | 2 ++ 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 69fb7ef..d414f8b 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -164,9 +164,6 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * } } -void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq); -void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x); - /* Generate entries for the production "A" in the given table row. */ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, const HCFChoice *A) @@ -206,8 +203,8 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, // XXX debug if(A == g->start) { printf("predict("); - pprint_sequence(stdout, g, rhs); - printf(" ) = "); + h_pprint_sequence(stdout, g, rhs); + printf(") = "); h_pprint_stringset(stdout, g, pred, 0); } } @@ -215,7 +212,7 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, // XXX debug if(A == g->start) { printf("row("); - pprint_symbol(stdout, g, A); + h_pprint_symbol(stdout, g, A); printf(") = "); h_pprint_stringset(stdout, g, row, 0); } diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 1721122..2eb53d9 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -624,7 +624,7 @@ static HCFChoice **pprint_string(FILE *f, HCFChoice **x) return x; } -void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) +void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) { switch(x->type) { case HCF_CHAR: @@ -643,32 +643,37 @@ void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) } } -void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) +void h_pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) { HCFChoice **x = seq->items; if(*x == NULL) { // the empty sequence - fputs(" \"\"", f); + fputs("\"\"", f); } else { while(*x) { - fputc(' ', f); // separator + if(x != seq->items) fputc(' ', f); // internal separator if((*x)->type == HCF_CHAR) { // condense character strings x = pprint_string(f, x); } else { - pprint_symbol(f, g, *x); + h_pprint_symbol(f, g, *x); x++; } } } +} +// adds some separators expected below +static void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) +{ + fputc(' ', f); + h_pprint_sequence(f, g, seq); fputc('\n', f); } -static -void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt, - int indent, int len) +static void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt, + int indent, int len) { int i; int column = indent + len; @@ -738,7 +743,7 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in a = hte->key; // production's left-hand symbol - pprint_symbol(file, g, a); + h_pprint_symbol(file, g, a); } } diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 8dc4449..5ac70b5 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -83,5 +83,7 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); /* Pretty-printers for grammars and associated data. */ void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent); +void h_pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq); +void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x); void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent); void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent); From 428636f3d0c9cefc80b2d359d10dc9474868692f Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 01:31:56 +0200 Subject: [PATCH 11/95] h_predict for k>1, more debugging --- src/backends/llk.c | 56 +++++++++++++--------------------------------- src/cfgrammar.c | 37 ++++++++++++++++++++++++++++-- src/cfgrammar.h | 8 ++++++- 3 files changed, 58 insertions(+), 43 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index d414f8b..3f3008e 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -81,28 +81,6 @@ void h_llktable_free(HLLkTable *table) h_free(table); } -/* Compute the predict_k set of production "A -> rhs". - * Always returns a newly-allocated HCFStringMap. - */ -HCFStringMap *h_predict(size_t k, HCFGrammar *g, - const HCFChoice *A, const HCFSequence *rhs) -{ - assert(k==1); // XXX - HCFStringMap *ret = h_stringmap_new(g->arena); - - // predict(A -> rhs) = first(rhs) u follow(A) if "" can be derived from rhs - // predict(A -> rhs) = first(rhs) otherwise - - h_stringmap_update(ret, h_first_seq(k, g, rhs->items)); - if(h_derives_epsilon_seq(g, rhs->items)) - h_stringmap_update(ret, h_follow(k, g, A)); - - // make sure there are only strings of length _exactly_ k - ret->epsilon_branch = NULL; - - return ret; -} - void *const CONFLICT = (void *)(uintptr_t)(-1); // helper for stringmap_merge @@ -113,7 +91,7 @@ static void *combine_entries(HHashSet *workset, void *dst, const void *src) if(dst == CONFLICT) { // previous conflict h_hashset_put(workset, src); - } else if(dst == src) { // new conflict + } else if(dst != src) { // new conflict h_hashset_put(workset, dst); h_hashset_put(workset, src); dst = CONFLICT; @@ -178,6 +156,8 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, // run until workset exhausted or kmax hit size_t k; for(k=1; k<=kmax; k++) { + printf("k=%lu\n", k); // XXX debug + // allocate a fresh workset for the next round HHashSet *nextset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); @@ -196,26 +176,22 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, HCFStringMap *pred = h_predict(k, g, A, rhs); h_stringmap_replace(pred, NULL, rhs); + // XXX debug + printf("predict("); + h_pprint_sequence(stdout, g, rhs); + printf(") = "); + h_pprint_stringset(stdout, pred, 0); + // merge predict set into the row // accumulates conflicts in new workset stringmap_merge(nextset, row, pred); - - // XXX debug - if(A == g->start) { - printf("predict("); - h_pprint_sequence(stdout, g, rhs); - printf(") = "); - h_pprint_stringset(stdout, g, pred, 0); - } } } // XXX debug - if(A == g->start) { - printf("row("); - h_pprint_symbol(stdout, g, A); - printf(") = "); - h_pprint_stringset(stdout, g, row, 0); - } + printf("row("); + h_pprint_symbol(stdout, g, A); + printf(") = "); + h_pprint_stringset(stdout, row, 0); // switch to the updated workset h_hashtable_free(workset); @@ -486,11 +462,11 @@ int test_llk(void) printf("derive epsilon: "); h_pprint_symbolset(stdout, g, g->geneps, 0); printf("first(A) = "); - h_pprint_stringset(stdout, g, h_first(3, g, g->start), 0); + h_pprint_stringset(stdout, h_first(3, g, g->start), 0); //printf("follow(C) = "); - //h_pprint_stringset(stdout, g, h_follow(3, g, h_desugar(&system_allocator, c)), 0); + //h_pprint_stringset(stdout, h_follow(3, g, h_desugar(&system_allocator, c)), 0); - if(h_compile(p, PB_LLk, NULL)) { + if(h_compile(p, PB_LLk, (void *)2)) { fprintf(stderr, "does not compile\n"); return 2; } diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 2eb53d9..d774dd0 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -437,7 +437,23 @@ static bool any_string_shorter(size_t k, const HCFStringMap *m) return false; } -const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); +// helper for h_predict +static void remove_all_shorter(size_t k, HCFStringMap *m) +{ + if(k==0) return; + m->epsilon_branch = NULL; + if(k==1) return; + + // iterate over m->char_branches + const HHashTable *ht = m->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + remove_all_shorter(k-1, hte->value); // recursion into subtree + } + } +} // h_follow adapted to the signature of StringSetFun static inline const HCFStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) @@ -507,6 +523,23 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) return ret; } +HCFStringMap *h_predict(size_t k, HCFGrammar *g, + const HCFChoice *A, const HCFSequence *rhs) +{ + HCFStringMap *ret = h_stringmap_new(g->arena); + + // predict_k(A -> rhs) = + // { ab | a <- first_k(rhs), b <- follow_k(A), |ab|=k } + + const HCFStringMap *first_rhs = h_first_seq(k, g, rhs->items); + stringset_extend(g, ret, k, first_rhs, h_follow_, (HCFChoice **)&A); + + // make sure there are only strings of length _exactly_ k + remove_all_shorter(k, ret); + + return ret; +} + // add the set { a b | a <- as, b <- f_l(S), l=k-|a| } to ret static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, size_t k, const HCFStringMap *as, @@ -806,7 +839,7 @@ pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, return first; } -void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent) +void h_pprint_stringset(FILE *file, const HCFStringMap *set, int indent) { int j; for(j=0; j rhs". + * Always returns a newly-allocated HCFStringMap. + */ +HCFStringMap *h_predict(size_t k, HCFGrammar *g, + const HCFChoice *A, const HCFSequence *rhs); + /* Pretty-printers for grammars and associated data. */ void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent); void h_pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq); void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x); void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent); -void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent); +void h_pprint_stringset(FILE *file, const HCFStringMap *set, int indent); From 8da48913b1c7bcc40e6085394dc39d1efdcc5eef Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 01:49:39 +0200 Subject: [PATCH 12/95] don't extend table entries that are already unambiguous --- src/backends/llk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/backends/llk.c b/src/backends/llk.c index 3f3008e..176541a 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -111,6 +111,12 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * combine_entries(workset, dst->epsilon_branch, src->epsilon_branch); else dst->epsilon_branch = src->epsilon_branch; + } else { + // if there is a non-conflicting value on the left (dst) side, it means + // that prediction is already unambiguous. we can drop the right (src) + // side we were going to extend with. + if(dst->epsilon_branch && dst->epsilon_branch != CONFLICT) + return; } if(src->end_branch) { From 81e0ffed1d7ec2c6ae083f5c484b3d27a532d8cd Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 04:31:33 +0200 Subject: [PATCH 13/95] remove an erroneous assert that snuck into reshape_optional --- src/parsers/optional.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parsers/optional.c b/src/parsers/optional.c index 9ba2f19..c60600d 100644 --- a/src/parsers/optional.c +++ b/src/parsers/optional.c @@ -25,7 +25,6 @@ static bool opt_isValidCF(void *env) { static const HParsedToken* reshape_optional(const HParseResult *p) { assert(p->ast); assert(p->ast->token_type == TT_SEQUENCE); - assert(p->ast->seq->used > 0); HParsedToken *res = p->ast->seq->elements[0]; if(res) From c6d3bc7fd55ee377f223df7dbff50af2bdea492a Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 04:32:14 +0200 Subject: [PATCH 14/95] let h_stringmap_get_char return the proper type --- src/cfgrammar.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 58de76e..d2270ff 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -50,7 +50,7 @@ void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); bool h_stringmap_present_epsilon(const HCFStringMap *m); -static inline void *h_stringmap_get_char(const HCFStringMap *m, const uint8_t c) +static inline HCFStringMap *h_stringmap_get_char(const HCFStringMap *m, const uint8_t c) { return h_hashtable_get(m->char_branches, (void *)char_key(c)); } From bd9f9293c133074bad6e8eb8ce7e72b5c7e198e4 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 04:33:27 +0200 Subject: [PATCH 15/95] why does this (test_llk) segfault!? driver goes into a loop. --- src/backends/llk.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 176541a..27258e6 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -198,9 +198,14 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, h_pprint_symbol(stdout, g, A); printf(") = "); h_pprint_stringset(stdout, row, 0); + if(h_stringmap_get(row, (uint8_t *)"a", 1, false)) { + printf(" a -> "); + h_pprint_sequence(stdout, g, h_stringmap_get(row, (uint8_t *)"a", 1, false)); + printf("\n"); + } // switch to the updated workset - h_hashtable_free(workset); + h_hashset_free(workset); workset = nextset; // if the workset is empty, row is without conflict; we're done @@ -452,9 +457,9 @@ int test_llk(void) */ HParser *X = h_optional(h_ch('x')); - HParser *Y = h_sequence(h_ch('y'), NULL); - HParser *A = h_sequence(X, Y, h_ch('a'), NULL); - HParser *B = h_sequence(Y, h_ch('b'), NULL); + //HParser *Y = h_epsilon_p(); //h_sequence(h_ch('y'), NULL); + HParser *A = h_sequence(X, h_ch('a'), NULL); + HParser *B = h_sequence(h_ch('b'), NULL); HParser *p = h_choice(A, B, NULL); HCFGrammar *g = h_cfgrammar(&system_allocator, p); @@ -477,7 +482,7 @@ int test_llk(void) return 2; } - HParseResult *res = h_parse(p, (uint8_t *)"xa", 2); + HParseResult *res = h_parse(p, (uint8_t *)"ab", 2); if(res) h_pprint(stdout, res->ast, 0, 2); else From e4984fe60cfed6aa57f3853ebb5dce0a7131895d Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 14:08:13 +0200 Subject: [PATCH 16/95] add an assertion that catches the infinite loop --- src/backends/llk.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/backends/llk.c b/src/backends/llk.c index 27258e6..11ff9ad 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -343,6 +343,9 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* if(p == NULL) goto no_parse; + // an infinite loop case that shouldn't happen + assert(!p->items[0] || p->items[0] != x); + // push production's rhs onto the stack (in reverse order) HCFChoice **s; for(s = p->items; *s; s++); From 3047fd223b9350f6d8cb77a2491eb20c10f1c5cf Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 22:29:33 +0200 Subject: [PATCH 17/95] let h_hashtable_merge's combine function decide what to do on NULL dst values --- src/datastructures.c | 12 ++++-------- src/internal.h | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/datastructures.c b/src/datastructures.c index 0781040..0581591 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -200,7 +200,7 @@ void h_hashtable_update(HHashTable *dst, const HHashTable *src) { } } -void h_hashtable_merge(void *(*combine)(void *v1, void *v2), +void h_hashtable_merge(void *(*combine)(void *v1, const void *v2), HHashTable *dst, const HHashTable *src) { size_t i; HHashTableEntry *hte; @@ -208,13 +208,9 @@ void h_hashtable_merge(void *(*combine)(void *v1, void *v2), for(hte = &src->contents[i]; hte; hte = hte->next) { if(hte->key == NULL) continue; - void *oldvalue = h_hashtable_get(dst, hte->key); - void *newvalue; - if(oldvalue) - newvalue = combine(oldvalue, hte->value); - else - newvalue = hte->value; - h_hashtable_put(dst, hte->key, newvalue); + void *dstvalue = h_hashtable_get(dst, hte->key); + void *srcvalue = hte->value; + h_hashtable_put(dst, hte->key, combine(dstvalue, srcvalue)); } } } diff --git a/src/internal.h b/src/internal.h index 889e5c3..7655afa 100644 --- a/src/internal.h +++ b/src/internal.h @@ -255,7 +255,7 @@ HHashTable* h_hashtable_new(HArena *arena, HEqualFunc equalFunc, HHashFunc hashF void* h_hashtable_get(const HHashTable* ht, const void* key); void h_hashtable_put(HHashTable* ht, const void* key, void* value); void h_hashtable_update(HHashTable* dst, const HHashTable *src); -void h_hashtable_merge(void *(*combine)(void *v1, void *v2), +void h_hashtable_merge(void *(*combine)(void *v1, const void *v2), HHashTable *dst, const HHashTable *src); int h_hashtable_present(const HHashTable* ht, const void* key); void h_hashtable_del(HHashTable* ht, const void* key); From de4b21757e47e1fd6524da4ed5f8996bbbec233e Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 22:38:14 +0200 Subject: [PATCH 18/95] make sure h_string_update copies its subtrees --- src/cfgrammar.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index d774dd0..e9c9d5d 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -248,12 +248,18 @@ void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v) } // helper for h_stringmap_update -static void *combine_stringmap(void *v1, void *v2) +static void *combine_stringmap(void *v1, const void *v2) { - h_stringmap_update((HCFStringMap *)v1, (HCFStringMap *)v2); - return v1; + HCFStringMap *m1 = v1; + const HCFStringMap *m2 = v2; + if(!m1) + m1 = h_stringmap_new(m2->arena); + h_stringmap_update(m1, m2); + + return m1; } +/* Note: Does *not* reuse submaps from n in building m. */ void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) { if(n->epsilon_branch) From cbd50ec4c3e77143b5acea198ac4987d87f2f63d Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 22:39:32 +0200 Subject: [PATCH 19/95] temp commit some debugging stuff for posterity --- src/backends/llk.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/backends/llk.c b/src/backends/llk.c index 11ff9ad..090e650 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -484,6 +484,21 @@ int test_llk(void) fprintf(stderr, "does not compile\n"); return 2; } + HLLkTable *table = p->backend_data; + printf("table(C,a) = "); + HCFStringMap *row = h_hashtable_get(table->rows, X->desugared); + assert(row); + HCFSequence *rhs = h_stringmap_get(row, (uint8_t*)"a", 1, false); + assert(rhs); + h_pprint_sequence(stdout, g, rhs); + printf(" (row %p, rhs %p)\n", row, rhs); + printf("table(D,a) = "); + row = h_hashtable_get(table->rows, rhs->items[0]); + assert(row); + rhs = h_stringmap_get(row, (uint8_t*)"a", 1, false); + assert(rhs); + h_pprint_sequence(stdout, g, rhs); + printf(" (row %p, rhs %p)\n", row, rhs); HParseResult *res = h_parse(p, (uint8_t *)"ab", 2); if(res) From f55ec2f1ed117a557f69f893c4a9407ca6f84cff Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 22:40:00 +0200 Subject: [PATCH 20/95] comment wording --- src/backends/llk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 090e650..df781c3 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -148,7 +148,7 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * } } -/* Generate entries for the production "A" in the given table row. */ +/* Generate entries for the productions of A in the given table row. */ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, const HCFChoice *A) { From 48afbcb0783c1b791051acb561aadebd80d26f44 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 22:48:15 +0200 Subject: [PATCH 21/95] justify a const cast --- src/cfgrammar.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index e9c9d5d..0ce3cbf 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -462,7 +462,8 @@ static void remove_all_shorter(size_t k, HCFStringMap *m) } // h_follow adapted to the signature of StringSetFun -static inline const HCFStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) +static inline +const HCFStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) { return h_follow(k, g, *s); } @@ -538,6 +539,10 @@ HCFStringMap *h_predict(size_t k, HCFGrammar *g, // { ab | a <- first_k(rhs), b <- follow_k(A), |ab|=k } const HCFStringMap *first_rhs = h_first_seq(k, g, rhs->items); + + // casting the const off of A below. note: stringset_extend does + // not touch this argument, only passes it through to h_follow + // in this case, which accepts it, once again, as const. stringset_extend(g, ret, k, first_rhs, h_follow_, (HCFChoice **)&A); // make sure there are only strings of length _exactly_ k From 7ce2194ff1e37633d96dee0cbd6579d54eb8bec3 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 23:00:00 +0200 Subject: [PATCH 22/95] upgrade dummy example to require LL(3) --- src/backends/llk.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index df781c3..7d3d0a4 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -460,9 +460,9 @@ int test_llk(void) */ HParser *X = h_optional(h_ch('x')); - //HParser *Y = h_epsilon_p(); //h_sequence(h_ch('y'), NULL); - HParser *A = h_sequence(X, h_ch('a'), NULL); - HParser *B = h_sequence(h_ch('b'), NULL); + HParser *Y = h_sequence(h_ch('y'), h_ch('y'), NULL); + HParser *A = h_sequence(X, Y, h_ch('a'), NULL); + HParser *B = h_sequence(Y, h_ch('b'), NULL); HParser *p = h_choice(A, B, NULL); HCFGrammar *g = h_cfgrammar(&system_allocator, p); @@ -480,27 +480,12 @@ int test_llk(void) //printf("follow(C) = "); //h_pprint_stringset(stdout, h_follow(3, g, h_desugar(&system_allocator, c)), 0); - if(h_compile(p, PB_LLk, (void *)2)) { + if(h_compile(p, PB_LLk, (void *)3)) { fprintf(stderr, "does not compile\n"); return 2; } - HLLkTable *table = p->backend_data; - printf("table(C,a) = "); - HCFStringMap *row = h_hashtable_get(table->rows, X->desugared); - assert(row); - HCFSequence *rhs = h_stringmap_get(row, (uint8_t*)"a", 1, false); - assert(rhs); - h_pprint_sequence(stdout, g, rhs); - printf(" (row %p, rhs %p)\n", row, rhs); - printf("table(D,a) = "); - row = h_hashtable_get(table->rows, rhs->items[0]); - assert(row); - rhs = h_stringmap_get(row, (uint8_t*)"a", 1, false); - assert(rhs); - h_pprint_sequence(stdout, g, rhs); - printf(" (row %p, rhs %p)\n", row, rhs); - HParseResult *res = h_parse(p, (uint8_t *)"ab", 2); + HParseResult *res = h_parse(p, (uint8_t *)"xyya", 4); if(res) h_pprint(stdout, res->ast, 0, 2); else From 0dde8ea4eeec2f14d4e599bcf7e26821937f03f4 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 24 May 2013 23:04:20 +0200 Subject: [PATCH 23/95] remove debug output from fill_table_row --- src/backends/llk.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 7d3d0a4..59ec790 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -162,8 +162,6 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, // run until workset exhausted or kmax hit size_t k; for(k=1; k<=kmax; k++) { - printf("k=%lu\n", k); // XXX debug - // allocate a fresh workset for the next round HHashSet *nextset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); @@ -182,27 +180,11 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, HCFStringMap *pred = h_predict(k, g, A, rhs); h_stringmap_replace(pred, NULL, rhs); - // XXX debug - printf("predict("); - h_pprint_sequence(stdout, g, rhs); - printf(") = "); - h_pprint_stringset(stdout, pred, 0); - // merge predict set into the row // accumulates conflicts in new workset stringmap_merge(nextset, row, pred); } } - // XXX debug - printf("row("); - h_pprint_symbol(stdout, g, A); - printf(") = "); - h_pprint_stringset(stdout, row, 0); - if(h_stringmap_get(row, (uint8_t *)"a", 1, false)) { - printf(" a -> "); - h_pprint_sequence(stdout, g, h_stringmap_get(row, (uint8_t *)"a", 1, false)); - printf("\n"); - } // switch to the updated workset h_hashset_free(workset); From 8618f9cb62702edc60d2f6de6c47606009027ec8 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Fri, 24 May 2013 20:10:21 -0700 Subject: [PATCH 24/95] remove useless desugar_unimplemented; revert incorrect changes to test_llk() --- src/backends/llk.c | 18 +++++++++--------- src/parsers/unimplemented.c | 7 +------ 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index a799cbe..aeafd6a 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -108,7 +108,7 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * if(src->epsilon_branch) { if(dst->epsilon_branch) dst->epsilon_branch = - combine_entries(workset, dst->epsilon_branch, src->epsilon_branch); + combine_entries(workset, dst->epsilon_branch, src->epsilon_branch); else dst->epsilon_branch = src->epsilon_branch; } else { @@ -122,7 +122,7 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * if(src->end_branch) { if(dst->end_branch) dst->end_branch = - combine_entries(workset, dst->end_branch, src->end_branch); + combine_entries(workset, dst->end_branch, src->end_branch); else dst->end_branch = src->end_branch; } @@ -412,10 +412,10 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* h_delete_arena(tarena); return make_result(arena, seq->elements[0]); - no_parse: - h_delete_arena(tarena); - h_delete_arena(arena); - return NULL; + no_parse: + h_delete_arena(tarena); + h_delete_arena(arena); + return NULL; } @@ -458,9 +458,9 @@ int test_llk(void) printf("derive epsilon: "); h_pprint_symbolset(stdout, g, g->geneps, 0); printf("first(A) = "); - h_pprint_stringset(stdout, g, h_first(2, g, g->start), 0); - printf("follow(C) = "); - h_pprint_stringset(stdout, g, h_follow(2, g, h_desugar(&system_allocator, NULL, c)), 0); + h_pprint_stringset(stdout, h_first(3, g, g->start), 0); + // printf("follow(C) = "); + // h_pprint_stringset(stdout, h_follow(3, g, h_desugar(&system_allocator, NULL, c)), 0); if(h_compile(p, PB_LLk, (void *)3)) { fprintf(stderr, "does not compile\n"); diff --git a/src/parsers/unimplemented.c b/src/parsers/unimplemented.c index 18255ac..e3f3039 100644 --- a/src/parsers/unimplemented.c +++ b/src/parsers/unimplemented.c @@ -12,16 +12,11 @@ static HParseResult* parse_unimplemented(void* env, HParseState *state) { return &result; } -static HCFChoice* desugar_unimplemented(HAllocator *mm__, HCFStack *stk__, void *env) { - assert_message(0, "'h_unimplemented' is not context-free, can't be desugared"); - return NULL; -} - static const HParserVtable unimplemented_vt = { .parse = parse_unimplemented, .isValidRegular = h_false, .isValidCF = h_false, - .desugar = desugar_unimplemented, + .desugar = NULL, .compile_to_rvm = h_not_regular, }; From e0207b8d5fab6ec3cbb73b92eb14221db66f0a06 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 25 May 2013 14:49:59 +0200 Subject: [PATCH 25/95] properly initialize NULL fields in h_cfgrammar_new and h_stringmap_new --- src/cfgrammar.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index a691230..3d5e558 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -18,6 +18,7 @@ HCFGrammar *h_cfgrammar_new(HAllocator *mm__) g->mm__ = mm__; g->arena = h_new_arena(mm__, 0); // default blocksize g->nts = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + g->start = NULL; g->geneps = NULL; g->first = NULL; g->follow = NULL; @@ -222,6 +223,8 @@ static void collect_geneps(HCFGrammar *g) HCFStringMap *h_stringmap_new(HArena *a) { HCFStringMap *m = h_arena_malloc(a, sizeof(HCFStringMap)); + m->epsilon_branch = NULL; + m->end_branch = NULL; m->char_branches = h_hashtable_new(a, h_eq_ptr, h_hash_ptr); m->arena = a; return m; From dba9d41edaf0b0a99f530007e2cd9532b09648f7 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Tue, 4 Jun 2013 21:47:09 +0200 Subject: [PATCH 26/95] rename HCFStringMap to HStringMap --- src/backends/llk.c | 18 +++++----- src/cfgrammar.c | 88 +++++++++++++++++++++++----------------------- src/cfgrammar.h | 42 +++++++++++----------- 3 files changed, 74 insertions(+), 74 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index aeafd6a..79ab8f4 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -8,7 +8,7 @@ static const size_t DEFAULT_KMAX = 1; /* Generating the LL(k) parse table */ -/* Maps each nonterminal (HCFChoice) of the grammar to a HCFStringMap that +/* Maps each nonterminal (HCFChoice) of the grammar to a HStringMap that * maps lookahead strings to productions (HCFSequence). */ typedef struct HLLkTable_ { @@ -23,13 +23,13 @@ typedef struct HLLkTable_ { const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, HInputStream lookahead) { - const HCFStringMap *row = h_hashtable_get(table->rows, x); + const HStringMap *row = h_hashtable_get(table->rows, x); assert(row != NULL); // the table should have one row for each nonterminal assert(!row->epsilon_branch); // would match without looking at the input // XXX cases where this could be useful? - const HCFStringMap *m = row; + const HStringMap *m = row; while(m) { if(m->epsilon_branch) { // input matched // assert: another lookahead would not bring a more specific match. @@ -103,7 +103,7 @@ static void *combine_entries(HHashSet *workset, void *dst, const void *src) // add the mappings of src to dst, marking conflicts and adding the conflicting // values to workset. // note: reuses parts of src to build dst! -static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap *src) +static void stringmap_merge(HHashSet *workset, HStringMap *dst, HStringMap *src) { if(src->epsilon_branch) { if(dst->epsilon_branch) @@ -135,10 +135,10 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * continue; HCharKey c = (HCharKey)hte->key; - HCFStringMap *src_ = hte->value; + HStringMap *src_ = hte->value; if(src_) { - HCFStringMap *dst_ = h_hashtable_get(dst->char_branches, (void *)c); + HStringMap *dst_ = h_hashtable_get(dst->char_branches, (void *)c); if(dst_) stringmap_merge(workset, dst_, src_); else @@ -149,7 +149,7 @@ static void stringmap_merge(HHashSet *workset, HCFStringMap *dst, HCFStringMap * } /* Generate entries for the productions of A in the given table row. */ -static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, +static int fill_table_row(size_t kmax, HCFGrammar *g, HStringMap *row, const HCFChoice *A) { HHashSet *workset; @@ -177,7 +177,7 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HCFStringMap *row, assert(rhs != CONFLICT); // just to be sure there's no mixup // calculate predict set; let values map to rhs - HCFStringMap *pred = h_predict(k, g, A, rhs); + HStringMap *pred = h_predict(k, g, A, rhs); h_stringmap_replace(pred, NULL, rhs); // merge predict set into the row @@ -220,7 +220,7 @@ static int fill_table(size_t kmax, HCFGrammar *g, HLLkTable *table) assert(a->type == HCF_CHOICE); // create table row for this nonterminal - HCFStringMap *row = h_stringmap_new(table->arena); + HStringMap *row = h_stringmap_new(table->arena); h_hashtable_put(table->rows, a, row); if(fill_table_row(kmax, g, row, a) < 0) { diff --git a/src/cfgrammar.c b/src/cfgrammar.c index a691230..32cc9d6 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -23,7 +23,7 @@ HCFGrammar *h_cfgrammar_new(HAllocator *mm__) g->follow = NULL; g->kmax = 0; // will be increased as needed by ensure_k - HCFStringMap *eps = h_stringmap_new(g->arena); + HStringMap *eps = h_stringmap_new(g->arena); h_stringmap_put_epsilon(eps, INSET); g->singleton_epsilon = eps; @@ -219,32 +219,32 @@ static void collect_geneps(HCFGrammar *g) } -HCFStringMap *h_stringmap_new(HArena *a) +HStringMap *h_stringmap_new(HArena *a) { - HCFStringMap *m = h_arena_malloc(a, sizeof(HCFStringMap)); + HStringMap *m = h_arena_malloc(a, sizeof(HStringMap)); m->char_branches = h_hashtable_new(a, h_eq_ptr, h_hash_ptr); m->arena = a; return m; } -void h_stringmap_put_end(HCFStringMap *m, void *v) +void h_stringmap_put_end(HStringMap *m, void *v) { m->end_branch = v; } -void h_stringmap_put_epsilon(HCFStringMap *m, void *v) +void h_stringmap_put_epsilon(HStringMap *m, void *v) { m->epsilon_branch = v; } -void h_stringmap_put_after(HCFStringMap *m, uint8_t c, HCFStringMap *ends) +void h_stringmap_put_after(HStringMap *m, uint8_t c, HStringMap *ends) { h_hashtable_put(m->char_branches, (void *)char_key(c), ends); } -void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v) +void h_stringmap_put_char(HStringMap *m, uint8_t c, void *v) { - HCFStringMap *node = h_stringmap_new(m->arena); + HStringMap *node = h_stringmap_new(m->arena); h_stringmap_put_epsilon(node, v); h_stringmap_put_after(m, c, node); } @@ -252,8 +252,8 @@ void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v) // helper for h_stringmap_update static void *combine_stringmap(void *v1, const void *v2) { - HCFStringMap *m1 = v1; - const HCFStringMap *m2 = v2; + HStringMap *m1 = v1; + const HStringMap *m2 = v2; if(!m1) m1 = h_stringmap_new(m2->arena); h_stringmap_update(m1, m2); @@ -262,7 +262,7 @@ static void *combine_stringmap(void *v1, const void *v2) } /* Note: Does *not* reuse submaps from n in building m. */ -void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) +void h_stringmap_update(HStringMap *m, const HStringMap *n) { if(n->epsilon_branch) m->epsilon_branch = n->epsilon_branch; @@ -277,7 +277,7 @@ void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) * If old is NULL, replace all values in m with new. * If new is NULL, remove the respective values. */ -void h_stringmap_replace(HCFStringMap *m, void *old, void *new) +void h_stringmap_replace(HStringMap *m, void *old, void *new) { if(!old) { if(m->epsilon_branch) m->epsilon_branch = new; @@ -294,14 +294,14 @@ void h_stringmap_replace(HCFStringMap *m, void *old, void *new) if(hte->key == NULL) continue; - HCFStringMap *m_ = hte->value; + HStringMap *m_ = hte->value; if(m_) h_stringmap_replace(m_, old, new); } } } -void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end) +void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool end) { for(size_t i=0; iend_branch) @@ -313,20 +313,20 @@ void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool return m->epsilon_branch; } -bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end) +bool h_stringmap_present(const HStringMap *m, const uint8_t *str, size_t n, bool end) { return (h_stringmap_get(m, str, n, end) != NULL); } -bool h_stringmap_present_epsilon(const HCFStringMap *m) +bool h_stringmap_present_epsilon(const HStringMap *m) { return (m->epsilon_branch != NULL); } -const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) +const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) { - HCFStringMap *ret; + HStringMap *ret; HCFSequence **p; uint8_t c; @@ -372,18 +372,18 @@ const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) } // helpers for h_first_seq, definitions below -static bool is_singleton_epsilon(const HCFStringMap *m); -static bool any_string_shorter(size_t k, const HCFStringMap *m); +static bool is_singleton_epsilon(const HStringMap *m); +static bool any_string_shorter(size_t k, const HStringMap *m); // pointer to functions like h_first_seq -typedef const HCFStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice **); +typedef const HStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice **); // helper for h_first_seq and h_follow -static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, - size_t k, const HCFStringMap *as, +static void stringset_extend(HCFGrammar *g, HStringMap *ret, + size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail); -const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) +const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) { // shortcut: the first set of the empty sequence, for any k, is {""} if(*s == NULL) @@ -394,7 +394,7 @@ const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) HCFChoice *x = s[0]; HCFChoice **tail = s+1; - const HCFStringMap *first_x = h_first(k, g, x); + const HStringMap *first_x = h_first(k, g, x); // shortcut: if first_k(X) = {""}, just return first_k(tail) if(is_singleton_epsilon(first_x)) @@ -405,7 +405,7 @@ const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) return first_x; // create a new result set and build up the set described above - HCFStringMap *ret = h_stringmap_new(g->arena); + HStringMap *ret = h_stringmap_new(g->arena); // extend the elements of first_k(X) up to length k from tail stringset_extend(g, ret, k, first_x, h_first_seq, tail); @@ -413,14 +413,14 @@ const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) return ret; } -static bool is_singleton_epsilon(const HCFStringMap *m) +static bool is_singleton_epsilon(const HStringMap *m) { return ( m->epsilon_branch && !m->end_branch && h_hashtable_empty(m->char_branches) ); } -static bool any_string_shorter(size_t k, const HCFStringMap *m) +static bool any_string_shorter(size_t k, const HStringMap *m) { if(k==0) return false; @@ -434,7 +434,7 @@ static bool any_string_shorter(size_t k, const HCFStringMap *m) for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { if(hte->key == NULL) continue; - HCFStringMap *m_ = hte->value; + HStringMap *m_ = hte->value; // check subtree for strings shorter than k-1 if(any_string_shorter(k-1, m_)) @@ -446,7 +446,7 @@ static bool any_string_shorter(size_t k, const HCFStringMap *m) } // helper for h_predict -static void remove_all_shorter(size_t k, HCFStringMap *m) +static void remove_all_shorter(size_t k, HStringMap *m) { if(k==0) return; m->epsilon_branch = NULL; @@ -465,12 +465,12 @@ static void remove_all_shorter(size_t k, HCFStringMap *m) // h_follow adapted to the signature of StringSetFun static inline -const HCFStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) +const HStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) { return h_follow(k, g, *s); } -const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) +const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) { // consider all occurances of X in g // the follow set of X is the union of: @@ -481,7 +481,7 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) // first_k(tail follow_k(A)) = // { a b | a <- first_k(tail), b <- follow_l(A), l=k-|a| } - HCFStringMap *ret; + HStringMap *ret; // shortcut: follow_0(X) is always {""} if(k==0) @@ -519,7 +519,7 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) if(*s == x) { // occurance found HCFChoice **tail = s+1; - const HCFStringMap *first_tail = h_first_seq(k, g, tail); + const HStringMap *first_tail = h_first_seq(k, g, tail); // extend the elems of first_k(tail) up to length k from follow(A) stringset_extend(g, ret, k, first_tail, h_follow_, &a); @@ -532,15 +532,15 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) return ret; } -HCFStringMap *h_predict(size_t k, HCFGrammar *g, +HStringMap *h_predict(size_t k, HCFGrammar *g, const HCFChoice *A, const HCFSequence *rhs) { - HCFStringMap *ret = h_stringmap_new(g->arena); + HStringMap *ret = h_stringmap_new(g->arena); // predict_k(A -> rhs) = // { ab | a <- first_k(rhs), b <- follow_k(A), |ab|=k } - const HCFStringMap *first_rhs = h_first_seq(k, g, rhs->items); + const HStringMap *first_rhs = h_first_seq(k, g, rhs->items); // casting the const off of A below. note: stringset_extend does // not touch this argument, only passes it through to h_follow @@ -554,8 +554,8 @@ HCFStringMap *h_predict(size_t k, HCFGrammar *g, } // add the set { a b | a <- as, b <- f_l(S), l=k-|a| } to ret -static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, - size_t k, const HCFStringMap *as, +static void stringset_extend(HCFGrammar *g, HStringMap *ret, + size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail) { if(as->epsilon_branch) { @@ -578,12 +578,12 @@ static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, uint8_t c = key_char((HCharKey)hte->key); // follow the branch to find the set { a' | t a' <- as } - HCFStringMap *as_ = (HCFStringMap *)hte->value; + HStringMap *as_ = (HStringMap *)hte->value; // now the elements of ret that begin with t are given by // t { a b | a <- as_, b <- f_l(tail), l=k-|a|-1 } // so we can use recursion over k - HCFStringMap *ret_ = h_stringmap_new(g->arena); + HStringMap *ret_ = h_stringmap_new(g->arena); h_stringmap_put_after(ret, c, ret_); stringset_extend(g, ret_, k-1, as_, f, tail); @@ -800,7 +800,7 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in static bool pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, - const HCFStringMap *set) + const HStringMap *set) { assert(n < BUFSIZE-4); @@ -827,7 +827,7 @@ pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, if(hte->key == NULL) continue; uint8_t c = key_char((HCharKey)hte->key); - HCFStringMap *ends = hte->value; + HStringMap *ends = hte->value; size_t n_ = n; switch(c) { @@ -852,7 +852,7 @@ pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, return first; } -void h_pprint_stringset(FILE *file, const HCFStringMap *set, int indent) +void h_pprint_stringset(FILE *file, const HStringMap *set, int indent) { int j; for(j=0; jchar_branches, (void *)char_key(c)); } @@ -72,18 +72,18 @@ bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol); bool h_derives_epsilon_seq(HCFGrammar *g, HCFChoice **s); /* Compute first_k set of symbol x. Memoized. */ -const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x); +const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x); /* Compute first_k set of sentential form s. s NULL-terminated. */ -const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s); +const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s); /* Compute follow_k set of symbol x. Memoized. */ -const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); +const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); /* Compute the predict_k set of production "A -> rhs". - * Always returns a newly-allocated HCFStringMap. + * Always returns a newly-allocated HStringMap. */ -HCFStringMap *h_predict(size_t k, HCFGrammar *g, +HStringMap *h_predict(size_t k, HCFGrammar *g, const HCFChoice *A, const HCFSequence *rhs); @@ -92,4 +92,4 @@ void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent); void h_pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq); void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x); void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent); -void h_pprint_stringset(FILE *file, const HCFStringMap *set, int indent); +void h_pprint_stringset(FILE *file, const HStringMap *set, int indent); From 3be83c7fa2a9232fad3b1ea738a001df15a6d5b5 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Tue, 4 Jun 2013 21:59:39 +0200 Subject: [PATCH 27/95] set index and offset of result tokens in llk driver --- src/backends/llk.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 79ab8f4..4f73c46 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -339,10 +339,12 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* // the top of stack is such that there will be a result... HParsedToken *tok; // will hold result token + tok = h_arena_malloc(arena, sizeof(HParsedToken)); + tok->index = stream->index; + tok->bit_offset = stream->bit_offset; if(x == mark) { // hit stack frame boundary... // wrap the accumulated parse result, this sequence is finished - tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_SEQUENCE; tok->seq = seq; @@ -361,13 +363,13 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* case HCF_END: if(!stream->overrun) goto no_parse; + h_arena_free(arena, tok); tok = NULL; break; case HCF_CHAR: if(input != x->chr) goto no_parse; - tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; tok->uint = x->chr; break; @@ -377,7 +379,6 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* goto no_parse; if(!charset_isset(x->charset, input)) goto no_parse; - tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; tok->uint = input; break; @@ -390,8 +391,6 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* // 'tok' has been parsed; process it - // XXX set tok->index and tok->bit_offset (don't take directly from stream, cuz peek!) - // perform token reshape if indicated if(x->reshape) tok = (HParsedToken *)x->reshape(make_result(arena, tok)); From 976205f9da21acab80c7bc8ce2e0163d97084f11 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Tue, 4 Jun 2013 22:14:06 +0200 Subject: [PATCH 28/95] hook up LALR backend stub --- src/Makefile | 1 + src/backends/lalr.c | 86 +++++++++++++++++++++++++++++++++++++++++++++ src/hammer.c | 1 + src/hammer.h | 8 ++--- src/internal.h | 1 + 5 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 src/backends/lalr.c diff --git a/src/Makefile b/src/Makefile index 7fac881..1a2bff3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -28,6 +28,7 @@ PARSERS := \ BACKENDS := \ packrat \ llk \ + lalr \ regex HAMMER_PARTS := \ diff --git a/src/backends/lalr.c b/src/backends/lalr.c new file mode 100644 index 0000000..22cd389 --- /dev/null +++ b/src/backends/lalr.c @@ -0,0 +1,86 @@ +#include +#include "../internal.h" +#include "../cfgrammar.h" +#include "../parsers/parser_internal.h" + + + +void h_lalr_free(HParser *parser) +{ + // XXX free data structures + parser->backend_data = NULL; + parser->backend = PB_PACKRAT; +} + + +/* LALR table generation */ + +int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) +{ + return -1; +} + + +/* LR driver */ + +HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) +{ + return NULL; +} + + + + +HParserBackendVTable h__lalr_backend_vtable = { + .compile = h_lalr_compile, + .parse = h_lr_parse, + .free = h_lalr_free +}; + + + + +// dummy! +int test_lalr(void) +{ + /* for k=2: + + S -> A | B + A -> X Y a + B -> Y b + X -> x | '' + Y -> y -- for k=3 use "yy" + */ + + // XXX make LALR example + HParser *X = h_optional(h_ch('x')); + HParser *Y = h_sequence(h_ch('y'), h_ch('y'), NULL); + HParser *A = h_sequence(X, Y, h_ch('a'), NULL); + HParser *B = h_sequence(Y, h_ch('b'), NULL); + HParser *p = h_choice(A, B, NULL); + + HCFGrammar *g = h_cfgrammar(&system_allocator, p); + + if(g == NULL) { + fprintf(stderr, "h_cfgrammar failed\n"); + return 1; + } + + h_pprint_grammar(stdout, g, 0); + // print states of the LR(0) automaton + // print LALR(1) table + + if(h_compile(p, PB_LALR, NULL)) { + fprintf(stderr, "does not compile\n"); + return 2; + } + + + HParseResult *res = h_parse(p, (uint8_t *)"xyya", 4); + if(res) + h_pprint(stdout, res->ast, 0, 2); + else + printf("no parse\n"); + + return 0; +} diff --git a/src/hammer.c b/src/hammer.c index 5f94142..7d5b4e9 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -30,6 +30,7 @@ static HParserBackendVTable *backends[PB_MAX + 1] = { &h__packrat_backend_vtable, &h__regex_backend_vtable, &h__llk_backend_vtable, + &h__lalr_backend_vtable, }; diff --git a/src/hammer.h b/src/hammer.h index 455684c..a5ebcff 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -34,11 +34,11 @@ typedef struct HParseState_ HParseState; typedef enum HParserBackend_ { PB_MIN = 0, PB_PACKRAT = PB_MIN, // PB_MIN is always the default. - PB_REGULAR, // - PB_LLk, // - PB_LALR, // Not Implemented + PB_REGULAR, + PB_LLk, + PB_LALR, PB_GLR, // Not Implemented - PB_MAX = PB_LLk + PB_MAX = PB_LALR } HParserBackend; typedef enum HTokenType_ { diff --git a/src/internal.h b/src/internal.h index 926bf02..01861f5 100644 --- a/src/internal.h +++ b/src/internal.h @@ -219,6 +219,7 @@ struct HBitWriter_ { // Backends {{{ extern HParserBackendVTable h__packrat_backend_vtable; extern HParserBackendVTable h__llk_backend_vtable; +extern HParserBackendVTable h__lalr_backend_vtable; // }}} // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. From 636f741d88ac179e74d172e81c1376507a382218 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 5 Jun 2013 15:12:48 +0200 Subject: [PATCH 29/95] add h_hashset_equal (set comparison) --- src/datastructures.c | 52 ++++++++++++++++++++++++++++++++++++++++++++ src/internal.h | 1 + 2 files changed, 53 insertions(+) diff --git a/src/datastructures.c b/src/datastructures.c index a12707e..bd9b4eb 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -232,6 +232,7 @@ int h_hashtable_present(const HHashTable* ht, const void* key) { } return false; } + void h_hashtable_del(HHashTable* ht, const void* key) { HHashValue hashval = ht->hashFunc(key); #ifdef CONSISTENCY_CHECK @@ -257,6 +258,7 @@ void h_hashtable_del(HHashTable* ht, const void* key) { } } } + void h_hashtable_free(HHashTable* ht) { for (size_t i = 0; i < ht->capacity; i++) { HHashTableEntry *hten, *hte = &ht->contents[i]; @@ -272,6 +274,56 @@ void h_hashtable_free(HHashTable* ht) { h_arena_free(ht->arena, ht->contents); } +// helper for hte_equal +static bool hte_same_length(HHashTableEntry *xs, HHashTableEntry *ys) { + for(; xs && ys; xs=xs->next, ys=ys->next) { + // skip NULL keys (= element not present) + if(xs->key == NULL) xs=xs->next; + if(ys->key == NULL) ys=ys->next; + } + return (xs == ys); // both NULL +} + +// helper for hte_equal: are all elements of xs present in ys? +static bool hte_subset(HEqualFunc eq, HHashTableEntry *xs, HHashTableEntry *ys) +{ + for(; xs; xs=xs->next) { + if(xs->key == NULL) continue; // element not present + + HHashTableEntry *hte; + for(hte=ys; hte; hte=hte->next) { + if(hte->key == xs->key) break; // assume an element is equal to itself + if(hte->hashval != xs->hashval) continue; // shortcut + if(eq(hte->key, xs->key)) break; + } + if(hte == NULL) return false; // element not found + } + return true; // all found +} + +// compare two lists of HHashTableEntries +static inline bool hte_equal(HEqualFunc eq, HHashTableEntry *xs, HHashTableEntry *ys) { + return (hte_same_length(xs, ys) && hte_subset(eq, xs, ys)); +} + +/* Set equality of HHashSets. + * Obviously, 'a' and 'b' must use the same equality function. + * Not strictly necessary, but we also assume the same hash function. + */ +bool h_hashset_equal(const HHashSet *a, const HHashSet *b) { + if(a->capacity == b->capacity) { + // iterate over the buckets in parallel + for(size_t i=0; i < a->capacity; i++) { + if(!hte_equal(a->equalFunc, &a->contents[i], &b->contents[i])) + return false; + } + } else { + assert_message(0, "h_hashset_equal called on sets of different capacity"); + // TODO implement general case + } + return true; +} + bool h_eq_ptr(const void *p, const void *q) { return (p==q); } diff --git a/src/internal.h b/src/internal.h index 01861f5..1183682 100644 --- a/src/internal.h +++ b/src/internal.h @@ -272,6 +272,7 @@ typedef HHashTable HHashSet; #define h_hashset_empty(ht) h_hashtable_empty(ht) #define h_hashset_del(ht,el) h_hashtable_del(ht,el) #define h_hashset_free(ht) h_hashtable_free(ht) +bool h_hashset_equal(const HHashSet *a, const HHashSet *b); bool h_eq_ptr(const void *p, const void *q); HHashValue h_hash_ptr(const void *p); From b959bcb5c79a2a82f2b950048bbbd9ab745c63e5 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 6 Jun 2013 13:01:39 +0200 Subject: [PATCH 30/95] fix an array index --- src/test_suite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test_suite.h b/src/test_suite.h index 168ab64..fc008e7 100644 --- a/src/test_suite.h +++ b/src/test_suite.h @@ -153,7 +153,7 @@ } while(0) #define g_check_stringmap_absent(table, key) do { \ - bool end = (key[strlen(key)-2] == '$'); \ + bool end = (key[strlen(key)-1] == '$'); \ if(h_stringmap_present(table, (uint8_t *)key, strlen(key), end)) { \ g_test_message("Check failed: \"%s\" shouldn't have been in map, but was", key); \ g_test_fail(); \ From e7a388d1c7adf034feddc5ba10d6d8b0c70e5f60 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 6 Jun 2013 13:01:54 +0200 Subject: [PATCH 31/95] move djbhash into general availability as h_djbhash --- src/backends/packrat.c | 10 +--------- src/datastructures.c | 11 ++++++++++- src/internal.h | 1 + 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/backends/packrat.c b/src/backends/packrat.c index c5c9565..8aa1f8e 100644 --- a/src/backends/packrat.c +++ b/src/backends/packrat.c @@ -3,14 +3,6 @@ #include "../internal.h" #include "../parsers/parser_internal.h" -static uint32_t djbhash(const uint8_t *buf, size_t len) { - uint32_t hash = 5381; - while (len--) { - hash = hash * 33 + *buf++; - } - return hash; -} - // short-hand for constructing HCachedResult's static HCachedResult *cached_result(const HParseState *state, HParseResult *result) { HCachedResult *ret = a_new(HCachedResult, 1); @@ -214,7 +206,7 @@ void h_packrat_free(HParser *parser) { } static uint32_t cache_key_hash(const void* key) { - return djbhash(key, sizeof(HParserCacheKey)); + return h_djbhash(key, sizeof(HParserCacheKey)); } static bool cache_key_equal(const void* key1, const void* key2) { return memcmp(key1, key2, sizeof(HParserCacheKey)) == 0; diff --git a/src/datastructures.c b/src/datastructures.c index bd9b4eb..730c6b9 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -329,6 +329,15 @@ bool h_eq_ptr(const void *p, const void *q) { } HHashValue h_hash_ptr(const void *p) { - // XXX just djbhash it + // XXX just djbhash it? it does make the benchmark ~7% slower. + //return h_djbhash((const uint8_t *)&p, sizeof(void *)); return (uintptr_t)p >> 4; } + +uint32_t h_djbhash(const uint8_t *buf, size_t len) { + uint32_t hash = 5381; + while (len--) { + hash = hash * 33 + *buf++; + } + return hash; +} diff --git a/src/internal.h b/src/internal.h index 1183682..2f3018d 100644 --- a/src/internal.h +++ b/src/internal.h @@ -276,6 +276,7 @@ bool h_hashset_equal(const HHashSet *a, const HHashSet *b); bool h_eq_ptr(const void *p, const void *q); HHashValue h_hash_ptr(const void *p); +uint32_t h_djbhash(const uint8_t *buf, size_t len); typedef struct HCFSequence_ HCFSequence; From 167e187151cae6c819914e3d6514f2c5288d2579 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 6 Jun 2013 13:05:48 +0200 Subject: [PATCH 32/95] scratch commit of LALR preparations (breaks compile) --- src/backends/lalr.c | 126 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 119 insertions(+), 7 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 22cd389..b7b127d 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -4,6 +4,125 @@ #include "../parsers/parser_internal.h" +// PLAN: +// data structures: +// - LR table is an array of hashtables that map grammar symbols (HCFChoice) +// to LRActions. + +// build LR(0) DFA +// extend with lookahead information by either: +// - reworking algorithm to propagate lookahead ("simple LALR generation") +// - follow sets of enhanced grammar ("conversion to SLR") + + +/* Constructing the characteristic automaton (handle recognizer) */ + +// - DFA is a hashset containing states (mapped to numbers) +// - states are hashsets containing LRItems +// - LRItems contain an optional lookahead set (HStringMap) +// - states (hashsets) get hash and comparison functions that ignore the lookahead + +typedef struct HLRDFA_ { + HHashSet *states; + HSlist *transitions; +} HLRDFA; + +typedef struct HLRTransition_ { + HLRState *from; + HCFChoice *symbol; + HLRState *to; +} HLRTransition; + +typedef struct HLRItem_ { + HCFChoice *lhs; + HCFChoice **rhs; + size_t len; // number of elements in rhs + size_t mark; + HStringMap *lookahead; // optional +} HLRItem; + +// compare LALR items - ignores lookahead +static bool eq_lalr_item(const void *p, const void *q) +{ + const HLRItem *a=p, *b=q; + + if(a->lhs != b->lhs) return false; + if(a->mark != b->mark) return false; + if(a->len != b->len) return false; + + for(size_t i=0; ilen; i++) + if(a->rhs[i] != b->rhs[i]) return false; + + return true; +} + +// compare LALR item sets (DFA states) +static inline bool eq_lalr_itemset(const void *p, const void *q) +{ + return h_hashset_equal(p, q); +} + +// hash LALR items +static inline HHashValue hash_lalr_item(const HLRItem *x) +{ + return (h_hash_ptr(x->lhs) + + h_djbhash((uint8_t *)x->rhs, x->len*sizeof(HCFChoice *)) + + x->mark); // XXX is it okay to just add mark? +} + +// hash LALR item sets (DFA states) - hash the elements and sum +static HHashValue hash_lalr_itemset(const void *p) +{ + HHashValue hash = 0; + + const HHashTable *ht = p; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + hash += hash_lalr_item(hte->key); + } + } + + return hash; +} + +static HHashSet *closure(const HHashSet *items); + +HLRDFA *h_lalr_dfa(HCFGrammar *g) +{ + HHashSet *states = h_hashset_new(g->arena, eq_lalr_itemset, hash_lalr_itemset); + + // make initial state (kernel) + + // while work to do (on some state) + // compute closure + // determine edge symbols + // for each edge symbol: + // advance respective items -> destination state (kernel) + // if destination is a new state: + // add it to state set + // add transition to it + // add it to the work list +} + + + +/* LALR table generation */ + +int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) +{ + // generate grammar + // construct dfa / determine lookahead + // extract table + // create an array of hashtables, one per state + // for each transition a--S-->b: + // add "shift, goto b" to table entry (a,S) + // for each state: + // add reduce entries for its accepting items + return -1; +} void h_lalr_free(HParser *parser) { @@ -13,13 +132,6 @@ void h_lalr_free(HParser *parser) } -/* LALR table generation */ - -int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) -{ - return -1; -} - /* LR driver */ From 3ad4c5107099ec6c49ee861d16865fc2ce21de3e Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 7 Jun 2013 13:46:16 +0200 Subject: [PATCH 33/95] fix hte_same_length() --- src/datastructures.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/datastructures.c b/src/datastructures.c index 730c6b9..55b8345 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -276,10 +276,12 @@ void h_hashtable_free(HHashTable* ht) { // helper for hte_equal static bool hte_same_length(HHashTableEntry *xs, HHashTableEntry *ys) { - for(; xs && ys; xs=xs->next, ys=ys->next) { + while(xs && ys) { + xs=xs->next; + ys=ys->next; // skip NULL keys (= element not present) - if(xs->key == NULL) xs=xs->next; - if(ys->key == NULL) ys=ys->next; + while(xs && xs->key == NULL) xs=xs->next; + while(ys && ys->key == NULL) ys=ys->next; } return (xs == ys); // both NULL } From 43fc07e67bf5fe485b726a3768c856930595f06b Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 7 Jun 2013 13:47:00 +0200 Subject: [PATCH 34/95] add construction of LR(0) automaton (untested) --- src/backends/lalr.c | 203 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 190 insertions(+), 13 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index b7b127d..509f76d 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -17,30 +17,48 @@ /* Constructing the characteristic automaton (handle recognizer) */ -// - DFA is a hashset containing states (mapped to numbers) // - states are hashsets containing LRItems // - LRItems contain an optional lookahead set (HStringMap) // - states (hashsets) get hash and comparison functions that ignore the lookahead +typedef HHashSet HLRState; + typedef struct HLRDFA_ { - HHashSet *states; + size_t nstates; + const HLRState **states; // array of size nstates HSlist *transitions; } HLRDFA; typedef struct HLRTransition_ { - HLRState *from; - HCFChoice *symbol; - HLRState *to; + size_t from, to; // indices into 'states' array + const HCFChoice *symbol; } HLRTransition; typedef struct HLRItem_ { HCFChoice *lhs; - HCFChoice **rhs; + HCFChoice **rhs; // NULL-terminated size_t len; // number of elements in rhs size_t mark; HStringMap *lookahead; // optional } HLRItem; +HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark) +{ + HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem)); + + size_t len = 0; + for(HCFChoice **p=rhs; *p; p++) len++; + assert(mark <= len); + + ret->lhs = lhs; + ret->rhs = rhs; + ret->len = len; + ret->mark = mark; + ret->lookahead = NULL; + + return ret; +} + // compare LALR items - ignores lookahead static bool eq_lalr_item(const void *p, const void *q) { @@ -63,8 +81,9 @@ static inline bool eq_lalr_itemset(const void *p, const void *q) } // hash LALR items -static inline HHashValue hash_lalr_item(const HLRItem *x) +static inline HHashValue hash_lalr_item(const void *p) { + const HLRItem *x = p; return (h_hash_ptr(x->lhs) + h_djbhash((uint8_t *)x->rhs, x->len*sizeof(HCFChoice *)) + x->mark); // XXX is it okay to just add mark? @@ -88,13 +107,90 @@ static HHashValue hash_lalr_itemset(const void *p) return hash; } -static HHashSet *closure(const HHashSet *items); +static inline HLRState *h_lrstate_new(HArena *arena) +{ + return h_hashset_new(arena, eq_lalr_item, hash_lalr_item); +} + +static HLRItem *advance_mark(HArena *arena, const HLRItem *item) +{ + assert(item->rhs[item->mark] != NULL); + HLRItem *ret = h_arena_malloc(arena, sizeof(HLRItem)); + *ret = *item; + ret->mark++; + return ret; +} + +static HHashSet *closure(HCFGrammar *g, const HHashSet *items) +{ + HArena *arena = g->arena; + HHashSet *ret = h_lrstate_new(arena); + HSlist *work = h_slist_new(arena); + + // iterate over items - initialize work list with them + const HHashTable *ht = items; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + const HLRItem *item = hte->key; + h_hashset_put(ret, item); + h_slist_push(work, (void *)item); + } + } + + while(!h_slist_empty(work)) { + const HLRItem *item = h_slist_pop(work); + HCFChoice *sym = item->rhs[item->mark]; // symbol after mark + + // if there is a non-terminal after the mark, follow it + // XXX: do we have to count HCF_CHARSET as nonterminal? + if(sym != NULL && sym->type == HCF_CHOICE) { + // add items corresponding to the productions of sym + for(HCFSequence **p=sym->seq; *p; p++) { + HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); + if(!h_hashset_present(ret, it)) { + h_hashset_put(ret, it); + h_slist_push(work, it); + } + } + + // if sym derives epsilon, also advance over it + if(h_derives_epsilon(g, sym)) { + HLRItem *it = advance_mark(arena, item); + h_hashset_put(ret, it); + h_slist_push(work, it); + } + } + } + + return ret; +} HLRDFA *h_lalr_dfa(HCFGrammar *g) { - HHashSet *states = h_hashset_new(g->arena, eq_lalr_itemset, hash_lalr_itemset); + HArena *arena = g->arena; + + HHashSet *states = h_hashset_new(arena, eq_lalr_itemset, hash_lalr_itemset); + // maps itemsets to assigned array indices + HSlist *transitions = h_slist_new(arena); + + // list of states that need to be processed + // to save lookups, we push two elements per state, the itemset and its + // assigned index. + HSlist *work = h_slist_new(arena); + + // XXX augment grammar?! // make initial state (kernel) + HLRState *start = h_lrstate_new(arena); + assert(g->start->type == HCF_CHOICE); + for(HCFSequence **p=g->start->seq; *p; p++) + h_hashset_put(start, h_lritem_new(arena, g->start, (*p)->items, 0)); + h_hashtable_put(states, start, 0); + h_slist_push(work, start); + h_slist_push(work, 0); // while work to do (on some state) // compute closure @@ -105,6 +201,85 @@ HLRDFA *h_lalr_dfa(HCFGrammar *g) // add it to state set // add transition to it // add it to the work list + + while(!h_slist_empty(work)) { + size_t state_idx = (uintptr_t)h_slist_pop(work); + HLRState *state = h_slist_pop(work); + + // maps edge symbols to neighbor states (item sets) of s + HHashTable *neighbors = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); + + // iterate over closure and generate neighboring sets + const HHashTable *ht = closure(g, state); + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + const HLRItem *item = hte->key; + HCFChoice *sym = item->rhs[item->mark]; // symbol after mark + + if(sym != NULL) { // mark was not at the end + // find or create prospective neighbor set + HLRState *neighbor = h_hashtable_get(neighbors, sym); + if(neighbor == NULL) { + neighbor = h_lrstate_new(arena); + h_hashtable_put(neighbors, sym, neighbor); + } + + // ...and add the advanced item to it + h_hashset_put(neighbor, advance_mark(arena, item)); + } + } + } + + // merge neighbor sets into the set of existing states + ht = neighbors; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + const HCFChoice *symbol = hte->key; + HLRState *neighbor = hte->value; + + // look up existing state, allocate new if not found + size_t neighbor_idx; + if(!h_hashset_present(states, neighbor)) { + neighbor_idx = states->used; + h_hashtable_put(states, neighbor, (void *)(uintptr_t)neighbor_idx); + h_slist_push(work, neighbor); + h_slist_push(work, (void *)(uintptr_t)neighbor_idx); + } + + // add transition "state --symbol--> neighbor" + HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); + t->from = state_idx; + t->to = neighbor_idx; + t->symbol = symbol; + h_slist_push(transitions, t); + } + } + } // end while(work) + + // fill DFA struct + HLRDFA *dfa = h_arena_malloc(arena, sizeof(HLRDFA)); + dfa->nstates = states->used; + dfa->states = h_arena_malloc(arena, dfa->nstates*sizeof(HLRState *)); + for(size_t i=0; i < states->capacity; i++) { + for(HHashTableEntry *hte = &states->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + const HLRState *state = hte->key; + size_t idx = (uintptr_t)hte->value; + + dfa->states[idx] = state; + } + } + dfa->transitions = transitions; + + return dfa; } @@ -172,20 +347,22 @@ int test_lalr(void) HParser *p = h_choice(A, B, NULL); HCFGrammar *g = h_cfgrammar(&system_allocator, p); - if(g == NULL) { fprintf(stderr, "h_cfgrammar failed\n"); return 1; } - h_pprint_grammar(stdout, g, 0); - // print states of the LR(0) automaton - // print LALR(1) table + + HLRDFA *dfa = h_lalr_dfa(g); + if(dfa) { + // print states of the LR(0) automaton + } if(h_compile(p, PB_LALR, NULL)) { fprintf(stderr, "does not compile\n"); return 2; } + // print LALR(1) table HParseResult *res = h_parse(p, (uint8_t *)"xyya", 4); From 373a7aef9447044ba2c7339774e5cfaa21eb3cff Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 7 Jun 2013 16:29:14 +0200 Subject: [PATCH 35/95] make h_pprint_char from cfgrammar public so LR pretty printers can use it --- src/cfgrammar.c | 8 ++++---- src/cfgrammar.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 0c51589..bc7b358 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -595,7 +595,7 @@ static void stringset_extend(HCFGrammar *g, HStringMap *ret, } -static void pprint_char(FILE *f, char c) +void h_pprint_char(FILE *f, char c) { switch(c) { case '"': fputs("\\\"", f); break; @@ -619,7 +619,7 @@ static void pprint_charset_char(FILE *f, char c) case '"': fputc(c, f); break; case '-': fputs("\\-", f); break; case ']': fputs("\\-", f); break; - default: pprint_char(f, c); + default: h_pprint_char(f, c); } } @@ -667,7 +667,7 @@ static HCFChoice **pprint_string(FILE *f, HCFChoice **x) for(; *x; x++) { if((*x)->type != HCF_CHAR) break; - pprint_char(f, (*x)->chr); + h_pprint_char(f, (*x)->chr); } fputc('"', f); return x; @@ -678,7 +678,7 @@ void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) switch(x->type) { case HCF_CHAR: fputc('"', f); - pprint_char(f, x->chr); + h_pprint_char(f, x->chr); fputc('"', f); break; case HCF_END: diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 6aba29c..a5de4d6 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -93,3 +93,4 @@ void h_pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq); void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x); void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent); void h_pprint_stringset(FILE *file, const HStringMap *set, int indent); +void h_pprint_char(FILE *file, char c); From 67e5b2fee0e4ceff7fab9375065e737eda12dfe0 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 7 Jun 2013 16:30:16 +0200 Subject: [PATCH 36/95] add DFA pretty-printer --- src/backends/lalr.c | 99 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 509f76d..15236f5 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -317,6 +317,93 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* +/* Pretty-printers */ + +void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item) +{ + h_pprint_symbol(f, g, item->lhs); + fputs(" ->", f); + + HCFChoice **x = item->rhs; + HCFChoice **mark = item->rhs + item->mark; + if(*x == NULL) { + fputs("\"\"", f); + } else { + while(*x) { + if(x == mark) + fputc('.', f); + else + fputc(' ', f); + + if((*x)->type == HCF_CHAR) { + // condense character strings + fputc('"', f); + h_pprint_char(f, (*x)->chr); + for(x++; *x; x++) { + if(x == mark) + break; + if((*x)->type != HCF_CHAR) + break; + h_pprint_char(f, (*x)->chr); + } + fputc('"', f); + } else { + h_pprint_symbol(f, g, *x); + x++; + } + } + if(x == mark) + fputs(".", f); + } +} + +void h_pprint_lrstate(FILE *f, const HCFGrammar *g, + const HLRState *state, unsigned int indent) +{ + bool first = true; + const HHashTable *ht = state; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + const HLRItem *item = hte->key; + + if(!first) + for(unsigned int i=0; isymbol); + fprintf(f, "->%lu", t->to); +} + +void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, + const HLRDFA *dfa, unsigned int indent) +{ + for(size_t i=0; instates; i++) { + unsigned int indent2 = indent + fprintf(f, "%4lu: ", i); + h_pprint_lrstate(f, g, dfa->states[i], indent2); + for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { + const HLRTransition *t = x->elem; + if(t->from == i) { + for(unsigned int i=0; iast, 0, 2); From 732545274afce6cdd1daf212a207a25b71deae21 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 7 Jun 2013 16:55:36 +0200 Subject: [PATCH 37/95] look up neighbor set index properly (was left uninitialized) --- src/backends/lalr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 15236f5..7a07c3d 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -250,6 +250,8 @@ HLRDFA *h_lalr_dfa(HCFGrammar *g) h_hashtable_put(states, neighbor, (void *)(uintptr_t)neighbor_idx); h_slist_push(work, neighbor); h_slist_push(work, (void *)(uintptr_t)neighbor_idx); + } else { + neighbor_idx = (uintptr_t)h_hashtable_get(states, neighbor); } // add transition "state --symbol--> neighbor" From f041775bb965613a2241facdf3d35f6ed3463c02 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 7 Jun 2013 22:38:26 +0200 Subject: [PATCH 38/95] declare h_cfgrammar_new in cfgrammar.h --- src/cfgrammar.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cfgrammar.h b/src/cfgrammar.h index a5de4d6..62b3320 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -60,6 +60,8 @@ static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_ */ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser); +HCFGrammar *h_cfgrammar_new(HAllocator *mm__); + /* Frees the given grammar and associated data. * Does *not* free parsers' CFG forms as created by h_desugar. */ From 3bb26162c35db804f49a148011f06c828419b028 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 7 Jun 2013 22:39:09 +0200 Subject: [PATCH 39/95] end-of-day scratch commit (LALR table generation) --- src/backends/lalr.c | 134 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 114 insertions(+), 20 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 7a07c3d..b3d9b5b 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -4,17 +4,6 @@ #include "../parsers/parser_internal.h" -// PLAN: -// data structures: -// - LR table is an array of hashtables that map grammar symbols (HCFChoice) -// to LRActions. - -// build LR(0) DFA -// extend with lookahead information by either: -// - reworking algorithm to propagate lookahead ("simple LALR generation") -// - follow sets of enhanced grammar ("conversion to SLR") - - /* Constructing the characteristic automaton (handle recognizer) */ // - states are hashsets containing LRItems @@ -288,17 +277,122 @@ HLRDFA *h_lalr_dfa(HCFGrammar *g) /* LALR table generation */ +typedef struct HLRAction_ { + enum {HLR_SHIFT, HLR_REDUCE} type; + union { + size_t nextstate; // used with shift + struct { + HCFChoice *lhs; + HCFChoice **rhs; + } production; // used with reduce + }; +} HLRAction; + +typedef struct HLRTable_ { + size_t nrows; + HHashTable **rows; // map symbols to HLRActions + HCFChoice *start; // start symbol + HArena *arena; + HAllocator *mm__; +} HLRTable; + +HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) +{ + HArena *arena = h_new_arena(mm__, 0); // default blocksize + assert(arena != NULL); + + HLRTable *ret = h_new(HLRTable, 1); + ret->nrows = nrows; + ret->rows = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); + ret->arena = arena; + ret->mm__ = mm__; + + for(size_t i=0; irows[i] = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); + + return ret; +} + +static HCFGrammar *transform_grammar(const HCFGrammar *g, const HLRTable *table, + const HLRDFA *dfa, HHashTable **syms) +{ + HCFGrammar *gt = h_cfgrammar_new(g->mm__); + HArena *arena = gt->arena; + + // old grammar symbol -> + //HHashTable *map = h_hashtable_new( + + for(size_t i=0; instates; i++) { + const HLRState *state = dfa->states[i]; + + syms[i] = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); + + + } + + // iterate over g->nts + const HHashTable *ht = g->nts; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + const HCFChoice *A = hte->key; + + // iterate over the productions of A + for(HCFSequence **p=A->seq; *p; p++) { + // find all transitions marked by A + // yields xAy -> rhs' + // trace rhs starting in state x and following the transitions + } + } + } + + return gt; +} + int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) { - // generate grammar - // construct dfa / determine lookahead - // extract table - // create an array of hashtables, one per state - // for each transition a--S-->b: - // add "shift, goto b" to table entry (a,S) - // for each state: - // add reduce entries for its accepting items - return -1; + // generate CFG from parser + // construct LR(0) DFA + // build parse table, shift-entries only + // for each transition a--S-->b, add "shift, goto b" to table entry (a,S) + // determine lookahead "by conversion to SLR" + // transform grammar to encode transitions in symbols + // -> lookahead for an item is the transformed left-hand side's follow set + // finish table; for each state: + // add reduce entries for its accepting items + // in case of conflict, add lookahead info + + HCFGrammar *g = h_cfgrammar(mm__, parser); + if(g == NULL) // backend not suitable (language not context-free) + return -1; + + HLRDFA *dfa = h_lalr_dfa(g); + if(dfa == NULL) // this should actually not happen + return -1; + + // create table with shift actions + HLRTable *table = h_lrtable_new(mm__, dfa->nstates); + for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { + HLRTransition *t = x->elem; + HLRAction *action = h_arena_malloc(table->arena, sizeof(HLRAction)); + action->type = HLR_SHIFT; + action->nextstate = t->to; + h_hashtable_put(table->rows[t->from], t->symbol, action); + } + + // mapping (state,item)-pairs to the symbols of the new grammar + HHashTable **syms = h_arena_malloc(g->arena, dfa->nstates * sizeof(HHashTable *)); + // XXX use a different arena for this (and other things) + + HCFGrammar *gt = transform_grammar(g, table, dfa, syms); + if(gt == NULL) // this should actually not happen + return -1; + + // XXX fill in reduce actions + + return 0; } void h_lalr_free(HParser *parser) From dabe4b07a94835041aee26f98b5c7e594767f1a2 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Mon, 10 Jun 2013 00:46:03 +0200 Subject: [PATCH 40/95] wip end-of-day commit - almost there (untested) --- src/backends/lalr.c | 328 +++++++++++++++++++++++++++++++++----------- 1 file changed, 246 insertions(+), 82 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index b3d9b5b..541a69a 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -4,7 +4,8 @@ #include "../parsers/parser_internal.h" -/* Constructing the characteristic automaton (handle recognizer) */ + +/* Data structures */ // - states are hashsets containing LRItems // - LRItems contain an optional lookahead set (HStringMap) @@ -19,8 +20,9 @@ typedef struct HLRDFA_ { } HLRDFA; typedef struct HLRTransition_ { - size_t from, to; // indices into 'states' array + size_t from; // index into 'states' array const HCFChoice *symbol; + size_t to; // index into 'states' array } HLRTransition; typedef struct HLRItem_ { @@ -31,6 +33,32 @@ typedef struct HLRItem_ { HStringMap *lookahead; // optional } HLRItem; +typedef struct HLRAction_ { + enum {HLR_SHIFT, HLR_REDUCE} type; + union { + size_t nextstate; // used with SHIFT + struct { + HCFChoice *lhs; // symbol carrying semantic actions etc. + size_t length; // # of symbols in rhs + // NB: the rhs symbols are not needed for the parse + } production; // used with REDUCE + }; +} HLRAction; + +typedef struct HLRTable_ { + size_t nrows; + HHashTable **rows; // map symbols to HLRActions + HLRAction **forall; // shortcut to set an action for an entire row + HCFChoice *start; // start symbol + HSlist *inadeq; // indices of any inadequate states + HArena *arena; + HAllocator *mm__; +} HLRTable; + + + +/* Constructing the characteristic automaton (handle recognizer) */ + HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark) { HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem)); @@ -157,7 +185,7 @@ static HHashSet *closure(HCFGrammar *g, const HHashSet *items) return ret; } -HLRDFA *h_lalr_dfa(HCFGrammar *g) +HLRDFA *h_lr0_dfa(HCFGrammar *g) { HArena *arena = g->arena; @@ -275,26 +303,24 @@ HLRDFA *h_lalr_dfa(HCFGrammar *g) -/* LALR table generation */ +/* LR(0) table generation */ -typedef struct HLRAction_ { - enum {HLR_SHIFT, HLR_REDUCE} type; - union { - size_t nextstate; // used with shift - struct { - HCFChoice *lhs; - HCFChoice **rhs; - } production; // used with reduce - }; -} HLRAction; +// XXX replace other hashtable iterations with this +// XXX move to internal.h or something +#define H_FOREACH_(HT) do { \ + const HHashTable *ht = HT; \ + for(size_t i=0; i < ht->capacity; i++) { \ + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { \ + if(hte->key == NULL) continue; -typedef struct HLRTable_ { - size_t nrows; - HHashTable **rows; // map symbols to HLRActions - HCFChoice *start; // start symbol - HArena *arena; - HAllocator *mm__; -} HLRTable; +#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_(HT) \ + const KEYVAR = hte->key; \ + VALVAR = hte->value; + +#define H_END_FOREACH \ + } \ + } \ + } while(0); HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) { @@ -313,91 +339,229 @@ HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) return ret; } -static HCFGrammar *transform_grammar(const HCFGrammar *g, const HLRTable *table, - const HLRDFA *dfa, HHashTable **syms) +void h_lrtable_free(HLRTable *table) { - HCFGrammar *gt = h_cfgrammar_new(g->mm__); - HArena *arena = gt->arena; + HAllocator *mm__ = table->mm__; + h_delete_arena(table->arena); + h_free(table); +} - // old grammar symbol -> - //HHashTable *map = h_hashtable_new( +static HLRAction *shift_action(HArena *arena, size_t nextstate) +{ + HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_SHIFT; + action->nextstate = nextstate; + return action; +} +static HLRAction *reduce_action(HArena *arena, HCFChoice *lhs, size_t rhslen) +{ + HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_REDUCE; + action->production.lhs = lhs; + action->production.length = rhslen; + return action; +} + +HLRTable *h_lr0_table(HCFGrammar *g) +{ + HAllocator *mm__ = g->mm__; + + // construct LR(0) DFA + HLRDFA *dfa = h_lr0_dfa(g); + if(!dfa) return NULL; + + HLRTable *table = h_lrtable_new(mm__, dfa->nstates); + HArena *arena = table->arena; + + // add shift entries + for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { + // for each transition x-A->y, add "shift, goto y" to table entry (x,A) + HLRTransition *t = x->elem; + + HLRAction *action = shift_action(arena, t->to); + h_hashtable_put(table->rows[t->from], t->symbol, action); + } + + // add reduce entries, record inadequate states for(size_t i=0; instates; i++) { - const HLRState *state = dfa->states[i]; - - syms[i] = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); - - - } - - // iterate over g->nts - const HHashTable *ht = g->nts; - for(size_t i=0; i < ht->capacity; i++) { - for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; - - const HCFChoice *A = hte->key; - - // iterate over the productions of A - for(HCFSequence **p=A->seq; *p; p++) { - // find all transitions marked by A - // yields xAy -> rhs' - // trace rhs starting in state x and following the transitions + // find reducible items in state + H_FOREACH(dfa->states[i], HLRItem *item, void *v_) + if(item->mark == item->len) { // mark at the end + // XXX store more informative stuff in the inadeq records? + if(table->forall[i]) { + // reduce/reduce conflict with a previous item + h_slist_push(table->inadeq, (void *)(uintptr_t)i); + } else if(!h_hashtable_empty(table->rows[i])) { + // shift/reduce conflict with one of the row's entries + h_slist_push(table->inadeq, (void *)(uintptr_t)i); + } else { + // set reduce action for the entire row + table->forall[i] = reduce_action(arena, item->lhs, item->len); + } } - } + H_END_FOREACH } - return gt; + return table; +} + + + +/* LALR-via-SLR grammar transformation */ + +static inline size_t seqsize(void *p_) +{ + size_t n=0; + for(void **p=p_; *p; p++) n++; + return n+1; +} + +static size_t follow_transition(const HLRTable *table, size_t x, HCFChoice *A) +{ + HLRAction *action = h_hashtable_get(table->rows[x], A); + assert(action != NULL); + assert(action->type == HLR_SHIFT); + return action->nextstate; +} + +static HCFChoice *transform_symbol(const HLRTable *table, HHashTable *map, + size_t x, HCFChoice *B, size_t z); + +static HCFChoice *transform_productions(const HLRTable *table, HHashTable *map, + size_t x, HCFChoice *xAy) +{ + HArena *arena = map->arena; + + HCFSequence **seq = h_arena_malloc(arena, seqsize(xAy->seq) + * sizeof(HCFSequence *)); + HCFSequence **p, **q; + for(p=xAy->seq, q=seq; *p; p++, q++) { + // trace rhs starting in state x and following the transitions + // xAy -> xBz ... + + HCFChoice **B = (*p)->items; + HCFChoice **xBz = h_arena_malloc(arena, seqsize(B) * sizeof(HCFChoice *)); + for(; *B; B++, xBz++) { + size_t z = follow_transition(table, x, *B); + *xBz = transform_symbol(table, map, x, *B, z); + x=z; + } + *xBz = NULL; + + *q = h_arena_malloc(arena, sizeof(HCFSequence)); + (*q)->items = xBz; + } + *q = NULL; + xAy->seq = seq; + + return xAy; // pass-through +} + +static inline HLRTransition *transition(HArena *arena, + size_t x, const HCFChoice *A, size_t y) +{ + HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); + t->from = x; + t->symbol = A; + t->to = y; + return t; +} + +static HCFChoice *transform_symbol(const HLRTable *table, HHashTable *map, + size_t x, HCFChoice *B, size_t z) +{ + HArena *arena = map->arena; + + // look up the transition in map, create symbol if not found + HLRTransition *x_B_z = transition(arena, x, B, z); + HCFChoice *xBz = h_hashtable_get(map, x_B_z); + if(!xBz) { + HCFChoice *xBz = h_arena_malloc(arena, sizeof(HCFChoice)); + *xBz = *B; + h_hashtable_put(map, x_B_z, xBz); + } + + return transform_productions(table, map, x, xBz); +} + +static bool eq_transition(const void *p, const void *q) +{ + const HLRTransition *a=p, *b=q; + return (a->from == b->from && a->to == b->to && a->symbol == b->symbol); +} + +static HHashValue hash_transition(const void *p) +{ + const HLRTransition *t = p; + return (h_hash_ptr(t->symbol) + t->from + t->to); // XXX ? +} + +static HHashTable *enhance_grammar(const HCFGrammar *g, const HLRTable *tbl) +{ + HArena *arena = g->arena; // XXX ? + HHashTable *map = h_hashtable_new(arena, eq_transition, hash_transition); + + // copy the start symbol over + HCFChoice *start = h_arena_malloc(arena, sizeof(HCFChoice)); + *start = *(g->start); + h_hashtable_put(map, g->start, start); + + transform_productions(tbl, map, 0, start); + + return map; +} + + + +/* LALR table generation */ + +bool is_inadequate(HLRTable *table, size_t state) +{ + // XXX +} + +bool has_conflicts(HLRTable *table) +{ + return !h_slist_empty(table->inadeq); } int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) { // generate CFG from parser - // construct LR(0) DFA - // build parse table, shift-entries only - // for each transition a--S-->b, add "shift, goto b" to table entry (a,S) - // determine lookahead "by conversion to SLR" - // transform grammar to encode transitions in symbols - // -> lookahead for an item is the transformed left-hand side's follow set - // finish table; for each state: - // add reduce entries for its accepting items - // in case of conflict, add lookahead info + // build LR(0) table + // if necessary, resolve conflicts "by conversion to SLR" HCFGrammar *g = h_cfgrammar(mm__, parser); if(g == NULL) // backend not suitable (language not context-free) return -1; - HLRDFA *dfa = h_lalr_dfa(g); - if(dfa == NULL) // this should actually not happen + HLRTable *table = h_lr0_table(g); + if(table == NULL) // this should normally not happen return -1; - // create table with shift actions - HLRTable *table = h_lrtable_new(mm__, dfa->nstates); - for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { - HLRTransition *t = x->elem; - HLRAction *action = h_arena_malloc(table->arena, sizeof(HLRAction)); - action->type = HLR_SHIFT; - action->nextstate = t->to; - h_hashtable_put(table->rows[t->from], t->symbol, action); + if(has_conflicts(table)) { + HHashTable *map = enhance_grammar(g, table); + if(map == NULL) // this should normally not happen + return -1; + + // XXX resolve conflicts + // iterate over dfa's transitions where 'from' state is inadequate + // look up enhanced symbol corr. to the transition + // for each terminal in follow set of enh. symbol: + // put reduce action into table cell (state, terminal) + // conflict if already occupied } - // mapping (state,item)-pairs to the symbols of the new grammar - HHashTable **syms = h_arena_malloc(g->arena, dfa->nstates * sizeof(HHashTable *)); - // XXX use a different arena for this (and other things) - - HCFGrammar *gt = transform_grammar(g, table, dfa, syms); - if(gt == NULL) // this should actually not happen - return -1; - - // XXX fill in reduce actions - - return 0; + h_cfgrammar_free(g); + parser->backend_data = table; + return has_conflicts(table)? -1 : 0; } void h_lalr_free(HParser *parser) { - // XXX free data structures + HLRTable *table = parser->backend_data; + h_lrtable_free(table); parser->backend_data = NULL; parser->backend = PB_PACKRAT; } @@ -538,7 +702,7 @@ int test_lalr(void) h_pprint_grammar(stdout, g, 0); printf("\n==== D F A ====\n"); - HLRDFA *dfa = h_lalr_dfa(g); + HLRDFA *dfa = h_lr0_dfa(g); if(dfa) h_pprint_lrdfa(stdout, g, dfa, 0); else From 4cd51b8953507fabb0c6cc73544a783e736d0288 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Mon, 10 Jun 2013 22:05:12 +0200 Subject: [PATCH 41/95] make it compile and fix some basic segfaults --- src/backends/lalr.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 541a69a..5b340e9 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -308,14 +308,16 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) // XXX replace other hashtable iterations with this // XXX move to internal.h or something #define H_FOREACH_(HT) do { \ - const HHashTable *ht = HT; \ - for(size_t i=0; i < ht->capacity; i++) { \ - for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { \ - if(hte->key == NULL) continue; + const HHashTable *ht__ = HT; \ + for(size_t i__=0; i__ < ht__->capacity; i__++) { \ + for(HHashTableEntry *hte__ = &ht__->contents[i]; hte__; hte__ = hte__->next) { \ + if(hte__->key == NULL) continue; -#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_(HT) \ - const KEYVAR = hte->key; \ - VALVAR = hte->value; +#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ + const KEYVAR = hte__->key; \ + +#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT) \ + VALVAR = hte__->value; #define H_END_FOREACH \ } \ @@ -330,6 +332,8 @@ HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) HLRTable *ret = h_new(HLRTable, 1); ret->nrows = nrows; ret->rows = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); + ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *)); + ret->inadeq = h_slist_new(arena); ret->arena = arena; ret->mm__ = mm__; @@ -386,7 +390,7 @@ HLRTable *h_lr0_table(HCFGrammar *g) // add reduce entries, record inadequate states for(size_t i=0; instates; i++) { // find reducible items in state - H_FOREACH(dfa->states[i], HLRItem *item, void *v_) + H_FOREACH_KEY(dfa->states[i], HLRItem *item) if(item->mark == item->len) { // mark at the end // XXX store more informative stuff in the inadeq records? if(table->forall[i]) { @@ -519,6 +523,7 @@ static HHashTable *enhance_grammar(const HCFGrammar *g, const HLRTable *tbl) bool is_inadequate(HLRTable *table, size_t state) { // XXX + return false; } bool has_conflicts(HLRTable *table) From 04487ff80fce6d5e6932a4cf2516e0ea408928b7 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Mon, 10 Jun 2013 23:45:25 +0200 Subject: [PATCH 42/95] add table pretty-printer --- src/backends/lalr.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 5b340e9..ccffaf3 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -316,7 +316,7 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) #define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ const KEYVAR = hte__->key; \ -#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT) \ +#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ VALVAR = hte__->value; #define H_END_FOREACH \ @@ -667,6 +667,40 @@ void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, } } +void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) +{ + if(action->type == HLR_SHIFT) { + fprintf(f, "s%lu", action->nextstate); + } else { + fputc('r', f); + // XXX reference the production somehow + } +} + +void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, + unsigned int indent) +{ + for(size_t i=0; inrows; i++) { + for(unsigned int j=0; jforall[i] && h_hashtable_empty(table->rows[i])) { + fputs(" - ", f); + pprint_lraction(f, g, table->forall[i]); + fputs(" -", f); + } + H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) + fputc(' ', f); // separator + h_pprint_symbol(f, g, symbol); + fputc(':', f); + if(table->forall[i]) { + fputc(action->type == HLR_SHIFT? 's' : 'r', f); + fputc('/', f); + fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); + } + H_END_FOREACH + fputc('\n', f); + } +} @@ -713,12 +747,19 @@ int test_lalr(void) else fprintf(stderr, "h_lalr_dfa failed\n"); + printf("\n==== L R ( 0 ) T A B L E ====\n"); + HLRTable *table0 = h_lr0_table(g); + if(table0) + h_pprint_lrtable(stdout, g, table0, 0); + else + fprintf(stderr, "h_lr0_table failed\n"); + printf("\n==== L A L R T A B L E ====\n"); if(h_compile(p, PB_LALR, NULL)) { fprintf(stderr, "does not compile\n"); return 2; } - // print LALR(1) table + h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); HParseResult *res = h_parse(p, (uint8_t *)"xyya", 4); From bbdced376ef5f577a63a0e2aaa946bb0d1b5efee Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Tue, 11 Jun 2013 00:27:34 +0200 Subject: [PATCH 43/95] derps etc --- src/backends/lalr.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index ccffaf3..2927b2a 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -307,14 +307,16 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) // XXX replace other hashtable iterations with this // XXX move to internal.h or something -#define H_FOREACH_(HT) do { \ +#define H_FOREACH_(HT) { \ const HHashTable *ht__ = HT; \ for(size_t i__=0; i__ < ht__->capacity; i__++) { \ - for(HHashTableEntry *hte__ = &ht__->contents[i]; hte__; hte__ = hte__->next) { \ + for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ + hte__; \ + hte__ = hte__->next) { \ if(hte__->key == NULL) continue; #define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ - const KEYVAR = hte__->key; \ + const KEYVAR = hte__->key; #define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ VALVAR = hte__->value; @@ -322,7 +324,7 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) #define H_END_FOREACH \ } \ } \ - } while(0); + } HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) { @@ -337,8 +339,10 @@ HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) ret->arena = arena; ret->mm__ = mm__; - for(size_t i=0; irows[i] = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); + ret->forall[i] = NULL; + } return ret; } @@ -687,17 +691,20 @@ void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, fputs(" - ", f); pprint_lraction(f, g, table->forall[i]); fputs(" -", f); + } else { + H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) + fputc(' ', f); // separator + h_pprint_symbol(f, g, symbol); + fputc(':', f); + if(table->forall[i]) { + fputc(action->type == HLR_SHIFT? 's' : 'r', f); + fputc('/', f); + fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); + } else { + pprint_lraction(f, g, action); + } + H_END_FOREACH } - H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) - fputc(' ', f); // separator - h_pprint_symbol(f, g, symbol); - fputc(':', f); - if(table->forall[i]) { - fputc(action->type == HLR_SHIFT? 's' : 'r', f); - fputc('/', f); - fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); - } - H_END_FOREACH fputc('\n', f); } } @@ -753,6 +760,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, table0, 0); else fprintf(stderr, "h_lr0_table failed\n"); + h_lrtable_free(table0); printf("\n==== L A L R T A B L E ====\n"); if(h_compile(p, PB_LALR, NULL)) { From 0a7548bb2fb39f5cd130cebce41d66facb46bcc3 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 12 Jun 2013 15:07:19 +0200 Subject: [PATCH 44/95] lr driver, first stab (still bugged) --- src/backends/lalr.c | 282 ++++++++++++++++++++++++++++++++------------ 1 file changed, 207 insertions(+), 75 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 2927b2a..5b8ea65 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -7,11 +7,7 @@ /* Data structures */ -// - states are hashsets containing LRItems -// - LRItems contain an optional lookahead set (HStringMap) -// - states (hashsets) get hash and comparison functions that ignore the lookahead - -typedef HHashSet HLRState; +typedef HHashSet HLRState; // states are sets of LRItems typedef struct HLRDFA_ { size_t nstates; @@ -30,7 +26,6 @@ typedef struct HLRItem_ { HCFChoice **rhs; // NULL-terminated size_t len; // number of elements in rhs size_t mark; - HStringMap *lookahead; // optional } HLRItem; typedef struct HLRAction_ { @@ -56,27 +51,28 @@ typedef struct HLRTable_ { } HLRTable; - -/* Constructing the characteristic automaton (handle recognizer) */ - -HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark) +// compare symbols - terminals by value, others by pointer +static bool eq_symbol(const void *p, const void *q) { - HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem)); - - size_t len = 0; - for(HCFChoice **p=rhs; *p; p++) len++; - assert(mark <= len); - - ret->lhs = lhs; - ret->rhs = rhs; - ret->len = len; - ret->mark = mark; - ret->lookahead = NULL; - - return ret; + const HCFChoice *x=p, *y=q; + return (x==y + || (x->type==HCF_END && y->type==HCF_END) + || (x->type==HCF_CHAR && y->type==HCF_CHAR && x->chr==y->chr)); } -// compare LALR items - ignores lookahead +// hash symbols - terminals by value, others by pointer +static HHashValue hash_symbol(const void *p) +{ + const HCFChoice *x=p; + if(x->type == HCF_END) + return 0; + else if(x->type == HCF_CHAR) + return x->chr * 33; + else + return h_hash_ptr(p); +} + +// compare LALR items by value static bool eq_lalr_item(const void *p, const void *q) { const HLRItem *a=p, *b=q; @@ -124,11 +120,80 @@ static HHashValue hash_lalr_itemset(const void *p) return hash; } +HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark) +{ + HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem)); + + size_t len = 0; + for(HCFChoice **p=rhs; *p; p++) len++; + assert(mark <= len); + + ret->lhs = lhs; + ret->rhs = rhs; + ret->len = len; + ret->mark = mark; + + return ret; +} + static inline HLRState *h_lrstate_new(HArena *arena) { return h_hashset_new(arena, eq_lalr_item, hash_lalr_item); } +HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) +{ + HArena *arena = h_new_arena(mm__, 0); // default blocksize + assert(arena != NULL); + + HLRTable *ret = h_new(HLRTable, 1); + ret->nrows = nrows; + ret->rows = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); + ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *)); + ret->inadeq = h_slist_new(arena); + ret->arena = arena; + ret->mm__ = mm__; + + for(size_t i=0; irows[i] = h_hashtable_new(arena, eq_symbol, hash_symbol); + ret->forall[i] = NULL; + } + + return ret; +} + +void h_lrtable_free(HLRTable *table) +{ + HAllocator *mm__ = table->mm__; + h_delete_arena(table->arena); + h_free(table); +} + +// XXX replace other hashtable iterations with this +// XXX move to internal.h or something +#define H_FOREACH_(HT) { \ + const HHashTable *ht__ = HT; \ + for(size_t i__=0; i__ < ht__->capacity; i__++) { \ + for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ + hte__; \ + hte__ = hte__->next) { \ + if(hte__->key == NULL) continue; + +#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ + const KEYVAR = hte__->key; + +#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ + VALVAR = hte__->value; + +#define H_END_FOREACH \ + } \ + } \ + } + + + +/* Constructing the characteristic automaton (handle recognizer) */ + static HLRItem *advance_mark(HArena *arena, const HLRItem *item) { assert(item->rhs[item->mark] != NULL); @@ -224,7 +289,7 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) HLRState *state = h_slist_pop(work); // maps edge symbols to neighbor states (item sets) of s - HHashTable *neighbors = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); + HHashTable *neighbors = h_hashtable_new(arena, eq_symbol, hash_symbol); // iterate over closure and generate neighboring sets const HHashTable *ht = closure(g, state); @@ -305,55 +370,6 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) /* LR(0) table generation */ -// XXX replace other hashtable iterations with this -// XXX move to internal.h or something -#define H_FOREACH_(HT) { \ - const HHashTable *ht__ = HT; \ - for(size_t i__=0; i__ < ht__->capacity; i__++) { \ - for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ - hte__; \ - hte__ = hte__->next) { \ - if(hte__->key == NULL) continue; - -#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ - const KEYVAR = hte__->key; - -#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ - VALVAR = hte__->value; - -#define H_END_FOREACH \ - } \ - } \ - } - -HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) -{ - HArena *arena = h_new_arena(mm__, 0); // default blocksize - assert(arena != NULL); - - HLRTable *ret = h_new(HLRTable, 1); - ret->nrows = nrows; - ret->rows = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); - ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *)); - ret->inadeq = h_slist_new(arena); - ret->arena = arena; - ret->mm__ = mm__; - - for(size_t i=0; irows[i] = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); - ret->forall[i] = NULL; - } - - return ret; -} - -void h_lrtable_free(HLRTable *table) -{ - HAllocator *mm__ = table->mm__; - h_delete_arena(table->arena); - h_free(table); -} - static HLRAction *shift_action(HArena *arena, size_t nextstate) { HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); @@ -579,9 +595,125 @@ void h_lalr_free(HParser *parser) /* LR driver */ +const HLRAction * +h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) +{ + assert(state < table->nrows); + if(table->forall[state]) { + assert(h_hashtable_empty(table->rows[state])); // that would be a conflict + return table->forall[state]; + } else { + return h_hashtable_get(table->rows[state], symbol); + } +} + +// XXX also, what about charsets!? + HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) { - return NULL; + HLRTable *table = parser->backend_data; + if(!table) + return NULL; + + HArena *arena = h_new_arena(mm__, 0); // will hold the results + HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse + HSlist *left = h_slist_new(tarena); // left stack; reductions happen here + HSlist *right = h_slist_new(tarena); // right stack; input appears here + + // stack layout: + // on the left stack, we put pairs: (saved state, semantic value) + // on the right stack, we put pairs: (symbol, semantic value) + + // run while the recognizer finds handles in the input + size_t state = 0; + while(1) { + // make sure there is input on the right stack + if(h_slist_empty(right)) { + HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); + HParsedToken *v; + + uint8_t c = h_read_bits(stream, 8, false); + + if(stream->overrun) { // end of input + x->type = HCF_END; + v = NULL; + } else { + x->type = HCF_CHAR; + x->chr = c; + v = h_arena_malloc(arena, sizeof(HParsedToken)); + v->token_type = TT_UINT; + v->uint = c; + } + + h_slist_push(right, v); + h_slist_push(right, x); + } + + // peek at input symbol on the right side + HCFChoice *symbol = right->head->elem; + + // table lookup + const HLRAction *action = h_lr_lookup(table, state, symbol); + if(action == NULL) + break; // no handle recognizable in input, terminate parsing + + if(action->type == HLR_SHIFT) { + h_slist_push(left, (void *)(uintptr_t)state); + h_slist_pop(right); // symbol (discard) + h_slist_push(left, h_slist_pop(right)); // semantic value + state = action->nextstate; + } else { + assert(action->type == HLR_REDUCE); + size_t len = action->production.length; + HCFChoice *symbol = action->production.lhs; + + // semantic value of the reduction result + HParsedToken *value = h_arena_malloc(arena, sizeof(HParsedToken)); + value->token_type = TT_SEQUENCE; + value->seq = h_carray_new_sized(arena, len); + + // pull values off the left stack, rewinding state accordingly + HParsedToken *v; + for(size_t i=0; iseq->elements[len-1-i] = v; + value->seq->used++; + } + // result position equals position of left-most symbol + value->index = v->index; + value->bit_offset = v->bit_offset; + + // perform token reshape if indicated + if(symbol->reshape) + value = (HParsedToken *)symbol->reshape(make_result(arena, value)); + + // call validation and semantic action, if present + if(symbol->pred && !symbol->pred(make_result(tarena, value))) + break; // validation failed -> no parse + if(symbol->action) + value = (HParsedToken *)symbol->action(make_result(arena, value)); + + // push result (value, symbol) onto the right stack + h_slist_push(right, value); + h_slist_push(right, symbol); + } + } + + h_delete_arena(tarena); + + // parsing was successful iff the start symbol is on top of the right stack + if(h_slist_pop(right) == table->start) { + // next on the right stack is the start symbol's semantic value + HParsedToken *result = h_slist_pop(right); + assert(result != NULL); + return make_result(arena, result); + } else { + h_delete_arena(arena); + return NULL; + } } From 805dfeb363ef22ba5e4c64fa19b37bb300cba5aa Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 12 Jun 2013 15:33:53 +0200 Subject: [PATCH 45/95] duh, forgot to record the start symbol in table --- src/backends/lalr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 5b8ea65..a11baf2 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -398,6 +398,9 @@ HLRTable *h_lr0_table(HCFGrammar *g) HLRTable *table = h_lrtable_new(mm__, dfa->nstates); HArena *arena = table->arena; + // remember start symbol + table->start = g->start; + // add shift entries for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { // for each transition x-A->y, add "shift, goto y" to table entry (x,A) From ce387d81b685844ac471cc23e9c3dfb94fd4c34e Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 12 Jun 2013 15:48:52 +0200 Subject: [PATCH 46/95] save production rhs in reduce actions for pretty-printing if not compiled NDEBUG --- src/backends/lalr.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index a11baf2..e974da6 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -35,7 +35,9 @@ typedef struct HLRAction_ { struct { HCFChoice *lhs; // symbol carrying semantic actions etc. size_t length; // # of symbols in rhs - // NB: the rhs symbols are not needed for the parse +#ifndef NDEBUG + HCFChoice **rhs; // NB: the rhs symbols are not needed for the parse +#endif } production; // used with REDUCE }; } HLRAction; @@ -378,12 +380,15 @@ static HLRAction *shift_action(HArena *arena, size_t nextstate) return action; } -static HLRAction *reduce_action(HArena *arena, HCFChoice *lhs, size_t rhslen) +static HLRAction *reduce_action(HArena *arena, const HLRItem *item) { HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); action->type = HLR_REDUCE; - action->production.lhs = lhs; - action->production.length = rhslen; + action->production.lhs = item->lhs; + action->production.length = item->len; +#ifndef NDEBUG + action->production.rhs = item->rhs; +#endif return action; } @@ -424,7 +429,7 @@ HLRTable *h_lr0_table(HCFGrammar *g) h_slist_push(table->inadeq, (void *)(uintptr_t)i); } else { // set reduce action for the entire row - table->forall[i] = reduce_action(arena, item->lhs, item->len); + table->forall[i] = reduce_action(arena, item); } } H_END_FOREACH @@ -811,8 +816,15 @@ void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) if(action->type == HLR_SHIFT) { fprintf(f, "s%lu", action->nextstate); } else { - fputc('r', f); - // XXX reference the production somehow +#ifdef NDEBUG + // if we can't print the production, at least print its length + fprintf(f, "r[%lu]", action->production.length); +#else + fputs("r(", f); + HCFSequence seq = {action->production.rhs}; + h_pprint_sequence(f, g, &seq); + fputc(')', f); +#endif } } From 41d4be4b84b8abd42bc968d2f8bb01d758cf1eac Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 12 Jun 2013 15:54:46 +0200 Subject: [PATCH 47/95] avoid a segfault on getting the token position --- src/backends/lalr.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index e974da6..8178f2d 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -690,9 +690,13 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* value->seq->elements[len-1-i] = v; value->seq->used++; } - // result position equals position of left-most symbol - value->index = v->index; - value->bit_offset = v->bit_offset; + if(v) { + // result position equals position of left-most symbol + value->index = v->index; + value->bit_offset = v->bit_offset; + } else { + // XXX how to get the position in this case? + } // perform token reshape if indicated if(symbol->reshape) From 4b5a93666723380e26e58ea23fad5c244b6ee66a Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 12 Jun 2013 16:38:50 +0200 Subject: [PATCH 48/95] handle charsets --- src/backends/lalr.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 8178f2d..66fe42c 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -229,15 +229,33 @@ static HHashSet *closure(HCFGrammar *g, const HHashSet *items) HCFChoice *sym = item->rhs[item->mark]; // symbol after mark // if there is a non-terminal after the mark, follow it - // XXX: do we have to count HCF_CHARSET as nonterminal? - if(sym != NULL && sym->type == HCF_CHOICE) { + // NB: unlike LLk, we do consider HCF_CHARSET a non-terminal here + if(sym != NULL && (sym->type==HCF_CHOICE || sym->type==HCF_CHARSET)) { // add items corresponding to the productions of sym - for(HCFSequence **p=sym->seq; *p; p++) { - HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); - if(!h_hashset_present(ret, it)) { - h_hashset_put(ret, it); - h_slist_push(work, it); + if(sym->type == HCF_CHOICE) { + for(HCFSequence **p=sym->seq; *p; p++) { + HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); + if(!h_hashset_present(ret, it)) { + h_hashset_put(ret, it); + h_slist_push(work, it); + } } + } else { // HCF_CHARSET + for(unsigned int i=0; i<256; i++) { + if(charset_isset(sym->charset, i)) { + HCFChoice **rhs = h_arena_malloc(arena, 2 * sizeof(HCFChoice *)); + rhs[0] = h_arena_malloc(arena, sizeof(HCFChoice)); + rhs[0]->type = HCF_CHAR; + rhs[0]->chr = i; + rhs[1] = NULL; + HLRItem *it = h_lritem_new(arena, sym, rhs, 0); + h_hashset_put(ret, it); + // single-character item needs no further work + } + } + // if sym is a non-terminal, we need a reshape on it + // this seems as good a place as any to set it + sym->reshape = h_act_first; } // if sym derives epsilon, also advance over it @@ -615,8 +633,6 @@ h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) } } -// XXX also, what about charsets!? - HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) { HLRTable *table = parser->backend_data; @@ -884,7 +900,7 @@ int test_lalr(void) */ // XXX make LALR example - HParser *X = h_optional(h_ch('x')); + HParser *X = h_optional(h_in((uint8_t *)"rst", 3)); HParser *Y = h_sequence(h_ch('y'), h_ch('y'), NULL); HParser *A = h_sequence(X, Y, h_ch('a'), NULL); HParser *B = h_sequence(Y, h_ch('b'), NULL); @@ -921,7 +937,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)"xyya", 4); + HParseResult *res = h_parse(p, (uint8_t *)"syya", 4); if(res) h_pprint(stdout, res->ast, 0, 2); else From ddfd3796a724b8819744876ea0603e68c7e2d340 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 12 Jun 2013 16:48:59 +0200 Subject: [PATCH 49/95] why not include the lhs in pretty-printed reduce entries? --- src/backends/lalr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 66fe42c..aad47d4 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -836,15 +836,17 @@ void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) if(action->type == HLR_SHIFT) { fprintf(f, "s%lu", action->nextstate); } else { + fputs("r(", f); + h_pprint_symbol(f, g, action->production.lhs); + fputs(" -> ", f); #ifdef NDEBUG // if we can't print the production, at least print its length - fprintf(f, "r[%lu]", action->production.length); + fprintf(f, "[%lu]", action->production.length); #else - fputs("r(", f); HCFSequence seq = {action->production.rhs}; h_pprint_sequence(f, g, &seq); - fputc(')', f); #endif + fputc(')', f); } } From fd297b636c05c6e46882485131d8f519c6fcb954 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 13 Jun 2013 14:45:26 +0200 Subject: [PATCH 50/95] add LALR conflict resolution (untested) --- src/backends/lalr.c | 170 ++++++++++++++++++++++++++++++++++---------- src/cfgrammar.c | 5 ++ src/cfgrammar.h | 1 + 3 files changed, 138 insertions(+), 38 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index aad47d4..7e83c53 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -52,6 +52,13 @@ typedef struct HLRTable_ { HAllocator *mm__; } HLRTable; +typedef struct HLREnhGrammar_ { + HCFGrammar *grammar; // enhanced grammar + HHashTable *tmap; // maps transitions to enhanced-grammar symbols + HHashTable *smap; // maps enhanced-grammar symbols to transitions + HArena *arena; +} HLREnhGrammar; + // compare symbols - terminals by value, others by pointer static bool eq_symbol(const void *p, const void *q) @@ -410,14 +417,10 @@ static HLRAction *reduce_action(HArena *arena, const HLRItem *item) return action; } -HLRTable *h_lr0_table(HCFGrammar *g) +HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) { HAllocator *mm__ = g->mm__; - // construct LR(0) DFA - HLRDFA *dfa = h_lr0_dfa(g); - if(!dfa) return NULL; - HLRTable *table = h_lrtable_new(mm__, dfa->nstates); HArena *arena = table->arena; @@ -475,13 +478,13 @@ static size_t follow_transition(const HLRTable *table, size_t x, HCFChoice *A) return action->nextstate; } -static HCFChoice *transform_symbol(const HLRTable *table, HHashTable *map, +static HCFChoice *transform_symbol(const HLRTable *table, HLREnhGrammar *eg, size_t x, HCFChoice *B, size_t z); -static HCFChoice *transform_productions(const HLRTable *table, HHashTable *map, +static HCFChoice *transform_productions(const HLRTable *table, HLREnhGrammar *eg, size_t x, HCFChoice *xAy) { - HArena *arena = map->arena; + HArena *arena = eg->arena; HCFSequence **seq = h_arena_malloc(arena, seqsize(xAy->seq) * sizeof(HCFSequence *)); @@ -494,7 +497,7 @@ static HCFChoice *transform_productions(const HLRTable *table, HHashTable *map, HCFChoice **xBz = h_arena_malloc(arena, seqsize(B) * sizeof(HCFChoice *)); for(; *B; B++, xBz++) { size_t z = follow_transition(table, x, *B); - *xBz = transform_symbol(table, map, x, *B, z); + *xBz = transform_symbol(table, eg, x, *B, z); x=z; } *xBz = NULL; @@ -518,21 +521,22 @@ static inline HLRTransition *transition(HArena *arena, return t; } -static HCFChoice *transform_symbol(const HLRTable *table, HHashTable *map, +static HCFChoice *transform_symbol(const HLRTable *table, HLREnhGrammar *eg, size_t x, HCFChoice *B, size_t z) { - HArena *arena = map->arena; + HArena *arena = eg->arena; // look up the transition in map, create symbol if not found HLRTransition *x_B_z = transition(arena, x, B, z); - HCFChoice *xBz = h_hashtable_get(map, x_B_z); + HCFChoice *xBz = h_hashtable_get(eg->tmap, x_B_z); if(!xBz) { HCFChoice *xBz = h_arena_malloc(arena, sizeof(HCFChoice)); *xBz = *B; - h_hashtable_put(map, x_B_z, xBz); + h_hashtable_put(eg->tmap, x_B_z, xBz); + h_hashtable_put(eg->smap, xBz, x_B_z); } - return transform_productions(table, map, x, xBz); + return transform_productions(table, eg, x, xBz); } static bool eq_transition(const void *p, const void *q) @@ -547,39 +551,71 @@ static HHashValue hash_transition(const void *p) return (h_hash_ptr(t->symbol) + t->from + t->to); // XXX ? } -static HHashTable *enhance_grammar(const HCFGrammar *g, const HLRTable *tbl) +static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRTable *tbl) { + HAllocator *mm__ = g->mm__; HArena *arena = g->arena; // XXX ? - HHashTable *map = h_hashtable_new(arena, eq_transition, hash_transition); + + HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar)); + eg->tmap = h_hashtable_new(arena, eq_transition, hash_transition); + eg->smap = h_hashtable_new(arena, eq_transition, hash_transition); + eg->arena = arena; // copy the start symbol over HCFChoice *start = h_arena_malloc(arena, sizeof(HCFChoice)); *start = *(g->start); - h_hashtable_put(map, g->start, start); - transform_productions(tbl, map, 0, start); + transform_productions(tbl, eg, 0, start); - return map; + eg->grammar = h_cfgrammar_(mm__, start); + return eg; } /* LALR table generation */ -bool is_inadequate(HLRTable *table, size_t state) -{ - // XXX - return false; -} - -bool has_conflicts(HLRTable *table) +static inline bool has_conflicts(HLRTable *table) { return !h_slist_empty(table->inadeq); } +// place a new entry in tbl; records conflicts in tbl->inadeq +// returns 0 on success, -1 on conflict +// ignores forall entries +int h_lrtable_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action) +{ + HLRAction *prev = h_hashtable_get(tbl->rows[state], x); + if(prev && prev != action) { + // conflict + h_slist_push(tbl->inadeq, (void *)(uintptr_t)state); + return -1; + } else { + h_hashtable_put(tbl->rows[state], x, action); + return 0; + } +} + +// check whether a sequence of enhanced-grammar symbols (p) matches the given +// (original-grammar) production rhs and terminates in the given end state. +bool match_production(HLREnhGrammar *eg, HCFChoice **p, + HCFChoice **rhs, size_t endstate) +{ + HLRTransition *t; + for(; *p && *rhs; p++, rhs++) { + t = h_hashtable_get(eg->smap, *p); + assert(t != NULL); + if(!eq_symbol(t->symbol, *rhs)) + return false; + } + return (*p == *rhs // both NULL + && t->to == endstate); +} + int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) { // generate CFG from parser + // construct LR(0) DFA // build LR(0) table // if necessary, resolve conflicts "by conversion to SLR" @@ -587,21 +623,79 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) if(g == NULL) // backend not suitable (language not context-free) return -1; - HLRTable *table = h_lr0_table(g); - if(table == NULL) // this should normally not happen + HLRDFA *dfa = h_lr0_dfa(g); + if(dfa == NULL) { // this should normally not happen + h_cfgrammar_free(g); return -1; + } + + HLRTable *table = h_lr0_table(g, dfa); + if(table == NULL) { // this should normally not happen + h_cfgrammar_free(g); + return -1; + } if(has_conflicts(table)) { - HHashTable *map = enhance_grammar(g, table); - if(map == NULL) // this should normally not happen - return -1; + HArena *arena = table->arena; - // XXX resolve conflicts - // iterate over dfa's transitions where 'from' state is inadequate - // look up enhanced symbol corr. to the transition - // for each terminal in follow set of enh. symbol: - // put reduce action into table cell (state, terminal) - // conflict if already occupied + HLREnhGrammar *eg = enhance_grammar(g, table); + if(eg == NULL) { // this should normally not happen + h_cfgrammar_free(g); + h_lrtable_free(table); + return -1; + } + + // go through the inadequate states; replace inadeq with a new list + HSlist *inadeq = table->inadeq; + table->inadeq = h_slist_new(arena); + + for(HSlistNode *x=inadeq->head; x; x=x->next) { + size_t state = (uintptr_t)x->elem; + + // clear old forall entry, it's being replaced by more fine-grained ones + table->forall[state] = NULL; + + // go through each reducible item of state + H_FOREACH_KEY(dfa->states[state], HLRItem *item) + if(item->mark < item->len) + continue; + + // action to place in the table cells indicated by lookahead + HLRAction *action = reduce_action(arena, item); + + // find all LR(0)-enhanced productions matching item + H_FOREACH(eg->smap, HCFChoice *lhs, HLRTransition *t) + if(t->symbol != item->lhs) + continue; + for(HCFSequence **p=lhs->seq; *p; p++) { + HCFChoice **rhs = (*p)->items; + if(!match_production(eg, rhs, item->rhs, state)) + continue; + + // the left-hand symbol's follow set is this production's + // contribution to the lookahead + const HStringMap *fs = h_follow(1, eg->grammar, lhs); + assert(fs != NULL); + + // for each lookahead symbol, put action into table cell + if(fs->end_branch) { + HCFChoice *terminal = h_arena_malloc(arena, sizeof(HCFChoice)); + terminal->type = HCF_END; + h_lrtable_put(table, state, terminal, action); + } + H_FOREACH(fs->char_branches, void *key, HStringMap *m) + if(!m->epsilon_branch) + continue; + + HCFChoice *terminal = h_arena_malloc(arena, sizeof(HCFChoice)); + terminal->type = HCF_CHAR; + terminal->chr = key_char((HCharKey)key); + + h_lrtable_put(table, state, terminal, action); + H_END_FOREACH // lookahead character + } H_END_FOREACH // enhanced production + H_END_FOREACH // reducible item + } } h_cfgrammar_free(g); @@ -924,7 +1018,7 @@ int test_lalr(void) fprintf(stderr, "h_lalr_dfa failed\n"); printf("\n==== L R ( 0 ) T A B L E ====\n"); - HLRTable *table0 = h_lr0_table(g); + HLRTable *table0 = h_lr0_table(g, dfa); if(table0) h_pprint_lrtable(stdout, g, table0, 0); else diff --git a/src/cfgrammar.c b/src/cfgrammar.c index bc7b358..a874236 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -51,6 +51,11 @@ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser) if(desugared == NULL) return NULL; // -> backend not suitable for this parser + return h_cfgrammar_(mm__, desugared); +} + +HCFGrammar *h_cfgrammar_(HAllocator* mm__, HCFChoice *desugared) +{ HCFGrammar *g = h_cfgrammar_new(mm__); // recursively traverse the desugared form and collect all HCFChoices that diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 62b3320..c70c68a 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -59,6 +59,7 @@ static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_ * A NULL return means we are unable to represent the parser as a CFG. */ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser); +HCFGrammar *h_cfgrammar_(HAllocator* mm__, HCFChoice *start); HCFGrammar *h_cfgrammar_new(HAllocator *mm__); From 92f16a4d14be273721fff1c46a55864aba2116f4 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 12:24:18 +0200 Subject: [PATCH 51/95] proper LALR example with conflict in LR(0) --- src/backends/lalr.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 7e83c53..9c9b864 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -667,6 +667,8 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) H_FOREACH(eg->smap, HCFChoice *lhs, HLRTransition *t) if(t->symbol != item->lhs) continue; + assert(lhs->type == HCF_CHOICE); // XXX could be CHARSET + for(HCFSequence **p=lhs->seq; *p; p++) { HCFChoice **rhs = (*p)->items; if(!match_production(eg, rhs, item->rhs, state)) @@ -986,21 +988,21 @@ HParserBackendVTable h__lalr_backend_vtable = { // dummy! int test_lalr(void) { - /* for k=2: - - S -> A | B - A -> X Y a - B -> Y b - X -> x | '' - Y -> y -- for k=3 use "yy" + /* + S -> E + E -> E '-' T + | T + T -> '(' E ')' + | N + N -> '0' -- also try [0-9] for the charset paths */ - // XXX make LALR example - HParser *X = h_optional(h_in((uint8_t *)"rst", 3)); - HParser *Y = h_sequence(h_ch('y'), h_ch('y'), NULL); - HParser *A = h_sequence(X, Y, h_ch('a'), NULL); - HParser *B = h_sequence(Y, h_ch('b'), NULL); - HParser *p = h_choice(A, B, NULL); + HParser *N = h_sequence(h_ch('n'), NULL); + HParser *E = h_indirect(); + HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), N, NULL); + HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); + h_bind_indirect(E, E_); + HParser *p = h_sequence(E, NULL); printf("\n==== G R A M M A R ====\n"); HCFGrammar *g = h_cfgrammar(&system_allocator, p); From 7b04ab8d9de37b7ced1b5038565821082fa42285 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 12:27:35 +0200 Subject: [PATCH 52/95] pre-allocate h_desugar's result to squelch recursive calls --- src/backends/contextfree.h | 7 ++++++- src/desugar.c | 4 +++- src/parsers/indirect.c | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/backends/contextfree.h b/src/backends/contextfree.h index 9c2ec45..3f01d70 100644 --- a/src/backends/contextfree.h +++ b/src/backends/contextfree.h @@ -11,6 +11,7 @@ struct HCFStack_ { int count; int cap; HCFChoice *last_completed; // Last completed choice. + HCFChoice *prealloc; // If not NULL, will serve as outermost choice. }; #ifndef UNUSED @@ -25,11 +26,13 @@ static HCFStack* h_cfstack_new(HAllocator *mm__) { stack->count = 0; stack->cap = 4; stack->stack = h_new(HCFChoice*, stack->cap); + stack->prealloc = NULL; return stack; } static void h_cfstack_free(HAllocator *mm__, HCFStack *stk__) UNUSED; static void h_cfstack_free(HAllocator *mm__, HCFStack *stk__) { + h_free(stk__->prealloc); h_free(stk__->stack); h_free(stk__); } @@ -56,7 +59,9 @@ static inline void h_cfstack_add_to_seq(HAllocator *mm__, HCFStack *stk__, HCFCh } static inline HCFChoice* h_cfstack_new_choice_raw(HAllocator *mm__, HCFStack *stk__) { - HCFChoice *ret = h_new(HCFChoice, 1); + HCFChoice *ret = stk__->prealloc? stk__->prealloc : h_new(HCFChoice, 1); + stk__->prealloc = NULL; + ret->reshape = NULL; ret->action = NULL; ret->pred = NULL; diff --git a/src/desugar.c b/src/desugar.c index ce87ca3..46176ea 100644 --- a/src/desugar.c +++ b/src/desugar.c @@ -8,9 +8,11 @@ HCFChoice *h_desugar(HAllocator *mm__, HCFStack *stk__, const HParser *parser) { if (nstk__ == NULL) { nstk__ = h_cfstack_new(mm__); } + if(nstk__->prealloc == NULL) + nstk__->prealloc = h_new(HCFChoice, 1); // we're going to do something naughty and cast away the const to memoize + ((HParser *)parser)->desugared = nstk__->prealloc; parser->vtable->desugar(mm__, nstk__, parser->env); - ((HParser *)parser)->desugared = nstk__->last_completed; if (stk__ == NULL) h_cfstack_free(mm__, nstk__); } else if (stk__ != NULL) { diff --git a/src/parsers/indirect.c b/src/parsers/indirect.c index 746f1a9..2217a20 100644 --- a/src/parsers/indirect.c +++ b/src/parsers/indirect.c @@ -10,7 +10,7 @@ static bool indirect_isValidCF(void *env) { } static void desugar_indirect(HAllocator *mm__, HCFStack *stk__, void *env) { - HCFS_DESUGAR( (HParser*)env ); + HCFS_DESUGAR( (HParser *)env ); } static const HParserVtable indirect_vt = { From dfe4c3ad4730f85f5645bb723dd02fd461e2d2de Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 12:42:55 +0200 Subject: [PATCH 53/95] add right-recursion test for packrat and llk --- src/t_parser.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/t_parser.c b/src/t_parser.c index 8aab7bb..fa19151 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -407,7 +407,7 @@ static void test_not(gconstpointer backend) { } /* static void test_leftrec(gconstpointer backend) { - const HParser *a_ = h_ch('a'); + HParser *a_ = h_ch('a'); HParser *lr_ = h_indirect(); h_bind_indirect(lr_, h_choice(h_sequence(lr_, a_, NULL), a_, NULL)); @@ -417,6 +417,17 @@ static void test_leftrec(gconstpointer backend) { g_check_parse_ok(lr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "((u0x61 u0x61) u0x61)"); } */ +static void test_rightrec(gconstpointer backend) { + HParser *a_ = h_ch('a'); + + HParser *rr_ = h_indirect(); + h_bind_indirect(rr_, h_choice(h_sequence(a_, rr_, NULL), h_epsilon_p(), NULL)); + + g_check_parse_ok(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "a", 1, "(u0x61)"); + g_check_parse_ok(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "aa", 2, "(u0x61 (u0x61))"); + g_check_parse_ok(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "(u0x61 (u0x61 (u0x61)))"); +} + void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/token", GINT_TO_POINTER(PB_PACKRAT), test_token); g_test_add_data_func("/core/parser/packrat/ch", GINT_TO_POINTER(PB_PACKRAT), test_ch); @@ -460,6 +471,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/not", GINT_TO_POINTER(PB_PACKRAT), test_not); g_test_add_data_func("/core/parser/packrat/ignore", GINT_TO_POINTER(PB_PACKRAT), test_ignore); // g_test_add_data_func("/core/parser/packrat/leftrec", GINT_TO_POINTER(PB_PACKRAT), test_leftrec); + g_test_add_data_func("/core/parser/packrat/rightrec", GINT_TO_POINTER(PB_PACKRAT), test_rightrec); g_test_add_data_func("/core/parser/llk/token", GINT_TO_POINTER(PB_LLk), test_token); g_test_add_data_func("/core/parser/llk/ch", GINT_TO_POINTER(PB_LLk), test_ch); @@ -496,6 +508,8 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llk/epsilon_p", GINT_TO_POINTER(PB_LLk), test_epsilon_p); g_test_add_data_func("/core/parser/llk/attr_bool", GINT_TO_POINTER(PB_LLk), test_attr_bool); g_test_add_data_func("/core/parser/llk/ignore", GINT_TO_POINTER(PB_LLk), test_ignore); + //g_test_add_data_func("/core/parser/llk/leftrec", GINT_TO_POINTER(PB_LLk), test_leftrec); + g_test_add_data_func("/core/parser/llk/rightrec", GINT_TO_POINTER(PB_LLk), test_rightrec); g_test_add_data_func("/core/parser/regex/token", GINT_TO_POINTER(PB_REGULAR), test_token); g_test_add_data_func("/core/parser/regex/ch", GINT_TO_POINTER(PB_REGULAR), test_ch); From f65b0ae82a0f192ceb58c9f03df1ef830b09e9e0 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 12:50:39 +0200 Subject: [PATCH 54/95] commentation --- src/backends/contextfree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backends/contextfree.h b/src/backends/contextfree.h index 3f01d70..b387e55 100644 --- a/src/backends/contextfree.h +++ b/src/backends/contextfree.h @@ -11,7 +11,8 @@ struct HCFStack_ { int count; int cap; HCFChoice *last_completed; // Last completed choice. - HCFChoice *prealloc; // If not NULL, will serve as outermost choice. + // XXX is last_completed still needed? + HCFChoice *prealloc; // If not NULL, will be used for the outermost choice. }; #ifndef UNUSED From bfc2433320a2af65e2e4bc39cc680f950c35a685 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 17:07:56 +0200 Subject: [PATCH 55/95] don't call compare function on NULL hashtable keys --- src/datastructures.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datastructures.c b/src/datastructures.c index 55b8345..075b966 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -147,6 +147,8 @@ void* h_hashtable_get(const HHashTable* ht, const void* key) { for (hte = &ht->contents[hashval & (ht->capacity - 1)]; hte != NULL; hte = hte->next) { + if (hte->key == NULL) + continue; if (hte->hashval != hashval) continue; if (ht->equalFunc(key, hte->key)) From 06acbe2fb5415396e5494bbcb6998aa87a3b431c Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 17:17:23 +0200 Subject: [PATCH 56/95] LALR example parses! --- src/backends/lalr.c | 187 +++++++++++++++++++++++++------------------- 1 file changed, 107 insertions(+), 80 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 9c9b864..e3a866b 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -56,6 +56,7 @@ typedef struct HLREnhGrammar_ { HCFGrammar *grammar; // enhanced grammar HHashTable *tmap; // maps transitions to enhanced-grammar symbols HHashTable *smap; // maps enhanced-grammar symbols to transitions + HHashTable *corr; // maps symbols to sets of corresponding e. symbols HArena *arena; } HLREnhGrammar; @@ -441,6 +442,7 @@ HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) // find reducible items in state H_FOREACH_KEY(dfa->states[i], HLRItem *item) if(item->mark == item->len) { // mark at the end + // check for conflicts // XXX store more informative stuff in the inadeq records? if(table->forall[i]) { // reduce/reduce conflict with a previous item @@ -448,10 +450,10 @@ HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) } else if(!h_hashtable_empty(table->rows[i])) { // shift/reduce conflict with one of the row's entries h_slist_push(table->inadeq, (void *)(uintptr_t)i); - } else { - // set reduce action for the entire row - table->forall[i] = reduce_action(arena, item); } + + // set reduce action for the entire row + table->forall[i] = reduce_action(arena, item); } H_END_FOREACH } @@ -478,39 +480,6 @@ static size_t follow_transition(const HLRTable *table, size_t x, HCFChoice *A) return action->nextstate; } -static HCFChoice *transform_symbol(const HLRTable *table, HLREnhGrammar *eg, - size_t x, HCFChoice *B, size_t z); - -static HCFChoice *transform_productions(const HLRTable *table, HLREnhGrammar *eg, - size_t x, HCFChoice *xAy) -{ - HArena *arena = eg->arena; - - HCFSequence **seq = h_arena_malloc(arena, seqsize(xAy->seq) - * sizeof(HCFSequence *)); - HCFSequence **p, **q; - for(p=xAy->seq, q=seq; *p; p++, q++) { - // trace rhs starting in state x and following the transitions - // xAy -> xBz ... - - HCFChoice **B = (*p)->items; - HCFChoice **xBz = h_arena_malloc(arena, seqsize(B) * sizeof(HCFChoice *)); - for(; *B; B++, xBz++) { - size_t z = follow_transition(table, x, *B); - *xBz = transform_symbol(table, eg, x, *B, z); - x=z; - } - *xBz = NULL; - - *q = h_arena_malloc(arena, sizeof(HCFSequence)); - (*q)->items = xBz; - } - *q = NULL; - xAy->seq = seq; - - return xAy; // pass-through -} - static inline HLRTransition *transition(HArena *arena, size_t x, const HCFChoice *A, size_t y) { @@ -521,51 +490,101 @@ static inline HLRTransition *transition(HArena *arena, return t; } -static HCFChoice *transform_symbol(const HLRTable *table, HLREnhGrammar *eg, - size_t x, HCFChoice *B, size_t z) +// no-op on terminal symbols +static void transform_productions(const HLRTable *table, HLREnhGrammar *eg, + size_t x, HCFChoice *xAy) { + if(xAy->type != HCF_CHOICE) + return; + // XXX CHARSET? + HArena *arena = eg->arena; - // look up the transition in map, create symbol if not found - HLRTransition *x_B_z = transition(arena, x, B, z); - HCFChoice *xBz = h_hashtable_get(eg->tmap, x_B_z); - if(!xBz) { - HCFChoice *xBz = h_arena_malloc(arena, sizeof(HCFChoice)); - *xBz = *B; - h_hashtable_put(eg->tmap, x_B_z, xBz); - h_hashtable_put(eg->smap, xBz, x_B_z); - } + HCFSequence **seq = h_arena_malloc(arena, seqsize(xAy->seq) + * sizeof(HCFSequence *)); + HCFSequence **p, **q; + for(p=xAy->seq, q=seq; *p; p++, q++) { + // trace rhs starting in state x and following the transitions + // xAy -> ... iBj ... - return transform_productions(table, eg, x, xBz); + size_t i = x; + HCFChoice **B = (*p)->items; + HCFChoice **items = h_arena_malloc(arena, seqsize(B) * sizeof(HCFChoice *)); + HCFChoice **iBj = items; + for(; *B; B++, iBj++) { + size_t j = follow_transition(table, i, *B); + HLRTransition *i_B_j = transition(arena, i, *B, j); + *iBj = h_hashtable_get(eg->tmap, i_B_j); + assert(*iBj != NULL); + i = j; + } + *iBj = NULL; + + *q = h_arena_malloc(arena, sizeof(HCFSequence)); + (*q)->items = items; + } + *q = NULL; + xAy->seq = seq; } static bool eq_transition(const void *p, const void *q) { const HLRTransition *a=p, *b=q; - return (a->from == b->from && a->to == b->to && a->symbol == b->symbol); + return (a->from == b->from && a->to == b->to && eq_symbol(a->symbol, b->symbol)); } static HHashValue hash_transition(const void *p) { const HLRTransition *t = p; - return (h_hash_ptr(t->symbol) + t->from + t->to); // XXX ? + return (hash_symbol(t->symbol) + t->from + t->to); // XXX ? } -static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRTable *tbl) +HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym) +{ + HArena *arena = eg->arena; + HCFChoice *esym = h_arena_malloc(arena, sizeof(HCFChoice)); + *esym = *sym; + + HHashSet *cs = h_hashtable_get(eg->corr, sym); + if(!cs) { + cs = h_hashset_new(arena, eq_symbol, hash_symbol); + h_hashtable_put(eg->corr, sym, cs); + } + h_hashset_put(cs, esym); + + return esym; +} +static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa, + const HLRTable *table) { HAllocator *mm__ = g->mm__; HArena *arena = g->arena; // XXX ? HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar)); eg->tmap = h_hashtable_new(arena, eq_transition, hash_transition); - eg->smap = h_hashtable_new(arena, eq_transition, hash_transition); + eg->smap = h_hashtable_new(arena, eq_symbol, hash_symbol); + eg->corr = h_hashtable_new(arena, eq_symbol, hash_symbol); eg->arena = arena; - // copy the start symbol over - HCFChoice *start = h_arena_malloc(arena, sizeof(HCFChoice)); - *start = *(g->start); + // establish mapping between transitions and symbols + for(HSlistNode *x=dfa->transitions->head; x; x=x->next) { + HLRTransition *t = x->elem; - transform_productions(tbl, eg, 0, start); + assert(!h_hashtable_present(eg->tmap, t)); + + HCFChoice *sym = new_enhanced_symbol(eg, t->symbol); + h_hashtable_put(eg->tmap, t, sym); + h_hashtable_put(eg->smap, sym, t); + } + + // transform the productions + H_FOREACH(eg->tmap, HLRTransition *t, HCFChoice *sym) + transform_productions(table, eg, t->from, sym); + H_END_FOREACH + + // add the start symbol + HCFChoice *start = new_enhanced_symbol(eg, g->start); + transform_productions(table, eg, 0, start); eg->grammar = h_cfgrammar_(mm__, start); return eg; @@ -638,7 +657,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) if(has_conflicts(table)) { HArena *arena = table->arena; - HLREnhGrammar *eg = enhance_grammar(g, table); + HLREnhGrammar *eg = enhance_grammar(g, dfa, table); if(eg == NULL) { // this should normally not happen h_cfgrammar_free(g); h_lrtable_free(table); @@ -664,10 +683,10 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) HLRAction *action = reduce_action(arena, item); // find all LR(0)-enhanced productions matching item - H_FOREACH(eg->smap, HCFChoice *lhs, HLRTransition *t) - if(t->symbol != item->lhs) - continue; - assert(lhs->type == HCF_CHOICE); // XXX could be CHARSET + HHashSet *lhss = h_hashtable_get(eg->corr, item->lhs); + assert(lhss != NULL); + H_FOREACH_KEY(lhss, HCFChoice *lhs) + assert(lhs->type == HCF_CHOICE); // XXX could be CHARSET? for(HCFSequence **p=lhs->seq; *p; p++) { HCFChoice **rhs = (*p)->items; @@ -952,26 +971,35 @@ void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, for(size_t i=0; inrows; i++) { for(unsigned int j=0; jforall[i] && h_hashtable_empty(table->rows[i])) { + if(table->forall[i]) { fputs(" - ", f); pprint_lraction(f, g, table->forall[i]); fputs(" -", f); - } else { - H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) - fputc(' ', f); // separator - h_pprint_symbol(f, g, symbol); - fputc(':', f); - if(table->forall[i]) { - fputc(action->type == HLR_SHIFT? 's' : 'r', f); - fputc('/', f); - fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); - } else { - pprint_lraction(f, g, action); - } - H_END_FOREACH + if(!h_hashtable_empty(table->rows[i])) + fputs(" !!", f); } + H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) + fputc(' ', f); // separator + h_pprint_symbol(f, g, symbol); + fputc(':', f); + if(table->forall[i]) { + fputc(action->type == HLR_SHIFT? 's' : 'r', f); + fputc('/', f); + fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); + } else { + pprint_lraction(f, g, action); + } + H_END_FOREACH fputc('\n', f); } + +#if 0 + fputs("inadeq=", f); + for(HSlistNode *x=table->inadeq->head; x; x=x->next) { + fprintf(f, "%lu ", (uintptr_t)x->elem); + } + fputc('\n', f); +#endif } @@ -993,13 +1021,12 @@ int test_lalr(void) E -> E '-' T | T T -> '(' E ')' - | N - N -> '0' -- also try [0-9] for the charset paths + | 'n' -- also try [0-9] for the charset paths */ - HParser *N = h_sequence(h_ch('n'), NULL); + HParser *n = h_ch('n'); HParser *E = h_indirect(); - HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), N, NULL); + HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL); HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); h_bind_indirect(E, E_); HParser *p = h_sequence(E, NULL); @@ -1035,7 +1062,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)"syya", 4); + HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 11); if(res) h_pprint(stdout, res->ast, 0, 2); else From 1918c97e1c464045eb3dc491d5632bceb2482ecf Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 17:36:30 +0200 Subject: [PATCH 57/95] fix a premature arena delete --- src/backends/lalr.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index e3a866b..c0be736 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -845,18 +845,22 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* } } - h_delete_arena(tarena); + // parsing was successful iff the start symbol is on top of the right stack + HParseResult *result = NULL; if(h_slist_pop(right) == table->start) { // next on the right stack is the start symbol's semantic value - HParsedToken *result = h_slist_pop(right); - assert(result != NULL); - return make_result(arena, result); + HParsedToken *tok = h_slist_pop(right); + assert(tok != NULL); + result = make_result(arena, tok); } else { h_delete_arena(arena); - return NULL; + result = NULL; } + + h_delete_arena(tarena); + return result; } @@ -922,7 +926,7 @@ void h_pprint_lrstate(FILE *f, const HCFGrammar *g, } } -void pprint_transition(FILE *f, const HCFGrammar *g, const HLRTransition *t) +static void pprint_transition(FILE *f, const HCFGrammar *g, const HLRTransition *t) { fputs("-", f); h_pprint_symbol(f, g, t->symbol); @@ -1062,7 +1066,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 11); + HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13); if(res) h_pprint(stdout, res->ast, 0, 2); else From d51e13173acdf276ca0597b0f2a8b43be77ffed5 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 17:36:46 +0200 Subject: [PATCH 58/95] add LALR test suite --- src/t_parser.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/src/t_parser.c b/src/t_parser.c index fa19151..a0e4040 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -405,7 +405,7 @@ static void test_not(gconstpointer backend) { g_check_parse_ok(not_2, (HParserBackend)GPOINTER_TO_INT(backend), "a+b", 3, "(u0x61 (u0x2b) u0x62)"); g_check_parse_ok(not_2, (HParserBackend)GPOINTER_TO_INT(backend), "a++b", 4, "(u0x61 <2b.2b> u0x62)"); } -/* + static void test_leftrec(gconstpointer backend) { HParser *a_ = h_ch('a'); @@ -416,7 +416,7 @@ static void test_leftrec(gconstpointer backend) { g_check_parse_ok(lr_, (HParserBackend)GPOINTER_TO_INT(backend), "aa", 2, "(u0x61 u0x61)"); g_check_parse_ok(lr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "((u0x61 u0x61) u0x61)"); } -*/ + static void test_rightrec(gconstpointer backend) { HParser *a_ = h_ch('a'); @@ -547,4 +547,42 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/regex/epsilon_p", GINT_TO_POINTER(PB_REGULAR), test_epsilon_p); g_test_add_data_func("/core/parser/regex/attr_bool", GINT_TO_POINTER(PB_REGULAR), test_attr_bool); g_test_add_data_func("/core/parser/regex/ignore", GINT_TO_POINTER(PB_REGULAR), test_ignore); + + g_test_add_data_func("/core/parser/lalr/token", GINT_TO_POINTER(PB_LALR), test_token); + g_test_add_data_func("/core/parser/lalr/ch", GINT_TO_POINTER(PB_LALR), test_ch); + g_test_add_data_func("/core/parser/lalr/ch_range", GINT_TO_POINTER(PB_LALR), test_ch_range); + g_test_add_data_func("/core/parser/lalr/int64", GINT_TO_POINTER(PB_LALR), test_int64); + g_test_add_data_func("/core/parser/lalr/int32", GINT_TO_POINTER(PB_LALR), test_int32); + g_test_add_data_func("/core/parser/lalr/int16", GINT_TO_POINTER(PB_LALR), test_int16); + g_test_add_data_func("/core/parser/lalr/int8", GINT_TO_POINTER(PB_LALR), test_int8); + g_test_add_data_func("/core/parser/lalr/uint64", GINT_TO_POINTER(PB_LALR), test_uint64); + g_test_add_data_func("/core/parser/lalr/uint32", GINT_TO_POINTER(PB_LALR), test_uint32); + g_test_add_data_func("/core/parser/lalr/uint16", GINT_TO_POINTER(PB_LALR), test_uint16); + g_test_add_data_func("/core/parser/lalr/uint8", GINT_TO_POINTER(PB_LALR), test_uint8); + g_test_add_data_func("/core/parser/lalr/int_range", GINT_TO_POINTER(PB_LALR), test_int_range); +#if 0 + g_test_add_data_func("/core/parser/lalr/float64", GINT_TO_POINTER(PB_LALR), test_float64); + g_test_add_data_func("/core/parser/lalr/float32", GINT_TO_POINTER(PB_LALR), test_float32); +#endif + g_test_add_data_func("/core/parser/lalr/whitespace", GINT_TO_POINTER(PB_LALR), test_whitespace); + g_test_add_data_func("/core/parser/lalr/left", GINT_TO_POINTER(PB_LALR), test_left); + g_test_add_data_func("/core/parser/lalr/right", GINT_TO_POINTER(PB_LALR), test_right); + g_test_add_data_func("/core/parser/lalr/middle", GINT_TO_POINTER(PB_LALR), test_middle); + g_test_add_data_func("/core/parser/lalr/action", GINT_TO_POINTER(PB_LALR), test_action); + g_test_add_data_func("/core/parser/lalr/in", GINT_TO_POINTER(PB_LALR), test_in); + g_test_add_data_func("/core/parser/lalr/not_in", GINT_TO_POINTER(PB_LALR), test_not_in); + g_test_add_data_func("/core/parser/lalr/end_p", GINT_TO_POINTER(PB_LALR), test_end_p); + g_test_add_data_func("/core/parser/lalr/nothing_p", GINT_TO_POINTER(PB_LALR), test_nothing_p); + g_test_add_data_func("/core/parser/lalr/sequence", GINT_TO_POINTER(PB_LALR), test_sequence); + g_test_add_data_func("/core/parser/lalr/choice", GINT_TO_POINTER(PB_LALR), test_choice); + g_test_add_data_func("/core/parser/lalr/many", GINT_TO_POINTER(PB_LALR), test_many); + g_test_add_data_func("/core/parser/lalr/many1", GINT_TO_POINTER(PB_LALR), test_many1); + g_test_add_data_func("/core/parser/lalr/optional", GINT_TO_POINTER(PB_LALR), test_optional); + g_test_add_data_func("/core/parser/lalr/sepBy", GINT_TO_POINTER(PB_LALR), test_sepBy); + g_test_add_data_func("/core/parser/lalr/sepBy1", GINT_TO_POINTER(PB_LALR), test_sepBy1); + g_test_add_data_func("/core/parser/lalr/epsilon_p", GINT_TO_POINTER(PB_LALR), test_epsilon_p); + g_test_add_data_func("/core/parser/lalr/attr_bool", GINT_TO_POINTER(PB_LALR), test_attr_bool); + g_test_add_data_func("/core/parser/lalr/ignore", GINT_TO_POINTER(PB_LALR), test_ignore); + g_test_add_data_func("/core/parser/lalr/leftrec", GINT_TO_POINTER(PB_LALR), test_leftrec); + g_test_add_data_func("/core/parser/lalr/rightrec", GINT_TO_POINTER(PB_LALR), test_rightrec); } From a7fe3824cda87f6f74b565abb53c9e1ad27889ef Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 14 Jun 2013 19:07:26 +0200 Subject: [PATCH 59/95] minor bugfixies --- src/backends/lalr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index c0be736..fbdb554 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -215,6 +215,7 @@ static HLRItem *advance_mark(HArena *arena, const HLRItem *item) static HHashSet *closure(HCFGrammar *g, const HHashSet *items) { + HAllocator *mm__ = g->mm__; HArena *arena = g->arena; HHashSet *ret = h_lrstate_new(arena); HSlist *work = h_slist_new(arena); @@ -251,8 +252,9 @@ static HHashSet *closure(HCFGrammar *g, const HHashSet *items) } else { // HCF_CHARSET for(unsigned int i=0; i<256; i++) { if(charset_isset(sym->charset, i)) { - HCFChoice **rhs = h_arena_malloc(arena, 2 * sizeof(HCFChoice *)); - rhs[0] = h_arena_malloc(arena, sizeof(HCFChoice)); + // XXX allocatethese single-character symbols statically somewhere + HCFChoice **rhs = h_new(HCFChoice *, 2); + rhs[0] = h_new(HCFChoice, 1); rhs[0]->type = HCF_CHAR; rhs[0]->chr = i; rhs[1] = NULL; @@ -558,7 +560,7 @@ static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa, const HLRTable *table) { HAllocator *mm__ = g->mm__; - HArena *arena = g->arena; // XXX ? + HArena *arena = g->arena; HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar)); eg->tmap = h_hashtable_new(arena, eq_transition, hash_transition); @@ -851,8 +853,8 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HParseResult *result = NULL; if(h_slist_pop(right) == table->start) { // next on the right stack is the start symbol's semantic value + assert(!h_slist_empty(right)); HParsedToken *tok = h_slist_pop(right); - assert(tok != NULL); result = make_result(arena, tok); } else { h_delete_arena(arena); From e56f05225522e517c2319054b0f11f4619e96fcf Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 15 Jun 2013 16:04:29 +0200 Subject: [PATCH 60/95] remove the extra +1 from h_arena_malloc --- src/allocator.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/allocator.c b/src/allocator.c index e345c87..80fa921 100644 --- a/src/allocator.c +++ b/src/allocator.c @@ -65,10 +65,10 @@ void* h_arena_malloc(HArena *arena, size_t size) { if (size <= arena->head->free) { // fast path.. void* ret = arena->head->rest + arena->head->used; - arena->used += size + 1; + arena->used += size; arena->wasted -= size; - arena->head->used += size + 1; - arena->head->free -= size + 1; + arena->head->used += size; + arena->head->free -= size; return ret; } else if (size > arena->block_size) { // We need a new, dedicated block for it, because it won't fit in a standard sized one. From 9a9631493129f08e97726608d8034c07312d72f4 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 15 Jun 2013 19:06:10 +0200 Subject: [PATCH 61/95] refactor hashtable iterations to use H_FOREACH --- src/backends/lalr.c | 180 +++++++++++++++++--------------------------- 1 file changed, 68 insertions(+), 112 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index fbdb554..54f6dc8 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -61,6 +61,27 @@ typedef struct HLREnhGrammar_ { } HLREnhGrammar; +// XXX move to internal.h or something +// XXX replace other hashtable iterations with this +#define H_FOREACH_(HT) { \ + const HHashTable *ht__ = HT; \ + for(size_t i__=0; i__ < ht__->capacity; i__++) { \ + for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ + hte__; \ + hte__ = hte__->next) { \ + if(hte__->key == NULL) continue; + +#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ + const KEYVAR = hte__->key; + +#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ + VALVAR = hte__->value; + +#define H_END_FOREACH \ + } \ + } \ + } + // compare symbols - terminals by value, others by pointer static bool eq_symbol(const void *p, const void *q) { @@ -117,15 +138,9 @@ static HHashValue hash_lalr_itemset(const void *p) { HHashValue hash = 0; - const HHashTable *ht = p; - for(size_t i=0; i < ht->capacity; i++) { - for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; - - hash += hash_lalr_item(hte->key); - } - } + H_FOREACH_KEY((const HHashSet *)p, HLRItem *item) + hash += hash_lalr_item(item); + H_END_FOREACH return hash; } @@ -179,27 +194,6 @@ void h_lrtable_free(HLRTable *table) h_free(table); } -// XXX replace other hashtable iterations with this -// XXX move to internal.h or something -#define H_FOREACH_(HT) { \ - const HHashTable *ht__ = HT; \ - for(size_t i__=0; i__ < ht__->capacity; i__++) { \ - for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ - hte__; \ - hte__ = hte__->next) { \ - if(hte__->key == NULL) continue; - -#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ - const KEYVAR = hte__->key; - -#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ - VALVAR = hte__->value; - -#define H_END_FOREACH \ - } \ - } \ - } - /* Constructing the characteristic automaton (handle recognizer) */ @@ -220,18 +214,11 @@ static HHashSet *closure(HCFGrammar *g, const HHashSet *items) HHashSet *ret = h_lrstate_new(arena); HSlist *work = h_slist_new(arena); - // iterate over items - initialize work list with them - const HHashTable *ht = items; - for(size_t i=0; i < ht->capacity; i++) { - for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; - - const HLRItem *item = hte->key; + // initialize work list with items + H_FOREACH_KEY(items, HLRItem *item) h_hashset_put(ret, item); h_slist_push(work, (void *)item); - } - } + H_END_FOREACH while(!h_slist_empty(work)) { const HLRItem *item = h_slist_pop(work); @@ -322,75 +309,52 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) HHashTable *neighbors = h_hashtable_new(arena, eq_symbol, hash_symbol); // iterate over closure and generate neighboring sets - const HHashTable *ht = closure(g, state); - for(size_t i=0; i < ht->capacity; i++) { - for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; + H_FOREACH_KEY(closure(g, state), HLRItem *item) + HCFChoice *sym = item->rhs[item->mark]; // symbol after mark - const HLRItem *item = hte->key; - HCFChoice *sym = item->rhs[item->mark]; // symbol after mark - - if(sym != NULL) { // mark was not at the end - // find or create prospective neighbor set - HLRState *neighbor = h_hashtable_get(neighbors, sym); - if(neighbor == NULL) { - neighbor = h_lrstate_new(arena); - h_hashtable_put(neighbors, sym, neighbor); - } - - // ...and add the advanced item to it - h_hashset_put(neighbor, advance_mark(arena, item)); + if(sym != NULL) { // mark was not at the end + // find or create prospective neighbor set + HLRState *neighbor = h_hashtable_get(neighbors, sym); + if(neighbor == NULL) { + neighbor = h_lrstate_new(arena); + h_hashtable_put(neighbors, sym, neighbor); } + + // ...and add the advanced item to it + h_hashset_put(neighbor, advance_mark(arena, item)); } - } + H_END_FOREACH // merge neighbor sets into the set of existing states - ht = neighbors; - for(size_t i=0; i < ht->capacity; i++) { - for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; - - const HCFChoice *symbol = hte->key; - HLRState *neighbor = hte->value; - - // look up existing state, allocate new if not found - size_t neighbor_idx; - if(!h_hashset_present(states, neighbor)) { - neighbor_idx = states->used; - h_hashtable_put(states, neighbor, (void *)(uintptr_t)neighbor_idx); - h_slist_push(work, neighbor); - h_slist_push(work, (void *)(uintptr_t)neighbor_idx); - } else { - neighbor_idx = (uintptr_t)h_hashtable_get(states, neighbor); - } - - // add transition "state --symbol--> neighbor" - HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); - t->from = state_idx; - t->to = neighbor_idx; - t->symbol = symbol; - h_slist_push(transitions, t); + H_FOREACH(neighbors, HCFChoice *symbol, HLRState *neighbor) + // look up existing state, allocate new if not found + size_t neighbor_idx; + if(!h_hashset_present(states, neighbor)) { + neighbor_idx = states->used; + h_hashtable_put(states, neighbor, (void *)(uintptr_t)neighbor_idx); + h_slist_push(work, neighbor); + h_slist_push(work, (void *)(uintptr_t)neighbor_idx); + } else { + neighbor_idx = (uintptr_t)h_hashtable_get(states, neighbor); } - } + + // add transition "state --symbol--> neighbor" + HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); + t->from = state_idx; + t->to = neighbor_idx; + t->symbol = symbol; + h_slist_push(transitions, t); + H_END_FOREACH } // end while(work) // fill DFA struct HLRDFA *dfa = h_arena_malloc(arena, sizeof(HLRDFA)); dfa->nstates = states->used; dfa->states = h_arena_malloc(arena, dfa->nstates*sizeof(HLRState *)); - for(size_t i=0; i < states->capacity; i++) { - for(HHashTableEntry *hte = &states->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; - - const HLRState *state = hte->key; - size_t idx = (uintptr_t)hte->value; - - dfa->states[idx] = state; - } - } + H_FOREACH(states, HLRState *state, void *v) + size_t idx = (uintptr_t)v; + dfa->states[idx] = state; + H_END_FOREACH dfa->transitions = transitions; return dfa; @@ -911,21 +875,13 @@ void h_pprint_lrstate(FILE *f, const HCFGrammar *g, const HLRState *state, unsigned int indent) { bool first = true; - const HHashTable *ht = state; - for(size_t i=0; i < ht->capacity; i++) { - for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { - if(hte->key == NULL) - continue; - - const HLRItem *item = hte->key; - - if(!first) - for(unsigned int i=0; i Date: Sat, 15 Jun 2013 20:30:57 +0200 Subject: [PATCH 62/95] some refactoring trying to get h_many to work --- src/backends/lalr.c | 46 ++++++++++++++++++++++----------------------- src/cfgrammar.c | 6 ++++++ src/cfgrammar.h | 1 + 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 54f6dc8..0dd1d07 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -207,16 +207,14 @@ static HLRItem *advance_mark(HArena *arena, const HLRItem *item) return ret; } -static HHashSet *closure(HCFGrammar *g, const HHashSet *items) +static void expand_to_closure(HCFGrammar *g, HHashSet *items) { HAllocator *mm__ = g->mm__; HArena *arena = g->arena; - HHashSet *ret = h_lrstate_new(arena); HSlist *work = h_slist_new(arena); // initialize work list with items H_FOREACH_KEY(items, HLRItem *item) - h_hashset_put(ret, item); h_slist_push(work, (void *)item); H_END_FOREACH @@ -231,22 +229,22 @@ static HHashSet *closure(HCFGrammar *g, const HHashSet *items) if(sym->type == HCF_CHOICE) { for(HCFSequence **p=sym->seq; *p; p++) { HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); - if(!h_hashset_present(ret, it)) { - h_hashset_put(ret, it); + if(!h_hashset_present(items, it)) { + h_hashset_put(items, it); h_slist_push(work, it); } } } else { // HCF_CHARSET for(unsigned int i=0; i<256; i++) { if(charset_isset(sym->charset, i)) { - // XXX allocatethese single-character symbols statically somewhere + // XXX allocate these single-character symbols statically somewhere HCFChoice **rhs = h_new(HCFChoice *, 2); rhs[0] = h_new(HCFChoice, 1); rhs[0]->type = HCF_CHAR; rhs[0]->chr = i; rhs[1] = NULL; HLRItem *it = h_lritem_new(arena, sym, rhs, 0); - h_hashset_put(ret, it); + h_hashset_put(items, it); // single-character item needs no further work } } @@ -254,17 +252,8 @@ static HHashSet *closure(HCFGrammar *g, const HHashSet *items) // this seems as good a place as any to set it sym->reshape = h_act_first; } - - // if sym derives epsilon, also advance over it - if(h_derives_epsilon(g, sym)) { - HLRItem *it = advance_mark(arena, item); - h_hashset_put(ret, it); - h_slist_push(work, it); - } } } - - return ret; } HLRDFA *h_lr0_dfa(HCFGrammar *g) @@ -287,15 +276,16 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) assert(g->start->type == HCF_CHOICE); for(HCFSequence **p=g->start->seq; *p; p++) h_hashset_put(start, h_lritem_new(arena, g->start, (*p)->items, 0)); + expand_to_closure(g, start); h_hashtable_put(states, start, 0); h_slist_push(work, start); h_slist_push(work, 0); // while work to do (on some state) - // compute closure // determine edge symbols // for each edge symbol: // advance respective items -> destination state (kernel) + // compute closure // if destination is a new state: // add it to state set // add transition to it @@ -308,8 +298,8 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) // maps edge symbols to neighbor states (item sets) of s HHashTable *neighbors = h_hashtable_new(arena, eq_symbol, hash_symbol); - // iterate over closure and generate neighboring sets - H_FOREACH_KEY(closure(g, state), HLRItem *item) + // iterate over state (closure) and generate neighboring sets + H_FOREACH_KEY(state, HLRItem *item) HCFChoice *sym = item->rhs[item->mark]; // symbol after mark if(sym != NULL) { // mark was not at the end @@ -325,8 +315,10 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) } H_END_FOREACH - // merge neighbor sets into the set of existing states + // merge expanded neighbor sets into the set of existing states H_FOREACH(neighbors, HCFChoice *symbol, HLRState *neighbor) + expand_to_closure(g, neighbor); + // look up existing state, allocate new if not found size_t neighbor_idx; if(!h_hashset_present(states, neighbor)) { @@ -528,8 +520,9 @@ static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa, HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar)); eg->tmap = h_hashtable_new(arena, eq_transition, hash_transition); - eg->smap = h_hashtable_new(arena, eq_symbol, hash_symbol); + eg->smap = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); eg->corr = h_hashtable_new(arena, eq_symbol, hash_symbol); + // XXX must use h_eq/hash_ptr for symbols! so enhanced CHARs are different eg->arena = arena; // establish mapping between transitions and symbols @@ -663,6 +656,8 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) // contribution to the lookahead const HStringMap *fs = h_follow(1, eg->grammar, lhs); assert(fs != NULL); + assert(fs->epsilon_branch == NULL); + assert(!h_stringmap_empty(fs)); // for each lookahead symbol, put action into table cell if(fs->end_branch) { @@ -734,6 +729,7 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* while(1) { // make sure there is input on the right stack if(h_slist_empty(right)) { + // XXX use statically-allocated terminal symbols HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); HParsedToken *v; @@ -841,7 +837,7 @@ void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item) HCFChoice **x = item->rhs; HCFChoice **mark = item->rhs + item->mark; if(*x == NULL) { - fputs("\"\"", f); + fputc('.', f); } else { while(*x) { if(x == mark) @@ -986,12 +982,14 @@ int test_lalr(void) | 'n' -- also try [0-9] for the charset paths */ +#if 0 HParser *n = h_ch('n'); HParser *E = h_indirect(); HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL); HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); h_bind_indirect(E, E_); - HParser *p = h_sequence(E, NULL); +#endif + HParser *p = h_choice(h_many(h_ch('x')), h_ch('n'), NULL); //h_sequence(E, NULL); printf("\n==== G R A M M A R ====\n"); HCFGrammar *g = h_cfgrammar(&system_allocator, p); @@ -1024,7 +1022,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13); + HParseResult *res = h_parse(p, (uint8_t *)"xxn-(n-((n)))-n", 13); if(res) h_pprint(stdout, res->ast, 0, 2); else diff --git a/src/cfgrammar.c b/src/cfgrammar.c index a874236..199ef5f 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -331,6 +331,12 @@ bool h_stringmap_present_epsilon(const HStringMap *m) return (m->epsilon_branch != NULL); } +bool h_stringmap_empty(const HStringMap *m) +{ + return (m->epsilon_branch == NULL + && m->end_branch == NULL + && h_hashtable_empty(m->char_branches)); +} const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) { diff --git a/src/cfgrammar.h b/src/cfgrammar.h index c70c68a..57f6f68 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -49,6 +49,7 @@ void h_stringmap_replace(HStringMap *m, void *old, void *new); void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool end); bool h_stringmap_present(const HStringMap *m, const uint8_t *str, size_t n, bool end); bool h_stringmap_present_epsilon(const HStringMap *m); +bool h_stringmap_empty(const HStringMap *m); static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_t c) { return h_hashtable_get(m->char_branches, (void *)char_key(c)); } From f0cd2de55c4ddfaf1db47aa838c8b1562a3b6b77 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 15 Jun 2013 20:54:53 +0200 Subject: [PATCH 63/95] fix match_production for the empty-sequence case --- src/backends/lalr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 0dd1d07..773b457 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -579,15 +579,16 @@ int h_lrtable_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action) bool match_production(HLREnhGrammar *eg, HCFChoice **p, HCFChoice **rhs, size_t endstate) { - HLRTransition *t; + size_t state = endstate; // initialized to end in case of empty rhs for(; *p && *rhs; p++, rhs++) { - t = h_hashtable_get(eg->smap, *p); + HLRTransition *t = h_hashtable_get(eg->smap, *p); assert(t != NULL); if(!eq_symbol(t->symbol, *rhs)) return false; + state = t->to; } return (*p == *rhs // both NULL - && t->to == endstate); + && state == endstate); } int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) @@ -1022,7 +1023,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)"xxn-(n-((n)))-n", 13); + HParseResult *res = h_parse(p, (uint8_t *)"xxn-(n-((n)))-n", 2); if(res) h_pprint(stdout, res->ast, 0, 2); else From 8d21c782e742971db9395af516edb48ce8988675 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 15 Jun 2013 21:24:27 +0200 Subject: [PATCH 64/95] fix eq/hash_lalr_item to compare symbols by value --- src/backends/lalr.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 773b457..e041489 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -108,12 +108,12 @@ static bool eq_lalr_item(const void *p, const void *q) { const HLRItem *a=p, *b=q; - if(a->lhs != b->lhs) return false; + if(!eq_symbol(a->lhs, b->lhs)) return false; if(a->mark != b->mark) return false; if(a->len != b->len) return false; for(size_t i=0; ilen; i++) - if(a->rhs[i] != b->rhs[i]) return false; + if(!eq_symbol(a->rhs[i], b->rhs[i])) return false; return true; } @@ -128,9 +128,14 @@ static inline bool eq_lalr_itemset(const void *p, const void *q) static inline HHashValue hash_lalr_item(const void *p) { const HLRItem *x = p; - return (h_hash_ptr(x->lhs) - + h_djbhash((uint8_t *)x->rhs, x->len*sizeof(HCFChoice *)) - + x->mark); // XXX is it okay to just add mark? + HHashValue hash = 0; + + hash += hash_symbol(x->lhs); + for(HCFChoice **p=x->rhs; *p; p++) + hash += hash_symbol(*p); + hash += x->mark; + + return hash; } // hash LALR item sets (DFA states) - hash the elements and sum @@ -215,7 +220,7 @@ static void expand_to_closure(HCFGrammar *g, HHashSet *items) // initialize work list with items H_FOREACH_KEY(items, HLRItem *item) - h_slist_push(work, (void *)item); + h_slist_push(work, (void *)item); H_END_FOREACH while(!h_slist_empty(work)) { @@ -990,7 +995,7 @@ int test_lalr(void) HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); h_bind_indirect(E, E_); #endif - HParser *p = h_choice(h_many(h_ch('x')), h_ch('n'), NULL); //h_sequence(E, NULL); + HParser *p = h_whitespace(h_ch('n')); //h_sequence(E, NULL); printf("\n==== G R A M M A R ====\n"); HCFGrammar *g = h_cfgrammar(&system_allocator, p); @@ -1023,7 +1028,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)"xxn-(n-((n)))-n", 2); + HParseResult *res = h_parse(p, (uint8_t *)" n-(n-((n)))-n", 13); if(res) h_pprint(stdout, res->ast, 0, 2); else From 24c15f34cc263765b77820d9bb3662ed39f70a93 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sun, 16 Jun 2013 22:23:35 +0200 Subject: [PATCH 65/95] augment grammar to ensure start symbol never occurs on rhs --- src/backends/lalr.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index e041489..bcb1a02 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -2,6 +2,7 @@ #include "../internal.h" #include "../cfgrammar.h" #include "../parsers/parser_internal.h" +#include "contextfree.h" @@ -274,8 +275,6 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) // assigned index. HSlist *work = h_slist_new(arena); - // XXX augment grammar?! - // make initial state (kernel) HLRState *start = h_lrstate_new(arena); assert(g->start->type == HCF_CHOICE); @@ -596,14 +595,33 @@ bool match_production(HLREnhGrammar *eg, HCFChoice **p, && state == endstate); } +// desugar parser with a fresh start symbol +// this guarantees that the start symbol will not occur in any productions +static HCFChoice *augment(HAllocator *mm__, HParser *parser) +{ + HCFChoice *augmented = h_new(HCFChoice, 1); + + HCFStack *stk__ = h_cfstack_new(mm__); + stk__->prealloc = augmented; + HCFS_BEGIN_CHOICE() { + HCFS_BEGIN_SEQ() { + HCFS_DESUGAR(parser); + } HCFS_END_SEQ(); + HCFS_THIS_CHOICE->reshape = h_act_first; + } HCFS_END_CHOICE(); + h_cfstack_free(mm__, stk__); + + return augmented; +} + int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) { - // generate CFG from parser + // generate (augmented) CFG from parser // construct LR(0) DFA // build LR(0) table // if necessary, resolve conflicts "by conversion to SLR" - HCFGrammar *g = h_cfgrammar(mm__, parser); + HCFGrammar *g = h_cfgrammar_(mm__, augment(mm__, parser)); if(g == NULL) // backend not suitable (language not context-free) return -1; @@ -981,21 +999,18 @@ HParserBackendVTable h__lalr_backend_vtable = { int test_lalr(void) { /* - S -> E E -> E '-' T | T T -> '(' E ')' | 'n' -- also try [0-9] for the charset paths */ -#if 0 HParser *n = h_ch('n'); HParser *E = h_indirect(); HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL); HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); h_bind_indirect(E, E_); -#endif - HParser *p = h_whitespace(h_ch('n')); //h_sequence(E, NULL); + HParser *p = E; printf("\n==== G R A M M A R ====\n"); HCFGrammar *g = h_cfgrammar(&system_allocator, p); @@ -1028,7 +1043,7 @@ int test_lalr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)" n-(n-((n)))-n", 13); + HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13); if(res) h_pprint(stdout, res->ast, 0, 2); else From 67b6e9666c376ac1bcda6efac1ba21941533c018 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sun, 16 Jun 2013 22:29:53 +0200 Subject: [PATCH 66/95] fix a potentially uninitialized variable --- src/backends/lalr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index bcb1a02..7a1c04a 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -798,7 +798,7 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* value->seq = h_carray_new_sized(arena, len); // pull values off the left stack, rewinding state accordingly - HParsedToken *v; + HParsedToken *v = NULL; for(size_t i=0; i Date: Mon, 17 Jun 2013 19:11:18 +0200 Subject: [PATCH 67/95] factor out a struct HLREngine --- src/backends/lalr.c | 241 ++++++++++++++++++++++++++------------------ 1 file changed, 141 insertions(+), 100 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 7a1c04a..79e03c7 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -61,6 +61,16 @@ typedef struct HLREnhGrammar_ { HArena *arena; } HLREnhGrammar; +typedef struct HLREngine_ { + const HLRTable *table; + HSlist *left; // left stack; reductions happen here + HSlist *right; // right stack; input appears here + size_t state; + bool running; + HArena *arena; // will hold the results + HArena *tarena; // tmp, deleted after parse +} HLREngine; + // XXX move to internal.h or something // XXX replace other hashtable iterations with this @@ -733,6 +743,132 @@ h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) } } +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table) +{ + HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine)); + + engine->table = table; + engine->left = h_slist_new(tarena); + engine->right = h_slist_new(tarena); + engine->state = 0; + engine->running = 1; + engine->arena = arena; + engine->tarena = tarena; + + return engine; +} + +void h_lrengine_step(HLREngine *engine, HInputStream *stream) +{ + // short-hand names + HSlist *left = engine->left; + HSlist *right = engine->right; + HArena *arena = engine->arena; + HArena *tarena = engine->tarena; + + // stack layout: + // on the left stack, we put pairs: (saved state, semantic value) + // on the right stack, we put pairs: (symbol, semantic value) + + // make sure there is input on the right stack + if(h_slist_empty(right)) { + // XXX use statically-allocated terminal symbols + HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); + HParsedToken *v; + + uint8_t c = h_read_bits(stream, 8, false); + + if(stream->overrun) { // end of input + x->type = HCF_END; + v = NULL; + } else { + x->type = HCF_CHAR; + x->chr = c; + v = h_arena_malloc(arena, sizeof(HParsedToken)); + v->token_type = TT_UINT; + v->uint = c; + } + + h_slist_push(right, v); + h_slist_push(right, x); + } + + // peek at input symbol on the right side + HCFChoice *symbol = right->head->elem; + + // table lookup + const HLRAction *action = h_lr_lookup(engine->table, engine->state, symbol); + if(action == NULL) { + // no handle recognizable in input, terminate + engine->running = false; + return; + } + + if(action->type == HLR_SHIFT) { + h_slist_push(left, (void *)(uintptr_t)engine->state); + h_slist_pop(right); // symbol (discard) + h_slist_push(left, h_slist_pop(right)); // semantic value + engine->state = action->nextstate; + } else { + assert(action->type == HLR_REDUCE); + size_t len = action->production.length; + HCFChoice *symbol = action->production.lhs; + + // semantic value of the reduction result + HParsedToken *value = h_arena_malloc(arena, sizeof(HParsedToken)); + value->token_type = TT_SEQUENCE; + value->seq = h_carray_new_sized(arena, len); + + // pull values off the left stack, rewinding state accordingly + HParsedToken *v = NULL; + for(size_t i=0; istate = (uintptr_t)h_slist_pop(left); + + // collect values in result sequence + value->seq->elements[len-1-i] = v; + value->seq->used++; + } + if(v) { + // result position equals position of left-most symbol + value->index = v->index; + value->bit_offset = v->bit_offset; + } else { + // XXX how to get the position in this case? + } + + // perform token reshape if indicated + if(symbol->reshape) + value = (HParsedToken *)symbol->reshape(make_result(arena, value)); + + // call validation and semantic action, if present + if(symbol->pred && !symbol->pred(make_result(tarena, value))) { + // validation failed -> no parse; terminate + engine->running = false; + return; + } + if(symbol->action) + value = (HParsedToken *)symbol->action(make_result(arena, value)); + + // push result (value, symbol) onto the right stack + h_slist_push(right, value); + h_slist_push(right, symbol); + } +} + +HParseResult *h_lrengine_result(HLREngine *engine) +{ + // parsing was successful iff the start symbol is on top of the right stack + if(h_slist_pop(engine->right) == engine->table->start) { + // next on the right stack is the start symbol's semantic value + assert(!h_slist_empty(engine->right)); + HParsedToken *tok = h_slist_pop(engine->right); + return make_result(engine->arena, tok); + } else { + return NULL; + } +} + HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) { HLRTable *table = parser->backend_data; @@ -741,110 +877,15 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HArena *arena = h_new_arena(mm__, 0); // will hold the results HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse - HSlist *left = h_slist_new(tarena); // left stack; reductions happen here - HSlist *right = h_slist_new(tarena); // right stack; input appears here - - // stack layout: - // on the left stack, we put pairs: (saved state, semantic value) - // on the right stack, we put pairs: (symbol, semantic value) + HLREngine *engine = h_lrengine_new(arena, tarena, table); // run while the recognizer finds handles in the input - size_t state = 0; - while(1) { - // make sure there is input on the right stack - if(h_slist_empty(right)) { - // XXX use statically-allocated terminal symbols - HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); - HParsedToken *v; + while(engine->running) + h_lrengine_step(engine, stream); - uint8_t c = h_read_bits(stream, 8, false); - - if(stream->overrun) { // end of input - x->type = HCF_END; - v = NULL; - } else { - x->type = HCF_CHAR; - x->chr = c; - v = h_arena_malloc(arena, sizeof(HParsedToken)); - v->token_type = TT_UINT; - v->uint = c; - } - - h_slist_push(right, v); - h_slist_push(right, x); - } - - // peek at input symbol on the right side - HCFChoice *symbol = right->head->elem; - - // table lookup - const HLRAction *action = h_lr_lookup(table, state, symbol); - if(action == NULL) - break; // no handle recognizable in input, terminate parsing - - if(action->type == HLR_SHIFT) { - h_slist_push(left, (void *)(uintptr_t)state); - h_slist_pop(right); // symbol (discard) - h_slist_push(left, h_slist_pop(right)); // semantic value - state = action->nextstate; - } else { - assert(action->type == HLR_REDUCE); - size_t len = action->production.length; - HCFChoice *symbol = action->production.lhs; - - // semantic value of the reduction result - HParsedToken *value = h_arena_malloc(arena, sizeof(HParsedToken)); - value->token_type = TT_SEQUENCE; - value->seq = h_carray_new_sized(arena, len); - - // pull values off the left stack, rewinding state accordingly - HParsedToken *v = NULL; - for(size_t i=0; iseq->elements[len-1-i] = v; - value->seq->used++; - } - if(v) { - // result position equals position of left-most symbol - value->index = v->index; - value->bit_offset = v->bit_offset; - } else { - // XXX how to get the position in this case? - } - - // perform token reshape if indicated - if(symbol->reshape) - value = (HParsedToken *)symbol->reshape(make_result(arena, value)); - - // call validation and semantic action, if present - if(symbol->pred && !symbol->pred(make_result(tarena, value))) - break; // validation failed -> no parse - if(symbol->action) - value = (HParsedToken *)symbol->action(make_result(arena, value)); - - // push result (value, symbol) onto the right stack - h_slist_push(right, value); - h_slist_push(right, symbol); - } - } - - - - // parsing was successful iff the start symbol is on top of the right stack - HParseResult *result = NULL; - if(h_slist_pop(right) == table->start) { - // next on the right stack is the start symbol's semantic value - assert(!h_slist_empty(right)); - HParsedToken *tok = h_slist_pop(right); - result = make_result(arena, tok); - } else { + HParseResult *result = h_lrengine_result(engine); + if(!result) h_delete_arena(arena); - result = NULL; - } - h_delete_arena(tarena); return result; } From 129d50c0ef1d7867ea8450c2df589bdf823bee6f Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Mon, 17 Jun 2013 20:08:25 +0200 Subject: [PATCH 68/95] split LR code into lr.c, lr0.c, and lalr.c --- src/Makefile | 2 + src/backends/lalr.c | 760 +------------------------------------------- src/backends/lr.c | 458 ++++++++++++++++++++++++++ src/backends/lr.h | 131 ++++++++ src/backends/lr0.c | 205 ++++++++++++ 5 files changed, 806 insertions(+), 750 deletions(-) create mode 100644 src/backends/lr.c create mode 100644 src/backends/lr.h create mode 100644 src/backends/lr0.c diff --git a/src/Makefile b/src/Makefile index 1a2bff3..380436a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -43,6 +43,8 @@ HAMMER_PARTS := \ benchmark.o \ cfgrammar.o \ glue.o \ + backends/lr.o \ + backends/lr0.o \ $(PARSERS:%=parsers/%.o) \ $(BACKENDS:%=backends/%.o) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 79e03c7..fa67e5a 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -1,437 +1,6 @@ #include -#include "../internal.h" -#include "../cfgrammar.h" -#include "../parsers/parser_internal.h" #include "contextfree.h" - - - -/* Data structures */ - -typedef HHashSet HLRState; // states are sets of LRItems - -typedef struct HLRDFA_ { - size_t nstates; - const HLRState **states; // array of size nstates - HSlist *transitions; -} HLRDFA; - -typedef struct HLRTransition_ { - size_t from; // index into 'states' array - const HCFChoice *symbol; - size_t to; // index into 'states' array -} HLRTransition; - -typedef struct HLRItem_ { - HCFChoice *lhs; - HCFChoice **rhs; // NULL-terminated - size_t len; // number of elements in rhs - size_t mark; -} HLRItem; - -typedef struct HLRAction_ { - enum {HLR_SHIFT, HLR_REDUCE} type; - union { - size_t nextstate; // used with SHIFT - struct { - HCFChoice *lhs; // symbol carrying semantic actions etc. - size_t length; // # of symbols in rhs -#ifndef NDEBUG - HCFChoice **rhs; // NB: the rhs symbols are not needed for the parse -#endif - } production; // used with REDUCE - }; -} HLRAction; - -typedef struct HLRTable_ { - size_t nrows; - HHashTable **rows; // map symbols to HLRActions - HLRAction **forall; // shortcut to set an action for an entire row - HCFChoice *start; // start symbol - HSlist *inadeq; // indices of any inadequate states - HArena *arena; - HAllocator *mm__; -} HLRTable; - -typedef struct HLREnhGrammar_ { - HCFGrammar *grammar; // enhanced grammar - HHashTable *tmap; // maps transitions to enhanced-grammar symbols - HHashTable *smap; // maps enhanced-grammar symbols to transitions - HHashTable *corr; // maps symbols to sets of corresponding e. symbols - HArena *arena; -} HLREnhGrammar; - -typedef struct HLREngine_ { - const HLRTable *table; - HSlist *left; // left stack; reductions happen here - HSlist *right; // right stack; input appears here - size_t state; - bool running; - HArena *arena; // will hold the results - HArena *tarena; // tmp, deleted after parse -} HLREngine; - - -// XXX move to internal.h or something -// XXX replace other hashtable iterations with this -#define H_FOREACH_(HT) { \ - const HHashTable *ht__ = HT; \ - for(size_t i__=0; i__ < ht__->capacity; i__++) { \ - for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ - hte__; \ - hte__ = hte__->next) { \ - if(hte__->key == NULL) continue; - -#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ - const KEYVAR = hte__->key; - -#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ - VALVAR = hte__->value; - -#define H_END_FOREACH \ - } \ - } \ - } - -// compare symbols - terminals by value, others by pointer -static bool eq_symbol(const void *p, const void *q) -{ - const HCFChoice *x=p, *y=q; - return (x==y - || (x->type==HCF_END && y->type==HCF_END) - || (x->type==HCF_CHAR && y->type==HCF_CHAR && x->chr==y->chr)); -} - -// hash symbols - terminals by value, others by pointer -static HHashValue hash_symbol(const void *p) -{ - const HCFChoice *x=p; - if(x->type == HCF_END) - return 0; - else if(x->type == HCF_CHAR) - return x->chr * 33; - else - return h_hash_ptr(p); -} - -// compare LALR items by value -static bool eq_lalr_item(const void *p, const void *q) -{ - const HLRItem *a=p, *b=q; - - if(!eq_symbol(a->lhs, b->lhs)) return false; - if(a->mark != b->mark) return false; - if(a->len != b->len) return false; - - for(size_t i=0; ilen; i++) - if(!eq_symbol(a->rhs[i], b->rhs[i])) return false; - - return true; -} - -// compare LALR item sets (DFA states) -static inline bool eq_lalr_itemset(const void *p, const void *q) -{ - return h_hashset_equal(p, q); -} - -// hash LALR items -static inline HHashValue hash_lalr_item(const void *p) -{ - const HLRItem *x = p; - HHashValue hash = 0; - - hash += hash_symbol(x->lhs); - for(HCFChoice **p=x->rhs; *p; p++) - hash += hash_symbol(*p); - hash += x->mark; - - return hash; -} - -// hash LALR item sets (DFA states) - hash the elements and sum -static HHashValue hash_lalr_itemset(const void *p) -{ - HHashValue hash = 0; - - H_FOREACH_KEY((const HHashSet *)p, HLRItem *item) - hash += hash_lalr_item(item); - H_END_FOREACH - - return hash; -} - -HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark) -{ - HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem)); - - size_t len = 0; - for(HCFChoice **p=rhs; *p; p++) len++; - assert(mark <= len); - - ret->lhs = lhs; - ret->rhs = rhs; - ret->len = len; - ret->mark = mark; - - return ret; -} - -static inline HLRState *h_lrstate_new(HArena *arena) -{ - return h_hashset_new(arena, eq_lalr_item, hash_lalr_item); -} - -HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) -{ - HArena *arena = h_new_arena(mm__, 0); // default blocksize - assert(arena != NULL); - - HLRTable *ret = h_new(HLRTable, 1); - ret->nrows = nrows; - ret->rows = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); - ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *)); - ret->inadeq = h_slist_new(arena); - ret->arena = arena; - ret->mm__ = mm__; - - for(size_t i=0; irows[i] = h_hashtable_new(arena, eq_symbol, hash_symbol); - ret->forall[i] = NULL; - } - - return ret; -} - -void h_lrtable_free(HLRTable *table) -{ - HAllocator *mm__ = table->mm__; - h_delete_arena(table->arena); - h_free(table); -} - - - -/* Constructing the characteristic automaton (handle recognizer) */ - -static HLRItem *advance_mark(HArena *arena, const HLRItem *item) -{ - assert(item->rhs[item->mark] != NULL); - HLRItem *ret = h_arena_malloc(arena, sizeof(HLRItem)); - *ret = *item; - ret->mark++; - return ret; -} - -static void expand_to_closure(HCFGrammar *g, HHashSet *items) -{ - HAllocator *mm__ = g->mm__; - HArena *arena = g->arena; - HSlist *work = h_slist_new(arena); - - // initialize work list with items - H_FOREACH_KEY(items, HLRItem *item) - h_slist_push(work, (void *)item); - H_END_FOREACH - - while(!h_slist_empty(work)) { - const HLRItem *item = h_slist_pop(work); - HCFChoice *sym = item->rhs[item->mark]; // symbol after mark - - // if there is a non-terminal after the mark, follow it - // NB: unlike LLk, we do consider HCF_CHARSET a non-terminal here - if(sym != NULL && (sym->type==HCF_CHOICE || sym->type==HCF_CHARSET)) { - // add items corresponding to the productions of sym - if(sym->type == HCF_CHOICE) { - for(HCFSequence **p=sym->seq; *p; p++) { - HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); - if(!h_hashset_present(items, it)) { - h_hashset_put(items, it); - h_slist_push(work, it); - } - } - } else { // HCF_CHARSET - for(unsigned int i=0; i<256; i++) { - if(charset_isset(sym->charset, i)) { - // XXX allocate these single-character symbols statically somewhere - HCFChoice **rhs = h_new(HCFChoice *, 2); - rhs[0] = h_new(HCFChoice, 1); - rhs[0]->type = HCF_CHAR; - rhs[0]->chr = i; - rhs[1] = NULL; - HLRItem *it = h_lritem_new(arena, sym, rhs, 0); - h_hashset_put(items, it); - // single-character item needs no further work - } - } - // if sym is a non-terminal, we need a reshape on it - // this seems as good a place as any to set it - sym->reshape = h_act_first; - } - } - } -} - -HLRDFA *h_lr0_dfa(HCFGrammar *g) -{ - HArena *arena = g->arena; - - HHashSet *states = h_hashset_new(arena, eq_lalr_itemset, hash_lalr_itemset); - // maps itemsets to assigned array indices - HSlist *transitions = h_slist_new(arena); - - // list of states that need to be processed - // to save lookups, we push two elements per state, the itemset and its - // assigned index. - HSlist *work = h_slist_new(arena); - - // make initial state (kernel) - HLRState *start = h_lrstate_new(arena); - assert(g->start->type == HCF_CHOICE); - for(HCFSequence **p=g->start->seq; *p; p++) - h_hashset_put(start, h_lritem_new(arena, g->start, (*p)->items, 0)); - expand_to_closure(g, start); - h_hashtable_put(states, start, 0); - h_slist_push(work, start); - h_slist_push(work, 0); - - // while work to do (on some state) - // determine edge symbols - // for each edge symbol: - // advance respective items -> destination state (kernel) - // compute closure - // if destination is a new state: - // add it to state set - // add transition to it - // add it to the work list - - while(!h_slist_empty(work)) { - size_t state_idx = (uintptr_t)h_slist_pop(work); - HLRState *state = h_slist_pop(work); - - // maps edge symbols to neighbor states (item sets) of s - HHashTable *neighbors = h_hashtable_new(arena, eq_symbol, hash_symbol); - - // iterate over state (closure) and generate neighboring sets - H_FOREACH_KEY(state, HLRItem *item) - HCFChoice *sym = item->rhs[item->mark]; // symbol after mark - - if(sym != NULL) { // mark was not at the end - // find or create prospective neighbor set - HLRState *neighbor = h_hashtable_get(neighbors, sym); - if(neighbor == NULL) { - neighbor = h_lrstate_new(arena); - h_hashtable_put(neighbors, sym, neighbor); - } - - // ...and add the advanced item to it - h_hashset_put(neighbor, advance_mark(arena, item)); - } - H_END_FOREACH - - // merge expanded neighbor sets into the set of existing states - H_FOREACH(neighbors, HCFChoice *symbol, HLRState *neighbor) - expand_to_closure(g, neighbor); - - // look up existing state, allocate new if not found - size_t neighbor_idx; - if(!h_hashset_present(states, neighbor)) { - neighbor_idx = states->used; - h_hashtable_put(states, neighbor, (void *)(uintptr_t)neighbor_idx); - h_slist_push(work, neighbor); - h_slist_push(work, (void *)(uintptr_t)neighbor_idx); - } else { - neighbor_idx = (uintptr_t)h_hashtable_get(states, neighbor); - } - - // add transition "state --symbol--> neighbor" - HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); - t->from = state_idx; - t->to = neighbor_idx; - t->symbol = symbol; - h_slist_push(transitions, t); - H_END_FOREACH - } // end while(work) - - // fill DFA struct - HLRDFA *dfa = h_arena_malloc(arena, sizeof(HLRDFA)); - dfa->nstates = states->used; - dfa->states = h_arena_malloc(arena, dfa->nstates*sizeof(HLRState *)); - H_FOREACH(states, HLRState *state, void *v) - size_t idx = (uintptr_t)v; - dfa->states[idx] = state; - H_END_FOREACH - dfa->transitions = transitions; - - return dfa; -} - - - -/* LR(0) table generation */ - -static HLRAction *shift_action(HArena *arena, size_t nextstate) -{ - HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); - action->type = HLR_SHIFT; - action->nextstate = nextstate; - return action; -} - -static HLRAction *reduce_action(HArena *arena, const HLRItem *item) -{ - HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); - action->type = HLR_REDUCE; - action->production.lhs = item->lhs; - action->production.length = item->len; -#ifndef NDEBUG - action->production.rhs = item->rhs; -#endif - return action; -} - -HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) -{ - HAllocator *mm__ = g->mm__; - - HLRTable *table = h_lrtable_new(mm__, dfa->nstates); - HArena *arena = table->arena; - - // remember start symbol - table->start = g->start; - - // add shift entries - for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { - // for each transition x-A->y, add "shift, goto y" to table entry (x,A) - HLRTransition *t = x->elem; - - HLRAction *action = shift_action(arena, t->to); - h_hashtable_put(table->rows[t->from], t->symbol, action); - } - - // add reduce entries, record inadequate states - for(size_t i=0; instates; i++) { - // find reducible items in state - H_FOREACH_KEY(dfa->states[i], HLRItem *item) - if(item->mark == item->len) { // mark at the end - // check for conflicts - // XXX store more informative stuff in the inadeq records? - if(table->forall[i]) { - // reduce/reduce conflict with a previous item - h_slist_push(table->inadeq, (void *)(uintptr_t)i); - } else if(!h_hashtable_empty(table->rows[i])) { - // shift/reduce conflict with one of the row's entries - h_slist_push(table->inadeq, (void *)(uintptr_t)i); - } - - // set reduce action for the entire row - table->forall[i] = reduce_action(arena, item); - } - H_END_FOREACH - } - - return table; -} +#include "lr.h" @@ -499,19 +68,7 @@ static void transform_productions(const HLRTable *table, HLREnhGrammar *eg, xAy->seq = seq; } -static bool eq_transition(const void *p, const void *q) -{ - const HLRTransition *a=p, *b=q; - return (a->from == b->from && a->to == b->to && eq_symbol(a->symbol, b->symbol)); -} - -static HHashValue hash_transition(const void *p) -{ - const HLRTransition *t = p; - return (hash_symbol(t->symbol) + t->from + t->to); // XXX ? -} - -HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym) +static HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym) { HArena *arena = eg->arena; HCFChoice *esym = h_arena_malloc(arena, sizeof(HCFChoice)); @@ -519,13 +76,14 @@ HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym) HHashSet *cs = h_hashtable_get(eg->corr, sym); if(!cs) { - cs = h_hashset_new(arena, eq_symbol, hash_symbol); + cs = h_hashset_new(arena, h_eq_symbol, h_hash_symbol); h_hashtable_put(eg->corr, sym, cs); } h_hashset_put(cs, esym); return esym; } + static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa, const HLRTable *table) { @@ -533,9 +91,9 @@ static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa, HArena *arena = g->arena; HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar)); - eg->tmap = h_hashtable_new(arena, eq_transition, hash_transition); + eg->tmap = h_hashtable_new(arena, h_eq_transition, h_hash_transition); eg->smap = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); - eg->corr = h_hashtable_new(arena, eq_symbol, hash_symbol); + eg->corr = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); // XXX must use h_eq/hash_ptr for symbols! so enhanced CHARs are different eg->arena = arena; @@ -590,14 +148,14 @@ int h_lrtable_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action) // check whether a sequence of enhanced-grammar symbols (p) matches the given // (original-grammar) production rhs and terminates in the given end state. -bool match_production(HLREnhGrammar *eg, HCFChoice **p, - HCFChoice **rhs, size_t endstate) +static bool match_production(HLREnhGrammar *eg, HCFChoice **p, + HCFChoice **rhs, size_t endstate) { size_t state = endstate; // initialized to end in case of empty rhs for(; *p && *rhs; p++, rhs++) { HLRTransition *t = h_hashtable_get(eg->smap, *p); assert(t != NULL); - if(!eq_symbol(t->symbol, *rhs)) + if(!h_eq_symbol(t->symbol, *rhs)) return false; state = t->to; } @@ -673,7 +231,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) continue; // action to place in the table cells indicated by lookahead - HLRAction *action = reduce_action(arena, item); + HLRAction *action = h_reduce_action(arena, item); // find all LR(0)-enhanced productions matching item HHashSet *lhss = h_hashtable_get(eg->corr, item->lhs); @@ -729,304 +287,6 @@ void h_lalr_free(HParser *parser) -/* LR driver */ - -const HLRAction * -h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) -{ - assert(state < table->nrows); - if(table->forall[state]) { - assert(h_hashtable_empty(table->rows[state])); // that would be a conflict - return table->forall[state]; - } else { - return h_hashtable_get(table->rows[state], symbol); - } -} - -HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table) -{ - HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine)); - - engine->table = table; - engine->left = h_slist_new(tarena); - engine->right = h_slist_new(tarena); - engine->state = 0; - engine->running = 1; - engine->arena = arena; - engine->tarena = tarena; - - return engine; -} - -void h_lrengine_step(HLREngine *engine, HInputStream *stream) -{ - // short-hand names - HSlist *left = engine->left; - HSlist *right = engine->right; - HArena *arena = engine->arena; - HArena *tarena = engine->tarena; - - // stack layout: - // on the left stack, we put pairs: (saved state, semantic value) - // on the right stack, we put pairs: (symbol, semantic value) - - // make sure there is input on the right stack - if(h_slist_empty(right)) { - // XXX use statically-allocated terminal symbols - HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); - HParsedToken *v; - - uint8_t c = h_read_bits(stream, 8, false); - - if(stream->overrun) { // end of input - x->type = HCF_END; - v = NULL; - } else { - x->type = HCF_CHAR; - x->chr = c; - v = h_arena_malloc(arena, sizeof(HParsedToken)); - v->token_type = TT_UINT; - v->uint = c; - } - - h_slist_push(right, v); - h_slist_push(right, x); - } - - // peek at input symbol on the right side - HCFChoice *symbol = right->head->elem; - - // table lookup - const HLRAction *action = h_lr_lookup(engine->table, engine->state, symbol); - if(action == NULL) { - // no handle recognizable in input, terminate - engine->running = false; - return; - } - - if(action->type == HLR_SHIFT) { - h_slist_push(left, (void *)(uintptr_t)engine->state); - h_slist_pop(right); // symbol (discard) - h_slist_push(left, h_slist_pop(right)); // semantic value - engine->state = action->nextstate; - } else { - assert(action->type == HLR_REDUCE); - size_t len = action->production.length; - HCFChoice *symbol = action->production.lhs; - - // semantic value of the reduction result - HParsedToken *value = h_arena_malloc(arena, sizeof(HParsedToken)); - value->token_type = TT_SEQUENCE; - value->seq = h_carray_new_sized(arena, len); - - // pull values off the left stack, rewinding state accordingly - HParsedToken *v = NULL; - for(size_t i=0; istate = (uintptr_t)h_slist_pop(left); - - // collect values in result sequence - value->seq->elements[len-1-i] = v; - value->seq->used++; - } - if(v) { - // result position equals position of left-most symbol - value->index = v->index; - value->bit_offset = v->bit_offset; - } else { - // XXX how to get the position in this case? - } - - // perform token reshape if indicated - if(symbol->reshape) - value = (HParsedToken *)symbol->reshape(make_result(arena, value)); - - // call validation and semantic action, if present - if(symbol->pred && !symbol->pred(make_result(tarena, value))) { - // validation failed -> no parse; terminate - engine->running = false; - return; - } - if(symbol->action) - value = (HParsedToken *)symbol->action(make_result(arena, value)); - - // push result (value, symbol) onto the right stack - h_slist_push(right, value); - h_slist_push(right, symbol); - } -} - -HParseResult *h_lrengine_result(HLREngine *engine) -{ - // parsing was successful iff the start symbol is on top of the right stack - if(h_slist_pop(engine->right) == engine->table->start) { - // next on the right stack is the start symbol's semantic value - assert(!h_slist_empty(engine->right)); - HParsedToken *tok = h_slist_pop(engine->right); - return make_result(engine->arena, tok); - } else { - return NULL; - } -} - -HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) -{ - HLRTable *table = parser->backend_data; - if(!table) - return NULL; - - HArena *arena = h_new_arena(mm__, 0); // will hold the results - HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse - HLREngine *engine = h_lrengine_new(arena, tarena, table); - - // run while the recognizer finds handles in the input - while(engine->running) - h_lrengine_step(engine, stream); - - HParseResult *result = h_lrengine_result(engine); - if(!result) - h_delete_arena(arena); - h_delete_arena(tarena); - return result; -} - - - -/* Pretty-printers */ - -void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item) -{ - h_pprint_symbol(f, g, item->lhs); - fputs(" ->", f); - - HCFChoice **x = item->rhs; - HCFChoice **mark = item->rhs + item->mark; - if(*x == NULL) { - fputc('.', f); - } else { - while(*x) { - if(x == mark) - fputc('.', f); - else - fputc(' ', f); - - if((*x)->type == HCF_CHAR) { - // condense character strings - fputc('"', f); - h_pprint_char(f, (*x)->chr); - for(x++; *x; x++) { - if(x == mark) - break; - if((*x)->type != HCF_CHAR) - break; - h_pprint_char(f, (*x)->chr); - } - fputc('"', f); - } else { - h_pprint_symbol(f, g, *x); - x++; - } - } - if(x == mark) - fputs(".", f); - } -} - -void h_pprint_lrstate(FILE *f, const HCFGrammar *g, - const HLRState *state, unsigned int indent) -{ - bool first = true; - H_FOREACH_KEY(state, HLRItem *item) - if(!first) - for(unsigned int i=0; isymbol); - fprintf(f, "->%lu", t->to); -} - -void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, - const HLRDFA *dfa, unsigned int indent) -{ - for(size_t i=0; instates; i++) { - unsigned int indent2 = indent + fprintf(f, "%4lu: ", i); - h_pprint_lrstate(f, g, dfa->states[i], indent2); - for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { - const HLRTransition *t = x->elem; - if(t->from == i) { - for(unsigned int i=0; itype == HLR_SHIFT) { - fprintf(f, "s%lu", action->nextstate); - } else { - fputs("r(", f); - h_pprint_symbol(f, g, action->production.lhs); - fputs(" -> ", f); -#ifdef NDEBUG - // if we can't print the production, at least print its length - fprintf(f, "[%lu]", action->production.length); -#else - HCFSequence seq = {action->production.rhs}; - h_pprint_sequence(f, g, &seq); -#endif - fputc(')', f); - } -} - -void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, - unsigned int indent) -{ - for(size_t i=0; inrows; i++) { - for(unsigned int j=0; jforall[i]) { - fputs(" - ", f); - pprint_lraction(f, g, table->forall[i]); - fputs(" -", f); - if(!h_hashtable_empty(table->rows[i])) - fputs(" !!", f); - } - H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) - fputc(' ', f); // separator - h_pprint_symbol(f, g, symbol); - fputc(':', f); - if(table->forall[i]) { - fputc(action->type == HLR_SHIFT? 's' : 'r', f); - fputc('/', f); - fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); - } else { - pprint_lraction(f, g, action); - } - H_END_FOREACH - fputc('\n', f); - } - -#if 0 - fputs("inadeq=", f); - for(HSlistNode *x=table->inadeq->head; x; x=x->next) { - fprintf(f, "%lu ", (uintptr_t)x->elem); - } - fputc('\n', f); -#endif -} - - - HParserBackendVTable h__lalr_backend_vtable = { .compile = h_lalr_compile, .parse = h_lr_parse, diff --git a/src/backends/lr.c b/src/backends/lr.c new file mode 100644 index 0000000..c3062d5 --- /dev/null +++ b/src/backends/lr.c @@ -0,0 +1,458 @@ +#include +#include "../parsers/parser_internal.h" +#include "lr.h" + + + +/* Comparison and hashing functions */ + +// compare symbols - terminals by value, others by pointer +bool h_eq_symbol(const void *p, const void *q) +{ + const HCFChoice *x=p, *y=q; + return (x==y + || (x->type==HCF_END && y->type==HCF_END) + || (x->type==HCF_CHAR && y->type==HCF_CHAR && x->chr==y->chr)); +} + +// hash symbols - terminals by value, others by pointer +HHashValue h_hash_symbol(const void *p) +{ + const HCFChoice *x=p; + if(x->type == HCF_END) + return 0; + else if(x->type == HCF_CHAR) + return x->chr * 33; + else + return h_hash_ptr(p); +} + +// compare LR items by value +static bool eq_lr_item(const void *p, const void *q) +{ + const HLRItem *a=p, *b=q; + + if(!h_eq_symbol(a->lhs, b->lhs)) return false; + if(a->mark != b->mark) return false; + if(a->len != b->len) return false; + + for(size_t i=0; ilen; i++) + if(!h_eq_symbol(a->rhs[i], b->rhs[i])) return false; + + return true; +} + +// hash LALR items +static inline HHashValue hash_lr_item(const void *p) +{ + const HLRItem *x = p; + HHashValue hash = 0; + + hash += h_hash_symbol(x->lhs); + for(HCFChoice **p=x->rhs; *p; p++) + hash += h_hash_symbol(*p); + hash += x->mark; + + return hash; +} + +// compare item sets (DFA states) +bool h_eq_lr_itemset(const void *p, const void *q) +{ + return h_hashset_equal(p, q); +} + +// hash LR item sets (DFA states) - hash the elements and sum +HHashValue h_hash_lr_itemset(const void *p) +{ + HHashValue hash = 0; + + H_FOREACH_KEY((const HHashSet *)p, HLRItem *item) + hash += hash_lr_item(item); + H_END_FOREACH + + return hash; +} + +bool h_eq_transition(const void *p, const void *q) +{ + const HLRTransition *a=p, *b=q; + return (a->from == b->from && a->to == b->to && h_eq_symbol(a->symbol, b->symbol)); +} + +HHashValue h_hash_transition(const void *p) +{ + const HLRTransition *t = p; + return (h_hash_symbol(t->symbol) + t->from + t->to); // XXX ? +} + + + +/* Constructors */ + +HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark) +{ + HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem)); + + size_t len = 0; + for(HCFChoice **p=rhs; *p; p++) len++; + assert(mark <= len); + + ret->lhs = lhs; + ret->rhs = rhs; + ret->len = len; + ret->mark = mark; + + return ret; +} + +HLRState *h_lrstate_new(HArena *arena) +{ + return h_hashset_new(arena, eq_lr_item, hash_lr_item); +} + +HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) +{ + HArena *arena = h_new_arena(mm__, 0); // default blocksize + assert(arena != NULL); + + HLRTable *ret = h_new(HLRTable, 1); + ret->nrows = nrows; + ret->rows = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); + ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *)); + ret->inadeq = h_slist_new(arena); + ret->arena = arena; + ret->mm__ = mm__; + + for(size_t i=0; irows[i] = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); + ret->forall[i] = NULL; + } + + return ret; +} + +void h_lrtable_free(HLRTable *table) +{ + HAllocator *mm__ = table->mm__; + h_delete_arena(table->arena); + h_free(table); +} + +HLRAction *h_shift_action(HArena *arena, size_t nextstate) +{ + HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_SHIFT; + action->nextstate = nextstate; + return action; +} + +HLRAction *h_reduce_action(HArena *arena, const HLRItem *item) +{ + HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_REDUCE; + action->production.lhs = item->lhs; + action->production.length = item->len; +#ifndef NDEBUG + action->production.rhs = item->rhs; +#endif + return action; +} + + + +/* LR driver */ + +const HLRAction * +h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) +{ + assert(state < table->nrows); + if(table->forall[state]) { + assert(h_hashtable_empty(table->rows[state])); // that would be a conflict + return table->forall[state]; + } else { + return h_hashtable_get(table->rows[state], symbol); + } +} + +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table) +{ + HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine)); + + engine->table = table; + engine->left = h_slist_new(tarena); + engine->right = h_slist_new(tarena); + engine->state = 0; + engine->running = 1; + engine->arena = arena; + engine->tarena = tarena; + + return engine; +} + +void h_lrengine_step(HLREngine *engine, HInputStream *stream) +{ + // short-hand names + HSlist *left = engine->left; + HSlist *right = engine->right; + HArena *arena = engine->arena; + HArena *tarena = engine->tarena; + + // stack layout: + // on the left stack, we put pairs: (saved state, semantic value) + // on the right stack, we put pairs: (symbol, semantic value) + + // make sure there is input on the right stack + if(h_slist_empty(right)) { + // XXX use statically-allocated terminal symbols + HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); + HParsedToken *v; + + uint8_t c = h_read_bits(stream, 8, false); + + if(stream->overrun) { // end of input + x->type = HCF_END; + v = NULL; + } else { + x->type = HCF_CHAR; + x->chr = c; + v = h_arena_malloc(arena, sizeof(HParsedToken)); + v->token_type = TT_UINT; + v->uint = c; + } + + h_slist_push(right, v); + h_slist_push(right, x); + } + + // peek at input symbol on the right side + HCFChoice *symbol = right->head->elem; + + // table lookup + const HLRAction *action = h_lr_lookup(engine->table, engine->state, symbol); + if(action == NULL) { + // no handle recognizable in input, terminate + engine->running = false; + return; + } + + if(action->type == HLR_SHIFT) { + h_slist_push(left, (void *)(uintptr_t)engine->state); + h_slist_pop(right); // symbol (discard) + h_slist_push(left, h_slist_pop(right)); // semantic value + engine->state = action->nextstate; + } else { + assert(action->type == HLR_REDUCE); + size_t len = action->production.length; + HCFChoice *symbol = action->production.lhs; + + // semantic value of the reduction result + HParsedToken *value = h_arena_malloc(arena, sizeof(HParsedToken)); + value->token_type = TT_SEQUENCE; + value->seq = h_carray_new_sized(arena, len); + + // pull values off the left stack, rewinding state accordingly + HParsedToken *v = NULL; + for(size_t i=0; istate = (uintptr_t)h_slist_pop(left); + + // collect values in result sequence + value->seq->elements[len-1-i] = v; + value->seq->used++; + } + if(v) { + // result position equals position of left-most symbol + value->index = v->index; + value->bit_offset = v->bit_offset; + } else { + // XXX how to get the position in this case? + } + + // perform token reshape if indicated + if(symbol->reshape) + value = (HParsedToken *)symbol->reshape(make_result(arena, value)); + + // call validation and semantic action, if present + if(symbol->pred && !symbol->pred(make_result(tarena, value))) { + // validation failed -> no parse; terminate + engine->running = false; + return; + } + if(symbol->action) + value = (HParsedToken *)symbol->action(make_result(arena, value)); + + // push result (value, symbol) onto the right stack + h_slist_push(right, value); + h_slist_push(right, symbol); + } +} + +HParseResult *h_lrengine_result(HLREngine *engine) +{ + // parsing was successful iff the start symbol is on top of the right stack + if(h_slist_pop(engine->right) == engine->table->start) { + // next on the right stack is the start symbol's semantic value + assert(!h_slist_empty(engine->right)); + HParsedToken *tok = h_slist_pop(engine->right); + return make_result(engine->arena, tok); + } else { + return NULL; + } +} + +HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) +{ + HLRTable *table = parser->backend_data; + if(!table) + return NULL; + + HArena *arena = h_new_arena(mm__, 0); // will hold the results + HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse + HLREngine *engine = h_lrengine_new(arena, tarena, table); + + // run while the recognizer finds handles in the input + while(engine->running) + h_lrengine_step(engine, stream); + + HParseResult *result = h_lrengine_result(engine); + if(!result) + h_delete_arena(arena); + h_delete_arena(tarena); + return result; +} + + + +/* Pretty-printers */ + +void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item) +{ + h_pprint_symbol(f, g, item->lhs); + fputs(" ->", f); + + HCFChoice **x = item->rhs; + HCFChoice **mark = item->rhs + item->mark; + if(*x == NULL) { + fputc('.', f); + } else { + while(*x) { + if(x == mark) + fputc('.', f); + else + fputc(' ', f); + + if((*x)->type == HCF_CHAR) { + // condense character strings + fputc('"', f); + h_pprint_char(f, (*x)->chr); + for(x++; *x; x++) { + if(x == mark) + break; + if((*x)->type != HCF_CHAR) + break; + h_pprint_char(f, (*x)->chr); + } + fputc('"', f); + } else { + h_pprint_symbol(f, g, *x); + x++; + } + } + if(x == mark) + fputs(".", f); + } +} + +void h_pprint_lrstate(FILE *f, const HCFGrammar *g, + const HLRState *state, unsigned int indent) +{ + bool first = true; + H_FOREACH_KEY(state, HLRItem *item) + if(!first) + for(unsigned int i=0; isymbol); + fprintf(f, "->%lu", t->to); +} + +void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, + const HLRDFA *dfa, unsigned int indent) +{ + for(size_t i=0; instates; i++) { + unsigned int indent2 = indent + fprintf(f, "%4lu: ", i); + h_pprint_lrstate(f, g, dfa->states[i], indent2); + for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { + const HLRTransition *t = x->elem; + if(t->from == i) { + for(unsigned int i=0; itype == HLR_SHIFT) { + fprintf(f, "s%lu", action->nextstate); + } else { + fputs("r(", f); + h_pprint_symbol(f, g, action->production.lhs); + fputs(" -> ", f); +#ifdef NDEBUG + // if we can't print the production, at least print its length + fprintf(f, "[%lu]", action->production.length); +#else + HCFSequence seq = {action->production.rhs}; + h_pprint_sequence(f, g, &seq); +#endif + fputc(')', f); + } +} + +void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, + unsigned int indent) +{ + for(size_t i=0; inrows; i++) { + for(unsigned int j=0; jforall[i]) { + fputs(" - ", f); + pprint_lraction(f, g, table->forall[i]); + fputs(" -", f); + if(!h_hashtable_empty(table->rows[i])) + fputs(" !!", f); + } + H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) + fputc(' ', f); // separator + h_pprint_symbol(f, g, symbol); + fputc(':', f); + if(table->forall[i]) { + fputc(action->type == HLR_SHIFT? 's' : 'r', f); + fputc('/', f); + fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); + } else { + pprint_lraction(f, g, action); + } + H_END_FOREACH + fputc('\n', f); + } + +#if 0 + fputs("inadeq=", f); + for(HSlistNode *x=table->inadeq->head; x; x=x->next) { + fprintf(f, "%lu ", (uintptr_t)x->elem); + } + fputc('\n', f); +#endif +} diff --git a/src/backends/lr.h b/src/backends/lr.h new file mode 100644 index 0000000..afd4042 --- /dev/null +++ b/src/backends/lr.h @@ -0,0 +1,131 @@ +#ifndef HAMMER_BACKENDS_LR__H +#define HAMMER_BACKENDS_LR__H + +#include "../hammer.h" +#include "../cfgrammar.h" +#include "../internal.h" + + +typedef HHashSet HLRState; // states are sets of LRItems + +typedef struct HLRDFA_ { + size_t nstates; + const HLRState **states; // array of size nstates + HSlist *transitions; +} HLRDFA; + +typedef struct HLRTransition_ { + size_t from; // index into 'states' array + const HCFChoice *symbol; + size_t to; // index into 'states' array +} HLRTransition; + +typedef struct HLRItem_ { + HCFChoice *lhs; + HCFChoice **rhs; // NULL-terminated + size_t len; // number of elements in rhs + size_t mark; +} HLRItem; + +typedef struct HLRAction_ { + enum {HLR_SHIFT, HLR_REDUCE} type; + union { + size_t nextstate; // used with SHIFT + struct { + HCFChoice *lhs; // symbol carrying semantic actions etc. + size_t length; // # of symbols in rhs +#ifndef NDEBUG + HCFChoice **rhs; // NB: the rhs symbols are not needed for the parse +#endif + } production; // used with REDUCE + }; +} HLRAction; + +typedef struct HLRTable_ { + size_t nrows; + HHashTable **rows; // map symbols to HLRActions + HLRAction **forall; // shortcut to set an action for an entire row + HCFChoice *start; // start symbol + HSlist *inadeq; // indices of any inadequate states + HArena *arena; + HAllocator *mm__; +} HLRTable; + +typedef struct HLREnhGrammar_ { + HCFGrammar *grammar; // enhanced grammar + HHashTable *tmap; // maps transitions to enhanced-grammar symbols + HHashTable *smap; // maps enhanced-grammar symbols to transitions + HHashTable *corr; // maps symbols to sets of corresponding e. symbols + HArena *arena; +} HLREnhGrammar; + +typedef struct HLREngine_ { + const HLRTable *table; + HSlist *left; // left stack; reductions happen here + HSlist *right; // right stack; input appears here + size_t state; + bool running; + HArena *arena; // will hold the results + HArena *tarena; // tmp, deleted after parse +} HLREngine; + + +// XXX move to internal.h or something +// XXX replace other hashtable iterations with this +#define H_FOREACH_(HT) { \ + const HHashTable *ht__ = HT; \ + for(size_t i__=0; i__ < ht__->capacity; i__++) { \ + for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ + hte__; \ + hte__ = hte__->next) { \ + if(hte__->key == NULL) continue; + +#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ + const KEYVAR = hte__->key; + +#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ + VALVAR = hte__->value; + +#define H_END_FOREACH \ + } \ + } \ + } + + + +HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark); +HLRState *h_lrstate_new(HArena *arena); +HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows); +void h_lrtable_free(HLRTable *table); +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table); +HLRAction *h_reduce_action(HArena *arena, const HLRItem *item); +HLRAction *h_shift_action(HArena *arena, size_t nextstate); + +bool h_eq_symbol(const void *p, const void *q); +bool h_eq_lr_itemset(const void *p, const void *q); +bool h_eq_transition(const void *p, const void *q); +HHashValue h_hash_symbol(const void *p); +HHashValue h_hash_lr_itemset(const void *p); +HHashValue h_hash_transition(const void *p); + +HLRDFA *h_lr0_dfa(HCFGrammar *g); +HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa); +int h_lrtable_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action); + +int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params); +void h_lalr_free(HParser *parser); + +const HLRAction *h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol); +void h_lrengine_step(HLREngine *engine, HInputStream *stream); +HParseResult *h_lrengine_result(HLREngine *engine); +HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); + +void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item); +void h_pprint_lrstate(FILE *f, const HCFGrammar *g, + const HLRState *state, unsigned int indent); +void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, + const HLRDFA *dfa, unsigned int indent); +void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, + unsigned int indent); + +#endif diff --git a/src/backends/lr0.c b/src/backends/lr0.c new file mode 100644 index 0000000..aab2ad1 --- /dev/null +++ b/src/backends/lr0.c @@ -0,0 +1,205 @@ +#include +#include "lr.h" + + + +/* Constructing the characteristic automaton (handle recognizer) */ + +static HLRItem *advance_mark(HArena *arena, const HLRItem *item) +{ + assert(item->rhs[item->mark] != NULL); + HLRItem *ret = h_arena_malloc(arena, sizeof(HLRItem)); + *ret = *item; + ret->mark++; + return ret; +} + +static void expand_to_closure(HCFGrammar *g, HHashSet *items) +{ + HAllocator *mm__ = g->mm__; + HArena *arena = g->arena; + HSlist *work = h_slist_new(arena); + + // initialize work list with items + H_FOREACH_KEY(items, HLRItem *item) + h_slist_push(work, (void *)item); + H_END_FOREACH + + while(!h_slist_empty(work)) { + const HLRItem *item = h_slist_pop(work); + HCFChoice *sym = item->rhs[item->mark]; // symbol after mark + + // if there is a non-terminal after the mark, follow it + // NB: unlike LLk, we do consider HCF_CHARSET a non-terminal here + if(sym != NULL && (sym->type==HCF_CHOICE || sym->type==HCF_CHARSET)) { + // add items corresponding to the productions of sym + if(sym->type == HCF_CHOICE) { + for(HCFSequence **p=sym->seq; *p; p++) { + HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); + if(!h_hashset_present(items, it)) { + h_hashset_put(items, it); + h_slist_push(work, it); + } + } + } else { // HCF_CHARSET + for(unsigned int i=0; i<256; i++) { + if(charset_isset(sym->charset, i)) { + // XXX allocate these single-character symbols statically somewhere + HCFChoice **rhs = h_new(HCFChoice *, 2); + rhs[0] = h_new(HCFChoice, 1); + rhs[0]->type = HCF_CHAR; + rhs[0]->chr = i; + rhs[1] = NULL; + HLRItem *it = h_lritem_new(arena, sym, rhs, 0); + h_hashset_put(items, it); + // single-character item needs no further work + } + } + // if sym is a non-terminal, we need a reshape on it + // this seems as good a place as any to set it + sym->reshape = h_act_first; + } + } + } +} + +HLRDFA *h_lr0_dfa(HCFGrammar *g) +{ + HArena *arena = g->arena; + + HHashSet *states = h_hashset_new(arena, h_eq_lr_itemset, h_hash_lr_itemset); + // maps itemsets to assigned array indices + HSlist *transitions = h_slist_new(arena); + + // list of states that need to be processed + // to save lookups, we push two elements per state, the itemset and its + // assigned index. + HSlist *work = h_slist_new(arena); + + // make initial state (kernel) + HLRState *start = h_lrstate_new(arena); + assert(g->start->type == HCF_CHOICE); + for(HCFSequence **p=g->start->seq; *p; p++) + h_hashset_put(start, h_lritem_new(arena, g->start, (*p)->items, 0)); + expand_to_closure(g, start); + h_hashtable_put(states, start, 0); + h_slist_push(work, start); + h_slist_push(work, 0); + + // while work to do (on some state) + // determine edge symbols + // for each edge symbol: + // advance respective items -> destination state (kernel) + // compute closure + // if destination is a new state: + // add it to state set + // add transition to it + // add it to the work list + + while(!h_slist_empty(work)) { + size_t state_idx = (uintptr_t)h_slist_pop(work); + HLRState *state = h_slist_pop(work); + + // maps edge symbols to neighbor states (item sets) of s + HHashTable *neighbors = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); + + // iterate over state (closure) and generate neighboring sets + H_FOREACH_KEY(state, HLRItem *item) + HCFChoice *sym = item->rhs[item->mark]; // symbol after mark + + if(sym != NULL) { // mark was not at the end + // find or create prospective neighbor set + HLRState *neighbor = h_hashtable_get(neighbors, sym); + if(neighbor == NULL) { + neighbor = h_lrstate_new(arena); + h_hashtable_put(neighbors, sym, neighbor); + } + + // ...and add the advanced item to it + h_hashset_put(neighbor, advance_mark(arena, item)); + } + H_END_FOREACH + + // merge expanded neighbor sets into the set of existing states + H_FOREACH(neighbors, HCFChoice *symbol, HLRState *neighbor) + expand_to_closure(g, neighbor); + + // look up existing state, allocate new if not found + size_t neighbor_idx; + if(!h_hashset_present(states, neighbor)) { + neighbor_idx = states->used; + h_hashtable_put(states, neighbor, (void *)(uintptr_t)neighbor_idx); + h_slist_push(work, neighbor); + h_slist_push(work, (void *)(uintptr_t)neighbor_idx); + } else { + neighbor_idx = (uintptr_t)h_hashtable_get(states, neighbor); + } + + // add transition "state --symbol--> neighbor" + HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); + t->from = state_idx; + t->to = neighbor_idx; + t->symbol = symbol; + h_slist_push(transitions, t); + H_END_FOREACH + } // end while(work) + + // fill DFA struct + HLRDFA *dfa = h_arena_malloc(arena, sizeof(HLRDFA)); + dfa->nstates = states->used; + dfa->states = h_arena_malloc(arena, dfa->nstates*sizeof(HLRState *)); + H_FOREACH(states, HLRState *state, void *v) + size_t idx = (uintptr_t)v; + dfa->states[idx] = state; + H_END_FOREACH + dfa->transitions = transitions; + + return dfa; +} + + + +/* LR(0) table generation */ + +HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) +{ + HAllocator *mm__ = g->mm__; + + HLRTable *table = h_lrtable_new(mm__, dfa->nstates); + HArena *arena = table->arena; + + // remember start symbol + table->start = g->start; + + // add shift entries + for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { + // for each transition x-A->y, add "shift, goto y" to table entry (x,A) + HLRTransition *t = x->elem; + + HLRAction *action = h_shift_action(arena, t->to); + h_hashtable_put(table->rows[t->from], t->symbol, action); + } + + // add reduce entries, record inadequate states + for(size_t i=0; instates; i++) { + // find reducible items in state + H_FOREACH_KEY(dfa->states[i], HLRItem *item) + if(item->mark == item->len) { // mark at the end + // check for conflicts + // XXX store more informative stuff in the inadeq records? + if(table->forall[i]) { + // reduce/reduce conflict with a previous item + h_slist_push(table->inadeq, (void *)(uintptr_t)i); + } else if(!h_hashtable_empty(table->rows[i])) { + // shift/reduce conflict with one of the row's entries + h_slist_push(table->inadeq, (void *)(uintptr_t)i); + } + + // set reduce action for the entire row + table->forall[i] = h_reduce_action(arena, item); + } + H_END_FOREACH + } + + return table; +} From 409d33c91642abe327823a0792cda2fa65d978a9 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 19 Jun 2013 14:09:39 +0200 Subject: [PATCH 69/95] split out h_lrengine_action() --- src/backends/lr.c | 22 ++++++++++++++-------- src/backends/lr.h | 8 +++++++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index c3062d5..0ad4569 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -190,18 +190,12 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table) return engine; } -void h_lrengine_step(HLREngine *engine, HInputStream *stream) +const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream) { - // short-hand names - HSlist *left = engine->left; HSlist *right = engine->right; HArena *arena = engine->arena; HArena *tarena = engine->tarena; - // stack layout: - // on the left stack, we put pairs: (saved state, semantic value) - // on the right stack, we put pairs: (symbol, semantic value) - // make sure there is input on the right stack if(h_slist_empty(right)) { // XXX use statically-allocated terminal symbols @@ -230,6 +224,18 @@ void h_lrengine_step(HLREngine *engine, HInputStream *stream) // table lookup const HLRAction *action = h_lr_lookup(engine->table, engine->state, symbol); + + return action; +} + +void h_lrengine_step(HLREngine *engine, const HLRAction *action) +{ + // short-hand names + HSlist *left = engine->left; + HSlist *right = engine->right; + HArena *arena = engine->arena; + HArena *tarena = engine->tarena; + if(action == NULL) { // no handle recognizable in input, terminate engine->running = false; @@ -313,7 +319,7 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* // run while the recognizer finds handles in the input while(engine->running) - h_lrengine_step(engine, stream); + h_lrengine_step(engine, h_lrengine_action(engine, stream)); HParseResult *result = h_lrengine_result(engine); if(!result) diff --git a/src/backends/lr.h b/src/backends/lr.h index afd4042..9312237 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -61,8 +61,13 @@ typedef struct HLREnhGrammar_ { typedef struct HLREngine_ { const HLRTable *table; + + // stack layout: + // on the left stack, we put pairs: (saved state, semantic value) + // on the right stack, we put pairs: (symbol, semantic value) HSlist *left; // left stack; reductions happen here HSlist *right; // right stack; input appears here + size_t state; bool running; HArena *arena; // will hold the results @@ -116,7 +121,8 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params); void h_lalr_free(HParser *parser); const HLRAction *h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol); -void h_lrengine_step(HLREngine *engine, HInputStream *stream); +const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream); +void h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); From 168760b10a6234259cd80b1c455fbf95d76d10ba Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 19 Jun 2013 14:16:34 +0200 Subject: [PATCH 70/95] return running state from h_lrengine_step --- src/backends/lr.c | 25 ++++++++++--------------- src/backends/lr.h | 3 +-- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index 0ad4569..2d329b1 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -183,7 +183,6 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table) engine->left = h_slist_new(tarena); engine->right = h_slist_new(tarena); engine->state = 0; - engine->running = 1; engine->arena = arena; engine->tarena = tarena; @@ -228,7 +227,8 @@ const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream) return action; } -void h_lrengine_step(HLREngine *engine, const HLRAction *action) +// run LR parser for one round; returns false when finished +bool h_lrengine_step(HLREngine *engine, const HLRAction *action) { // short-hand names HSlist *left = engine->left; @@ -236,11 +236,8 @@ void h_lrengine_step(HLREngine *engine, const HLRAction *action) HArena *arena = engine->arena; HArena *tarena = engine->tarena; - if(action == NULL) { - // no handle recognizable in input, terminate - engine->running = false; - return; - } + if(action == NULL) + return false; // no handle recognizable in input, terminate if(action->type == HLR_SHIFT) { h_slist_push(left, (void *)(uintptr_t)engine->state); @@ -280,11 +277,8 @@ void h_lrengine_step(HLREngine *engine, const HLRAction *action) value = (HParsedToken *)symbol->reshape(make_result(arena, value)); // call validation and semantic action, if present - if(symbol->pred && !symbol->pred(make_result(tarena, value))) { - // validation failed -> no parse; terminate - engine->running = false; - return; - } + if(symbol->pred && !symbol->pred(make_result(tarena, value))) + return false; // validation failed -> no parse; terminate if(symbol->action) value = (HParsedToken *)symbol->action(make_result(arena, value)); @@ -292,6 +286,8 @@ void h_lrengine_step(HLREngine *engine, const HLRAction *action) h_slist_push(right, value); h_slist_push(right, symbol); } + + return true; } HParseResult *h_lrengine_result(HLREngine *engine) @@ -317,9 +313,8 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse HLREngine *engine = h_lrengine_new(arena, tarena, table); - // run while the recognizer finds handles in the input - while(engine->running) - h_lrengine_step(engine, h_lrengine_action(engine, stream)); + // iterate engine to completion + while(h_lrengine_step(engine, h_lrengine_action(engine, stream))); HParseResult *result = h_lrengine_result(engine); if(!result) diff --git a/src/backends/lr.h b/src/backends/lr.h index 9312237..b95c133 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -69,7 +69,6 @@ typedef struct HLREngine_ { HSlist *right; // right stack; input appears here size_t state; - bool running; HArena *arena; // will hold the results HArena *tarena; // tmp, deleted after parse } HLREngine; @@ -122,7 +121,7 @@ void h_lalr_free(HParser *parser); const HLRAction *h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol); const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream); -void h_lrengine_step(HLREngine *engine, const HLRAction *action); +bool h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); From 54ba62bfb7e77db43df7f50dbdfc9640bc10ebb2 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 19 Jun 2013 17:01:13 +0200 Subject: [PATCH 71/95] record conflicts in a special HLRAction type --- src/backends/lalr.c | 13 ++++++++++--- src/backends/lr.c | 21 +++++++++++++++++++++ src/backends/lr.h | 13 ++++++++++--- src/backends/lr0.c | 24 +++++++++++++++--------- 4 files changed, 56 insertions(+), 15 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index fa67e5a..698b106 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -138,7 +138,8 @@ int h_lrtable_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action) HLRAction *prev = h_hashtable_get(tbl->rows[state], x); if(prev && prev != action) { // conflict - h_slist_push(tbl->inadeq, (void *)(uintptr_t)state); + action = h_lr_conflict(tbl->arena, prev, action); + h_hashtable_put(tbl->rows[state], x, action); return -1; } else { h_hashtable_put(tbl->rows[state], x, action); @@ -221,6 +222,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) for(HSlistNode *x=inadeq->head; x; x=x->next) { size_t state = (uintptr_t)x->elem; + bool inadeq = false; // clear old forall entry, it's being replaced by more fine-grained ones table->forall[state] = NULL; @@ -255,7 +257,8 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) if(fs->end_branch) { HCFChoice *terminal = h_arena_malloc(arena, sizeof(HCFChoice)); terminal->type = HCF_END; - h_lrtable_put(table, state, terminal, action); + if(h_lrtable_put(table, state, terminal, action) < 0) + inadeq = true; } H_FOREACH(fs->char_branches, void *key, HStringMap *m) if(!m->epsilon_branch) @@ -265,10 +268,14 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) terminal->type = HCF_CHAR; terminal->chr = key_char((HCharKey)key); - h_lrtable_put(table, state, terminal, action); + if(h_lrtable_put(table, state, terminal, action) < 0) + inadeq = true; H_END_FOREACH // lookahead character } H_END_FOREACH // enhanced production H_END_FOREACH // reducible item + + if(inadeq) + h_slist_push(table->inadeq, (void *)(uintptr_t)state); } } diff --git a/src/backends/lr.c b/src/backends/lr.c index 2d329b1..bf06645 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -159,6 +159,25 @@ HLRAction *h_reduce_action(HArena *arena, const HLRItem *item) return action; } +// adds 'new' to the branches of 'action' +// returns a 'action' if it is already of type HLR_CONFLICT +// allocates a new HLRAction otherwise +HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new) +{ + if(action->type != HLR_CONFLICT) { + HLRAction *old = action; + action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_CONFLICT; + action->branches = h_slist_new(arena); + h_slist_push(action->branches, old); + } + + assert(action->type == HLR_CONFLICT); + h_slist_push(action->branches, new); + + return action; +} + /* LR driver */ @@ -239,6 +258,8 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) if(action == NULL) return false; // no handle recognizable in input, terminate + assert(action->type == HLR_SHIFT || action->type == HLR_REDUCE); + if(action->type == HLR_SHIFT) { h_slist_push(left, (void *)(uintptr_t)engine->state); h_slist_pop(right); // symbol (discard) diff --git a/src/backends/lr.h b/src/backends/lr.h index b95c133..13e10d4 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -28,16 +28,22 @@ typedef struct HLRItem_ { } HLRItem; typedef struct HLRAction_ { - enum {HLR_SHIFT, HLR_REDUCE} type; + enum {HLR_SHIFT, HLR_REDUCE, HLR_CONFLICT} type; union { - size_t nextstate; // used with SHIFT + // used with HLR_SHIFT + size_t nextstate; + + // used with HLR_REDUCE struct { HCFChoice *lhs; // symbol carrying semantic actions etc. size_t length; // # of symbols in rhs #ifndef NDEBUG HCFChoice **rhs; // NB: the rhs symbols are not needed for the parse #endif - } production; // used with REDUCE + } production; + + // used with HLR_CONFLICT + HSlist *branches; // list of possible HLRActions }; } HLRAction; @@ -104,6 +110,7 @@ void h_lrtable_free(HLRTable *table); HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table); HLRAction *h_reduce_action(HArena *arena, const HLRItem *item); HLRAction *h_shift_action(HArena *arena, size_t nextstate); +HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new); bool h_eq_symbol(const void *p, const void *q); bool h_eq_lr_itemset(const void *p, const void *q); diff --git a/src/backends/lr0.c b/src/backends/lr0.c index aab2ad1..67cf2aa 100644 --- a/src/backends/lr0.c +++ b/src/backends/lr0.c @@ -182,23 +182,29 @@ HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) // add reduce entries, record inadequate states for(size_t i=0; instates; i++) { + bool inadeq = false; + // find reducible items in state H_FOREACH_KEY(dfa->states[i], HLRItem *item) if(item->mark == item->len) { // mark at the end - // check for conflicts - // XXX store more informative stuff in the inadeq records? + HLRAction *reduce = h_reduce_action(arena, item); + + // check for reduce/reduce conflict on forall if(table->forall[i]) { - // reduce/reduce conflict with a previous item - h_slist_push(table->inadeq, (void *)(uintptr_t)i); - } else if(!h_hashtable_empty(table->rows[i])) { - // shift/reduce conflict with one of the row's entries - h_slist_push(table->inadeq, (void *)(uintptr_t)i); + reduce = h_lr_conflict(arena, table->forall[i], reduce); + inadeq = true; } + table->forall[i] = reduce; - // set reduce action for the entire row - table->forall[i] = h_reduce_action(arena, item); + // check for shift/reduce conflict with other entries + // NOTE: these are not recorded as HLR_CONFLICTs at this point + if(!h_hashtable_empty(table->rows[i])) + inadeq = true; } H_END_FOREACH + + if(inadeq) + h_slist_push(table->inadeq, (void *)(uintptr_t)i); } return table; From 55c9a3d9c51fb752e89c85f49987a124c35acc88 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 19 Jun 2013 17:20:53 +0200 Subject: [PATCH 72/95] add stub GLR backend with h_glr_parse() a copy of h_lr_parse() --- src/Makefile | 3 +- src/backends/glr.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++ src/backends/lr.h | 1 + src/hammer.c | 1 + src/hammer.h | 4 +- src/internal.h | 1 + src/t_parser.c | 38 +++++++++++++++++++ 7 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 src/backends/glr.c diff --git a/src/Makefile b/src/Makefile index 380436a..9ce6d9f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -27,9 +27,10 @@ PARSERS := \ BACKENDS := \ packrat \ + regex \ llk \ lalr \ - regex + glr HAMMER_PARTS := \ bitreader.o \ diff --git a/src/backends/glr.c b/src/backends/glr.c new file mode 100644 index 0000000..c57ffd6 --- /dev/null +++ b/src/backends/glr.c @@ -0,0 +1,93 @@ +#include "lr.h" + + + +/* GLR driver */ + +HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) +{ + HLRTable *table = parser->backend_data; + if(!table) + return NULL; + + HArena *arena = h_new_arena(mm__, 0); // will hold the results + HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse + HLREngine *engine = h_lrengine_new(arena, tarena, table); + + // iterate engine to completion + while(h_lrengine_step(engine, h_lrengine_action(engine, stream))); + + HParseResult *result = h_lrengine_result(engine); + if(!result) + h_delete_arena(arena); + h_delete_arena(tarena); + return result; +} + + + +HParserBackendVTable h__glr_backend_vtable = { + .compile = h_lalr_compile, + .parse = h_glr_parse, + .free = h_lalr_free +}; + + + + +// dummy! +int test_glr(void) +{ + /* + E -> E '-' T + | T + T -> '(' E ')' + | 'n' -- also try [0-9] for the charset paths + */ + + HParser *n = h_ch('n'); + HParser *E = h_indirect(); + HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL); + HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); + h_bind_indirect(E, E_); + HParser *p = E; + + printf("\n==== G R A M M A R ====\n"); + HCFGrammar *g = h_cfgrammar(&system_allocator, p); + if(g == NULL) { + fprintf(stderr, "h_cfgrammar failed\n"); + return 1; + } + h_pprint_grammar(stdout, g, 0); + + printf("\n==== D F A ====\n"); + HLRDFA *dfa = h_lr0_dfa(g); + if(dfa) + h_pprint_lrdfa(stdout, g, dfa, 0); + else + fprintf(stderr, "h_lalr_dfa failed\n"); + + printf("\n==== L R ( 0 ) T A B L E ====\n"); + HLRTable *table0 = h_lr0_table(g, dfa); + if(table0) + h_pprint_lrtable(stdout, g, table0, 0); + else + fprintf(stderr, "h_lr0_table failed\n"); + h_lrtable_free(table0); + + printf("\n==== L A L R T A B L E ====\n"); + if(h_compile(p, PB_GLR, NULL)) { + fprintf(stderr, "does not compile\n"); + return 2; + } + h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); + + printf("\n==== P A R S E R E S U L T ====\n"); + HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13); + if(res) + h_pprint(stdout, res->ast, 0, 2); + else + printf("no parse\n"); + + return 0; +} diff --git a/src/backends/lr.h b/src/backends/lr.h index 13e10d4..f766d5b 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -131,6 +131,7 @@ const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream); bool h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); +HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item); void h_pprint_lrstate(FILE *f, const HCFGrammar *g, diff --git a/src/hammer.c b/src/hammer.c index 7d5b4e9..7fc80db 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -31,6 +31,7 @@ static HParserBackendVTable *backends[PB_MAX + 1] = { &h__regex_backend_vtable, &h__llk_backend_vtable, &h__lalr_backend_vtable, + &h__glr_backend_vtable, }; diff --git a/src/hammer.h b/src/hammer.h index a5ebcff..67fb8e4 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -37,8 +37,8 @@ typedef enum HParserBackend_ { PB_REGULAR, PB_LLk, PB_LALR, - PB_GLR, // Not Implemented - PB_MAX = PB_LALR + PB_GLR, + PB_MAX = PB_GLR } HParserBackend; typedef enum HTokenType_ { diff --git a/src/internal.h b/src/internal.h index 2f3018d..d8b221a 100644 --- a/src/internal.h +++ b/src/internal.h @@ -220,6 +220,7 @@ struct HBitWriter_ { extern HParserBackendVTable h__packrat_backend_vtable; extern HParserBackendVTable h__llk_backend_vtable; extern HParserBackendVTable h__lalr_backend_vtable; +extern HParserBackendVTable h__glr_backend_vtable; // }}} // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. diff --git a/src/t_parser.c b/src/t_parser.c index a0e4040..57486cd 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -585,4 +585,42 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/lalr/ignore", GINT_TO_POINTER(PB_LALR), test_ignore); g_test_add_data_func("/core/parser/lalr/leftrec", GINT_TO_POINTER(PB_LALR), test_leftrec); g_test_add_data_func("/core/parser/lalr/rightrec", GINT_TO_POINTER(PB_LALR), test_rightrec); + + g_test_add_data_func("/core/parser/glr/token", GINT_TO_POINTER(PB_GLR), test_token); + g_test_add_data_func("/core/parser/glr/ch", GINT_TO_POINTER(PB_GLR), test_ch); + g_test_add_data_func("/core/parser/glr/ch_range", GINT_TO_POINTER(PB_GLR), test_ch_range); + g_test_add_data_func("/core/parser/glr/int64", GINT_TO_POINTER(PB_GLR), test_int64); + g_test_add_data_func("/core/parser/glr/int32", GINT_TO_POINTER(PB_GLR), test_int32); + g_test_add_data_func("/core/parser/glr/int16", GINT_TO_POINTER(PB_GLR), test_int16); + g_test_add_data_func("/core/parser/glr/int8", GINT_TO_POINTER(PB_GLR), test_int8); + g_test_add_data_func("/core/parser/glr/uint64", GINT_TO_POINTER(PB_GLR), test_uint64); + g_test_add_data_func("/core/parser/glr/uint32", GINT_TO_POINTER(PB_GLR), test_uint32); + g_test_add_data_func("/core/parser/glr/uint16", GINT_TO_POINTER(PB_GLR), test_uint16); + g_test_add_data_func("/core/parser/glr/uint8", GINT_TO_POINTER(PB_GLR), test_uint8); + g_test_add_data_func("/core/parser/glr/int_range", GINT_TO_POINTER(PB_GLR), test_int_range); +#if 0 + g_test_add_data_func("/core/parser/glr/float64", GINT_TO_POINTER(PB_GLR), test_float64); + g_test_add_data_func("/core/parser/glr/float32", GINT_TO_POINTER(PB_GLR), test_float32); +#endif + g_test_add_data_func("/core/parser/glr/whitespace", GINT_TO_POINTER(PB_GLR), test_whitespace); + g_test_add_data_func("/core/parser/glr/left", GINT_TO_POINTER(PB_GLR), test_left); + g_test_add_data_func("/core/parser/glr/right", GINT_TO_POINTER(PB_GLR), test_right); + g_test_add_data_func("/core/parser/glr/middle", GINT_TO_POINTER(PB_GLR), test_middle); + g_test_add_data_func("/core/parser/glr/action", GINT_TO_POINTER(PB_GLR), test_action); + g_test_add_data_func("/core/parser/glr/in", GINT_TO_POINTER(PB_GLR), test_in); + g_test_add_data_func("/core/parser/glr/not_in", GINT_TO_POINTER(PB_GLR), test_not_in); + g_test_add_data_func("/core/parser/glr/end_p", GINT_TO_POINTER(PB_GLR), test_end_p); + g_test_add_data_func("/core/parser/glr/nothing_p", GINT_TO_POINTER(PB_GLR), test_nothing_p); + g_test_add_data_func("/core/parser/glr/sequence", GINT_TO_POINTER(PB_GLR), test_sequence); + g_test_add_data_func("/core/parser/glr/choice", GINT_TO_POINTER(PB_GLR), test_choice); + g_test_add_data_func("/core/parser/glr/many", GINT_TO_POINTER(PB_GLR), test_many); + g_test_add_data_func("/core/parser/glr/many1", GINT_TO_POINTER(PB_GLR), test_many1); + g_test_add_data_func("/core/parser/glr/optional", GINT_TO_POINTER(PB_GLR), test_optional); + g_test_add_data_func("/core/parser/glr/sepBy", GINT_TO_POINTER(PB_GLR), test_sepBy); + g_test_add_data_func("/core/parser/glr/sepBy1", GINT_TO_POINTER(PB_GLR), test_sepBy1); + g_test_add_data_func("/core/parser/glr/epsilon_p", GINT_TO_POINTER(PB_GLR), test_epsilon_p); + g_test_add_data_func("/core/parser/glr/attr_bool", GINT_TO_POINTER(PB_GLR), test_attr_bool); + g_test_add_data_func("/core/parser/glr/ignore", GINT_TO_POINTER(PB_GLR), test_ignore); + g_test_add_data_func("/core/parser/glr/leftrec", GINT_TO_POINTER(PB_GLR), test_leftrec); + g_test_add_data_func("/core/parser/glr/rightrec", GINT_TO_POINTER(PB_GLR), test_rightrec); } From ecfc0a8e62a11df893282fc8ff01ece3436551a8 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 19 Jun 2013 18:21:53 +0200 Subject: [PATCH 73/95] fix conflict pretty-printing --- src/backends/lr.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index bf06645..392d3f6 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -425,9 +425,11 @@ void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) { - if(action->type == HLR_SHIFT) { + switch(action->type) { + case HLR_SHIFT: fprintf(f, "s%lu", action->nextstate); - } else { + break; + case HLR_REDUCE: fputs("r(", f); h_pprint_symbol(f, g, action->production.lhs); fputs(" -> ", f); @@ -439,6 +441,18 @@ void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) h_pprint_sequence(f, g, &seq); #endif fputc(')', f); + break; + case HLR_CONFLICT: + fputc('!', f); + for(HSlistNode *x=action->branches->head; x; x=x->next) { + HLRAction *branch = x->elem; + assert(branch->type != HLR_CONFLICT); // no nesting + pprint_lraction(f, g, branch); + if(x->next) fputc('/', f); // separator + } + break; + default: + assert_message(0, "not reached"); } } @@ -459,13 +473,7 @@ void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, fputc(' ', f); // separator h_pprint_symbol(f, g, symbol); fputc(':', f); - if(table->forall[i]) { - fputc(action->type == HLR_SHIFT? 's' : 'r', f); - fputc('/', f); - fputc(table->forall[i]->type == HLR_SHIFT? 's' : 'r', f); - } else { - pprint_lraction(f, g, action); - } + pprint_lraction(f, g, action); H_END_FOREACH fputc('\n', f); } From 572f1c8f9e14e521317c54045ee22639d65b5ffe Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 19 Jun 2013 18:22:19 +0200 Subject: [PATCH 74/95] expand stub GLR backend --- src/backends/glr.c | 60 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index c57ffd6..429d06b 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -1,6 +1,25 @@ #include "lr.h" +/* GLR compilation (LALR w/o failing on conflict) */ + +int h_glr_compile(HAllocator* mm__, HParser* parser, const void* params) +{ + int result = h_lalr_compile(mm__, parser, params); + + if(result == -1 && parser->backend_data) { + // table is there, just has conflicts? nevermind, that's okay. + result = 0; + } + + return result; +} + +void h_glr_free(HParser *parser) +{ + h_lalr_free(parser); +} + /* GLR driver */ @@ -12,12 +31,28 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HArena *arena = h_new_arena(mm__, 0); // will hold the results HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse - HLREngine *engine = h_lrengine_new(arena, tarena, table); - // iterate engine to completion - while(h_lrengine_step(engine, h_lrengine_action(engine, stream))); + HSlist *engines = h_slist_new(tarena); + h_slist_push(engines, h_lrengine_new(arena, tarena, table)); + + HParseResult *result = NULL; + while(result == NULL && !h_slist_empty(engines)) { + for(HSlistNode **x = &engines->head; *x; ) { + HLREngine *engine = (*x)->elem; + + const HLRAction *action = h_lrengine_action(engine, stream); + // XXX handle conflicts -> fork engine + bool running = h_lrengine_step(engine, action); + + if(running) { + x = &(*x)->next; // go to next + } else { + *x = (*x)->next; // remove from list + result = h_lrengine_result(engine); + } + } + } - HParseResult *result = h_lrengine_result(engine); if(!result) h_delete_arena(arena); h_delete_arena(tarena); @@ -27,9 +62,9 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HParserBackendVTable h__glr_backend_vtable = { - .compile = h_lalr_compile, + .compile = h_glr_compile, .parse = h_glr_parse, - .free = h_lalr_free + .free = h_glr_free }; @@ -39,16 +74,13 @@ HParserBackendVTable h__glr_backend_vtable = { int test_glr(void) { /* - E -> E '-' T - | T - T -> '(' E ')' - | 'n' -- also try [0-9] for the charset paths + E -> E '+' E + | 'd' */ - HParser *n = h_ch('n'); + HParser *d = h_ch('d'); HParser *E = h_indirect(); - HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL); - HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); + HParser *E_ = h_choice(h_sequence(E, h_ch('+'), E, NULL), d, NULL); h_bind_indirect(E, E_); HParser *p = E; @@ -83,7 +115,7 @@ int test_glr(void) h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); printf("\n==== P A R S E R E S U L T ====\n"); - HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13); + HParseResult *res = h_parse(p, (uint8_t *)"d+d+d", 5); if(res) h_pprint(stdout, res->ast, 0, 2); else From 4f36fcd2c105008014364131c48f7001597bdcf7 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 19 Jun 2013 18:51:16 +0200 Subject: [PATCH 75/95] avoid duplicate conflict branches --- src/backends/lr.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index 392d3f6..4ab53bc 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -170,11 +170,19 @@ HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new) action->type = HLR_CONFLICT; action->branches = h_slist_new(arena); h_slist_push(action->branches, old); + h_slist_push(action->branches, new); + } else { + // check if 'new' is already among branches + HSlistNode *x; + for(x=action->branches->head; x; x=x->next) { + if(x->elem == new) + break; + } + // add 'new' if it is not already in list + if(x == NULL) + h_slist_push(action->branches, new); } - assert(action->type == HLR_CONFLICT); - h_slist_push(action->branches, new); - return action; } From b1e8e297740256e44233837c6aff309369608c23 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 20 Jun 2013 11:05:57 +0200 Subject: [PATCH 76/95] add engine forking --- src/backends/glr.c | 55 ++++++++++++++++++++++++++++++++++++++++++-- src/backends/lr.c | 12 +++++----- src/backends/lr.h | 2 +- src/datastructures.c | 10 ++++++++ src/internal.h | 1 + 5 files changed, 71 insertions(+), 9 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 429d06b..2b4b974 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -1,3 +1,4 @@ +#include #include "lr.h" @@ -23,6 +24,26 @@ void h_glr_free(HParser *parser) /* GLR driver */ +HLREngine *fork_engine(const HLREngine *engine) +{ + HLREngine *eng2 = h_arena_malloc(engine->tarena, sizeof(HLREngine)); + eng2->table = engine->table; + eng2->state = engine->state; + + // shallow-copy the stacks + // this works because h_slist_push and h_slist_pop never modify + // the underlying structure of HSlistNodes, only the head pointer. + // in fact, this gives us prefix sharing for free. + eng2->left = h_arena_malloc(engine->tarena, sizeof(HSlist)); + eng2->right = h_arena_malloc(engine->tarena, sizeof(HSlist)); + *eng2->left = *engine->left; + *eng2->right = *engine->right; + + eng2->arena = engine->arena; + eng2->tarena = engine->tarena; + return eng2; +} + HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) { HLRTable *table = parser->backend_data; @@ -41,14 +62,44 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HLREngine *engine = (*x)->elem; const HLRAction *action = h_lrengine_action(engine, stream); - // XXX handle conflicts -> fork engine + + // fork engine on conflicts + if(action && action->type == HLR_CONFLICT) { + const HSlist *branches = action->branches; + + // there should be at least two conflicting actions + assert(branches->head); + assert(branches->head->next); + + // save first action for use with old engine below + action = branches->head->elem; + + // fork a new engine for all the other actions + for(HSlistNode *x=branches->head->next; x; x=x->next) { + HLRAction *act = x->elem; + HLREngine *eng = fork_engine(engine); + + // perform one step; add engine to list if it wants to keep running + bool run = h_lrengine_step(eng, act); + if(run) { + h_slist_push(engines, eng); + } else { + HParseResult *res = h_lrengine_result(eng); + if(res) + result = res; + } + } + } + bool running = h_lrengine_step(engine, action); if(running) { x = &(*x)->next; // go to next } else { *x = (*x)->next; // remove from list - result = h_lrengine_result(engine); + HParseResult *res = h_lrengine_result(engine); + if(res) + result = res; } } } diff --git a/src/backends/lr.c b/src/backends/lr.c index 4ab53bc..1ea6a39 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -270,8 +270,8 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) if(action->type == HLR_SHIFT) { h_slist_push(left, (void *)(uintptr_t)engine->state); - h_slist_pop(right); // symbol (discard) - h_slist_push(left, h_slist_pop(right)); // semantic value + h_slist_drop(right); // symbol (discard) + h_slist_push(left, h_slist_drop(right)); // semantic value engine->state = action->nextstate; } else { assert(action->type == HLR_REDUCE); @@ -286,8 +286,8 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) // pull values off the left stack, rewinding state accordingly HParsedToken *v = NULL; for(size_t i=0; istate = (uintptr_t)h_slist_pop(left); + v = h_slist_drop(left); + engine->state = (uintptr_t)h_slist_drop(left); // collect values in result sequence value->seq->elements[len-1-i] = v; @@ -322,10 +322,10 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) HParseResult *h_lrengine_result(HLREngine *engine) { // parsing was successful iff the start symbol is on top of the right stack - if(h_slist_pop(engine->right) == engine->table->start) { + if(h_slist_drop(engine->right) == engine->table->start) { // next on the right stack is the start symbol's semantic value assert(!h_slist_empty(engine->right)); - HParsedToken *tok = h_slist_pop(engine->right); + HParsedToken *tok = h_slist_drop(engine->right); return make_result(engine->arena, tok); } else { return NULL; diff --git a/src/backends/lr.h b/src/backends/lr.h index f766d5b..edf0871 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -67,6 +67,7 @@ typedef struct HLREnhGrammar_ { typedef struct HLREngine_ { const HLRTable *table; + size_t state; // stack layout: // on the left stack, we put pairs: (saved state, semantic value) @@ -74,7 +75,6 @@ typedef struct HLREngine_ { HSlist *left; // left stack; reductions happen here HSlist *right; // right stack; input appears here - size_t state; HArena *arena; // will hold the results HArena *tarena; // tmp, deleted after parse } HLREngine; diff --git a/src/datastructures.c b/src/datastructures.c index 075b966..94bc901 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -62,6 +62,16 @@ HSlist* h_slist_copy(HSlist *slist) { return ret; } +// like h_slist_pop, but does not deallocate the head node +void* h_slist_drop(HSlist *slist) { + HSlistNode *head = slist->head; + if (!head) + return NULL; + void* ret = head->elem; + slist->head = head->next; + return ret; +} + void* h_slist_pop(HSlist *slist) { HSlistNode *head = slist->head; if (!head) diff --git a/src/internal.h b/src/internal.h index d8b221a..a897e9f 100644 --- a/src/internal.h +++ b/src/internal.h @@ -248,6 +248,7 @@ void h_carray_append(HCountedArray *array, void* item); HSlist* h_slist_new(HArena *arena); HSlist* h_slist_copy(HSlist *slist); void* h_slist_pop(HSlist *slist); +void* h_slist_drop(HSlist *slist); void h_slist_push(HSlist *slist, void* item); bool h_slist_find(HSlist *slist, const void* item); HSlist* h_slist_remove_all(HSlist *slist, const void* item); From f5d4ea90da9f4829714b5460bd0028e5851929e0 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 20 Jun 2013 11:30:30 +0200 Subject: [PATCH 77/95] add ambiguous test case for GLR --- src/t_parser.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/t_parser.c b/src/t_parser.c index 57486cd..7522e92 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -428,6 +428,17 @@ static void test_rightrec(gconstpointer backend) { g_check_parse_ok(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "(u0x61 (u0x61 (u0x61)))"); } +static void test_ambiguous(gconstpointer backend) { + HParser *d_ = h_ch('d'); + HParser *E_ = h_indirect(); + h_bind_indirect(E_, h_choice(h_sequence(E_, h_ch('+'), E_, NULL), d_, NULL)); + + g_check_parse_ok(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d", 1, "u0x64"); + g_check_parse_ok(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d", 3, "(u0x64 u0x2b u0x64)"); + g_check_parse_ok(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d+d", 5, "(u0x64 u0x2b (u0x64 u0x2b u0x64))"); + g_check_parse_failed(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d+", 2); +} + void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/token", GINT_TO_POINTER(PB_PACKRAT), test_token); g_test_add_data_func("/core/parser/packrat/ch", GINT_TO_POINTER(PB_PACKRAT), test_ch); @@ -623,4 +634,5 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/ignore", GINT_TO_POINTER(PB_GLR), test_ignore); g_test_add_data_func("/core/parser/glr/leftrec", GINT_TO_POINTER(PB_GLR), test_leftrec); g_test_add_data_func("/core/parser/glr/rightrec", GINT_TO_POINTER(PB_GLR), test_rightrec); + g_test_add_data_func("/core/parser/glr/ambiguous", GINT_TO_POINTER(PB_GLR), test_ambiguous); } From bbbaf1634cead71bd3997e062213bb63bd3ce544 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 15:00:04 +0200 Subject: [PATCH 78/95] add an assertion checking that HLR_REDUCE is always followed by HLR_SHIFT --- src/backends/lr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/backends/lr.c b/src/backends/lr.c index 1ea6a39..2f7d5e4 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -314,6 +314,12 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) // push result (value, symbol) onto the right stack h_slist_push(right, value); h_slist_push(right, symbol); + + // this is LR, building a right-most derivation bottom-up, so no reduce can + // follow a reduce. we can also assume no conflict follows for GLR if we + // use LALR tables, because only terminal symbols (lookahead) get reduces. + const HLRAction *next = h_lr_lookup(engine->table, engine->state, symbol); + assert(next == NULL || next->type == HLR_SHIFT); } return true; From 7cd143c0c42779778eec755d4386ea752fb5f1f4 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 15:06:31 +0200 Subject: [PATCH 79/95] piggy-back the next shift directly onto the reduce --- src/backends/lr.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index 2f7d5e4..03264b5 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -268,12 +268,7 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) assert(action->type == HLR_SHIFT || action->type == HLR_REDUCE); - if(action->type == HLR_SHIFT) { - h_slist_push(left, (void *)(uintptr_t)engine->state); - h_slist_drop(right); // symbol (discard) - h_slist_push(left, h_slist_drop(right)); // semantic value - engine->state = action->nextstate; - } else { + if(action->type == HLR_REDUCE) { assert(action->type == HLR_REDUCE); size_t len = action->production.length; HCFChoice *symbol = action->production.lhs; @@ -318,8 +313,18 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) // this is LR, building a right-most derivation bottom-up, so no reduce can // follow a reduce. we can also assume no conflict follows for GLR if we // use LALR tables, because only terminal symbols (lookahead) get reduces. - const HLRAction *next = h_lr_lookup(engine->table, engine->state, symbol); - assert(next == NULL || next->type == HLR_SHIFT); + action = h_lr_lookup(engine->table, engine->state, symbol); + if(action == NULL) + return false; // no handle after reduce; terminate + assert(action->type == HLR_SHIFT); + } + + // this could be the original action, or a shift piggy-backed onto reduce + if(action->type == HLR_SHIFT) { + h_slist_push(left, (void *)(uintptr_t)engine->state); + h_slist_drop(right); // symbol (discard) + h_slist_push(left, h_slist_drop(right)); // semantic value + engine->state = action->nextstate; } return true; From bf3e3c162e2bd199effd17412813d8271dba40ee Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 15:31:37 +0200 Subject: [PATCH 80/95] don't even touch the right stack on piggy-back shifts --- src/backends/lr.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index 03264b5..33d597b 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -269,7 +269,6 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) assert(action->type == HLR_SHIFT || action->type == HLR_REDUCE); if(action->type == HLR_REDUCE) { - assert(action->type == HLR_REDUCE); size_t len = action->production.length; HCFChoice *symbol = action->production.lhs; @@ -306,21 +305,24 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) if(symbol->action) value = (HParsedToken *)symbol->action(make_result(arena, value)); - // push result (value, symbol) onto the right stack - h_slist_push(right, value); - h_slist_push(right, symbol); - // this is LR, building a right-most derivation bottom-up, so no reduce can // follow a reduce. we can also assume no conflict follows for GLR if we // use LALR tables, because only terminal symbols (lookahead) get reduces. - action = h_lr_lookup(engine->table, engine->state, symbol); - if(action == NULL) - return false; // no handle after reduce; terminate - assert(action->type == HLR_SHIFT); - } + const HLRAction *next = h_lr_lookup(engine->table, engine->state, symbol); + if(next) { + assert(next->type == HLR_SHIFT); - // this could be the original action, or a shift piggy-backed onto reduce - if(action->type == HLR_SHIFT) { + // piggy-back the shift onto here, never touching the right stack + h_slist_push(left, (void *)(uintptr_t)engine->state); + h_slist_push(left, value); + engine->state = next->nextstate; + } else { + // fallback + h_slist_push(right, value); + h_slist_push(right, symbol); + } + } else { + assert(action->type == HLR_SHIFT); h_slist_push(left, (void *)(uintptr_t)engine->state); h_slist_drop(right); // symbol (discard) h_slist_push(left, h_slist_drop(right)); // semantic value From ec88580b22866a65cf874c8b313ec2336eb0a061 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 18:46:16 +0200 Subject: [PATCH 81/95] make h_lrengine_step a void function again --- src/backends/glr.c | 51 +++++++++++++++++++++++++++++----------------- src/backends/lr.c | 46 +++++++++++++++++++++++------------------ src/backends/lr.h | 3 ++- src/backends/lr0.c | 5 +++++ 4 files changed, 65 insertions(+), 40 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 2b4b974..d460e8a 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -61,6 +61,20 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* for(HSlistNode **x = &engines->head; *x; ) { HLREngine *engine = (*x)->elem; + // check for terminated engines + if(engine->run) { + x = &(*x)->next; // advance x with no change + } else { + *x = (*x)->next; // advance x, removing the current element + + // check for parse success + HParseResult *res = h_lrengine_result(engine); + if(res) + result = res; + + continue; + } + const HLRAction *action = h_lrengine_action(engine, stream); // fork engine on conflicts @@ -79,28 +93,13 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HLRAction *act = x->elem; HLREngine *eng = fork_engine(engine); - // perform one step; add engine to list if it wants to keep running - bool run = h_lrengine_step(eng, act); - if(run) { - h_slist_push(engines, eng); - } else { - HParseResult *res = h_lrengine_result(eng); - if(res) - result = res; - } + // perform one step and add to list + h_lrengine_step(eng, act); + h_slist_push(engines, eng); } } - bool running = h_lrengine_step(engine, action); - - if(running) { - x = &(*x)->next; // go to next - } else { - *x = (*x)->next; // remove from list - HParseResult *res = h_lrengine_result(engine); - if(res) - result = res; - } + h_lrengine_step(engine, action); } } @@ -120,6 +119,20 @@ HParserBackendVTable h__glr_backend_vtable = { +// XXX TODO +// - eliminate right stack by always doing a shift after reduce +// (shift should always follow reduce because rightmost) +// - split tables into +// - one mapping input bytes to actions (shift or reduce or conflict) +// - one mapping reduced-to lhs nonterminals to shift states +// - can there still be conflicts here? +// - use HStringMap to represent lookahead sets and the "piggyback" table +// - implement engine merging +// - triggered when two enter the same state +// - old stacks (/engines?) saved +// - new common suffix stack created +// - when rewinding (during reduce), watch for empty stack -> demerge + // dummy! int test_glr(void) diff --git a/src/backends/lr.c b/src/backends/lr.c index 33d597b..2603ff2 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -207,9 +207,10 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table) HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine)); engine->table = table; + engine->state = 0; + engine->run = true; engine->left = h_slist_new(tarena); engine->right = h_slist_new(tarena); - engine->state = 0; engine->arena = arena; engine->tarena = tarena; @@ -255,7 +256,7 @@ const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream) } // run LR parser for one round; returns false when finished -bool h_lrengine_step(HLREngine *engine, const HLRAction *action) +static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) { // short-hand names HSlist *left = engine->left; @@ -308,19 +309,18 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) // this is LR, building a right-most derivation bottom-up, so no reduce can // follow a reduce. we can also assume no conflict follows for GLR if we // use LALR tables, because only terminal symbols (lookahead) get reduces. - const HLRAction *next = h_lr_lookup(engine->table, engine->state, symbol); - if(next) { - assert(next->type == HLR_SHIFT); + const HLRAction *shift = h_lr_lookup(engine->table, engine->state, symbol); + if(shift == NULL) + return false; // parse error + assert(shift->type == HLR_SHIFT); - // piggy-back the shift onto here, never touching the right stack - h_slist_push(left, (void *)(uintptr_t)engine->state); - h_slist_push(left, value); - engine->state = next->nextstate; - } else { - // fallback - h_slist_push(right, value); - h_slist_push(right, symbol); - } + // piggy-back the shift right here, never touching the input + h_slist_push(left, (void *)(uintptr_t)engine->state); + h_slist_push(left, value); + engine->state = shift->nextstate; + + if(symbol == engine->table->start) + return false; // reduced to start symbol; accept! } else { assert(action->type == HLR_SHIFT); h_slist_push(left, (void *)(uintptr_t)engine->state); @@ -332,13 +332,18 @@ bool h_lrengine_step(HLREngine *engine, const HLRAction *action) return true; } +// run LR parser for one round; sets engine->run +void h_lrengine_step(HLREngine *engine, const HLRAction *action) +{ + engine->run = h_lrengine_step_(engine, action); +} + HParseResult *h_lrengine_result(HLREngine *engine) { - // parsing was successful iff the start symbol is on top of the right stack - if(h_slist_drop(engine->right) == engine->table->start) { - // next on the right stack is the start symbol's semantic value - assert(!h_slist_empty(engine->right)); - HParsedToken *tok = h_slist_drop(engine->right); + // parsing was successful iff after a shift the engine is back in state 0 + if(engine->state == 0 && !h_slist_empty(engine->left)) { + // on top of the stack is the start symbol's semantic value + HParsedToken *tok = engine->left->head->elem; return make_result(engine->arena, tok); } else { return NULL; @@ -356,7 +361,8 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HLREngine *engine = h_lrengine_new(arena, tarena, table); // iterate engine to completion - while(h_lrengine_step(engine, h_lrengine_action(engine, stream))); + while(engine->run) + h_lrengine_step(engine, h_lrengine_action(engine, stream)); HParseResult *result = h_lrengine_result(engine); if(!result) diff --git a/src/backends/lr.h b/src/backends/lr.h index edf0871..5e2f032 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -68,6 +68,7 @@ typedef struct HLREnhGrammar_ { typedef struct HLREngine_ { const HLRTable *table; size_t state; + bool run; // stack layout: // on the left stack, we put pairs: (saved state, semantic value) @@ -128,7 +129,7 @@ void h_lalr_free(HParser *parser); const HLRAction *h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol); const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream); -bool h_lrengine_step(HLREngine *engine, const HLRAction *action); +void h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); diff --git a/src/backends/lr0.c b/src/backends/lr0.c index 67cf2aa..9f350b6 100644 --- a/src/backends/lr0.c +++ b/src/backends/lr0.c @@ -171,6 +171,11 @@ HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) // remember start symbol table->start = g->start; + // add dummy shift entry for the start symbol so h_lrengine_step can always + // find a shift. + // NB: nextstate=0 is used for the "victory condition" by h_lrengine_result. + h_hashtable_put(table->rows[0], g->start, h_shift_action(arena, 0)); + // add shift entries for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { // for each transition x-A->y, add "shift, goto y" to table entry (x,A) From 1e59e461fa5a544d12dd5350ddc7098454bb5a91 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 19:58:58 +0200 Subject: [PATCH 82/95] make ambiguous test ignore order of precedence --- src/t_parser.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/t_parser.c b/src/t_parser.c index 7522e92..59adf36 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -430,13 +430,15 @@ static void test_rightrec(gconstpointer backend) { static void test_ambiguous(gconstpointer backend) { HParser *d_ = h_ch('d'); + HParser *p_ = h_ch('+'); HParser *E_ = h_indirect(); - h_bind_indirect(E_, h_choice(h_sequence(E_, h_ch('+'), E_, NULL), d_, NULL)); + h_bind_indirect(E_, h_choice(h_sequence(E_, p_, E_, NULL), d_, NULL)); + HParser *expr_ = h_action(E_, h_act_flatten); - g_check_parse_ok(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d", 1, "u0x64"); - g_check_parse_ok(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d", 3, "(u0x64 u0x2b u0x64)"); - g_check_parse_ok(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d+d", 5, "(u0x64 u0x2b (u0x64 u0x2b u0x64))"); - g_check_parse_failed(E_, (HParserBackend)GPOINTER_TO_INT(backend), "d+", 2); + g_check_parse_ok(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d", 1, "(u0x64)"); + g_check_parse_ok(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d", 3, "(u0x64 u0x2b u0x64)"); + g_check_parse_ok(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d+d", 5, "(u0x64 u0x2b u0x64 u0x2b u0x64)"); + g_check_parse_failed(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d+", 2); } void register_parser_tests(void) { From c32cf709b2e51924f8f1f91693febc1856fedc43 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 20:11:19 +0200 Subject: [PATCH 83/95] eliminate the right stack; work with the HInputStream directly --- src/backends/glr.c | 15 ++++---- src/backends/lr.c | 87 +++++++++++++++++++++++----------------------- src/backends/lr.h | 12 +++---- 3 files changed, 54 insertions(+), 60 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index d460e8a..7a5f8f5 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -29,15 +29,14 @@ HLREngine *fork_engine(const HLREngine *engine) HLREngine *eng2 = h_arena_malloc(engine->tarena, sizeof(HLREngine)); eng2->table = engine->table; eng2->state = engine->state; + eng2->input = engine->input; - // shallow-copy the stacks + // shallow-copy the stack // this works because h_slist_push and h_slist_pop never modify // the underlying structure of HSlistNodes, only the head pointer. // in fact, this gives us prefix sharing for free. - eng2->left = h_arena_malloc(engine->tarena, sizeof(HSlist)); - eng2->right = h_arena_malloc(engine->tarena, sizeof(HSlist)); - *eng2->left = *engine->left; - *eng2->right = *engine->right; + eng2->stack = h_arena_malloc(engine->tarena, sizeof(HSlist)); + *eng2->stack = *engine->stack; eng2->arena = engine->arena; eng2->tarena = engine->tarena; @@ -54,7 +53,7 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse HSlist *engines = h_slist_new(tarena); - h_slist_push(engines, h_lrengine_new(arena, tarena, table)); + h_slist_push(engines, h_lrengine_new(arena, tarena, table, stream)); HParseResult *result = NULL; while(result == NULL && !h_slist_empty(engines)) { @@ -75,7 +74,7 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* continue; } - const HLRAction *action = h_lrengine_action(engine, stream); + const HLRAction *action = h_lrengine_action(engine); // fork engine on conflicts if(action && action->type == HLR_CONFLICT) { @@ -120,8 +119,6 @@ HParserBackendVTable h__glr_backend_vtable = { // XXX TODO -// - eliminate right stack by always doing a shift after reduce -// (shift should always follow reduce because rightmost) // - split tables into // - one mapping input bytes to actions (shift or reduce or conflict) // - one mapping reduced-to lhs nonterminals to shift states diff --git a/src/backends/lr.c b/src/backends/lr.c index 2603ff2..f33aab8 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -202,65 +202,64 @@ h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) } } -HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table) +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, + const HInputStream *stream) { HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine)); engine->table = table; engine->state = 0; engine->run = true; - engine->left = h_slist_new(tarena); - engine->right = h_slist_new(tarena); + engine->stack = h_slist_new(tarena); + engine->input = *stream; engine->arena = arena; engine->tarena = tarena; return engine; } -const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream) +const HLRAction *h_lrengine_action(const HLREngine *engine) { - HSlist *right = engine->right; - HArena *arena = engine->arena; HArena *tarena = engine->tarena; - // make sure there is input on the right stack - if(h_slist_empty(right)) { - // XXX use statically-allocated terminal symbols - HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); - HParsedToken *v; + // XXX use statically-allocated terminal symbols + HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); - uint8_t c = h_read_bits(stream, 8, false); + HInputStream lookahead = engine->input; + uint8_t c = h_read_bits(&lookahead, 8, false); - if(stream->overrun) { // end of input - x->type = HCF_END; - v = NULL; - } else { - x->type = HCF_CHAR; - x->chr = c; - v = h_arena_malloc(arena, sizeof(HParsedToken)); - v->token_type = TT_UINT; - v->uint = c; - } - - h_slist_push(right, v); - h_slist_push(right, x); + if(lookahead.overrun) { // end of input + x->type = HCF_END; + } else { + x->type = HCF_CHAR; + x->chr = c; } - // peek at input symbol on the right side - HCFChoice *symbol = right->head->elem; + return h_lr_lookup(engine->table, engine->state, x); +} - // table lookup - const HLRAction *action = h_lr_lookup(engine->table, engine->state, symbol); +static HParsedToken *consume_input(HLREngine *engine) +{ + HParsedToken *v; - return action; + uint8_t c = h_read_bits(&engine->input, 8, false); + + if(engine->input.overrun) { // end of input + v = NULL; + } else { + v = h_arena_malloc(engine->arena, sizeof(HParsedToken)); + v->token_type = TT_UINT; + v->uint = c; + } + + return v; } // run LR parser for one round; returns false when finished static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) { // short-hand names - HSlist *left = engine->left; - HSlist *right = engine->right; + HSlist *stack = engine->stack; HArena *arena = engine->arena; HArena *tarena = engine->tarena; @@ -278,11 +277,11 @@ static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) value->token_type = TT_SEQUENCE; value->seq = h_carray_new_sized(arena, len); - // pull values off the left stack, rewinding state accordingly + // pull values off the stack, rewinding state accordingly HParsedToken *v = NULL; for(size_t i=0; istate = (uintptr_t)h_slist_drop(left); + v = h_slist_drop(stack); + engine->state = (uintptr_t)h_slist_drop(stack); // collect values in result sequence value->seq->elements[len-1-i] = v; @@ -315,17 +314,17 @@ static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) assert(shift->type == HLR_SHIFT); // piggy-back the shift right here, never touching the input - h_slist_push(left, (void *)(uintptr_t)engine->state); - h_slist_push(left, value); + h_slist_push(stack, (void *)(uintptr_t)engine->state); + h_slist_push(stack, value); engine->state = shift->nextstate; if(symbol == engine->table->start) return false; // reduced to start symbol; accept! } else { assert(action->type == HLR_SHIFT); - h_slist_push(left, (void *)(uintptr_t)engine->state); - h_slist_drop(right); // symbol (discard) - h_slist_push(left, h_slist_drop(right)); // semantic value + HParsedToken *value = consume_input(engine); + h_slist_push(stack, (void *)(uintptr_t)engine->state); + h_slist_push(stack, value); engine->state = action->nextstate; } @@ -341,9 +340,9 @@ void h_lrengine_step(HLREngine *engine, const HLRAction *action) HParseResult *h_lrengine_result(HLREngine *engine) { // parsing was successful iff after a shift the engine is back in state 0 - if(engine->state == 0 && !h_slist_empty(engine->left)) { + if(engine->state == 0 && !h_slist_empty(engine->stack)) { // on top of the stack is the start symbol's semantic value - HParsedToken *tok = engine->left->head->elem; + HParsedToken *tok = engine->stack->head->elem; return make_result(engine->arena, tok); } else { return NULL; @@ -358,11 +357,11 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HArena *arena = h_new_arena(mm__, 0); // will hold the results HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse - HLREngine *engine = h_lrengine_new(arena, tarena, table); + HLREngine *engine = h_lrengine_new(arena, tarena, table, stream); // iterate engine to completion while(engine->run) - h_lrengine_step(engine, h_lrengine_action(engine, stream)); + h_lrengine_step(engine, h_lrengine_action(engine)); HParseResult *result = h_lrengine_result(engine); if(!result) diff --git a/src/backends/lr.h b/src/backends/lr.h index 5e2f032..f76bd33 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -70,11 +70,8 @@ typedef struct HLREngine_ { size_t state; bool run; - // stack layout: - // on the left stack, we put pairs: (saved state, semantic value) - // on the right stack, we put pairs: (symbol, semantic value) - HSlist *left; // left stack; reductions happen here - HSlist *right; // right stack; input appears here + HSlist *stack; // holds pairs: (saved state, semantic value) + HInputStream input; HArena *arena; // will hold the results HArena *tarena; // tmp, deleted after parse @@ -108,7 +105,8 @@ HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark); HLRState *h_lrstate_new(HArena *arena); HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows); void h_lrtable_free(HLRTable *table); -HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table); +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, + const HInputStream *stream); HLRAction *h_reduce_action(HArena *arena, const HLRItem *item); HLRAction *h_shift_action(HArena *arena, size_t nextstate); HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new); @@ -128,7 +126,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params); void h_lalr_free(HParser *parser); const HLRAction *h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol); -const HLRAction *h_lrengine_action(HLREngine *engine, HInputStream *stream); +const HLRAction *h_lrengine_action(const HLREngine *engine); void h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); From 534a29b7ba6c30619954de2c20b4c83202aa2878 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 20:21:40 +0200 Subject: [PATCH 84/95] split h_lr_lookup into two (yet identical) functions --- src/backends/lr.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index f33aab8..cdd2a35 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -190,18 +190,6 @@ HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new) /* LR driver */ -const HLRAction * -h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) -{ - assert(state < table->nrows); - if(table->forall[state]) { - assert(h_hashtable_empty(table->rows[state])); // that would be a conflict - return table->forall[state]; - } else { - return h_hashtable_get(table->rows[state], symbol); - } -} - HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, const HInputStream *stream) { @@ -218,6 +206,36 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, return engine; } +static const HLRAction * +terminal_lookup(const HLREngine *engine, const HCFChoice *symbol) +{ + const HLRTable *table = engine->table; + size_t state = engine->state; + + assert(state < table->nrows); + if(table->forall[state]) { + assert(h_hashtable_empty(table->rows[state])); // that would be a conflict + return table->forall[state]; + } else { + return h_hashtable_get(table->rows[state], symbol); + } +} + +static const HLRAction * +nonterminal_lookup(const HLREngine *engine, const HCFChoice *symbol) +{ + const HLRTable *table = engine->table; + size_t state = engine->state; + + assert(state < table->nrows); + if(table->forall[state]) { + assert(h_hashtable_empty(table->rows[state])); // that would be a conflict + return table->forall[state]; + } else { + return h_hashtable_get(table->rows[state], symbol); + } +} + const HLRAction *h_lrengine_action(const HLREngine *engine) { HArena *tarena = engine->tarena; @@ -235,7 +253,7 @@ const HLRAction *h_lrengine_action(const HLREngine *engine) x->chr = c; } - return h_lr_lookup(engine->table, engine->state, x); + return terminal_lookup(engine, x); } static HParsedToken *consume_input(HLREngine *engine) @@ -308,7 +326,7 @@ static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) // this is LR, building a right-most derivation bottom-up, so no reduce can // follow a reduce. we can also assume no conflict follows for GLR if we // use LALR tables, because only terminal symbols (lookahead) get reduces. - const HLRAction *shift = h_lr_lookup(engine->table, engine->state, symbol); + const HLRAction *shift = nonterminal_lookup(engine, symbol); if(shift == NULL) return false; // parse error assert(shift->type == HLR_SHIFT); From d67e12a825697290d3d41f59b9aba2c2fc0d3112 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 20:52:27 +0200 Subject: [PATCH 85/95] better factor out lr table writes --- src/backends/lalr.c | 8 ++++---- src/backends/lr.h | 2 -- src/backends/lr0.c | 13 ++++++++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 698b106..39c4afd 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -130,10 +130,10 @@ static inline bool has_conflicts(HLRTable *table) return !h_slist_empty(table->inadeq); } -// place a new entry in tbl; records conflicts in tbl->inadeq +// place a new terminal entry in tbl; records conflicts in tbl->inadeq // returns 0 on success, -1 on conflict // ignores forall entries -int h_lrtable_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action) +static int terminal_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action) { HLRAction *prev = h_hashtable_get(tbl->rows[state], x); if(prev && prev != action) { @@ -257,7 +257,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) if(fs->end_branch) { HCFChoice *terminal = h_arena_malloc(arena, sizeof(HCFChoice)); terminal->type = HCF_END; - if(h_lrtable_put(table, state, terminal, action) < 0) + if(terminal_put(table, state, terminal, action) < 0) inadeq = true; } H_FOREACH(fs->char_branches, void *key, HStringMap *m) @@ -268,7 +268,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) terminal->type = HCF_CHAR; terminal->chr = key_char((HCharKey)key); - if(h_lrtable_put(table, state, terminal, action) < 0) + if(terminal_put(table, state, terminal, action) < 0) inadeq = true; H_END_FOREACH // lookahead character } H_END_FOREACH // enhanced production diff --git a/src/backends/lr.h b/src/backends/lr.h index f76bd33..ee0c1f3 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -120,12 +120,10 @@ HHashValue h_hash_transition(const void *p); HLRDFA *h_lr0_dfa(HCFGrammar *g); HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa); -int h_lrtable_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action); int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params); void h_lalr_free(HParser *parser); -const HLRAction *h_lr_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol); const HLRAction *h_lrengine_action(const HLREngine *engine); void h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); diff --git a/src/backends/lr0.c b/src/backends/lr0.c index 9f350b6..1bd63e5 100644 --- a/src/backends/lr0.c +++ b/src/backends/lr0.c @@ -161,6 +161,14 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) /* LR(0) table generation */ +static inline +void put_shift(HLRTable *table, size_t state, const HCFChoice *symbol, + size_t nextstate) +{ + HLRAction *action = h_shift_action(table->arena, nextstate); + h_hashtable_put(table->rows[state], symbol, action); +} + HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) { HAllocator *mm__ = g->mm__; @@ -174,15 +182,14 @@ HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) // add dummy shift entry for the start symbol so h_lrengine_step can always // find a shift. // NB: nextstate=0 is used for the "victory condition" by h_lrengine_result. - h_hashtable_put(table->rows[0], g->start, h_shift_action(arena, 0)); + put_shift(table, 0, g->start, 0); // add shift entries for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { // for each transition x-A->y, add "shift, goto y" to table entry (x,A) HLRTransition *t = x->elem; - HLRAction *action = h_shift_action(arena, t->to); - h_hashtable_put(table->rows[t->from], t->symbol, action); + put_shift(table, t->from, t->symbol, t->to); } // add reduce entries, record inadequate states From 853e1fba4607aa5cba8fe503401312330b54d08d Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 22:55:04 +0200 Subject: [PATCH 86/95] split LR table representation by key type (terminals/nonterminals) --- src/backends/glr.c | 5 --- src/backends/lalr.c | 88 +++++++++++++++++++++++++++++---------------- src/backends/lr.c | 52 +++++++++++++++++++-------- src/backends/lr.h | 6 ++-- src/backends/lr0.c | 16 +++++++-- src/cfgrammar.c | 54 ++++++++++++++++++++-------- src/cfgrammar.h | 3 ++ 7 files changed, 156 insertions(+), 68 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 7a5f8f5..864ecaa 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -119,11 +119,6 @@ HParserBackendVTable h__glr_backend_vtable = { // XXX TODO -// - split tables into -// - one mapping input bytes to actions (shift or reduce or conflict) -// - one mapping reduced-to lhs nonterminals to shift states -// - can there still be conflicts here? -// - use HStringMap to represent lookahead sets and the "piggyback" table // - implement engine merging // - triggered when two enter the same state // - old stacks (/engines?) saved diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 39c4afd..242988e 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -13,9 +13,23 @@ static inline size_t seqsize(void *p_) return n+1; } +static HLRAction * +lrtable_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) +{ + switch(symbol->type) { + case HCF_END: + return table->tmap[state]->end_branch; + case HCF_CHAR: + return h_stringmap_get(table->tmap[state], &symbol->chr, 1, false); + default: + // nonterminal case + return h_hashtable_get(table->ntmap[state], symbol); + } +} + static size_t follow_transition(const HLRTable *table, size_t x, HCFChoice *A) { - HLRAction *action = h_hashtable_get(table->rows[x], A); + HLRAction *action = lrtable_lookup(table, x, A); assert(action != NULL); assert(action->type == HLR_SHIFT); return action->nextstate; @@ -130,21 +144,48 @@ static inline bool has_conflicts(HLRTable *table) return !h_slist_empty(table->inadeq); } -// place a new terminal entry in tbl; records conflicts in tbl->inadeq +// for each lookahead symbol (fs), put action into tmap // returns 0 on success, -1 on conflict // ignores forall entries -static int terminal_put(HLRTable *tbl, size_t state, HCFChoice *x, HLRAction *action) +static int terminals_put(HStringMap *tmap, const HStringMap *fs, HLRAction *action) { - HLRAction *prev = h_hashtable_get(tbl->rows[state], x); - if(prev && prev != action) { - // conflict - action = h_lr_conflict(tbl->arena, prev, action); - h_hashtable_put(tbl->rows[state], x, action); - return -1; - } else { - h_hashtable_put(tbl->rows[state], x, action); - return 0; + int ret = 0; + + if(fs->epsilon_branch) { + HLRAction *prev = tmap->epsilon_branch; + if(prev && prev != action) { + // conflict + tmap->epsilon_branch = h_lr_conflict(tmap->arena, prev, action); + ret = -1; + } else { + tmap->epsilon_branch = action; + } } + + if(fs->end_branch) { + HLRAction *prev = tmap->end_branch; + if(prev && prev != action) { + // conflict + tmap->end_branch = h_lr_conflict(tmap->arena, prev, action); + ret = -1; + } else { + tmap->end_branch = action; + } + } + + H_FOREACH(fs->char_branches, void *key, HStringMap *fs_) + HStringMap *tmap_ = h_hashtable_get(tmap->char_branches, key); + + if(!tmap_) { + tmap_ = h_stringmap_new(tmap->arena); + h_hashtable_put(tmap->char_branches, key, tmap_); + } + + if(terminals_put(tmap_, fs_, action) < 0) + ret = -1; + H_END_FOREACH + + return ret; } // check whether a sequence of enhanced-grammar symbols (p) matches the given @@ -254,23 +295,8 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) assert(!h_stringmap_empty(fs)); // for each lookahead symbol, put action into table cell - if(fs->end_branch) { - HCFChoice *terminal = h_arena_malloc(arena, sizeof(HCFChoice)); - terminal->type = HCF_END; - if(terminal_put(table, state, terminal, action) < 0) - inadeq = true; - } - H_FOREACH(fs->char_branches, void *key, HStringMap *m) - if(!m->epsilon_branch) - continue; - - HCFChoice *terminal = h_arena_malloc(arena, sizeof(HCFChoice)); - terminal->type = HCF_CHAR; - terminal->chr = key_char((HCharKey)key); - - if(terminal_put(table, state, terminal, action) < 0) - inadeq = true; - H_END_FOREACH // lookahead character + if(terminals_put(table->tmap[state], fs, action) < 0) + inadeq = true; } H_END_FOREACH // enhanced production H_END_FOREACH // reducible item @@ -306,6 +332,8 @@ HParserBackendVTable h__lalr_backend_vtable = { // dummy! int test_lalr(void) { + HAllocator *mm__ = &system_allocator; + /* E -> E '-' T | T @@ -321,7 +349,7 @@ int test_lalr(void) HParser *p = E; printf("\n==== G R A M M A R ====\n"); - HCFGrammar *g = h_cfgrammar(&system_allocator, p); + HCFGrammar *g = h_cfgrammar_(mm__, augment(mm__, p)); if(g == NULL) { fprintf(stderr, "h_cfgrammar failed\n"); return 1; diff --git a/src/backends/lr.c b/src/backends/lr.c index cdd2a35..66a76b7 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -1,4 +1,5 @@ #include +#include #include "../parsers/parser_internal.h" #include "lr.h" @@ -118,14 +119,16 @@ HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) HLRTable *ret = h_new(HLRTable, 1); ret->nrows = nrows; - ret->rows = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); + ret->ntmap = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); + ret->tmap = h_arena_malloc(arena, nrows * sizeof(HStringMap *)); ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *)); ret->inadeq = h_slist_new(arena); ret->arena = arena; ret->mm__ = mm__; for(size_t i=0; irows[i] = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); + ret->ntmap[i] = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); + ret->tmap[i] = h_stringmap_new(arena); ret->forall[i] = NULL; } @@ -186,6 +189,12 @@ HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new) return action; } +bool h_lrtable_row_empty(const HLRTable *table, size_t i) +{ + return (h_hashtable_empty(table->ntmap[i]) + && h_stringmap_empty(table->tmap[i])); +} + /* LR driver */ @@ -214,10 +223,14 @@ terminal_lookup(const HLREngine *engine, const HCFChoice *symbol) assert(state < table->nrows); if(table->forall[state]) { - assert(h_hashtable_empty(table->rows[state])); // that would be a conflict + assert(h_lrtable_row_empty(table, state)); // that would be a conflict return table->forall[state]; } else { - return h_hashtable_get(table->rows[state], symbol); + // XXX use the lookahead stream directly here (cf. llk) + if(symbol->type == HCF_END) + return table->tmap[state]->end_branch; + else + return h_stringmap_get(table->tmap[state], &symbol->chr, 1, false); } } @@ -228,12 +241,9 @@ nonterminal_lookup(const HLREngine *engine, const HCFChoice *symbol) size_t state = engine->state; assert(state < table->nrows); - if(table->forall[state]) { - assert(h_hashtable_empty(table->rows[state])); // that would be a conflict - return table->forall[state]; - } else { - return h_hashtable_get(table->rows[state], symbol); - } + assert(!table->forall[state]); // contains only reduce entries + // we are only looking for shifts + return h_hashtable_get(table->ntmap[state], symbol); } const HLRAction *h_lrengine_action(const HLREngine *engine) @@ -500,6 +510,19 @@ void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) } } +static void valprint_lraction(FILE *file, void *env, void *val) +{ + const HLRAction *action = val; + const HCFGrammar *grammar = env; + pprint_lraction(file, grammar, action); +} + +static void pprint_lrtable_terminals(FILE *file, const HCFGrammar *g, + const HStringMap *map) +{ + h_pprint_stringmap(file, ' ', valprint_lraction, (void *)g, map); +} + void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, unsigned int indent) { @@ -507,18 +530,19 @@ void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, for(unsigned int j=0; jforall[i]) { - fputs(" - ", f); + fputc(' ', f); pprint_lraction(f, g, table->forall[i]); - fputs(" -", f); - if(!h_hashtable_empty(table->rows[i])) + if(!h_lrtable_row_empty(table, i)) fputs(" !!", f); } - H_FOREACH(table->rows[i], HCFChoice *symbol, HLRAction *action) + H_FOREACH(table->ntmap[i], HCFChoice *symbol, HLRAction *action) fputc(' ', f); // separator h_pprint_symbol(f, g, symbol); fputc(':', f); pprint_lraction(f, g, action); H_END_FOREACH + fputc(' ', f); // separator + pprint_lrtable_terminals(f, g, table->tmap[i]); fputc('\n', f); } diff --git a/src/backends/lr.h b/src/backends/lr.h index ee0c1f3..ca8418e 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -48,8 +48,9 @@ typedef struct HLRAction_ { } HLRAction; typedef struct HLRTable_ { - size_t nrows; - HHashTable **rows; // map symbols to HLRActions + size_t nrows; // dimension of the pointer arrays below + HHashTable **ntmap; // map nonterminal symbols to HLRActions, per row + HStringMap **tmap; // map lookahead strings to HLRActions, per row HLRAction **forall; // shortcut to set an action for an entire row HCFChoice *start; // start symbol HSlist *inadeq; // indices of any inadequate states @@ -110,6 +111,7 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, HLRAction *h_reduce_action(HArena *arena, const HLRItem *item); HLRAction *h_shift_action(HArena *arena, size_t nextstate); HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new); +bool h_lrtable_row_empty(const HLRTable *table, size_t i); bool h_eq_symbol(const void *p, const void *q); bool h_eq_lr_itemset(const void *p, const void *q); diff --git a/src/backends/lr0.c b/src/backends/lr0.c index 1bd63e5..5add53a 100644 --- a/src/backends/lr0.c +++ b/src/backends/lr0.c @@ -166,7 +166,18 @@ void put_shift(HLRTable *table, size_t state, const HCFChoice *symbol, size_t nextstate) { HLRAction *action = h_shift_action(table->arena, nextstate); - h_hashtable_put(table->rows[state], symbol, action); + + switch(symbol->type) { + case HCF_END: + h_stringmap_put_end(table->tmap[state], action); + break; + case HCF_CHAR: + h_stringmap_put_char(table->tmap[state], symbol->chr, action); + break; + default: + // nonterminal case + h_hashtable_put(table->ntmap[state], symbol, action); + } } HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) @@ -210,7 +221,8 @@ HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) // check for shift/reduce conflict with other entries // NOTE: these are not recorded as HLR_CONFLICTs at this point - if(!h_hashtable_empty(table->rows[i])) + + if(!h_lrtable_row_empty(table, i)) inadeq = true; } H_END_FOREACH diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 199ef5f..b01c44c 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -813,27 +813,43 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in #define BUFSIZE 512 static bool -pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, - const HStringMap *set) +pprint_stringmap_elems(FILE *file, bool first, char *prefix, size_t n, char sep, + void (*valprint)(FILE *f, void *env, void *val), void *env, + const HStringMap *map) { assert(n < BUFSIZE-4); - if(set->epsilon_branch) { - if(!first) fputc(',', file); first=false; - if(n==0) - fputs("''", file); - else + if(map->epsilon_branch) { + if(!first) fputc(sep, file); first=false; + if(n==0) { + fputs("\"\"", file); + } else { + fputs("\"", file); fwrite(prefix, 1, n, file); + fputs("\"", file); + } + + if(valprint) { + fputc(':', file); + valprint(file, env, map->epsilon_branch); + } } - if(set->end_branch) { - if(!first) fputc(',', file); first=false; + if(map->end_branch) { + if(!first) fputs(",\"", file); first=false; + if(n>0) fputs("\"\"", file); fwrite(prefix, 1, n, file); - fputc('$', file); + if(n>0) fputs("\"\"", file); + fputs("$", file); + + if(valprint) { + fputc(':', file); + valprint(file, env, map->end_branch); + } } - // iterate over set->char_branches - HHashTable *ht = set->char_branches; + // iterate over map->char_branches + HHashTable *ht = map->char_branches; size_t i; HHashTableEntry *hte; for(i=0; i < ht->capacity; i++) { @@ -859,20 +875,28 @@ pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, n_ += sprintf(prefix+n_, "\\x%.2X", c); } - first = pprint_stringset_elems(file, first, prefix, n_, ends); + first = pprint_stringmap_elems(file, first, prefix, n_, + sep, valprint, env, ends); } } return first; } +void h_pprint_stringmap(FILE *file, char sep, + void (*valprint)(FILE *f, void *env, void *val), void *env, + const HStringMap *map) +{ + char buf[BUFSIZE]; + pprint_stringmap_elems(file, true, buf, 0, sep, valprint, env, map); +} + void h_pprint_stringset(FILE *file, const HStringMap *set, int indent) { int j; for(j=0; j Date: Fri, 21 Jun 2013 23:22:07 +0200 Subject: [PATCH 87/95] use lookahead stream directly for LR (terminal) lookup --- src/backends/llk.c | 27 +++------------------------ src/backends/lr.c | 25 +++---------------------- src/cfgrammar.c | 25 +++++++++++++++++++++++++ src/cfgrammar.h | 1 + 4 files changed, 32 insertions(+), 46 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 4f73c46..c0cf6af 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -21,7 +21,7 @@ typedef struct HLLkTable_ { /* Interface to look up an entry in the parse table. */ const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, - HInputStream lookahead) + const HInputStream *stream) { const HStringMap *row = h_hashtable_get(table->rows, x); assert(row != NULL); // the table should have one row for each nonterminal @@ -29,28 +29,7 @@ const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, assert(!row->epsilon_branch); // would match without looking at the input // XXX cases where this could be useful? - const HStringMap *m = row; - while(m) { - if(m->epsilon_branch) { // input matched - // assert: another lookahead would not bring a more specific match. - // this is for the table generator to ensure. - return m->epsilon_branch; - } - - // note the lookahead stream is passed by value, i.e. a copy. - // reading bits from it does not consume them from the real input. - uint8_t c = h_read_bits(&lookahead, 8, false); - - if(lookahead.overrun) { // end of input - // XXX assumption of byte-wise grammar and input - return m->end_branch; - } - - // no match yet, descend - m = h_stringmap_get_char(m, c); - } - - return NULL; + return h_stringmap_get_lookahead(row, *stream); } /* Allocate a new parse table. */ @@ -321,7 +300,7 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* seq = h_carray_new(arena); // look up applicable production in parse table - const HCFSequence *p = h_llk_lookup(table, x, *stream); + const HCFSequence *p = h_llk_lookup(table, x, stream); if(p == NULL) goto no_parse; diff --git a/src/backends/lr.c b/src/backends/lr.c index 66a76b7..ca45582 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -216,7 +216,7 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, } static const HLRAction * -terminal_lookup(const HLREngine *engine, const HCFChoice *symbol) +terminal_lookup(const HLREngine *engine, const HInputStream *stream) { const HLRTable *table = engine->table; size_t state = engine->state; @@ -226,11 +226,7 @@ terminal_lookup(const HLREngine *engine, const HCFChoice *symbol) assert(h_lrtable_row_empty(table, state)); // that would be a conflict return table->forall[state]; } else { - // XXX use the lookahead stream directly here (cf. llk) - if(symbol->type == HCF_END) - return table->tmap[state]->end_branch; - else - return h_stringmap_get(table->tmap[state], &symbol->chr, 1, false); + return h_stringmap_get_lookahead(table->tmap[state], *stream); } } @@ -248,22 +244,7 @@ nonterminal_lookup(const HLREngine *engine, const HCFChoice *symbol) const HLRAction *h_lrengine_action(const HLREngine *engine) { - HArena *tarena = engine->tarena; - - // XXX use statically-allocated terminal symbols - HCFChoice *x = h_arena_malloc(tarena, sizeof(HCFChoice)); - - HInputStream lookahead = engine->input; - uint8_t c = h_read_bits(&lookahead, 8, false); - - if(lookahead.overrun) { // end of input - x->type = HCF_END; - } else { - x->type = HCF_CHAR; - x->chr = c; - } - - return terminal_lookup(engine, x); + return terminal_lookup(engine, &engine->input); } static HParsedToken *consume_input(HLREngine *engine) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index b01c44c..196d9d3 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -321,6 +321,31 @@ void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool en return m->epsilon_branch; } +void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead) +{ + while(m) { + if(m->epsilon_branch) { // input matched + // assert: another lookahead would not bring a more specific match. + // this is for the table generator to ensure. (LLk) + return m->epsilon_branch; + } + + // note the lookahead stream is passed by value, i.e. a copy. + // reading bits from it does not consume them from the real input. + uint8_t c = h_read_bits(&lookahead, 8, false); + + if(lookahead.overrun) { // end of input + // XXX assumption of byte-wise grammar and input + return m->end_branch; + } + + // no match yet, descend + m = h_stringmap_get_char(m, c); + } + + return NULL; +} + bool h_stringmap_present(const HStringMap *m, const uint8_t *str, size_t n, bool end) { return (h_stringmap_get(m, str, n, end) != NULL); diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 1f52bdd..193f8ca 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -47,6 +47,7 @@ void h_stringmap_put_char(HStringMap *m, uint8_t c, void *v); void h_stringmap_update(HStringMap *m, const HStringMap *n); void h_stringmap_replace(HStringMap *m, void *old, void *new); void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool end); +void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead); bool h_stringmap_present(const HStringMap *m, const uint8_t *str, size_t n, bool end); bool h_stringmap_present_epsilon(const HStringMap *m); bool h_stringmap_empty(const HStringMap *m); From 34c6d868b9823f9c55ec9398bf33dd734dfc6bda Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 23:47:22 +0200 Subject: [PATCH 88/95] commentation --- src/backends/glr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 864ecaa..f7b6f8b 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -32,7 +32,7 @@ HLREngine *fork_engine(const HLREngine *engine) eng2->input = engine->input; // shallow-copy the stack - // this works because h_slist_push and h_slist_pop never modify + // this works because h_slist_push and h_slist_drop never modify // the underlying structure of HSlistNodes, only the head pointer. // in fact, this gives us prefix sharing for free. eng2->stack = h_arena_malloc(engine->tarena, sizeof(HSlist)); From 8bc3b93e959619708d07f468e3a7dd5037bf28c5 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 21 Jun 2013 23:53:47 +0200 Subject: [PATCH 89/95] correctly use augmented grammar for test_glr() output --- src/backends/glr.c | 4 +++- src/backends/lalr.c | 6 +++--- src/backends/lr.h | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index f7b6f8b..82ad5c0 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -129,6 +129,8 @@ HParserBackendVTable h__glr_backend_vtable = { // dummy! int test_glr(void) { + HAllocator *mm__ = &system_allocator; + /* E -> E '+' E | 'd' @@ -141,7 +143,7 @@ int test_glr(void) HParser *p = E; printf("\n==== G R A M M A R ====\n"); - HCFGrammar *g = h_cfgrammar(&system_allocator, p); + HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p)); if(g == NULL) { fprintf(stderr, "h_cfgrammar failed\n"); return 1; diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 242988e..93becf3 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -207,7 +207,7 @@ static bool match_production(HLREnhGrammar *eg, HCFChoice **p, // desugar parser with a fresh start symbol // this guarantees that the start symbol will not occur in any productions -static HCFChoice *augment(HAllocator *mm__, HParser *parser) +HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser) { HCFChoice *augmented = h_new(HCFChoice, 1); @@ -231,7 +231,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) // build LR(0) table // if necessary, resolve conflicts "by conversion to SLR" - HCFGrammar *g = h_cfgrammar_(mm__, augment(mm__, parser)); + HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, parser)); if(g == NULL) // backend not suitable (language not context-free) return -1; @@ -349,7 +349,7 @@ int test_lalr(void) HParser *p = E; printf("\n==== G R A M M A R ====\n"); - HCFGrammar *g = h_cfgrammar_(mm__, augment(mm__, p)); + HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p)); if(g == NULL) { fprintf(stderr, "h_cfgrammar failed\n"); return 1; diff --git a/src/backends/lr.h b/src/backends/lr.h index ca8418e..1158542 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -123,6 +123,7 @@ HHashValue h_hash_transition(const void *p); HLRDFA *h_lr0_dfa(HCFGrammar *g); HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa); +HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser); int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params); void h_lalr_free(HParser *parser); From bf9c9b5f7a7ffa5f81028c3c685afb7ba958a2b9 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 22 Jun 2013 12:51:25 +0200 Subject: [PATCH 90/95] refactoring in preparation for engine merging --- src/backends/glr.c | 77 +++++++++++++++++++++++++++++++--------------- src/backends/lr.c | 1 + src/backends/lr.h | 6 ++-- 3 files changed, 57 insertions(+), 27 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 82ad5c0..2978e62 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -43,6 +43,44 @@ HLREngine *fork_engine(const HLREngine *engine) return eng2; } +static void stow_engine(HSlist *engines, HLREngine *engine) +{ + // XXX switch to one engine per state, and do the merge here + h_slist_push(engines, engine); +} + +static const HLRAction *handle_conflict(HSlist *engines, const HLREngine *engine, + const HSlist *branches) +{ + // there should be at least two conflicting actions + assert(branches->head); + assert(branches->head->next); // this is just a consistency check + + // fork a new engine for all but the first action + for(HSlistNode *x=branches->head->next; x; x=x->next) { + HLRAction *act = x->elem; + HLREngine *eng = fork_engine(engine); + + // perform one step and add to list + h_lrengine_step(eng, act); + stow_engine(engines, eng); + } + + // return first action for use with original engine + return branches->head->elem; +} + +static HLREngine *handle_demerge(HSlist *engines, HLREngine *engine, + const HLRAction *reduce) +{ + return engine; // XXX + + for(size_t i=0; iproduction.length; i++) { + // XXX if stack hits bottom, demerge + } + // XXX call step and stow on the newly-created engines +} + HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) { HLRTable *table = parser->backend_data; @@ -60,12 +98,11 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* for(HSlistNode **x = &engines->head; *x; ) { HLREngine *engine = (*x)->elem; - // check for terminated engines - if(engine->run) { - x = &(*x)->next; // advance x with no change - } else { - *x = (*x)->next; // advance x, removing the current element + // remove engine from list; it may come back in below + *x = (*x)->next; // advance x, removing the current element + // drop those engines that have terminated + if(!engine->run) { // check for parse success HParseResult *res = h_lrengine_result(engine); if(res) @@ -76,29 +113,19 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* const HLRAction *action = h_lrengine_action(engine); - // fork engine on conflicts - if(action && action->type == HLR_CONFLICT) { - const HSlist *branches = action->branches; - - // there should be at least two conflicting actions - assert(branches->head); - assert(branches->head->next); - - // save first action for use with old engine below - action = branches->head->elem; - - // fork a new engine for all the other actions - for(HSlistNode *x=branches->head->next; x; x=x->next) { - HLRAction *act = x->elem; - HLREngine *eng = fork_engine(engine); - - // perform one step and add to list - h_lrengine_step(eng, act); - h_slist_push(engines, eng); - } + // handle forks and demerges (~> spawn engines) + if(action) { + if(action->type == HLR_CONFLICT) { + // fork engine on conflicts + action = handle_conflict(engines, engine, action->branches); + } else if(action->type == HLR_REDUCE) { + // demerge as needed to ensure that stacks are deep enough + engine = handle_demerge(engines, engine, action); + } } h_lrengine_step(engine, action); + stow_engine(engines, engine); } } diff --git a/src/backends/lr.c b/src/backends/lr.c index ca45582..bb20f71 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -209,6 +209,7 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, engine->run = true; engine->stack = h_slist_new(tarena); engine->input = *stream; + engine->merged = NULL; engine->arena = arena; engine->tarena = tarena; diff --git a/src/backends/lr.h b/src/backends/lr.h index 1158542..5febc24 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -74,8 +74,10 @@ typedef struct HLREngine_ { HSlist *stack; // holds pairs: (saved state, semantic value) HInputStream input; - HArena *arena; // will hold the results - HArena *tarena; // tmp, deleted after parse + HSlist *merged; // saved ancestor engines that merged to form this one + + HArena *arena; // will hold the results + HArena *tarena; // tmp, deleted after parse } HLREngine; From 23afea4b4e6fa00262c0b0cfbd958fa3e9e44093 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 22 Jun 2013 14:21:19 +0200 Subject: [PATCH 91/95] add demerge code path (untested) --- src/backends/glr.c | 55 +++++++++++++++++++++++++++++++++++++++------- src/backends/lr.c | 1 + src/backends/lr.h | 3 ++- 3 files changed, 50 insertions(+), 9 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 2978e62..411171b 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -70,15 +70,54 @@ static const HLRAction *handle_conflict(HSlist *engines, const HLREngine *engine return branches->head->elem; } -static HLREngine *handle_demerge(HSlist *engines, HLREngine *engine, - const HLRAction *reduce) +static HSlist *demerge_stack(HSlistNode *bottom, HSlistNode *mp, HSlist *stack) { - return engine; // XXX + HArena *arena = stack->arena; - for(size_t i=0; iproduction.length; i++) { - // XXX if stack hits bottom, demerge + HSlist *ret = h_slist_new(arena); + + // copy the stack from the top + HSlistNode **y = &ret->head; + for(HSlistNode *x=stack->head; x && x!=mp; x=x->next) { + HSlistNode *node = h_arena_malloc(arena, sizeof(HSlistNode)); + node->elem = x->elem; + node->next = NULL; + *y = node; + y = &node->next; } - // XXX call step and stow on the newly-created engines + *y = bottom; // attach the ancestor stack + + return ret; +} + +static void demerge(HSlist *engines, HLREngine *engine, + const HLRAction *action, size_t depth) +{ + // no-op on engines that are not merged + if(!engine->merged) + return; + + HSlistNode *p = engine->stack->head; + for(size_t i=0; imp) { + HLREngine *eng = engine->merged; + eng->stack = demerge_stack(eng->stack->head, engine->mp, engine->stack); + demerge(engines, eng, action, depth-i); + + // call step and stow on restored ancestor + h_lrengine_step(eng, action); + stow_engine(engines, eng); + break; + } + p = p->next; + } +} + +static inline void +handle_demerge(HSlist *engines, HLREngine *engine, const HLRAction *reduce) +{ + demerge(engines, engine, reduce, reduce->production.length); } HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) @@ -119,8 +158,8 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* // fork engine on conflicts action = handle_conflict(engines, engine, action->branches); } else if(action->type == HLR_REDUCE) { - // demerge as needed to ensure that stacks are deep enough - engine = handle_demerge(engines, engine, action); + // demerge/respawn as needed + handle_demerge(engines, engine, action); } } diff --git a/src/backends/lr.c b/src/backends/lr.c index bb20f71..c481d29 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -210,6 +210,7 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, engine->stack = h_slist_new(tarena); engine->input = *stream; engine->merged = NULL; + engine->mp = NULL; engine->arena = arena; engine->tarena = tarena; diff --git a/src/backends/lr.h b/src/backends/lr.h index 5febc24..ab48633 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -74,7 +74,8 @@ typedef struct HLREngine_ { HSlist *stack; // holds pairs: (saved state, semantic value) HInputStream input; - HSlist *merged; // saved ancestor engines that merged to form this one + struct HLREngine_ *merged; // ancestor merged into this engine at mp + HSlistNode *mp; // mergepoint: stack->head at time of merge HArena *arena; // will hold the results HArena *tarena; // tmp, deleted after parse From 67681a119afcb485249946b3ac802761fc81e026 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 22 Jun 2013 17:40:47 +0200 Subject: [PATCH 92/95] finish engine merging --- src/backends/glr.c | 240 +++++++++++++++++++++++++++------------------ src/backends/lr.c | 33 +++---- src/backends/lr.h | 8 +- src/backends/lr0.c | 6 +- 4 files changed, 168 insertions(+), 119 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 411171b..34a5633 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -1,6 +1,9 @@ #include #include "lr.h" +static bool glr_step(HParseResult **result, HLREngine **engines, + HLREngine *engine, const HLRAction *action); + /* GLR compilation (LALR w/o failing on conflict) */ @@ -22,7 +25,86 @@ void h_glr_free(HParser *parser) } -/* GLR driver */ +/* Merging engines (when they converge on the same state) */ + +static HLREngine *lrengine_merge(HLREngine *old, HLREngine *new) +{ + HArena *arena = old->arena; + + HLREngine *ret = h_arena_malloc(arena, sizeof(HLREngine)); + + assert(old->state == new->state); + assert(old->input.input == new->input.input); + + *ret = *old; + ret->stack = h_slist_new(arena); + ret->merged[0] = old; + ret->merged[1] = new; + + return ret; +} + +static HSlist *demerge_stack(HSlistNode *bottom, HSlist *stack) +{ + HArena *arena = stack->arena; + + HSlist *ret = h_slist_new(arena); + + // copy the stack from the top + HSlistNode **y = &ret->head; + for(HSlistNode *x=stack->head; x; x=x->next) { + HSlistNode *node = h_arena_malloc(arena, sizeof(HSlistNode)); + node->elem = x->elem; + node->next = NULL; + *y = node; + y = &node->next; + } + *y = bottom; // attach the ancestor stack + + return ret; +} + +static inline HLREngine *respawn(HLREngine *eng, HSlist *stack) +{ + // NB: this can be a destructive update because an engine is not used for + // anything after it is merged. + eng->stack = demerge_stack(eng->stack->head, stack); + return eng; +} + +static HLREngine * +demerge(HParseResult **result, HLREngine **engines, + HLREngine *engine, const HLRAction *action, size_t depth) +{ + // no-op on engines that are not merged + if(!engine->merged[0]) + return engine; + + HSlistNode *p = engine->stack->head; + for(size_t i=0; imerged[0], engine->stack); + HLREngine *b = respawn(engine->merged[1], engine->stack); + + // continue demerge until final depth reached + a = demerge(result, engines, a, action, depth-i); + b = demerge(result, engines, b, action, depth-i); + + // step and stow one ancestor... + glr_step(result, engines, a, action); + + // ...and return the other + return b; + } + p = p->next; + } + + return engine; // there is enough stack before the merge point +} + + +/* Forking engines (on conflicts */ HLREngine *fork_engine(const HLREngine *engine) { @@ -43,14 +125,9 @@ HLREngine *fork_engine(const HLREngine *engine) return eng2; } -static void stow_engine(HSlist *engines, HLREngine *engine) -{ - // XXX switch to one engine per state, and do the merge here - h_slist_push(engines, engine); -} - -static const HLRAction *handle_conflict(HSlist *engines, const HLREngine *engine, - const HSlist *branches) +static const HLRAction * +handle_conflict(HParseResult **result, HLREngine **engines, + const HLREngine *engine, const HSlist *branches) { // there should be at least two conflicting actions assert(branches->head); @@ -61,63 +138,46 @@ static const HLRAction *handle_conflict(HSlist *engines, const HLREngine *engine HLRAction *act = x->elem; HLREngine *eng = fork_engine(engine); - // perform one step and add to list - h_lrengine_step(eng, act); - stow_engine(engines, eng); + // perform one step and add to engines + glr_step(result, engines, eng, act); } // return first action for use with original engine return branches->head->elem; } -static HSlist *demerge_stack(HSlistNode *bottom, HSlistNode *mp, HSlist *stack) + +/* GLR driver */ + +static bool glr_step(HParseResult **result, HLREngine **engines, + HLREngine *engine, const HLRAction *action) { - HArena *arena = stack->arena; - - HSlist *ret = h_slist_new(arena); - - // copy the stack from the top - HSlistNode **y = &ret->head; - for(HSlistNode *x=stack->head; x && x!=mp; x=x->next) { - HSlistNode *node = h_arena_malloc(arena, sizeof(HSlistNode)); - node->elem = x->elem; - node->next = NULL; - *y = node; - y = &node->next; - } - *y = bottom; // attach the ancestor stack - - return ret; -} - -static void demerge(HSlist *engines, HLREngine *engine, - const HLRAction *action, size_t depth) -{ - // no-op on engines that are not merged - if(!engine->merged) - return; - - HSlistNode *p = engine->stack->head; - for(size_t i=0; imp) { - HLREngine *eng = engine->merged; - eng->stack = demerge_stack(eng->stack->head, engine->mp, engine->stack); - demerge(engines, eng, action, depth-i); - - // call step and stow on restored ancestor - h_lrengine_step(eng, action); - stow_engine(engines, eng); - break; + // handle forks and demerges (~> spawn engines) + if(action) { + if(action->type == HLR_CONFLICT) { + // fork engine on conflicts + action = handle_conflict(result, engines, engine, action->branches); + } else if(action->type == HLR_REDUCE) { + // demerge/respawn as needed + size_t depth = action->production.length; + engine = demerge(result, engines, engine, action, depth); } - p = p->next; } -} -static inline void -handle_demerge(HSlist *engines, HLREngine *engine, const HLRAction *reduce) -{ - demerge(engines, engine, reduce, reduce->production.length); + bool run = h_lrengine_step(engine, action); + + if(run) { + // store engine in the array, merge if necessary + if(engines[engine->state] == NULL) + engines[engine->state] = engine; + else + engines[engine->state] = lrengine_merge(engines[engine->state], engine); + } else if(engine->state == HLR_SUCCESS) { + // save the result + *result = h_lrengine_result(engine); + } + + return run; } HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) @@ -129,43 +189,42 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HArena *arena = h_new_arena(mm__, 0); // will hold the results HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse - HSlist *engines = h_slist_new(tarena); - h_slist_push(engines, h_lrengine_new(arena, tarena, table, stream)); + // allocate engine arrays (can hold one engine per state) + // these are swapped each iteration + HLREngine **engines = h_arena_malloc(tarena, table->nrows * sizeof(HLREngine *)); + HLREngine **engback = h_arena_malloc(tarena, table->nrows * sizeof(HLREngine *)); + + assert(table->nrows > 0); + for(size_t i=0; inrows; i++) { + engines[i] = NULL; + engback[i] = NULL; + } + + // create initial engine + engines[0] = h_lrengine_new(arena, tarena, table, stream); + assert(engines[0]->state == 0); HParseResult *result = NULL; - while(result == NULL && !h_slist_empty(engines)) { - for(HSlistNode **x = &engines->head; *x; ) { - HLREngine *engine = (*x)->elem; - - // remove engine from list; it may come back in below - *x = (*x)->next; // advance x, removing the current element - - // drop those engines that have terminated - if(!engine->run) { - // check for parse success - HParseResult *res = h_lrengine_result(engine); - if(res) - result = res; + size_t engines_left = 1; + while(engines_left && result == NULL) { + engines_left = 0; + for(size_t i=0; inrows; i++) { + HLREngine *engine = engines[i]; + if(engine == NULL) continue; - } + engines[i] = NULL; // cleared for next iteration - const HLRAction *action = h_lrengine_action(engine); - - // handle forks and demerges (~> spawn engines) - if(action) { - if(action->type == HLR_CONFLICT) { - // fork engine on conflicts - action = handle_conflict(engines, engine, action->branches); - } else if(action->type == HLR_REDUCE) { - // demerge/respawn as needed - handle_demerge(engines, engine, action); - } - } - - h_lrengine_step(engine, action); - stow_engine(engines, engine); + // step all engines + bool run = glr_step(&result, engback, engine, h_lrengine_action(engine)); + if(run) + engines_left++; } + + // swap the arrays + HLREngine **tmp = engines; + engines = engback; + engback = tmp; } if(!result) @@ -184,13 +243,6 @@ HParserBackendVTable h__glr_backend_vtable = { -// XXX TODO -// - implement engine merging -// - triggered when two enter the same state -// - old stacks (/engines?) saved -// - new common suffix stack created -// - when rewinding (during reduce), watch for empty stack -> demerge - // dummy! int test_glr(void) diff --git a/src/backends/lr.c b/src/backends/lr.c index c481d29..4c89d19 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -206,11 +206,10 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, engine->table = table; engine->state = 0; - engine->run = true; engine->stack = h_slist_new(tarena); engine->input = *stream; - engine->merged = NULL; - engine->mp = NULL; + engine->merged[0] = NULL; + engine->merged[1] = NULL; engine->arena = arena; engine->tarena = tarena; @@ -267,7 +266,7 @@ static HParsedToken *consume_input(HLREngine *engine) } // run LR parser for one round; returns false when finished -static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) +bool h_lrengine_step(HLREngine *engine, const HLRAction *action) { // short-hand names HSlist *stack = engine->stack; @@ -329,8 +328,11 @@ static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) h_slist_push(stack, value); engine->state = shift->nextstate; - if(symbol == engine->table->start) - return false; // reduced to start symbol; accept! + // check for success + if(engine->state == HLR_SUCCESS) { + assert(symbol == engine->table->start); + return false; + } } else { assert(action->type == HLR_SHIFT); HParsedToken *value = consume_input(engine); @@ -342,17 +344,12 @@ static bool h_lrengine_step_(HLREngine *engine, const HLRAction *action) return true; } -// run LR parser for one round; sets engine->run -void h_lrengine_step(HLREngine *engine, const HLRAction *action) -{ - engine->run = h_lrengine_step_(engine, action); -} - HParseResult *h_lrengine_result(HLREngine *engine) { - // parsing was successful iff after a shift the engine is back in state 0 - if(engine->state == 0 && !h_slist_empty(engine->stack)) { + // parsing was successful iff the engine reaches the end state + if(engine->state == HLR_SUCCESS) { // on top of the stack is the start symbol's semantic value + assert(!h_slist_empty(engine->stack)); HParsedToken *tok = engine->stack->head->elem; return make_result(engine->arena, tok); } else { @@ -371,8 +368,7 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HLREngine *engine = h_lrengine_new(arena, tarena, table, stream); // iterate engine to completion - while(engine->run) - h_lrengine_step(engine, h_lrengine_action(engine)); + while(h_lrengine_step(engine, h_lrengine_action(engine))); HParseResult *result = h_lrengine_result(engine); if(!result) @@ -464,7 +460,10 @@ void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) { switch(action->type) { case HLR_SHIFT: - fprintf(f, "s%lu", action->nextstate); + if(action->nextstate == HLR_SUCCESS) + fputs("s~", f); + else + fprintf(f, "s%lu", action->nextstate); break; case HLR_REDUCE: fputs("r(", f); diff --git a/src/backends/lr.h b/src/backends/lr.h index ab48633..8f1eadd 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -69,18 +69,18 @@ typedef struct HLREnhGrammar_ { typedef struct HLREngine_ { const HLRTable *table; size_t state; - bool run; HSlist *stack; // holds pairs: (saved state, semantic value) HInputStream input; - struct HLREngine_ *merged; // ancestor merged into this engine at mp - HSlistNode *mp; // mergepoint: stack->head at time of merge + struct HLREngine_ *merged[2]; // ancestors merged into this engine HArena *arena; // will hold the results HArena *tarena; // tmp, deleted after parse } HLREngine; +#define HLR_SUCCESS ((size_t)~0) // parser end state + // XXX move to internal.h or something // XXX replace other hashtable iterations with this @@ -131,7 +131,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params); void h_lalr_free(HParser *parser); const HLRAction *h_lrengine_action(const HLREngine *engine); -void h_lrengine_step(HLREngine *engine, const HLRAction *action); +bool h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); diff --git a/src/backends/lr0.c b/src/backends/lr0.c index 5add53a..1c86484 100644 --- a/src/backends/lr0.c +++ b/src/backends/lr0.c @@ -190,10 +190,8 @@ HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) // remember start symbol table->start = g->start; - // add dummy shift entry for the start symbol so h_lrengine_step can always - // find a shift. - // NB: nextstate=0 is used for the "victory condition" by h_lrengine_result. - put_shift(table, 0, g->start, 0); + // shift to the accepting end state for the start symbol + put_shift(table, 0, g->start, HLR_SUCCESS); // add shift entries for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { From 66809ceedad6e4ca4dfd445f166545713a444d0b Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 22 Jun 2013 18:16:41 +0200 Subject: [PATCH 93/95] go back to storing engines in lists --- src/backends/glr.c | 63 +++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/src/backends/glr.c b/src/backends/glr.c index 34a5633..353d0e6 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -1,7 +1,7 @@ #include #include "lr.h" -static bool glr_step(HParseResult **result, HLREngine **engines, +static bool glr_step(HParseResult **result, HSlist *engines, HLREngine *engine, const HLRAction *action); @@ -73,7 +73,7 @@ static inline HLREngine *respawn(HLREngine *eng, HSlist *stack) } static HLREngine * -demerge(HParseResult **result, HLREngine **engines, +demerge(HParseResult **result, HSlist *engines, HLREngine *engine, const HLRAction *action, size_t depth) { // no-op on engines that are not merged @@ -126,7 +126,7 @@ HLREngine *fork_engine(const HLREngine *engine) } static const HLRAction * -handle_conflict(HParseResult **result, HLREngine **engines, +handle_conflict(HParseResult **result, HSlist *engines, const HLREngine *engine, const HSlist *branches) { // there should be at least two conflicting actions @@ -149,7 +149,7 @@ handle_conflict(HParseResult **result, HLREngine **engines, /* GLR driver */ -static bool glr_step(HParseResult **result, HLREngine **engines, +static bool glr_step(HParseResult **result, HSlist *engines, HLREngine *engine, const HLRAction *action) { // handle forks and demerges (~> spawn engines) @@ -167,11 +167,17 @@ static bool glr_step(HParseResult **result, HLREngine **engines, bool run = h_lrengine_step(engine, action); if(run) { - // store engine in the array, merge if necessary - if(engines[engine->state] == NULL) - engines[engine->state] = engine; - else - engines[engine->state] = lrengine_merge(engines[engine->state], engine); + // store engine in the list, merge if necessary + HSlistNode *x; + for(x=engines->head; x; x=x->next) { + HLREngine *eng = x->elem; + if(eng->state == engine->state) { + x->elem = lrengine_merge(eng, engine); + break; + } + } + if(!x) // no merge happened + h_slist_push(engines, engine); } else if(engine->state == HLR_SUCCESS) { // save the result *result = h_lrengine_result(engine); @@ -189,40 +195,27 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HArena *arena = h_new_arena(mm__, 0); // will hold the results HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse - // allocate engine arrays (can hold one engine per state) + // allocate engine lists (will hold one engine per state) // these are swapped each iteration - HLREngine **engines = h_arena_malloc(tarena, table->nrows * sizeof(HLREngine *)); - HLREngine **engback = h_arena_malloc(tarena, table->nrows * sizeof(HLREngine *)); - - assert(table->nrows > 0); - for(size_t i=0; inrows; i++) { - engines[i] = NULL; - engback[i] = NULL; - } + HSlist *engines = h_slist_new(tarena); + HSlist *engback = h_slist_new(tarena); // create initial engine - engines[0] = h_lrengine_new(arena, tarena, table, stream); - assert(engines[0]->state == 0); + h_slist_push(engines, h_lrengine_new(arena, tarena, table, stream)); HParseResult *result = NULL; - size_t engines_left = 1; - while(engines_left && result == NULL) { - engines_left = 0; + while(result == NULL && !h_slist_empty(engines)) { + assert(h_slist_empty(engback)); - for(size_t i=0; inrows; i++) { - HLREngine *engine = engines[i]; - if(engine == NULL) - continue; - engines[i] = NULL; // cleared for next iteration - - // step all engines - bool run = glr_step(&result, engback, engine, h_lrengine_action(engine)); - if(run) - engines_left++; + // step all engines + while(!h_slist_empty(engines)) { + HLREngine *engine = h_slist_pop(engines); + const HLRAction *action = h_lrengine_action(engine); + glr_step(&result, engback, engine, action); } - // swap the arrays - HLREngine **tmp = engines; + // swap the lists + HSlist *tmp = engines; engines = engback; engback = tmp; } From 0d7e69767d746b7239434e5b78f8de9be95cb177 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Sat, 22 Jun 2013 11:00:49 -0700 Subject: [PATCH 94/95] update README: all the backends are done --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 91ee36c..492950d 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,8 @@ Features * Parsing backends: * Packrat parsing * LL(k) - * GLR (not yet implemented) - * LALR(8) (not yet implemented) + * GLR + * LALR * Regular expressions * Language bindings: * C++ (not yet implemented) From aed1de5ce5df812311a9df5d1b9d84cbf1dfa3eb Mon Sep 17 00:00:00 2001 From: Dan Hirsch Date: Mon, 24 Jun 2013 21:46:23 +0200 Subject: [PATCH 95/95] Applied a bugfix from my local branch --- src/parsers/many.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parsers/many.c b/src/parsers/many.c index a095940..1e3b022 100644 --- a/src/parsers/many.c +++ b/src/parsers/many.c @@ -130,7 +130,7 @@ static bool many_ctrvm(HRVMProg *prog, void *env) { if (repeat->min_p) { h_rvm_insert_insn(prog, RVM_PUSH, 0); assert(repeat->count < 2); // TODO: The other cases should be supported later. - uint16_t end_fork; + uint16_t end_fork = 0xFFFF; // Shut up GCC if (repeat->count == 0) end_fork = h_rvm_insert_insn(prog, RVM_FORK, 0xFFFF); uint16_t goto_mid = h_rvm_insert_insn(prog, RVM_GOTO, 0xFFFF); @@ -145,7 +145,8 @@ static bool many_ctrvm(HRVMProg *prog, void *env) { if (!h_compile_regex(prog, repeat->p)) return false; h_rvm_insert_insn(prog, RVM_FORK, nxt); - h_rvm_patch_arg(prog, end_fork, h_rvm_get_ip(prog)); + if (repeat->count == 0) + h_rvm_patch_arg(prog, end_fork, h_rvm_get_ip(prog)); h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_make_sequence, NULL)); return true;