395 lines
11 KiB
C
395 lines
11 KiB
C
#include <assert.h>
|
|
#include "contextfree.h"
|
|
#include "lr.h"
|
|
|
|
|
|
|
|
/* LALR-via-SLR grammar transformation */
|
|
|
|
static inline size_t seqsize(void *p_)
|
|
{
|
|
size_t n=0;
|
|
for(void **p=p_; *p; p++) n++;
|
|
return n+1;
|
|
}
|
|
|
|
static HLRAction *
|
|
lrtable_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol)
|
|
{
|
|
switch(symbol->type) {
|
|
case HCF_END:
|
|
return table->tmap[state]->end_branch;
|
|
case HCF_CHAR:
|
|
return h_stringmap_get(table->tmap[state], &symbol->chr, 1, false);
|
|
default:
|
|
// nonterminal case
|
|
return h_hashtable_get(table->ntmap[state], symbol);
|
|
}
|
|
}
|
|
|
|
static size_t follow_transition(const HLRTable *table, size_t x, HCFChoice *A)
|
|
{
|
|
HLRAction *action = lrtable_lookup(table, x, A);
|
|
assert(action != NULL);
|
|
assert(action->type == HLR_SHIFT);
|
|
return action->nextstate;
|
|
}
|
|
|
|
static inline HLRTransition *transition(HArena *arena,
|
|
size_t x, const HCFChoice *A, size_t y)
|
|
{
|
|
HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition));
|
|
t->from = x;
|
|
t->symbol = A;
|
|
t->to = y;
|
|
return t;
|
|
}
|
|
|
|
// no-op on terminal symbols
|
|
static void transform_productions(const HLRTable *table, HLREnhGrammar *eg,
|
|
size_t x, HCFChoice *xAy)
|
|
{
|
|
if (xAy->type != HCF_CHOICE) {
|
|
return;
|
|
}
|
|
// NB: nothing to do on quasi-terminal CHARSET which carries no list of rhs's
|
|
|
|
HArena *arena = eg->arena;
|
|
|
|
HCFSequence **seq = h_arena_malloc(arena, seqsize(xAy->seq)
|
|
* sizeof(HCFSequence *));
|
|
HCFSequence **p, **q;
|
|
for(p=xAy->seq, q=seq; *p; p++, q++) {
|
|
// trace rhs starting in state x and following the transitions
|
|
// xAy -> ... iBj ...
|
|
|
|
size_t i = x;
|
|
HCFChoice **B = (*p)->items;
|
|
HCFChoice **items = h_arena_malloc(arena, seqsize(B) * sizeof(HCFChoice *));
|
|
HCFChoice **iBj = items;
|
|
for(; *B; B++, iBj++) {
|
|
size_t j = follow_transition(table, i, *B);
|
|
HLRTransition *i_B_j = transition(arena, i, *B, j);
|
|
*iBj = h_hashtable_get(eg->tmap, i_B_j);
|
|
assert(*iBj != NULL);
|
|
i = j;
|
|
}
|
|
*iBj = NULL;
|
|
|
|
*q = h_arena_malloc(arena, sizeof(HCFSequence));
|
|
(*q)->items = items;
|
|
}
|
|
*q = NULL;
|
|
xAy->seq = seq;
|
|
}
|
|
|
|
static HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym)
|
|
{
|
|
HArena *arena = eg->arena;
|
|
HCFChoice *esym = h_arena_malloc(arena, sizeof(HCFChoice));
|
|
*esym = *sym;
|
|
|
|
HHashSet *cs = h_hashtable_get(eg->corr, sym);
|
|
if (!cs) {
|
|
cs = h_hashset_new(arena, h_eq_symbol, h_hash_symbol);
|
|
h_hashtable_put(eg->corr, sym, cs);
|
|
}
|
|
h_hashset_put(cs, esym);
|
|
|
|
return esym;
|
|
}
|
|
|
|
static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa,
|
|
const HLRTable *table)
|
|
{
|
|
HAllocator *mm__ = g->mm__;
|
|
HArena *arena = g->arena;
|
|
|
|
HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar));
|
|
eg->tmap = h_hashtable_new(arena, h_eq_transition, h_hash_transition);
|
|
eg->smap = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr);
|
|
eg->corr = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol);
|
|
// XXX must use h_eq/hash_ptr for symbols! so enhanced CHARs are different
|
|
eg->arena = arena;
|
|
|
|
// establish mapping between transitions and symbols
|
|
for(HSlistNode *x=dfa->transitions->head; x; x=x->next) {
|
|
HLRTransition *t = x->elem;
|
|
|
|
assert(!h_hashtable_present(eg->tmap, t));
|
|
|
|
HCFChoice *sym = new_enhanced_symbol(eg, t->symbol);
|
|
h_hashtable_put(eg->tmap, t, sym);
|
|
h_hashtable_put(eg->smap, sym, t);
|
|
}
|
|
|
|
// transform the productions
|
|
H_FOREACH(eg->tmap, HLRTransition *t, HCFChoice *sym)
|
|
transform_productions(table, eg, t->from, sym);
|
|
H_END_FOREACH
|
|
|
|
// add the start symbol
|
|
HCFChoice *start = new_enhanced_symbol(eg, g->start);
|
|
transform_productions(table, eg, 0, start);
|
|
|
|
eg->grammar = h_cfgrammar_(mm__, start);
|
|
return eg;
|
|
}
|
|
|
|
|
|
|
|
/* LALR table generation */
|
|
|
|
static inline bool has_conflicts(HLRTable *table)
|
|
{
|
|
return !h_slist_empty(table->inadeq);
|
|
}
|
|
|
|
// for each lookahead symbol (fs), put action into tmap
|
|
// returns 0 on success, -1 on conflict
|
|
// ignores forall entries
|
|
static int terminals_put(HStringMap *tmap, const HStringMap *fs, HLRAction *action)
|
|
{
|
|
int ret = 0;
|
|
|
|
if (fs->epsilon_branch) {
|
|
HLRAction *prev = tmap->epsilon_branch;
|
|
if (prev && prev != action) {
|
|
// conflict
|
|
tmap->epsilon_branch = h_lr_conflict(tmap->arena, prev, action);
|
|
ret = -1;
|
|
} else {
|
|
tmap->epsilon_branch = action;
|
|
}
|
|
}
|
|
|
|
if (fs->end_branch) {
|
|
HLRAction *prev = tmap->end_branch;
|
|
if (prev && prev != action) {
|
|
// conflict
|
|
tmap->end_branch = h_lr_conflict(tmap->arena, prev, action);
|
|
ret = -1;
|
|
} else {
|
|
tmap->end_branch = action;
|
|
}
|
|
}
|
|
|
|
H_FOREACH(fs->char_branches, void *key, HStringMap *fs_)
|
|
HStringMap *tmap_ = h_hashtable_get(tmap->char_branches, key);
|
|
|
|
if (!tmap_) {
|
|
tmap_ = h_stringmap_new(tmap->arena);
|
|
h_hashtable_put(tmap->char_branches, key, tmap_);
|
|
}
|
|
|
|
if (terminals_put(tmap_, fs_, action) < 0) {
|
|
ret = -1;
|
|
}
|
|
H_END_FOREACH
|
|
|
|
return ret;
|
|
}
|
|
|
|
// check whether a sequence of enhanced-grammar symbols (p) matches the given
|
|
// (original-grammar) production rhs and terminates in the given end state.
|
|
static bool match_production(HLREnhGrammar *eg, HCFChoice **p,
|
|
HCFChoice **rhs, size_t endstate)
|
|
{
|
|
size_t state = endstate; // initialized to end in case of empty rhs
|
|
for(; *p && *rhs; p++, rhs++) {
|
|
HLRTransition *t = h_hashtable_get(eg->smap, *p);
|
|
assert(t != NULL);
|
|
if (!h_eq_symbol(t->symbol, *rhs)) {
|
|
return false;
|
|
}
|
|
state = t->to;
|
|
}
|
|
return (*p == *rhs // both NULL
|
|
&& state == endstate);
|
|
}
|
|
|
|
// desugar parser with a fresh start symbol
|
|
// this guarantees that the start symbol will not occur in any productions
|
|
HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser)
|
|
{
|
|
HCFChoice *augmented = h_new(HCFChoice, 1);
|
|
|
|
HCFStack *stk__ = h_cfstack_new(mm__);
|
|
stk__->prealloc = augmented;
|
|
HCFS_BEGIN_CHOICE() {
|
|
HCFS_BEGIN_SEQ() {
|
|
HCFS_DESUGAR(parser);
|
|
} HCFS_END_SEQ();
|
|
HCFS_THIS_CHOICE->reshape = h_act_first;
|
|
} HCFS_END_CHOICE();
|
|
h_cfstack_free(mm__, stk__);
|
|
|
|
return augmented;
|
|
}
|
|
|
|
int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params)
|
|
{
|
|
// generate (augmented) CFG from parser
|
|
// construct LR(0) DFA
|
|
// build LR(0) table
|
|
// if necessary, resolve conflicts "by conversion to SLR"
|
|
|
|
if (!parser->vtable->isValidCF(parser->env)) {
|
|
return -1;
|
|
}
|
|
HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, parser));
|
|
if(g == NULL) // backend not suitable (language not context-free)
|
|
return -1;
|
|
|
|
HLRDFA *dfa = h_lr0_dfa(g);
|
|
if (dfa == NULL) { // this should normally not happen
|
|
h_cfgrammar_free(g);
|
|
return -1;
|
|
}
|
|
|
|
HLRTable *table = h_lr0_table(g, dfa);
|
|
if (table == NULL) { // this should normally not happen
|
|
h_cfgrammar_free(g);
|
|
return -1;
|
|
}
|
|
|
|
if(has_conflicts(table)) {
|
|
HArena *arena = table->arena;
|
|
|
|
HLREnhGrammar *eg = enhance_grammar(g, dfa, table);
|
|
if(eg == NULL) { // this should normally not happen
|
|
h_cfgrammar_free(g);
|
|
h_lrtable_free(table);
|
|
return -1;
|
|
}
|
|
|
|
// go through the inadequate states; replace inadeq with a new list
|
|
HSlist *inadeq = table->inadeq;
|
|
table->inadeq = h_slist_new(arena);
|
|
|
|
for(HSlistNode *x=inadeq->head; x; x=x->next) {
|
|
size_t state = (uintptr_t)x->elem;
|
|
bool inadeq = false;
|
|
|
|
// clear old forall entry, it's being replaced by more fine-grained ones
|
|
table->forall[state] = NULL;
|
|
|
|
// go through each reducible item of state
|
|
H_FOREACH_KEY(dfa->states[state], HLRItem *item)
|
|
if(item->mark < item->len)
|
|
continue;
|
|
|
|
// action to place in the table cells indicated by lookahead
|
|
HLRAction *action = h_reduce_action(arena, item);
|
|
|
|
// find all LR(0)-enhanced productions matching item
|
|
HHashSet *lhss = h_hashtable_get(eg->corr, item->lhs);
|
|
assert(lhss != NULL);
|
|
H_FOREACH_KEY(lhss, HCFChoice *lhs)
|
|
assert(lhs->type == HCF_CHOICE || lhs->type == HCF_CHARSET);
|
|
|
|
bool match = false;
|
|
if(lhs->type == HCF_CHOICE) {
|
|
for(HCFSequence **p=lhs->seq; *p; p++) {
|
|
HCFChoice **rhs = (*p)->items;
|
|
if(match_production(eg, rhs, item->rhs, state)) {
|
|
match = true;
|
|
break;
|
|
}
|
|
}
|
|
} else { // HCF_CHARSET
|
|
assert(item->rhs[0] != NULL);
|
|
assert(item->rhs[1] == NULL);
|
|
assert(item->rhs[0]->type == HCF_CHAR);
|
|
HLRTransition *t = h_hashtable_get(eg->smap, lhs);
|
|
assert(t != NULL);
|
|
match = (t->to == state
|
|
&& charset_isset(lhs->charset, item->rhs[0]->chr));
|
|
}
|
|
|
|
if(match) {
|
|
// the left-hand symbol's follow set is this production's
|
|
// contribution to the lookahead
|
|
const HStringMap *fs = h_follow(1, eg->grammar, lhs);
|
|
assert(fs != NULL);
|
|
assert(fs->epsilon_branch == NULL);
|
|
assert(!h_stringmap_empty(fs));
|
|
|
|
// for each lookahead symbol, put action into table cell
|
|
if(terminals_put(table->tmap[state], fs, action) < 0)
|
|
inadeq = true;
|
|
}
|
|
H_END_FOREACH // enhanced production
|
|
H_END_FOREACH // reducible item
|
|
|
|
if(inadeq) {
|
|
h_slist_push(table->inadeq, (void *)(uintptr_t)state);
|
|
}
|
|
}
|
|
}
|
|
|
|
h_cfgrammar_free(g);
|
|
parser->backend_data = table;
|
|
return has_conflicts(table)? -1 : 0;
|
|
}
|
|
|
|
void h_lalr_free(HParser *parser)
|
|
{
|
|
HLRTable *table = parser->backend_data;
|
|
h_lrtable_free(table);
|
|
parser->backend_data = NULL;
|
|
parser->backend = PB_PACKRAT;
|
|
}
|
|
|
|
|
|
|
|
HParserBackendVTable h__lalr_backend_vtable = {
|
|
.compile = h_lalr_compile,
|
|
.parse = h_lr_parse,
|
|
.free = h_lalr_free,
|
|
.parse_start = h_lr_parse_start,
|
|
.parse_chunk = h_lr_parse_chunk,
|
|
.parse_finish = h_lr_parse_finish
|
|
};
|
|
|
|
|
|
|
|
|
|
// dummy!
|
|
int test_lalr(void)
|
|
{
|
|
/*
|
|
E -> E '-' T
|
|
| T
|
|
T -> '(' E ')'
|
|
| 'n' -- also try [0-9] for the charset paths
|
|
*/
|
|
|
|
HParser *n = h_ch('n');
|
|
HParser *E = h_indirect();
|
|
HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL);
|
|
HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL);
|
|
h_bind_indirect(E, E_);
|
|
HParser *p = E;
|
|
|
|
HCFGrammar *g = h_pprint_lr_info(stdout, p);
|
|
if(!g)
|
|
return 1;
|
|
|
|
fprintf(stdout, "\n==== L A L R T A B L E ====\n");
|
|
if (h_compile(p, PB_LALR, NULL)) {
|
|
fprintf(stdout, "does not compile\n");
|
|
return 2;
|
|
}
|
|
h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0);
|
|
|
|
fprintf(stdout, "\n==== P A R S E R E S U L T ====\n");
|
|
HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13);
|
|
if (res) {
|
|
h_pprint(stdout, res->ast, 0, 2);
|
|
} else {
|
|
fprintf(stdout, "no parse\n");
|
|
}
|
|
|
|
return 0;
|
|
}
|