hammer/src/backends/lalr.c
2015-09-16 23:42:18 +02:00

392 lines
11 KiB
C

#include <assert.h>
#include "contextfree.h"
#include "lr.h"
/* LALR-via-SLR grammar transformation */
static inline size_t seqsize(void *p_)
{
size_t n=0;
for(void **p=p_; *p; p++) n++;
return n+1;
}
static HLRAction *
lrtable_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol)
{
switch(symbol->type) {
case HCF_END:
return table->tmap[state]->end_branch;
case HCF_CHAR:
return h_stringmap_get(table->tmap[state], &symbol->chr, 1, false);
default:
// nonterminal case
return h_hashtable_get(table->ntmap[state], symbol);
}
}
static size_t follow_transition(const HLRTable *table, size_t x, HCFChoice *A)
{
HLRAction *action = lrtable_lookup(table, x, A);
assert(action != NULL);
assert(action->type == HLR_SHIFT);
return action->nextstate;
}
static inline HLRTransition *transition(HArena *arena,
size_t x, const HCFChoice *A, size_t y)
{
HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition));
t->from = x;
t->symbol = A;
t->to = y;
return t;
}
// no-op on terminal symbols
static void transform_productions(const HLRTable *table, HLREnhGrammar *eg,
size_t x, HCFChoice *xAy)
{
if (xAy->type != HCF_CHOICE) {
return;
}
// NB: nothing to do on quasi-terminal CHARSET which carries no list of rhs's
HArena *arena = eg->arena;
HCFSequence **seq = h_arena_malloc(arena, seqsize(xAy->seq)
* sizeof(HCFSequence *));
HCFSequence **p, **q;
for(p=xAy->seq, q=seq; *p; p++, q++) {
// trace rhs starting in state x and following the transitions
// xAy -> ... iBj ...
size_t i = x;
HCFChoice **B = (*p)->items;
HCFChoice **items = h_arena_malloc(arena, seqsize(B) * sizeof(HCFChoice *));
HCFChoice **iBj = items;
for(; *B; B++, iBj++) {
size_t j = follow_transition(table, i, *B);
HLRTransition *i_B_j = transition(arena, i, *B, j);
*iBj = h_hashtable_get(eg->tmap, i_B_j);
assert(*iBj != NULL);
i = j;
}
*iBj = NULL;
*q = h_arena_malloc(arena, sizeof(HCFSequence));
(*q)->items = items;
}
*q = NULL;
xAy->seq = seq;
}
static HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym)
{
HArena *arena = eg->arena;
HCFChoice *esym = h_arena_malloc(arena, sizeof(HCFChoice));
*esym = *sym;
HHashSet *cs = h_hashtable_get(eg->corr, sym);
if (!cs) {
cs = h_hashset_new(arena, h_eq_symbol, h_hash_symbol);
h_hashtable_put(eg->corr, sym, cs);
}
h_hashset_put(cs, esym);
return esym;
}
static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa,
const HLRTable *table)
{
HAllocator *mm__ = g->mm__;
HArena *arena = g->arena;
HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar));
eg->tmap = h_hashtable_new(arena, h_eq_transition, h_hash_transition);
eg->smap = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr);
eg->corr = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol);
// XXX must use h_eq/hash_ptr for symbols! so enhanced CHARs are different
eg->arena = arena;
// establish mapping between transitions and symbols
for(HSlistNode *x=dfa->transitions->head; x; x=x->next) {
HLRTransition *t = x->elem;
assert(!h_hashtable_present(eg->tmap, t));
HCFChoice *sym = new_enhanced_symbol(eg, t->symbol);
h_hashtable_put(eg->tmap, t, sym);
h_hashtable_put(eg->smap, sym, t);
}
// transform the productions
H_FOREACH(eg->tmap, HLRTransition *t, HCFChoice *sym)
transform_productions(table, eg, t->from, sym);
H_END_FOREACH
// add the start symbol
HCFChoice *start = new_enhanced_symbol(eg, g->start);
transform_productions(table, eg, 0, start);
eg->grammar = h_cfgrammar_(mm__, start);
return eg;
}
/* LALR table generation */
static inline bool has_conflicts(HLRTable *table)
{
return !h_slist_empty(table->inadeq);
}
// for each lookahead symbol (fs), put action into tmap
// returns 0 on success, -1 on conflict
// ignores forall entries
static int terminals_put(HStringMap *tmap, const HStringMap *fs, HLRAction *action)
{
int ret = 0;
if (fs->epsilon_branch) {
HLRAction *prev = tmap->epsilon_branch;
if (prev && prev != action) {
// conflict
tmap->epsilon_branch = h_lr_conflict(tmap->arena, prev, action);
ret = -1;
} else {
tmap->epsilon_branch = action;
}
}
if (fs->end_branch) {
HLRAction *prev = tmap->end_branch;
if (prev && prev != action) {
// conflict
tmap->end_branch = h_lr_conflict(tmap->arena, prev, action);
ret = -1;
} else {
tmap->end_branch = action;
}
}
H_FOREACH(fs->char_branches, void *key, HStringMap *fs_)
HStringMap *tmap_ = h_hashtable_get(tmap->char_branches, key);
if (!tmap_) {
tmap_ = h_stringmap_new(tmap->arena);
h_hashtable_put(tmap->char_branches, key, tmap_);
}
if (terminals_put(tmap_, fs_, action) < 0) {
ret = -1;
}
H_END_FOREACH
return ret;
}
// check whether a sequence of enhanced-grammar symbols (p) matches the given
// (original-grammar) production rhs and terminates in the given end state.
static bool match_production(HLREnhGrammar *eg, HCFChoice **p,
HCFChoice **rhs, size_t endstate)
{
size_t state = endstate; // initialized to end in case of empty rhs
for(; *p && *rhs; p++, rhs++) {
HLRTransition *t = h_hashtable_get(eg->smap, *p);
assert(t != NULL);
if (!h_eq_symbol(t->symbol, *rhs)) {
return false;
}
state = t->to;
}
return (*p == *rhs // both NULL
&& state == endstate);
}
// desugar parser with a fresh start symbol
// this guarantees that the start symbol will not occur in any productions
HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser)
{
HCFChoice *augmented = h_new(HCFChoice, 1);
HCFStack *stk__ = h_cfstack_new(mm__);
stk__->prealloc = augmented;
HCFS_BEGIN_CHOICE() {
HCFS_BEGIN_SEQ() {
HCFS_DESUGAR(parser);
} HCFS_END_SEQ();
HCFS_THIS_CHOICE->reshape = h_act_first;
} HCFS_END_CHOICE();
h_cfstack_free(mm__, stk__);
return augmented;
}
int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params)
{
// generate (augmented) CFG from parser
// construct LR(0) DFA
// build LR(0) table
// if necessary, resolve conflicts "by conversion to SLR"
if (!parser->vtable->isValidCF(parser->env)) {
return -1;
}
HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, parser));
if(g == NULL) // backend not suitable (language not context-free)
return -1;
HLRDFA *dfa = h_lr0_dfa(g);
if (dfa == NULL) { // this should normally not happen
h_cfgrammar_free(g);
return -1;
}
HLRTable *table = h_lr0_table(g, dfa);
if (table == NULL) { // this should normally not happen
h_cfgrammar_free(g);
return -1;
}
if(has_conflicts(table)) {
HArena *arena = table->arena;
HLREnhGrammar *eg = enhance_grammar(g, dfa, table);
if(eg == NULL) { // this should normally not happen
h_cfgrammar_free(g);
h_lrtable_free(table);
return -1;
}
// go through the inadequate states; replace inadeq with a new list
HSlist *inadeq = table->inadeq;
table->inadeq = h_slist_new(arena);
for(HSlistNode *x=inadeq->head; x; x=x->next) {
size_t state = (uintptr_t)x->elem;
bool inadeq = false;
// clear old forall entry, it's being replaced by more fine-grained ones
table->forall[state] = NULL;
// go through each reducible item of state
H_FOREACH_KEY(dfa->states[state], HLRItem *item)
if(item->mark < item->len)
continue;
// action to place in the table cells indicated by lookahead
HLRAction *action = h_reduce_action(arena, item);
// find all LR(0)-enhanced productions matching item
HHashSet *lhss = h_hashtable_get(eg->corr, item->lhs);
assert(lhss != NULL);
H_FOREACH_KEY(lhss, HCFChoice *lhs)
assert(lhs->type == HCF_CHOICE || lhs->type == HCF_CHARSET);
bool match = false;
if(lhs->type == HCF_CHOICE) {
for(HCFSequence **p=lhs->seq; *p; p++) {
HCFChoice **rhs = (*p)->items;
if(match_production(eg, rhs, item->rhs, state)) {
match = true;
break;
}
}
} else { // HCF_CHARSET
assert(item->rhs[0] != NULL);
assert(item->rhs[1] == NULL);
assert(item->rhs[0]->type == HCF_CHAR);
HLRTransition *t = h_hashtable_get(eg->smap, lhs);
assert(t != NULL);
match = (t->to == state
&& charset_isset(lhs->charset, item->rhs[0]->chr));
}
if(match) {
// the left-hand symbol's follow set is this production's
// contribution to the lookahead
const HStringMap *fs = h_follow(1, eg->grammar, lhs);
assert(fs != NULL);
assert(fs->epsilon_branch == NULL);
assert(!h_stringmap_empty(fs));
// for each lookahead symbol, put action into table cell
if(terminals_put(table->tmap[state], fs, action) < 0)
inadeq = true;
}
H_END_FOREACH // enhanced production
H_END_FOREACH // reducible item
if(inadeq) {
h_slist_push(table->inadeq, (void *)(uintptr_t)state);
}
}
}
h_cfgrammar_free(g);
parser->backend_data = table;
return has_conflicts(table)? -1 : 0;
}
void h_lalr_free(HParser *parser)
{
HLRTable *table = parser->backend_data;
h_lrtable_free(table);
parser->backend_data = NULL;
parser->backend = PB_PACKRAT;
}
HParserBackendVTable h__lalr_backend_vtable = {
.compile = h_lalr_compile,
.parse = h_lr_parse,
.free = h_lalr_free
};
// dummy!
int test_lalr(void)
{
/*
E -> E '-' T
| T
T -> '(' E ')'
| 'n' -- also try [0-9] for the charset paths
*/
HParser *n = h_ch('n');
HParser *E = h_indirect();
HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL);
HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL);
h_bind_indirect(E, E_);
HParser *p = E;
HCFGrammar *g = h_pprint_lr_info(stdout, p);
if(!g)
return 1;
fprintf(stdout, "\n==== L A L R T A B L E ====\n");
if (h_compile(p, PB_LALR, NULL)) {
fprintf(stdout, "does not compile\n");
return 2;
}
h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0);
fprintf(stdout, "\n==== P A R S E R E S U L T ====\n");
HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13);
if (res) {
h_pprint(stdout, res->ast, 0, 2);
} else {
fprintf(stdout, "no parse\n");
}
return 0;
}