Merge remote-tracking branch 'tq/master' into LL such that it compiles
Conflicts: src/Makefile src/backends/packrat.c src/compile.c src/hammer.h src/internal.h src/parsers/action.c src/parsers/and.c src/parsers/attr_bool.c src/parsers/bits.c src/parsers/butnot.c src/parsers/ch.c src/parsers/charset.c src/parsers/choice.c src/parsers/difference.c src/parsers/end.c src/parsers/epsilon.c src/parsers/ignore.c src/parsers/ignoreseq.c src/parsers/indirect.c src/parsers/int_range.c src/parsers/many.c src/parsers/not.c src/parsers/nothing.c src/parsers/optional.c src/parsers/sequence.c src/parsers/token.c src/parsers/unimplemented.c src/parsers/whitespace.c src/parsers/xor.c
This commit is contained in:
commit
c64a4e435e
46 changed files with 1289 additions and 263 deletions
|
|
@ -3,22 +3,23 @@
|
|||
#include "../cfgrammar.h"
|
||||
#include "../parsers/parser_internal.h"
|
||||
|
||||
// XXX despite the names, this is all LL(1) right now. TODO
|
||||
|
||||
|
||||
/* Generating the LL parse table */
|
||||
/* Generating the LL(k) parse table */
|
||||
|
||||
/* Maps each nonterminal (HCFChoice) of the grammar to another hash table that
|
||||
* maps lookahead tokens (HCFToken) to productions (HCFSequence).
|
||||
*/
|
||||
typedef struct HLLTable_ {
|
||||
typedef struct HLLkTable_ {
|
||||
HHashTable *rows;
|
||||
HCFChoice *start; // start symbol
|
||||
HArena *arena;
|
||||
HAllocator *mm__;
|
||||
} HLLTable;
|
||||
} HLLkTable;
|
||||
|
||||
/* Interface to look up an entry in the parse table. */
|
||||
const HCFSequence *h_ll_lookup(const HLLTable *table, const HCFChoice *x, HCFToken tok)
|
||||
const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, HCFToken tok)
|
||||
{
|
||||
const HHashTable *row = h_hashtable_get(table->rows, x);
|
||||
assert(row != NULL); // the table should have one row for each nonterminal
|
||||
|
|
@ -28,7 +29,7 @@ const HCFSequence *h_ll_lookup(const HLLTable *table, const HCFChoice *x, HCFTok
|
|||
}
|
||||
|
||||
/* Allocate a new parse table. */
|
||||
HLLTable *h_lltable_new(HAllocator *mm__)
|
||||
HLLkTable *h_llktable_new(HAllocator *mm__)
|
||||
{
|
||||
// NB the parse table gets an arena separate from the grammar so we can free
|
||||
// the latter after table generation.
|
||||
|
|
@ -37,7 +38,7 @@ HLLTable *h_lltable_new(HAllocator *mm__)
|
|||
HHashTable *rows = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr);
|
||||
assert(rows != NULL);
|
||||
|
||||
HLLTable *table = h_new(HLLTable, 1);
|
||||
HLLkTable *table = h_new(HLLkTable, 1);
|
||||
assert(table != NULL);
|
||||
table->mm__ = mm__;
|
||||
table->arena = arena;
|
||||
|
|
@ -46,7 +47,7 @@ HLLTable *h_lltable_new(HAllocator *mm__)
|
|||
return table;
|
||||
}
|
||||
|
||||
void h_lltable_free(HLLTable *table)
|
||||
void h_llktable_free(HLLkTable *table)
|
||||
{
|
||||
HAllocator *mm__ = table->mm__;
|
||||
h_delete_arena(table->arena);
|
||||
|
|
@ -95,10 +96,10 @@ int fill_table_row(HCFGrammar *g, HHashTable *row,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Generate the LL parse table from the given grammar.
|
||||
/* Generate the LL(k) parse table from the given grammar.
|
||||
* Returns -1 on error, 0 on success.
|
||||
*/
|
||||
static int fill_table(HCFGrammar *g, HLLTable *table)
|
||||
static int fill_table(HCFGrammar *g, HLLkTable *table)
|
||||
{
|
||||
table->start = g->start;
|
||||
|
||||
|
|
@ -120,7 +121,7 @@ static int fill_table(HCFGrammar *g, HLLTable *table)
|
|||
for(s = a->seq; *s; s++) {
|
||||
// record this production in row as appropriate
|
||||
// this can signal an ambiguity conflict.
|
||||
// NB we don't worry about deallocating anything, h_ll_compile will
|
||||
// NB we don't worry about deallocating anything, h_llk_compile will
|
||||
// delete the whole arena for us.
|
||||
if(fill_table_row(g, row, a, *s) < 0)
|
||||
return -1;
|
||||
|
|
@ -131,7 +132,7 @@ static int fill_table(HCFGrammar *g, HLLTable *table)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int h_ll_compile(HAllocator* mm__, HParser* parser, const void* params)
|
||||
int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params)
|
||||
{
|
||||
// Convert parser to a CFG. This can fail as indicated by a NULL return.
|
||||
HCFGrammar *grammar = h_cfgrammar(mm__, parser);
|
||||
|
|
@ -143,11 +144,11 @@ int h_ll_compile(HAllocator* mm__, HParser* parser, const void* params)
|
|||
// TODO: avoid conflicts by splitting occurances?
|
||||
|
||||
// generate table and store in parser->data.
|
||||
HLLTable *table = h_lltable_new(mm__);
|
||||
HLLkTable *table = h_llktable_new(mm__);
|
||||
if(fill_table(grammar, table) < 0) {
|
||||
// the table was ambiguous
|
||||
h_cfgrammar_free(grammar);
|
||||
h_lltable_free(table);
|
||||
h_llktable_free(table);
|
||||
return -1;
|
||||
}
|
||||
parser->data = table;
|
||||
|
|
@ -161,13 +162,14 @@ int h_ll_compile(HAllocator* mm__, HParser* parser, const void* params)
|
|||
|
||||
|
||||
|
||||
/* LL driver */
|
||||
/* LL(k) driver */
|
||||
|
||||
HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* state)
|
||||
HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream)
|
||||
{
|
||||
const HLLTable *table = parser->data;
|
||||
HArena *arena = state->arena;
|
||||
HSlist *stack = h_slist_new(arena);
|
||||
const HLLkTable *table = parser->data;
|
||||
HArena *arena = h_new_arena(mm__, 0); // will hold the results
|
||||
HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse
|
||||
HSlist *stack = h_slist_new(tarena);
|
||||
HCountedArray *seq = h_carray_new(arena); // accumulates current parse result
|
||||
|
||||
// in order to construct the parse tree, we delimit the symbol stack into
|
||||
|
|
@ -177,7 +179,7 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s
|
|||
// frame delimiter.
|
||||
// also on the stack below the mark, we store the previously accumulated
|
||||
// value for the surrounding production.
|
||||
void *mark = h_arena_malloc(arena, 1);
|
||||
void *mark = h_arena_malloc(tarena, 1);
|
||||
|
||||
// initialize with the start symbol on the stack.
|
||||
h_slist_push(stack, table->start);
|
||||
|
|
@ -188,8 +190,8 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s
|
|||
while(!h_slist_empty(stack)) {
|
||||
// fill up lookahead buffer as required
|
||||
if(lookahead == 0) {
|
||||
uint8_t c = h_read_bits(&state->input_stream, 8, false);
|
||||
if(state->input_stream.overrun)
|
||||
uint8_t c = h_read_bits(stream, 8, false);
|
||||
if(stream->overrun)
|
||||
lookahead = end_token;
|
||||
else
|
||||
lookahead = char_token(c);
|
||||
|
|
@ -203,16 +205,16 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s
|
|||
// hit stack frame boundary
|
||||
|
||||
// wrap the accumulated parse result, this sequence is finished
|
||||
HParsedToken *tok = a_new(HParsedToken, 1);
|
||||
HParsedToken *tok = h_arena_malloc(arena, sizeof(HParsedToken));
|
||||
tok->token_type = TT_SEQUENCE;
|
||||
tok->seq = seq;
|
||||
// XXX tok->index and tok->bit_offset (don't take directly from stream, cuz peek!)
|
||||
|
||||
// call validation and semantic action, if present
|
||||
if(x->pred && !x->pred(make_result(state, tok)))
|
||||
return NULL; // validation failed -> no parse
|
||||
if(x->pred && !x->pred(make_result(tarena, tok)))
|
||||
goto no_parse; // validation failed -> no parse
|
||||
if(x->action)
|
||||
tok = (HParsedToken *)x->action(make_result(state, tok));
|
||||
tok = (HParsedToken *)x->action(make_result(arena, tok));
|
||||
|
||||
// result becomes next left-most element of higher-level sequence
|
||||
seq = h_slist_pop(stack);
|
||||
|
|
@ -230,7 +232,7 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s
|
|||
seq = h_carray_new(arena);
|
||||
|
||||
// look up applicable production in parse table
|
||||
const HCFSequence *p = h_ll_lookup(table, x, lookahead);
|
||||
const HCFSequence *p = h_llk_lookup(table, x, lookahead);
|
||||
|
||||
// push production's rhs onto the stack (in reverse order)
|
||||
HCFChoice **s;
|
||||
|
|
@ -250,40 +252,40 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s
|
|||
switch(x->type) {
|
||||
case HCF_END:
|
||||
if(input != end_token)
|
||||
return NULL;
|
||||
goto no_parse;
|
||||
tok = NULL;
|
||||
break;
|
||||
|
||||
case HCF_CHAR:
|
||||
if(input != char_token(x->chr))
|
||||
return NULL;
|
||||
tok = a_new(HParsedToken, 1);
|
||||
goto no_parse;
|
||||
tok = h_arena_malloc(arena, sizeof(HParsedToken));
|
||||
tok->token_type = TT_UINT;
|
||||
tok->uint = x->chr;
|
||||
break;
|
||||
|
||||
case HCF_CHARSET:
|
||||
if(input == end_token)
|
||||
return NULL;
|
||||
goto no_parse;
|
||||
if(!charset_isset(x->charset, token_char(input)))
|
||||
return NULL;
|
||||
tok = a_new(HParsedToken, 1);
|
||||
goto no_parse;
|
||||
tok = h_arena_malloc(arena, sizeof(HParsedToken));
|
||||
tok->token_type = TT_UINT;
|
||||
tok->uint = token_char(input);
|
||||
break;
|
||||
|
||||
default: // should not be reached
|
||||
assert_message(0, "unknown HCFChoice type");
|
||||
return NULL;
|
||||
goto no_parse;
|
||||
}
|
||||
|
||||
// XXX tok->index and tok->bit_offset (don't take directly from stream, cuz peek!)
|
||||
|
||||
// call validation and semantic action, if present
|
||||
if(x->pred && !x->pred(make_result(state, tok)))
|
||||
return NULL; // validation failed -> no parse
|
||||
if(x->pred && !x->pred(make_result(tarena, tok)))
|
||||
goto no_parse; // validation failed -> no parse
|
||||
if(x->action)
|
||||
tok = (HParsedToken *)x->action(make_result(state, tok));
|
||||
tok = (HParsedToken *)x->action(make_result(arena, tok));
|
||||
|
||||
// append to result sequence
|
||||
h_carray_append(seq, tok);
|
||||
|
|
@ -293,25 +295,31 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s
|
|||
// since we started with a single nonterminal on the stack, seq should
|
||||
// contain exactly the parse result.
|
||||
assert(seq->used == 1);
|
||||
return make_result(state, seq->elements[0]);
|
||||
h_delete_arena(tarena);
|
||||
return make_result(arena, seq->elements[0]);
|
||||
|
||||
no_parse:
|
||||
h_delete_arena(tarena);
|
||||
h_delete_arena(arena);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
HParserBackendVTable h__ll_backend_vtable = {
|
||||
.compile = h_ll_compile,
|
||||
.parse = h_ll_parse
|
||||
HParserBackendVTable h__llk_backend_vtable = {
|
||||
.compile = h_llk_compile,
|
||||
.parse = h_llk_parse
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
// dummy!
|
||||
int test_ll(void)
|
||||
int test_llk(void)
|
||||
{
|
||||
const HParser *c = h_many(h_ch('x'));
|
||||
const HParser *q = h_sequence(c, h_ch('y'), NULL);
|
||||
const HParser *p = h_choice(q, h_end_p(), NULL);
|
||||
HParser *c = h_many(h_ch('x'));
|
||||
HParser *q = h_sequence(c, h_ch('y'), NULL);
|
||||
HParser *p = h_choice(q, h_end_p(), NULL);
|
||||
|
||||
HCFGrammar *g = h_cfgrammar(&system_allocator, p);
|
||||
|
||||
|
|
@ -1,7 +1,16 @@
|
|||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include "../internal.h"
|
||||
#include "../parsers/parser_internal.h"
|
||||
|
||||
static uint32_t djbhash(const uint8_t *buf, size_t len) {
|
||||
uint32_t hash = 5381;
|
||||
while (len--) {
|
||||
hash = hash * 33 + *buf++;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
// short-hand for constructing HCachedResult's
|
||||
static HCachedResult *cached_result(const HParseState *state, HParseResult *result) {
|
||||
HCachedResult *ret = a_new(HCachedResult, 1);
|
||||
|
|
@ -191,12 +200,37 @@ HParseResult* h_do_parse(const HParser* parser, HParseState *state) {
|
|||
}
|
||||
|
||||
int h_packrat_compile(HAllocator* mm__, HParser* parser, const void* params) {
|
||||
parser->backend = PB_PACKRAT;
|
||||
return 0; // No compilation necessary, and everything should work
|
||||
// out of the box.
|
||||
}
|
||||
|
||||
HParseResult *h_packrat_parse(HAllocator* mm__, const HParser* parser, HParseState* parse_state) {
|
||||
return h_do_parse(parser, parse_state);
|
||||
static uint32_t cache_key_hash(const void* key) {
|
||||
return djbhash(key, sizeof(HParserCacheKey));
|
||||
}
|
||||
static bool cache_key_equal(const void* key1, const void* key2) {
|
||||
return memcmp(key1, key2, sizeof(HParserCacheKey)) == 0;
|
||||
}
|
||||
|
||||
HParseResult *h_packrat_parse(HAllocator* mm__, const HParser* parser, HInputStream *input_stream) {
|
||||
HArena * arena = h_new_arena(mm__, 0);
|
||||
HParseState *parse_state = a_new_(arena, HParseState, 1);
|
||||
parse_state->cache = h_hashtable_new(arena, cache_key_equal, // key_equal_func
|
||||
cache_key_hash); // hash_func
|
||||
parse_state->input_stream = *input_stream;
|
||||
parse_state->lr_stack = h_slist_new(arena);
|
||||
parse_state->recursion_heads = h_hashtable_new(arena, cache_key_equal,
|
||||
cache_key_hash);
|
||||
parse_state->arena = arena;
|
||||
HParseResult *res = h_do_parse(parser, parse_state);
|
||||
h_slist_free(parse_state->lr_stack);
|
||||
h_hashtable_free(parse_state->recursion_heads);
|
||||
// tear down the parse state
|
||||
h_hashtable_free(parse_state->cache);
|
||||
if (!res)
|
||||
h_delete_arena(parse_state->arena);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
HParserBackendVTable h__packrat_backend_vtable = {
|
||||
|
|
|
|||
366
src/backends/regex.c
Normal file
366
src/backends/regex.c
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "../internal.h"
|
||||
#include "../parsers/parser_internal.h"
|
||||
#include "regex.h"
|
||||
|
||||
#undef a_new
|
||||
#define a_new(typ, count) a_new_(arena, typ, count)
|
||||
// Stack VM
|
||||
typedef enum HSVMOp_ {
|
||||
SVM_PUSH, // Push a mark. There is no VM insn to push an object.
|
||||
SVM_NOP, // Used to start the chain, and possibly elsewhere. Does nothing.
|
||||
SVM_ACTION, // Same meaning as RVM_ACTION
|
||||
SVM_CAPTURE, // Same meaning as RVM_CAPTURE
|
||||
SVM_ACCEPT,
|
||||
} HSVMOp;
|
||||
|
||||
typedef struct HRVMTrace_ {
|
||||
struct HRVMTrace_ *next; // When parsing, these are
|
||||
// reverse-threaded. There is a postproc
|
||||
// step that inverts all the pointers.
|
||||
size_t input_pos;
|
||||
uint16_t arg;
|
||||
uint8_t opcode;
|
||||
} HRVMTrace;
|
||||
|
||||
typedef struct HRVMThread_ {
|
||||
HRVMTrace *trace;
|
||||
uint16_t ip;
|
||||
} HRVMThread;
|
||||
|
||||
HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len);
|
||||
|
||||
HRVMTrace *invert_trace(HRVMTrace *trace) {
|
||||
HRVMTrace *last = NULL;
|
||||
if (!trace)
|
||||
return NULL;
|
||||
if (!trace->next)
|
||||
return trace;
|
||||
do {
|
||||
HRVMTrace *next = trace->next;
|
||||
trace->next = last;
|
||||
last = trace;
|
||||
trace = next;
|
||||
} while (trace->next);
|
||||
return trace;
|
||||
}
|
||||
|
||||
void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) {
|
||||
HArena *arena = h_new_arena(mm__, 0);
|
||||
HRVMTrace **heads_p = a_new(HRVMTrace*, prog->length),
|
||||
**heads_n = a_new(HRVMTrace*, prog->length);
|
||||
|
||||
HRVMTrace *ret_trace;
|
||||
|
||||
uint8_t *insn_seen = a_new(uint8_t, prog->length); // 0 -> not seen, 1->processed, 2->queued
|
||||
HRVMThread *ip_queue = a_new(HRVMThread, prog->length);
|
||||
size_t ipq_top;
|
||||
|
||||
|
||||
|
||||
|
||||
#define THREAD ip_queue[ipq_top-1]
|
||||
#define PUSH_SVM(op_, arg_) do { \
|
||||
HRVMTrace *nt = a_new(HRVMTrace, 1); \
|
||||
nt->arg = (arg_); \
|
||||
nt->opcode = (op_); \
|
||||
nt->next = THREAD.trace; \
|
||||
nt->input_pos = off; \
|
||||
THREAD.trace = nt; \
|
||||
} while(0)
|
||||
|
||||
heads_n[0] = a_new(HRVMTrace, 1); // zeroing
|
||||
heads_n[0]->opcode = SVM_NOP;
|
||||
|
||||
size_t off = 0;
|
||||
int live_threads = 1;
|
||||
for (off = 0; off <= len; off++) {
|
||||
uint8_t ch = ((off == len) ? 0 : input[off]);
|
||||
size_t ip_s; // BUG: there was an unused variable ip. Not sure if
|
||||
// I intended to use it somewhere.
|
||||
/* scope */ {
|
||||
HRVMTrace **heads_t;
|
||||
heads_t = heads_n;
|
||||
heads_n = heads_p;
|
||||
heads_p = heads_t;
|
||||
memset(heads_n, 0, prog->length * sizeof(*heads_n));
|
||||
}
|
||||
memset(insn_seen, 0, prog->length); // no insns seen yet
|
||||
if (!live_threads)
|
||||
goto match_fail;
|
||||
live_threads = 0;
|
||||
for (ip_s = 0; ip_s < prog->length; ip_s++) {
|
||||
ipq_top = 1;
|
||||
// TODO: Write this as a threaded VM
|
||||
if (!heads_p[ip_s])
|
||||
continue;
|
||||
THREAD.ip = ip_s;
|
||||
|
||||
uint8_t hi, lo;
|
||||
uint16_t arg;
|
||||
while(ipq_top > 0) {
|
||||
if (insn_seen[THREAD.ip] == 1)
|
||||
continue;
|
||||
insn_seen[THREAD.ip] = 1;
|
||||
arg = prog->insns[THREAD.ip].arg;
|
||||
switch(prog->insns[THREAD.ip].op) {
|
||||
case RVM_ACCEPT:
|
||||
PUSH_SVM(SVM_ACCEPT, 0);
|
||||
ret_trace = THREAD.trace;
|
||||
goto run_trace;
|
||||
case RVM_MATCH:
|
||||
// Doesn't actually validate the "must be followed by MATCH
|
||||
// or STEP. It should. Preproc perhaps?
|
||||
hi = (arg >> 8) & 0xff;
|
||||
lo = arg & 0xff;
|
||||
THREAD.ip++;
|
||||
if (ch < lo || ch > hi)
|
||||
ipq_top--; // terminate thread
|
||||
goto next_insn;
|
||||
case RVM_GOTO:
|
||||
THREAD.ip = arg;
|
||||
goto next_insn;
|
||||
case RVM_FORK:
|
||||
THREAD.ip++;
|
||||
if (!insn_seen[arg]) {
|
||||
insn_seen[THREAD.ip] = 2;
|
||||
HRVMTrace* tr = THREAD.trace;
|
||||
ipq_top++;
|
||||
THREAD.ip = arg;
|
||||
THREAD.trace = tr;
|
||||
}
|
||||
goto next_insn;
|
||||
case RVM_PUSH:
|
||||
PUSH_SVM(SVM_PUSH, 0);
|
||||
THREAD.ip++;
|
||||
goto next_insn;
|
||||
case RVM_ACTION:
|
||||
PUSH_SVM(SVM_ACTION, arg);
|
||||
THREAD.ip++;
|
||||
goto next_insn;
|
||||
case RVM_CAPTURE:
|
||||
PUSH_SVM(SVM_CAPTURE, 0);
|
||||
THREAD.ip++;
|
||||
goto next_insn;
|
||||
case RVM_EOF:
|
||||
THREAD.ip++;
|
||||
if (off != len)
|
||||
ipq_top--; // Terminate thread
|
||||
goto next_insn;
|
||||
case RVM_STEP:
|
||||
// save thread
|
||||
live_threads++;
|
||||
heads_n[THREAD.ip++] = THREAD.trace;
|
||||
ipq_top--;
|
||||
goto next_insn;
|
||||
}
|
||||
next_insn:
|
||||
;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
// No accept was reached.
|
||||
match_fail:
|
||||
h_delete_arena(arena);
|
||||
return NULL;
|
||||
|
||||
run_trace:
|
||||
// Invert the direction of the trace linked list.
|
||||
|
||||
|
||||
ret_trace = invert_trace(ret_trace);
|
||||
HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len);
|
||||
// ret is in its own arena
|
||||
h_delete_arena(arena);
|
||||
return ret;
|
||||
}
|
||||
#undef PUSH_SVM
|
||||
#undef THREAD
|
||||
|
||||
|
||||
|
||||
|
||||
void svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) {
|
||||
if (ctx->stack_count + addl >= ctx->stack_capacity) {
|
||||
ctx->stack = mm__->realloc(mm__, ctx->stack, sizeof(*ctx->stack) * (ctx->stack_capacity *= 2));
|
||||
// TODO: check for realloc failure
|
||||
}
|
||||
}
|
||||
|
||||
HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len) {
|
||||
// orig_prog is only used for the action table
|
||||
HSVMContext ctx;
|
||||
HArena *arena = h_new_arena(mm__, 0);
|
||||
ctx.stack_count = 0;
|
||||
ctx.stack_capacity = 16;
|
||||
ctx.stack = h_new(HParsedToken*, ctx.stack_capacity);
|
||||
|
||||
HParsedToken *tmp_res;
|
||||
HRVMTrace *cur;
|
||||
for (cur = trace; cur; cur = cur->next) {
|
||||
switch (cur->opcode) {
|
||||
case SVM_PUSH:
|
||||
svm_stack_ensure_cap(mm__, &ctx, 1);
|
||||
tmp_res = a_new(HParsedToken, 1);
|
||||
tmp_res->token_type = TT_MARK;
|
||||
tmp_res->index = cur->input_pos;
|
||||
tmp_res->bit_offset = 0;
|
||||
ctx.stack[ctx.stack_count++] = tmp_res;
|
||||
break;
|
||||
case SVM_NOP:
|
||||
break;
|
||||
case SVM_ACTION:
|
||||
// Action should modify stack appropriately
|
||||
if (!orig_prog->actions[cur->arg].action(arena, &ctx, orig_prog->actions[cur->arg].env)) {
|
||||
// action failed... abort somehow
|
||||
// TODO: Actually abort
|
||||
}
|
||||
break;
|
||||
case SVM_CAPTURE:
|
||||
// Top of stack must be a mark
|
||||
// This replaces said mark in-place with a TT_BYTES.
|
||||
assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK);
|
||||
|
||||
tmp_res = ctx.stack[ctx.stack_count];
|
||||
tmp_res->token_type = TT_BYTES;
|
||||
// TODO: Will need to copy if bit_offset is nonzero
|
||||
assert(tmp_res->bit_offset == 0);
|
||||
|
||||
tmp_res->bytes.token = input + tmp_res->index;
|
||||
tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive
|
||||
break;
|
||||
case SVM_ACCEPT:
|
||||
assert(ctx.stack_count == 1);
|
||||
HParseResult *res = a_new(HParseResult, 1);
|
||||
res->ast = ctx.stack[0];
|
||||
res->bit_length = cur->input_pos * 8;
|
||||
res->arena = arena;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
h_delete_arena(arena);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
uint16_t h_rvm_create_action(HRVMProg *prog, HSVMActionFunc action_func, void* env) {
|
||||
for (uint16_t i = 0; i < prog->action_count; i++) {
|
||||
if (prog->actions[i].action == action_func && prog->actions[i].env == env)
|
||||
return i;
|
||||
}
|
||||
// Ensure that there's room in the action array...
|
||||
if (!(prog->action_count & (prog->action_count + 1))) {
|
||||
// needs to be scaled up.
|
||||
size_t array_size = (prog->action_count + 1) * 2; // action_count+1 is a
|
||||
// power of two
|
||||
prog->actions = prog->allocator->realloc(prog->allocator, prog->actions, array_size * sizeof(*prog->actions));
|
||||
// TODO: Handle the allocation failed case nicely.
|
||||
}
|
||||
|
||||
HSVMAction *action = &prog->actions[prog->action_count];
|
||||
action->action = action_func;
|
||||
action->env = env;
|
||||
return prog->action_count++;
|
||||
}
|
||||
|
||||
uint16_t h_rvm_insert_insn(HRVMProg *prog, HRVMOp op, uint16_t arg) {
|
||||
// Ensure that there's room in the insn array...
|
||||
if (!(prog->length & (prog->length + 1))) {
|
||||
// needs to be scaled up.
|
||||
size_t array_size = (prog->length + 1) * 2; // action_count+1 is a
|
||||
// power of two
|
||||
prog->insns = prog->allocator->realloc(prog->allocator, prog->insns, array_size * sizeof(*prog->insns));
|
||||
// TODO: Handle the allocation failed case nicely.
|
||||
}
|
||||
|
||||
prog->insns[prog->length].op = op;
|
||||
prog->insns[prog->length].arg = arg;
|
||||
return prog->length++;
|
||||
}
|
||||
|
||||
uint16_t h_rvm_get_ip(HRVMProg *prog) {
|
||||
return prog->length;
|
||||
}
|
||||
|
||||
void h_rvm_patch_arg(HRVMProg *prog, uint16_t ip, uint16_t new_val) {
|
||||
assert(prog->length > ip);
|
||||
prog->insns[ip].arg = new_val;
|
||||
}
|
||||
|
||||
size_t h_svm_count_to_mark(HSVMContext *ctx) {
|
||||
size_t ctm;
|
||||
for (ctm = 0; ctm < ctx->stack_count-1; ctm++) {
|
||||
if (ctx->stack[ctx->stack_count - 1 - ctm]->token_type == TT_MARK)
|
||||
return ctm;
|
||||
}
|
||||
return ctx->stack_count;
|
||||
}
|
||||
|
||||
// TODO: Implement the primitive actions
|
||||
bool h_svm_action_make_sequence(HArena *arena, HSVMContext *ctx, void* env) {
|
||||
size_t n_items = h_svm_count_to_mark(ctx);
|
||||
assert (n_items < ctx->stack_count);
|
||||
HParsedToken *res = ctx->stack[ctx->stack_count - 1 - n_items];
|
||||
assert (res->token_type == TT_MARK);
|
||||
res->token_type = TT_SEQUENCE;
|
||||
|
||||
HCountedArray *ret_carray = h_carray_new_sized(arena, n_items);
|
||||
res->seq = ret_carray;
|
||||
// res index and bit offset are the same as the mark.
|
||||
for (size_t i = 0; i < n_items; i++) {
|
||||
ret_carray->elements[i] = ctx->stack[ctx->stack_count - n_items + i];
|
||||
}
|
||||
ctx->stack_count -= n_items;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool h_svm_action_clear_to_mark(HArena *arena, HSVMContext *ctx, void* env) {
|
||||
while (ctx->stack_count > 0) {
|
||||
if (ctx->stack[--ctx->stack_count]->token_type == TT_MARK)
|
||||
return true;
|
||||
}
|
||||
return false; // no mark found.
|
||||
}
|
||||
|
||||
// Glue regex backend to rest of system
|
||||
|
||||
bool h_compile_regex(HRVMProg *prog, const HParser *parser) {
|
||||
return parser->vtable->compile_to_rvm(prog, parser->env);
|
||||
}
|
||||
|
||||
static void h_regex_free(HParser *parser) {
|
||||
HRVMProg *prog = (HRVMProg*)parser->backend_data;
|
||||
HAllocator *mm__ = prog->allocator;
|
||||
h_free(prog->insns);
|
||||
h_free(prog->actions);
|
||||
h_free(prog);
|
||||
parser->backend_data = NULL;
|
||||
parser->backend = PB_PACKRAT;
|
||||
}
|
||||
|
||||
static int h_regex_compile(HAllocator *mm__, HParser* parser, const void* params) {
|
||||
if (!parser->vtable->isValidRegular(parser->env))
|
||||
return 1;
|
||||
HRVMProg *prog = h_new(HRVMProg, 1);
|
||||
prog->allocator = mm__;
|
||||
if (!h_compile_regex(prog, parser)) {
|
||||
h_free(prog->insns);
|
||||
h_free(prog->actions);
|
||||
h_free(prog);
|
||||
return 2;
|
||||
}
|
||||
parser->backend_data = prog;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static HParseResult *h_regex_parse(HAllocator* mm__, const HParser* parser, HInputStream *input_stream) {
|
||||
return h_rvm_run__m(mm__, (HRVMProg*)parser->backend_data, input_stream->input, input_stream->length);
|
||||
}
|
||||
|
||||
HParserBackendVTable h__regex_backend_vtable = {
|
||||
.compile = h_regex_compile,
|
||||
.parse = h_regex_parse,
|
||||
.free = h_regex_free
|
||||
};
|
||||
80
src/backends/regex.h
Normal file
80
src/backends/regex.h
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
// Internal defs
|
||||
#ifndef HAMMER_BACKEND_REGEX__H
|
||||
#define HAMMER_BACKEND_REGEX__H
|
||||
|
||||
// each insn is an 8-bit opcode and a 16-bit parameter
|
||||
// [a] are actions; they add an instruction to the stackvm that is being output.
|
||||
// [m] are match ops; they can either succeed or fail, depending on the current character
|
||||
// [c] are control ops. They affect the pc non-linearly.
|
||||
typedef enum HRVMOp_ {
|
||||
RVM_ACCEPT, // [a]
|
||||
RVM_GOTO, // [c] parameter is an offset into the instruction table
|
||||
RVM_FORK, // [c] parameter is an offset into the instruction table
|
||||
RVM_PUSH, // [a] No arguments, just pushes a mark (pointer to some
|
||||
// character in the input string) onto the stack
|
||||
RVM_ACTION, // [a] argument is an action ID
|
||||
RVM_CAPTURE, // [a] Capture the last string (up to the current
|
||||
// position, non-inclusive), and push it on the
|
||||
// stack. No arg.
|
||||
RVM_EOF, // [m] Succeeds only if at EOF.
|
||||
RVM_MATCH, // [m] The high byte of the parameter is an upper bound
|
||||
// and the low byte is a lower bound, both
|
||||
// inclusive. An inverted match should be handled
|
||||
// as two ranges.
|
||||
RVM_STEP, // [a] Step to the next byte of input
|
||||
RVM_OPCOUNT
|
||||
} HRVMOp;
|
||||
|
||||
typedef struct HRVMInsn_{
|
||||
uint8_t op;
|
||||
uint16_t arg;
|
||||
} HRVMInsn;
|
||||
|
||||
#define TT_MARK TT_RESERVED_1
|
||||
|
||||
typedef struct HSVMContext_ {
|
||||
HParsedToken **stack;
|
||||
size_t stack_count; // number of items on the stack. Thus stack[stack_count] is the first unused item on the stack.
|
||||
size_t stack_capacity;
|
||||
} HSVMContext;
|
||||
|
||||
// These actions all assume that the items on the stack are not
|
||||
// aliased anywhere.
|
||||
typedef bool (*HSVMActionFunc)(HArena *arena, HSVMContext *ctx, void* env);
|
||||
typedef struct HSVMAction_ {
|
||||
HSVMActionFunc action;
|
||||
void* env;
|
||||
} HSVMAction;
|
||||
|
||||
struct HRVMProg_ {
|
||||
HAllocator *allocator;
|
||||
size_t length;
|
||||
size_t action_count;
|
||||
HRVMInsn *insns;
|
||||
HSVMAction *actions;
|
||||
};
|
||||
|
||||
// Returns true IFF the provided parser could be compiled.
|
||||
bool h_compile_regex(HRVMProg *prog, const HParser* parser);
|
||||
|
||||
// These functions are used by the compile_to_rvm method of HParser
|
||||
uint16_t h_rvm_create_action(HRVMProg *prog, HSVMActionFunc action_func, void* env);
|
||||
|
||||
// returns the address of the instruction just created
|
||||
uint16_t h_rvm_insert_insn(HRVMProg *prog, HRVMOp op, uint16_t arg);
|
||||
|
||||
// returns the address of the next insn to be created.
|
||||
uint16_t h_rvm_get_ip(HRVMProg *prog);
|
||||
|
||||
// Used to insert forward references; the idea is to generate a JUMP
|
||||
// or FORK instruction with a target of 0, then update it once the
|
||||
// correct target is known.
|
||||
void h_rvm_patch_arg(HRVMProg *prog, uint16_t ip, uint16_t new_val);
|
||||
|
||||
// Common SVM action funcs...
|
||||
bool h_svm_action_make_sequence(HArena *arena, HSVMContext *ctx, void* env);
|
||||
bool h_svm_action_clear_to_mark(HArena *arena, HSVMContext *ctx, void* env);
|
||||
|
||||
extern HParserBackendVTable h__regex_backend_vtable;
|
||||
|
||||
#endif
|
||||
112
src/backends/regexvm_asm.pl
Normal file
112
src/backends/regexvm_asm.pl
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
# The input file consists of a sequence of blocks, which can be parsed
|
||||
# as SVM test cases, RVM test cases, or C functions. Each block starts
|
||||
# with a header line, then a sequence of options, and finally text in
|
||||
# a format defined by the block type.
|
||||
#
|
||||
# Header lines start with "+TYPE", optionally followed by a name. This
|
||||
# name is semantically meaningful for SVM and RVM blocks; it
|
||||
# determines the name of the test case.
|
||||
|
||||
# A C block's name is not used, and it takes no options. The body
|
||||
# (which continues until the first line that looks like a header), is
|
||||
# just passed straight through into the C source.
|
||||
|
||||
# SVM blocks' names are the GLib test case name. The underlying
|
||||
# function's name is derived by substituting invalid characters with
|
||||
# '_'. Note that this can result in collisions (eg, /foo_bar/baz
|
||||
# collides with /foo/bar_baz). If this happens, it's your own damn
|
||||
# fault; rename the blocks. SVM blocks take three different options:
|
||||
# @input, @output, and @pre. The @input pragma's argument is a
|
||||
# C-quoted string that gets passed into the VM as the input string,
|
||||
# and @output is a C-quoted string that is compared against
|
||||
# h_write_result_unamb. @pre lines are prepended verbatim to the
|
||||
# function body (with the @pre stripped, of course); they can be used
|
||||
# to initialize environment values.
|
||||
#
|
||||
# SVM instructions consist of either two or four fields:
|
||||
#
|
||||
# input_pos opcode [arg env]
|
||||
#
|
||||
# input_pos and opcode correspond to the fields in HRVMTrace. arg and
|
||||
# env are used to populate an HSVMAction; arg is the function, and env
|
||||
# is the object whose address should be used as the env.
|
||||
|
||||
# RVM blocks are very similar to SVM blocks; the name and options are
|
||||
# handled exactly the same way. The assembly text is handled slightly
|
||||
# differently; the format is:
|
||||
#
|
||||
# [label:] opcode [arg ...]
|
||||
#
|
||||
# For FORK and GOTO, the arg should be a label that is defined
|
||||
# elsewhere.
|
||||
#
|
||||
# For ACTION, the arguments are handled the same way as with SVM.
|
||||
#
|
||||
# MATCH takes two arguments, each of which can be any C integer
|
||||
# constant (not including character constants), which form the lower
|
||||
# and upper bounds of the matched character, respectively.
|
||||
#
|
||||
# No other RVM instructions take an argument.
|
||||
|
||||
# At the beginning of any line, comments preceeded by '#' are allowed;
|
||||
# they are replaced by C++ comments and inserted in the nearest valid
|
||||
# location in the output.
|
||||
|
||||
my $mode == "TOP";
|
||||
|
||||
# common regexes:
|
||||
my $re_ident = qr/[A-Za-z_][A-Za-z0-9_]*/;
|
||||
my $re_cstr = qr/"(?:[^\\"]|\\["'abefnrtv0\\]|\\x[0-9a-fA-F]{2}|\\[0-7]{3})*"/;
|
||||
|
||||
|
||||
my %svm = (
|
||||
name => sub {
|
||||
my ($env, $name) = @_;
|
||||
$env->{name} = $name;
|
||||
},
|
||||
pragma => sub {
|
||||
my ($env, $name, $val) = @_;
|
||||
if ($name eq "input") {
|
||||
chomp($env->{input} = $val);
|
||||
} elsif ($name eq "output") {
|
||||
chomp($env->{output} = $val);
|
||||
} elsif ($name eq "pre") {
|
||||
# Do I have the ref precedence right here?
|
||||
push(@$env->{pre}, $val);
|
||||
} else {
|
||||
warn "Invalid SVM pragma";
|
||||
}
|
||||
},
|
||||
body => sub {
|
||||
my ($env, $line) = @_;
|
||||
my ($ipos, $op, $arg, $argenv);
|
||||
if ($line =~ /^\s*(\d+)\s+(PUSH|NOP|ACTION|CAPTURE|ACCEPT)(?:\s+($re_ident)\s+($re_ident))?/) {
|
||||
if ($2 eq "PUSH") {
|
||||
# TODO: implement all the opcodes
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
while (<>) {
|
||||
if (/^+(C|RVM|SVM)/) {
|
||||
$mode = $1;
|
||||
}
|
||||
|
||||
if ($mode eq "TOP") {
|
||||
if (/^#(.*)/) {
|
||||
print "// $1";
|
||||
next;
|
||||
}
|
||||
} elsif ($mode eq "SVM") {
|
||||
} elsif ($mode eq "RVM") {
|
||||
} elsif ($mode eq "C") {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue