Regex VM finished but untested.
This commit is contained in:
parent
a8ab63e096
commit
c19d7bb66e
8 changed files with 283 additions and 32 deletions
32
docs/milestone2.dot
Normal file
32
docs/milestone2.dot
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
digraph {
|
||||
graph [rankdir=LR];
|
||||
subgraph complete {
|
||||
node [color="gray",fontcolor="gray"];
|
||||
regex_gen;
|
||||
glue;
|
||||
}
|
||||
/* The end result of the milestone, along with the subtasks listed */
|
||||
milestone2 [color="green",style="filled"];
|
||||
llk -> milestone2;
|
||||
lr -> milestone2;
|
||||
lalr8_gen -> lr;
|
||||
glr_gen -> lr;
|
||||
lr_driver -> lr;
|
||||
regex -> milestone2;
|
||||
glue -> milestone2; // Meredith knows what glue referred to here.
|
||||
tests -> milestone2;
|
||||
|
||||
regex_gen -> regex;
|
||||
regex_driver -> regex;
|
||||
llk_driver -> llk;
|
||||
llk_gen -> llk;
|
||||
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
desugaring -> llk_gen;
|
||||
desugaring -> lalr8_gen;
|
||||
desugaring -> glr_gen;
|
||||
|
||||
}
|
||||
|
|
@ -26,7 +26,8 @@ PARSERS := \
|
|||
indirect
|
||||
|
||||
BACKENDS := \
|
||||
packrat
|
||||
packrat \
|
||||
regex
|
||||
|
||||
HAMMER_PARTS := \
|
||||
bitreader.o \
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "../internal.h"
|
||||
#include "../parsers/parser_internal.h"
|
||||
#include "regex.h"
|
||||
|
||||
#undef a_new
|
||||
#define a_new(typ, count) a_new_(arena, typ, count);
|
||||
#define a_new(typ, count) a_new_(arena, typ, count)
|
||||
// Stack VM
|
||||
typedef enum HSVMOp_ {
|
||||
SVM_PUSH, // Push a mark. There is no VM insn to push an object.
|
||||
|
|
@ -16,6 +19,7 @@ typedef struct HRVMTrace_ {
|
|||
struct HRVMTrace_ *next; // When parsing, these are
|
||||
// reverse-threaded. There is a postproc
|
||||
// step that inverts all the pointers.
|
||||
size_t input_pos;
|
||||
uint16_t arg;
|
||||
uint8_t opcode;
|
||||
} HRVMTrace;
|
||||
|
|
@ -25,13 +29,27 @@ typedef struct HRVMThread_ {
|
|||
uint16_t ip;
|
||||
} HRVMThread;
|
||||
|
||||
// TODO(thequux): This function could really use a refactoring, at the
|
||||
// very least, to split the two VMs.
|
||||
void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t len) {
|
||||
HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len);
|
||||
|
||||
HRVMTrace *invert_trace(HRVMTrace *trace) {
|
||||
HRVMTrace *last = NULL;
|
||||
if (!trace)
|
||||
return NULL;
|
||||
if (!trace->next)
|
||||
return trace;
|
||||
do {
|
||||
HRVMTrace *next = trace->next;
|
||||
trace->next = last;
|
||||
last = trace;
|
||||
trace = next;
|
||||
} while (trace->next);
|
||||
return trace;
|
||||
}
|
||||
|
||||
void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) {
|
||||
HArena *arena = h_new_arena(mm__, 0);
|
||||
HRVMTrace **heads_p = a_new(HRVMTrace*, prog->length),
|
||||
**heads_n = a_new(HRVMTrace*, prog->length),
|
||||
**heads_t;
|
||||
**heads_n = a_new(HRVMTrace*, prog->length);
|
||||
|
||||
HRVMTrace *ret_trace;
|
||||
|
||||
|
|
@ -39,12 +57,16 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
|
|||
HRVMThread *ip_queue = a_new(HRVMThread, prog->length);
|
||||
size_t ipq_top;
|
||||
|
||||
|
||||
|
||||
|
||||
#define THREAD ip_queue[ipq_top-1]
|
||||
#define PUSH_SVM(op_, arg_) do { \
|
||||
HRVMTrace *nt = a_new(HRVMTrace, 1); \
|
||||
nt->arg = (arg_); \
|
||||
nt->opcode = (op_); \
|
||||
nt->next = THREAD.trace; \
|
||||
nt->input_pos = off; \
|
||||
THREAD.trace = nt; \
|
||||
} while(0)
|
||||
|
||||
|
|
@ -55,7 +77,8 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
|
|||
int live_threads = 1;
|
||||
for (off = 0; off <= len; off++) {
|
||||
uint8_t ch = ((off == len) ? 0 : input[off]);
|
||||
size_t ip_s, ip;
|
||||
size_t ip_s; // BUG: there was an unused variable ip. Not sure if
|
||||
// I intended to use it somewhere.
|
||||
/* scope */ {
|
||||
HRVMTrace **heads_t;
|
||||
heads_t = heads_n;
|
||||
|
|
@ -77,9 +100,9 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
|
|||
uint8_t hi, lo;
|
||||
uint16_t arg;
|
||||
while(ipq_top > 0) {
|
||||
if (insns_seen[THREAD.ip] == 1)
|
||||
if (insn_seen[THREAD.ip] == 1)
|
||||
continue;
|
||||
insns_seen[THREAD.ip] = 1;
|
||||
insn_seen[THREAD.ip] = 1;
|
||||
arg = prog->insns[THREAD.ip].arg;
|
||||
switch(prog->insns[THREAD.ip].op) {
|
||||
case RVM_ACCEPT:
|
||||
|
|
@ -100,8 +123,8 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
|
|||
goto next_insn;
|
||||
case RVM_FORK:
|
||||
THREAD.ip++;
|
||||
if (!insns_seen[arg]) {
|
||||
insns_seen[THREAD.ip] = 2;
|
||||
if (!insn_seen[arg]) {
|
||||
insn_seen[THREAD.ip] = 2;
|
||||
HRVMTrace* tr = THREAD.trace;
|
||||
ipq_top++;
|
||||
THREAD.ip = arg;
|
||||
|
|
@ -109,7 +132,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
|
|||
}
|
||||
goto next_insn;
|
||||
case RVM_PUSH:
|
||||
PUSH_SVM(SVM_PUSH, off);
|
||||
PUSH_SVM(SVM_PUSH, 0);
|
||||
THREAD.ip++;
|
||||
goto next_insn;
|
||||
case RVM_ACTION:
|
||||
|
|
@ -133,6 +156,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
|
|||
goto next_insn;
|
||||
}
|
||||
next_insn:
|
||||
;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -147,27 +171,78 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
|
|||
|
||||
|
||||
ret_trace = invert_trace(ret_trace);
|
||||
HParseResult *ret = run_trace(mm__, ret_trace, input, length);
|
||||
HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len);
|
||||
// ret is in its own arena
|
||||
h_delete_arena(arena);
|
||||
return ret;
|
||||
}
|
||||
#undef PUSH_SVM
|
||||
#undef THREAD
|
||||
|
||||
HRVMTrace *invert_trace(HRVMTrace *trace) {
|
||||
HRVMTrace *next, *last = NULL;
|
||||
if (!trace)
|
||||
|
||||
|
||||
|
||||
void svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) {
|
||||
if (ctx->stack_count + addl >= ctx->stack_capacity) {
|
||||
ctx->stack = mm__->realloc(mm__, ctx->stack, sizeof(*ctx->stack) * (ctx->stack_capacity *= 2));
|
||||
// TODO: check for realloc failure
|
||||
}
|
||||
}
|
||||
|
||||
HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len) {
|
||||
// orig_prog is only used for the action table
|
||||
HSVMContext ctx;
|
||||
HArena *arena = h_new_arena(mm__, 0);
|
||||
ctx.stack_count = 0;
|
||||
ctx.stack_capacity = 16;
|
||||
ctx.stack = h_new(HParsedToken*, ctx.stack_capacity);
|
||||
|
||||
HParsedToken *tmp_res;
|
||||
HRVMTrace *cur;
|
||||
for (cur = trace; cur; cur = cur->next) {
|
||||
switch (cur->opcode) {
|
||||
case SVM_PUSH:
|
||||
svm_stack_ensure_cap(mm__, &ctx, 1);
|
||||
tmp_res = a_new(HParsedToken, 1);
|
||||
tmp_res->token_type = TT_MARK;
|
||||
tmp_res->index = cur->input_pos;
|
||||
tmp_res->bit_offset = 0;
|
||||
ctx.stack[ctx.stack_count++] = tmp_res;
|
||||
break;
|
||||
case SVM_NOP:
|
||||
break;
|
||||
case SVM_ACTION:
|
||||
// Action should modify stack appropriately
|
||||
if (!orig_prog->actions[cur->arg].fn(arena, &ctx, orig_prog->actions[cur->arg].env)) {
|
||||
// action failed... abort somehow
|
||||
// TODO: Actually abort
|
||||
}
|
||||
break;
|
||||
case SVM_CAPTURE:
|
||||
// Top of stack must be a mark
|
||||
// This replaces said mark in-place with a TT_BYTES.
|
||||
assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK);
|
||||
|
||||
tmp_res = ctx.stack[ctx.stack_count];
|
||||
tmp_res->token_type = TT_BYTES;
|
||||
// TODO: Will need to copy if bit_offset is nonzero
|
||||
assert(tmp_res->bit_offset == 0);
|
||||
|
||||
tmp_res->bytes.token = input + tmp_res->index;
|
||||
tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive
|
||||
break;
|
||||
case SVM_ACCEPT:
|
||||
assert(ctx.stack_count == 1);
|
||||
HParseResult *res = a_new(HParseResult, 1);
|
||||
res->ast = ctx.stack[0];
|
||||
res->bit_length = cur->input_pos * 8;
|
||||
res->arena = arena;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
h_delete_arena(arena);
|
||||
return NULL;
|
||||
if (!trace->next)
|
||||
return trace;
|
||||
do {
|
||||
HRVMTrace *next = trace->next;
|
||||
trace->next = last;
|
||||
last = trace;
|
||||
trace = next;
|
||||
} while (trace->next);
|
||||
return trace;
|
||||
}
|
||||
|
||||
HParseResult *run_trace(HAllocator mm__, HRVMTrace *trace, uint8_t *input, int len) {
|
||||
|
||||
}
|
||||
// TODO: Implement the primitive actions
|
||||
|
|
|
|||
|
|
@ -27,12 +27,29 @@ typedef struct HRVMInsn_{
|
|||
uint16_t arg;
|
||||
} HRVMInsn;
|
||||
|
||||
const HTokenType TT_MARK = TT_RESERVED_1;
|
||||
|
||||
typedef struct HSVMContext_ {
|
||||
HParsedToken **stack;
|
||||
size_t stack_count;
|
||||
size_t stack_capacity;
|
||||
} HSVMContext;
|
||||
|
||||
// These actions all assume that the items on the stack are not
|
||||
// aliased anywhere.
|
||||
typedef struct HSVMAction_ {
|
||||
bool (*fn)(HArena *arena, HSVMContext *ctx, void* env);
|
||||
void* env;
|
||||
} HSVMAction;
|
||||
|
||||
typedef struct HRVMProg_ {
|
||||
size_t length;
|
||||
size_t action_count;
|
||||
HAction *actions;
|
||||
HRVMInsn *insns;
|
||||
};
|
||||
HSVMAction *actions;
|
||||
} HRVMProg;
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
|||
112
src/backends/regexvm_asm.pl
Normal file
112
src/backends/regexvm_asm.pl
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
# The input file consists of a sequence of blocks, which can be parsed
|
||||
# as SVM test cases, RVM test cases, or C functions. Each block starts
|
||||
# with a header line, then a sequence of options, and finally text in
|
||||
# a format defined by the block type.
|
||||
#
|
||||
# Header lines start with "+TYPE", optionally followed by a name. This
|
||||
# name is semantically meaningful for SVM and RVM blocks; it
|
||||
# determines the name of the test case.
|
||||
|
||||
# A C block's name is not used, and it takes no options. The body
|
||||
# (which continues until the first line that looks like a header), is
|
||||
# just passed straight through into the C source.
|
||||
|
||||
# SVM blocks' names are the GLib test case name. The underlying
|
||||
# function's name is derived by substituting invalid characters with
|
||||
# '_'. Note that this can result in collisions (eg, /foo_bar/baz
|
||||
# collides with /foo/bar_baz). If this happens, it's your own damn
|
||||
# fault; rename the blocks. SVM blocks take three different options:
|
||||
# @input, @output, and @pre. The @input pragma's argument is a
|
||||
# C-quoted string that gets passed into the VM as the input string,
|
||||
# and @output is a C-quoted string that is compared against
|
||||
# h_write_result_unamb. @pre lines are prepended verbatim to the
|
||||
# function body (with the @pre stripped, of course); they can be used
|
||||
# to initialize environment values.
|
||||
#
|
||||
# SVM instructions consist of either two or four fields:
|
||||
#
|
||||
# input_pos opcode [arg env]
|
||||
#
|
||||
# input_pos and opcode correspond to the fields in HRVMTrace. arg and
|
||||
# env are used to populate an HSVMAction; arg is the function, and env
|
||||
# is the object whose address should be used as the env.
|
||||
|
||||
# RVM blocks are very similar to SVM blocks; the name and options are
|
||||
# handled exactly the same way. The assembly text is handled slightly
|
||||
# differently; the format is:
|
||||
#
|
||||
# [label:] opcode [arg ...]
|
||||
#
|
||||
# For FORK and GOTO, the arg should be a label that is defined
|
||||
# elsewhere.
|
||||
#
|
||||
# For ACTION, the arguments are handled the same way as with SVM.
|
||||
#
|
||||
# MATCH takes two arguments, each of which can be any C integer
|
||||
# constant (not including character constants), which form the lower
|
||||
# and upper bounds of the matched character, respectively.
|
||||
#
|
||||
# No other RVM instructions take an argument.
|
||||
|
||||
# At the beginning of any line, comments preceeded by '#' are allowed;
|
||||
# they are replaced by C++ comments and inserted in the nearest valid
|
||||
# location in the output.
|
||||
|
||||
my $mode == "TOP";
|
||||
|
||||
# common regexes:
|
||||
my $re_ident = qr/[A-Za-z_][A-Za-z0-9_]*/;
|
||||
my $re_cstr = qr/"(?:[^\\"]|\\["'abefnrtv0\\]|\\x[0-9a-fA-F]{2}|\\[0-7]{3})*"/;
|
||||
|
||||
|
||||
my %svm = (
|
||||
name => sub {
|
||||
my ($env, $name) = @_;
|
||||
$env->{name} = $name;
|
||||
},
|
||||
pragma => sub {
|
||||
my ($env, $name, $val) = @_;
|
||||
if ($name eq "input") {
|
||||
chomp($env->{input} = $val);
|
||||
} elsif ($name eq "output") {
|
||||
chomp($env->{output} = $val);
|
||||
} elsif ($name eq "pre") {
|
||||
# Do I have the ref precedence right here?
|
||||
push(@$env->{pre}, $val);
|
||||
} else {
|
||||
warn "Invalid SVM pragma";
|
||||
}
|
||||
},
|
||||
body => sub {
|
||||
my ($env, $line) = @_;
|
||||
my ($ipos, $op, $arg, $argenv);
|
||||
if ($line =~ /^\s*(\d+)\s+(PUSH|NOP|ACTION|CAPTURE|ACCEPT)(?:\s+($re_ident)\s+($re_ident))?/) {
|
||||
if ($2 eq "PUSH") {
|
||||
# TODO: implement all the opcodes
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
|
||||
while (<>) {
|
||||
if (/^+(C|RVM|SVM)/) {
|
||||
$mode = $1;
|
||||
}
|
||||
|
||||
if ($mode eq "TOP") {
|
||||
if (/^#(.*)/) {
|
||||
print "// $1";
|
||||
next;
|
||||
}
|
||||
} elsif ($mode eq "SVM") {
|
||||
} elsif ($mode eq "RVM") {
|
||||
} elsif ($mode eq "C") {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -84,4 +84,12 @@ void h_parse_result_free(HParseResult *result) {
|
|||
h_delete_arena(result->arena);
|
||||
}
|
||||
|
||||
bool h_false(void* env) {
|
||||
(void)env;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool h_true(void* env) {
|
||||
(void)env;
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ typedef enum HTokenType_ {
|
|||
TT_SINT,
|
||||
TT_UINT,
|
||||
TT_SEQUENCE,
|
||||
TT_RESERVED_1, // reserved for internal use
|
||||
TT_USER = 64,
|
||||
TT_ERR,
|
||||
TT_MAX
|
||||
|
|
@ -78,7 +79,9 @@ typedef struct HParsedToken_ {
|
|||
} HParsedToken;
|
||||
|
||||
/**
|
||||
* The result of a successful parse.
|
||||
* The result of a successful parse. Note that this may reference the
|
||||
* input string.
|
||||
*
|
||||
* If a parse fails, the parse result will be NULL.
|
||||
* If a parse is successful but there's nothing there (i.e., if end_p
|
||||
* succeeds) then there's a parse result but its ast is NULL.
|
||||
|
|
|
|||
|
|
@ -223,6 +223,9 @@ int h_hashtable_present(HHashTable* ht, void* key);
|
|||
void h_hashtable_del(HHashTable* ht, void* key);
|
||||
void h_hashtable_free(HHashTable* ht);
|
||||
|
||||
bool h_false(void*);
|
||||
bool h_true(void*);
|
||||
|
||||
#if 0
|
||||
#include <stdlib.h>
|
||||
#define h_arena_malloc(a, s) malloc(s)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue