Regex VM finished but untested.

This commit is contained in:
Dan Hirsch 2013-03-09 21:42:49 -08:00
parent a8ab63e096
commit c19d7bb66e
8 changed files with 283 additions and 32 deletions

112
src/backends/regexvm_asm.pl Normal file
View file

@ -0,0 +1,112 @@
#!/usr/bin/perl -w
use strict;
# The input file consists of a sequence of blocks, which can be parsed
# as SVM test cases, RVM test cases, or C functions. Each block starts
# with a header line, then a sequence of options, and finally text in
# a format defined by the block type.
#
# Header lines start with "+TYPE", optionally followed by a name. This
# name is semantically meaningful for SVM and RVM blocks; it
# determines the name of the test case.
# A C block's name is not used, and it takes no options. The body
# (which continues until the first line that looks like a header), is
# just passed straight through into the C source.
# SVM blocks' names are the GLib test case name. The underlying
# function's name is derived by substituting invalid characters with
# '_'. Note that this can result in collisions (eg, /foo_bar/baz
# collides with /foo/bar_baz). If this happens, it's your own damn
# fault; rename the blocks. SVM blocks take three different options:
# @input, @output, and @pre. The @input pragma's argument is a
# C-quoted string that gets passed into the VM as the input string,
# and @output is a C-quoted string that is compared against
# h_write_result_unamb. @pre lines are prepended verbatim to the
# function body (with the @pre stripped, of course); they can be used
# to initialize environment values.
#
# SVM instructions consist of either two or four fields:
#
# input_pos opcode [arg env]
#
# input_pos and opcode correspond to the fields in HRVMTrace. arg and
# env are used to populate an HSVMAction; arg is the function, and env
# is the object whose address should be used as the env.
# RVM blocks are very similar to SVM blocks; the name and options are
# handled exactly the same way. The assembly text is handled slightly
# differently; the format is:
#
# [label:] opcode [arg ...]
#
# For FORK and GOTO, the arg should be a label that is defined
# elsewhere.
#
# For ACTION, the arguments are handled the same way as with SVM.
#
# MATCH takes two arguments, each of which can be any C integer
# constant (not including character constants), which form the lower
# and upper bounds of the matched character, respectively.
#
# No other RVM instructions take an argument.
# At the beginning of any line, comments preceeded by '#' are allowed;
# they are replaced by C++ comments and inserted in the nearest valid
# location in the output.
my $mode == "TOP";
# common regexes:
my $re_ident = qr/[A-Za-z_][A-Za-z0-9_]*/;
my $re_cstr = qr/"(?:[^\\"]|\\["'abefnrtv0\\]|\\x[0-9a-fA-F]{2}|\\[0-7]{3})*"/;
my %svm = (
name => sub {
my ($env, $name) = @_;
$env->{name} = $name;
},
pragma => sub {
my ($env, $name, $val) = @_;
if ($name eq "input") {
chomp($env->{input} = $val);
} elsif ($name eq "output") {
chomp($env->{output} = $val);
} elsif ($name eq "pre") {
# Do I have the ref precedence right here?
push(@$env->{pre}, $val);
} else {
warn "Invalid SVM pragma";
}
},
body => sub {
my ($env, $line) = @_;
my ($ipos, $op, $arg, $argenv);
if ($line =~ /^\s*(\d+)\s+(PUSH|NOP|ACTION|CAPTURE|ACCEPT)(?:\s+($re_ident)\s+($re_ident))?/) {
if ($2 eq "PUSH") {
# TODO: implement all the opcodes
}
}
}
);
while (<>) {
if (/^+(C|RVM|SVM)/) {
$mode = $1;
}
if ($mode eq "TOP") {
if (/^#(.*)/) {
print "// $1";
next;
}
} elsif ($mode eq "SVM") {
} elsif ($mode eq "RVM") {
} elsif ($mode eq "C") {
}
}