Got a lot of regex test cases working

This commit is contained in:
Dan Hirsch 2013-05-23 23:26:22 +02:00
parent f37a13ef41
commit 0600440b7c
11 changed files with 148 additions and 14 deletions

View file

@ -11,7 +11,10 @@ CONFIG_VARS= INCLUDE_TESTS
.DEFAULT_GOAL := all .DEFAULT_GOAL := all
%: nojni: all
nojni: SUBDIRS:=$(filter-out jni,$(SUBDIRS))
all clean:
+for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done +for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done
test: src/test_suite test: src/test_suite

View file

@ -6,7 +6,7 @@ endif
include $(TOPLEVEL)/config.mk include $(TOPLEVEL)/config.mk
TEST_CFLAGS = $(shell pkg-config --cflags glib-2.0) -DINCLUDE_TESTS TEST_CFLAGS = $(shell pkg-config --cflags glib-2.0) -DINCLUDE_TESTS
TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) -lrt TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) -lrt -ldl
CFLAGS := -std=gnu99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-attributes -g CFLAGS := -std=gnu99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-attributes -g
LDFLAGS := LDFLAGS :=

View file

@ -83,3 +83,5 @@ test: test_suite
test_suite: $(TESTS) libhammer.a test_suite: $(TESTS) libhammer.a
$(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) $(TEST_LDFLAGS) $(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) $(TEST_LDFLAGS)
backends/regex.o: backends/regex_debug.c

View file

@ -1,3 +1,4 @@
#define _GNU_SOURCE
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
#include "../internal.h" #include "../internal.h"
@ -13,6 +14,7 @@ typedef enum HSVMOp_ {
SVM_ACTION, // Same meaning as RVM_ACTION SVM_ACTION, // Same meaning as RVM_ACTION
SVM_CAPTURE, // Same meaning as RVM_CAPTURE SVM_CAPTURE, // Same meaning as RVM_CAPTURE
SVM_ACCEPT, SVM_ACCEPT,
SVM_OPCOUNT
} HSVMOp; } HSVMOp;
typedef struct HRVMTrace_ { typedef struct HRVMTrace_ {
@ -42,8 +44,8 @@ HRVMTrace *invert_trace(HRVMTrace *trace) {
trace->next = last; trace->next = last;
last = trace; last = trace;
trace = next; trace = next;
} while (trace->next); } while (trace);
return trace; return last;
} }
void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) { void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) {
@ -151,7 +153,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_
case RVM_STEP: case RVM_STEP:
// save thread // save thread
live_threads++; live_threads++;
heads_n[THREAD.ip++] = THREAD.trace; heads_n[++THREAD.ip] = THREAD.trace;
ipq_top--; ipq_top--;
goto next_insn; goto next_insn;
} }
@ -221,15 +223,15 @@ HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace,
case SVM_CAPTURE: case SVM_CAPTURE:
// Top of stack must be a mark // Top of stack must be a mark
// This replaces said mark in-place with a TT_BYTES. // This replaces said mark in-place with a TT_BYTES.
assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK); assert(ctx.stack[ctx.stack_count-1]->token_type == TT_MARK);
tmp_res = ctx.stack[ctx.stack_count]; tmp_res = ctx.stack[ctx.stack_count-1];
tmp_res->token_type = TT_BYTES; tmp_res->token_type = TT_BYTES;
// TODO: Will need to copy if bit_offset is nonzero // TODO: Will need to copy if bit_offset is nonzero
assert(tmp_res->bit_offset == 0); assert(tmp_res->bit_offset == 0);
tmp_res->bytes.token = input + tmp_res->index; tmp_res->bytes.token = input + tmp_res->index;
tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive tmp_res->bytes.len = cur->input_pos - tmp_res->index;
break; break;
case SVM_ACCEPT: case SVM_ACCEPT:
assert(ctx.stack_count == 1); assert(ctx.stack_count == 1);
@ -351,6 +353,7 @@ static int h_regex_compile(HAllocator *mm__, HParser* parser, const void* params
h_free(prog); h_free(prog);
return 2; return 2;
} }
h_rvm_insert_insn(prog, RVM_ACCEPT, 0);
parser->backend_data = prog; parser->backend_data = prog;
return 0; return 0;
} }
@ -364,3 +367,7 @@ HParserBackendVTable h__regex_backend_vtable = {
.parse = h_regex_parse, .parse = h_regex_parse,
.free = h_regex_free .free = h_regex_free
}; };
#ifndef NDEBUG
#include "regex_debug.c"
#endif

View file

@ -0,0 +1,83 @@
// Intended to be included from regex_debug.c
#define _GNU_SOURCE
#include <stdio.h>
#include <malloc.h>
// This is some spectacularly non-portable code... but whee!
#include <dlfcn.h>
char* getsym(void* addr) {
Dl_info dli;
char* retstr;
if (dladdr(addr, &dli) != 0 && dli.dli_sname != NULL) {
if (dli.dli_saddr == addr)
return strdup(dli.dli_sname);
else
asprintf(&retstr, "%s+0x%lx", dli.dli_sname, addr - dli.dli_saddr);
} else
asprintf(&retstr, "%p", addr);
return retstr;
}
const char* rvm_op_names[RVM_OPCOUNT] = {
"ACCEPT",
"GOTO",
"FORK",
"PUSH",
"ACTION",
"CAPTURE",
"EOF",
"MATCH",
"STEP"
};
const char* svm_op_names[SVM_OPCOUNT] = {
"PUSH",
"NOP",
"ACTION",
"CAPTURE",
"ACCEPT"
};
void dump_rvm_prog(HRVMProg *prog) {
char* symref;
for (unsigned int i = 0; i < prog->length; i++) {
HRVMInsn *insn = &prog->insns[i];
printf("%4d %-10s", i, rvm_op_names[insn->op]);
switch (insn->op) {
case RVM_GOTO:
case RVM_FORK:
printf("%hd\n", insn->arg);
break;
case RVM_ACTION:
symref = getsym(prog->actions[insn->arg].action);
// TODO: somehow format the argument to action
printf("%s\n", symref);
free(symref);
break;
case RVM_MATCH: {
uint8_t low, high;
low = insn->arg & 0xff;
high = (insn->arg >> 8) & 0xff;
if (high > low)
printf("NONE\n");
else {
if (low >= 0x32 && low <= 0x7e)
printf("%02hhx ('%c')", low, low);
else
printf("%02hhx", low);
if (high >= 0x32 && high <= 0x7e)
printf(" - %02hhx ('%c')\n", high, high);
else
printf(" - %02hhx\n", high);
}
break;
}
default:
printf("\n");
}
}
}

View file

@ -97,6 +97,8 @@ static bool h_svm_action_bits(HArena *arena, HSVMContext *ctx, void* env) {
uint64_t res = 0; uint64_t res = 0;
for (size_t i = 0; i < top->bytes.len; i++) for (size_t i = 0; i < top->bytes.len; i++)
res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses.
uint64_t msb = (env_->signedp ? 1LL:0) << (top->bytes.len * 8 - 1);
res = (res ^ msb) - msb;
top->uint = res; // possibly cast to signed through union top->uint = res; // possibly cast to signed through union
top->token_type = (env_->signedp ? TT_SINT : TT_UINT); top->token_type = (env_->signedp ? TT_SINT : TT_UINT);
return true; return true;
@ -105,7 +107,7 @@ static bool h_svm_action_bits(HArena *arena, HSVMContext *ctx, void* env) {
static bool bits_ctrvm(HRVMProg *prog, void* env) { static bool bits_ctrvm(HRVMProg *prog, void* env) {
struct bits_env *env_ = (struct bits_env*)env; struct bits_env *env_ = (struct bits_env*)env;
h_rvm_insert_insn(prog, RVM_PUSH, 0); h_rvm_insert_insn(prog, RVM_PUSH, 0);
for (size_t i=0; (i < env_->length)/8; ++i) { // FUTURE: when we can handle non-byte-aligned, the env_->length/8 part will be different for (size_t i=0; i < (env_->length/8); ++i) { // FUTURE: when we can handle non-byte-aligned, the env_->length/8 part will be different
h_rvm_insert_insn(prog, RVM_MATCH, 0xFF00); h_rvm_insert_insn(prog, RVM_MATCH, 0xFF00);
h_rvm_insert_insn(prog, RVM_STEP, 0); h_rvm_insert_insn(prog, RVM_STEP, 0);
} }

View file

@ -1,3 +1,4 @@
#include <assert.h>
#include "parser_internal.h" #include "parser_internal.h"
static HParseResult* parse_ch(void* env, HParseState *state) { static HParseResult* parse_ch(void* env, HParseState *state) {
@ -20,11 +21,26 @@ static HCFChoice* desugar_ch(HAllocator *mm__, void *env) {
return ret; return ret;
} }
static bool h_svm_action_ch(HArena *arena, HSVMContext *ctx, void* env) {
// BUG: relies un undefined behaviour: int64_t is a signed uint64_t; not necessarily true on 32-bit
HParsedToken *top = ctx->stack[ctx->stack_count-1];
assert(top->token_type == TT_BYTES);
uint64_t res = 0;
for (size_t i = 0; i < top->bytes.len; i++)
res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses.
top->uint = res; // possibly cast to signed through union
top->token_type = TT_UINT;
return true;
}
static bool ch_ctrvm(HRVMProg *prog, void* env) { static bool ch_ctrvm(HRVMProg *prog, void* env) {
uint8_t c = (uint8_t)(unsigned long)(env); uint8_t c = (uint8_t)(unsigned long)(env);
// TODO: Does this capture anything? // TODO: Does this capture anything?
h_rvm_insert_insn(prog, RVM_MATCH, c & c << 8); h_rvm_insert_insn(prog, RVM_PUSH, 0);
h_rvm_insert_insn(prog, RVM_MATCH, c | c << 8);
h_rvm_insert_insn(prog, RVM_STEP, 0); h_rvm_insert_insn(prog, RVM_STEP, 0);
h_rvm_insert_insn(prog, RVM_CAPTURE, 0);
h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ch, env));
return true; return true;
} }

View file

@ -1,3 +1,4 @@
#include <assert.h>
#include <string.h> #include <string.h>
#include "../internal.h" #include "../internal.h"
#include "parser_internal.h" #include "parser_internal.h"
@ -22,23 +23,42 @@ static HCFChoice* desugar_charset(HAllocator *mm__, void *env) {
return ret; return ret;
} }
static bool h_svm_action_ch(HArena *arena, HSVMContext *ctx, void* env) {
// BUG: relies un undefined behaviour: int64_t is a signed uint64_t; not necessarily true on 32-bit
HParsedToken *top = ctx->stack[ctx->stack_count-1];
assert(top->token_type == TT_BYTES);
uint64_t res = 0;
for (size_t i = 0; i < top->bytes.len; i++)
res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses.
top->uint = res; // possibly cast to signed through union
top->token_type = TT_UINT;
return true;
}
// FUTURE: this is horribly inefficient // FUTURE: this is horribly inefficient
static bool cs_ctrvm(HRVMProg *prog, void *env) { static bool cs_ctrvm(HRVMProg *prog, void *env) {
HCharset cs = (HCharset)env; HCharset cs = (HCharset)env;
h_rvm_insert_insn(prog, RVM_PUSH, 0);
uint16_t start = h_rvm_get_ip(prog); uint16_t start = h_rvm_get_ip(prog);
for (size_t i=0; i<256; ++i) { for (size_t i=0; i<256; ++i) {
// TODO: merge ranges.
if (charset_isset(cs, i)) { if (charset_isset(cs, i)) {
uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0);
h_rvm_insert_insn(prog, RVM_MATCH, i & i << 8); h_rvm_insert_insn(prog, RVM_MATCH, i | i << 8);
h_rvm_insert_insn(prog, RVM_GOTO, 0); h_rvm_insert_insn(prog, RVM_GOTO, 0);
h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog));
} }
} }
h_rvm_insert_insn(prog, RVM_MATCH, 0x00FF);
uint16_t jump = h_rvm_insert_insn(prog, RVM_STEP, 0); uint16_t jump = h_rvm_insert_insn(prog, RVM_STEP, 0);
for (size_t i=start; i<jump; ++i) { for (size_t i=start; i<jump; ++i) {
if (RVM_GOTO == prog->insns[i].op) if (RVM_GOTO == prog->insns[i].op)
h_rvm_patch_arg(prog, i, jump); h_rvm_patch_arg(prog, i, jump);
} }
h_rvm_insert_insn(prog, RVM_CAPTURE, 0);
h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ch, env));
return true; return true;
} }

View file

@ -69,7 +69,7 @@ static bool token_ctrvm(HRVMProg *prog, void *env) {
HToken *t = (HToken*)env; HToken *t = (HToken*)env;
h_rvm_insert_insn(prog, RVM_PUSH, 0); h_rvm_insert_insn(prog, RVM_PUSH, 0);
for (int i=0; i<t->len; ++i) { for (int i=0; i<t->len; ++i) {
h_rvm_insert_insn(prog, RVM_MATCH, t->str[i] & t->str[i] << 8); h_rvm_insert_insn(prog, RVM_MATCH, t->str[i] | t->str[i] << 8);
h_rvm_insert_insn(prog, RVM_STEP, 0); h_rvm_insert_insn(prog, RVM_STEP, 0);
} }
h_rvm_insert_insn(prog, RVM_CAPTURE, 0); h_rvm_insert_insn(prog, RVM_CAPTURE, 0);

View file

@ -73,7 +73,7 @@ static bool ws_ctrvm(HRVMProg *prog, void *env) {
h_rvm_insert_insn(prog, RVM_GOTO, start); h_rvm_insert_insn(prog, RVM_GOTO, start);
h_rvm_patch_arg(prog, next, h_rvm_get_ip(prog)); h_rvm_patch_arg(prog, next, h_rvm_get_ip(prog));
} }
return h_compile_regex(prog, p->env); return h_compile_regex(prog, p);
} }
static const HParserVtable whitespace_vt = { static const HParserVtable whitespace_vt = {

View file

@ -21,7 +21,8 @@ static void* system_realloc(HAllocator *allocator, void* ptr, size_t size) {
} }
static void system_free(HAllocator *allocator, void* ptr) { static void system_free(HAllocator *allocator, void* ptr) {
free(ptr - sizeof(size_t)); if (ptr != NULL)
free(ptr - sizeof(size_t));
} }
HAllocator system_allocator = { HAllocator system_allocator = {