Sped up charset parsing; fixed choice operator

This commit is contained in:
Dan Hirsch 2012-05-04 21:23:56 +01:00
parent a076c4d12c
commit 2af69dd8f9
5 changed files with 81 additions and 39 deletions

View file

@ -11,6 +11,9 @@ SUBDIRS = src \
%:
+for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done
test: src/test_suite
$<
define SUBDIR_TEMPLATE
$(1)/%:
$$(MAKE) -C $(1) $$*

14
NOTES
View file

@ -4,3 +4,17 @@ NOTES
Regarding parse_result_t:
If a parse fails, the parse_result_t will be NULL.
If a parse is successful but there's nothing there (i.e., if end_p succeeds), then there's a parse_result_t but its ast is NULL.
Regarding input location:
If parse is successful, input is left at beginning of next thing to be read.
If parse fails, location is UNPREDICTABLE.
If CONSISTENCY_CHECK is defined, enable a bunch of additional internal
consistency checks.
TODO: Add consistency check to the bitreader
We should support the use of parse-table-based parse methods; add a
parse_compile method that must be called before the newly-created
parser is used.

View file

@ -8,10 +8,14 @@
#define MSB(range) (1:range)
#define LDB(range,i) (((i)>>LSB(range))&((1<<(MSB(range)-LSB(range)+1))-1))
long long read_bits(input_stream_t* state, int count, char signed_p) {
// BUG: Does not
long long out = 0;
int offset = 0;
long long msb = (!!signed_p) << (count - 1); // 0 if unsigned, else 1 << (nbits - 1)
// BUG: does not stop early in case of
if ((state->bit_offset & 0x7) == 0 && (count & 0x7) == 0) {
// fast path
if (state->endianness & BYTE_BIG_ENDIAN) {

View file

@ -19,14 +19,14 @@
#include "internal.h"
#include <assert.h>
#include <string.h>
/* TODO(thequux): rewrite to follow new parse_state_t layout
parse_state_t* from(parse_state_t *ps, const size_t index) {
parse_state_t p = { ps->input, ps->index + index, ps->length - index, ps->cache };
parse_state_t *ret = g_new(parse_state_t, 1);
*ret = p;
*ret = *ps;
ret->input_stream.index += index;
return ret;
}
*/
const uint8_t* substring(const parse_state_t *ps, const size_t start, const size_t end) {
if (end > start && (ps->input_stream.index + end) < ps->input_stream.length) {
gpointer ret = g_malloc(end - start);
@ -48,8 +48,7 @@ const gchar* to_string(parse_state_t *ps) {
return g_strescape((const gchar*)(ps->input_stream.input), NULL);
}
guint djbhash(const
uint8_t *buf, size_t len) {
guint djbhash(const uint8_t *buf, size_t len) {
guint hash = 5381;
while (len--) {
hash = hash * 33 + *buf++;
@ -75,6 +74,12 @@ parse_result_t* do_parse(const parser_t* parser, parse_state_t *state) {
res = parser->fn(parser->env, state);
// update the cache
g_hash_table_replace(state->cache, &key, res);
#ifdef CONSISTENCY_CHECK
if (!res) {
state->input_stream = INVALID;
state->input_stream.input = key.input_pos.input;
}
#endif
return res;
}
}
@ -135,47 +140,41 @@ typedef struct {
uint8_t upper;
} range_t;
static parse_result_t* parse_range(void* env, parse_state_t *state) {
range_t *range = (range_t*)env;
uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
if (range->lower <= r && range->upper >= r) {
parsed_token_t *tok = g_new(parsed_token_t, 1);
tok->token_type = TT_UINT; tok->uint = r;
return make_result(tok);
} else {
return NULL;
}
}
const parser_t* range(const uint8_t lower, const uint8_t upper) {
range_t *r = g_new(range_t, 1);
r->lower = lower; r->upper = upper;
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_range; ret->env = (void*)r;
return (const parser_t*)ret;
}
const parser_t* whitespace(const parser_t* p) { return NULL; }
//const parser_t* action(const parser_t* p, /* fptr to action on AST */) { return NULL; }
const parser_t* left_factor_action(const parser_t* p) { return NULL; }
static parse_result_t* parse_negate(void *env, parse_state_t *state) {
parser_t *p = (parser_t*)env;
parse_result_t *result = do_parse(p, state);
if (NULL == result) {
uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
static parse_result_t* parse_charset(void *env, parse_state_t *state) {
uint8_t in = read_bits(&state->input_stream, 8, false);
charset cs = (charset)env;
if (charset_isset(cs, in)) {
parsed_token_t *tok = g_new(parsed_token_t, 1);
tok->token_type = TT_UINT; tok->uint = r;
tok->token_type = TT_UINT; tok->uint = in;
return make_result(tok);
} else {
} else
return NULL;
}
}
const parser_t* negate(const parser_t* p) {
assert(parse_ch == p->fn || parse_range == p->fn);
const parser_t* range(const uint8_t lower, const uint8_t upper) {
parser_t *ret = g_new(parser_t, 1);
ret->fn = parse_negate; ret->env = (void*)p;
charset cs = new_charset();
for (int i = 0; i < 256; i++)
charset_set(cs, i, (lower <= i) && (i <= upper));
ret->fn = parse_charset; ret->env = (void*)cs;
return (const parser_t*)ret;
}
const parser_t* notin(const uint8_t *options, int count) {
parser_t *ret = g_new(parser_t, 1);
charset cs = new_charset();
for (int i = 0; i < 256; i++)
charset_set(cs, i, 1);
for (int i = 0; i < count; i++)
charset_set(cs, i, 0);
ret->fn = parse_charset; ret->env = (void*)cs;
return (const parser_t*)ret;
}
@ -232,7 +231,10 @@ const parser_t* sequence(const parser_t* p_array[]) {
static parse_result_t* parse_choice(void *env, parse_state_t *state) {
sequence_t *s = (sequence_t*)env;
input_stream_t backup = state->input_stream;
for (size_t i=0; i<s->len; ++i) {
if (i != 0)
state->input_stream = backup;
parse_result_t *tmp = do_parse(s->p_array[i], state);
if (NULL != tmp)
return tmp;

View file

@ -1,5 +1,6 @@
#ifndef HAMMER_INTERNAL__H
#define HAMMER_INTERNAL__H
#include <glib.h>
#include "hammer.h"
#define false 0
@ -10,6 +11,24 @@ typedef struct parser_cache_key {
const parser_t *parser;
} parser_cache_key_t;
typedef unsigned int *charset;
static inline charset new_charset() {
charset cs = g_new0(unsigned int, 256 / sizeof(unsigned int));
return cs;
}
static inline int charset_isset(charset cs, uint8_t pos) {
return !!(cs[pos / sizeof(*cs)] & (1 << (pos % sizeof(*cs))));
}
static inline void charset_set(charset cs, uint8_t pos, int val) {
cs[pos / sizeof(*cs)] =
val
? cs[pos / sizeof(*cs)] | (1 << (pos % sizeof(*cs)))
: cs[pos / sizeof(*cs)] & ~(1 << (pos % sizeof(*cs)));
}
// TODO(thequux): Set symbol visibility for these functions so that they aren't exported.
long long read_bits(input_stream_t* state, int count, char signed_p);