Sped up charset parsing; fixed choice operator
This commit is contained in:
parent
a076c4d12c
commit
2af69dd8f9
5 changed files with 81 additions and 39 deletions
3
Makefile
3
Makefile
|
|
@ -11,6 +11,9 @@ SUBDIRS = src \
|
||||||
%:
|
%:
|
||||||
+for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done
|
+for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done
|
||||||
|
|
||||||
|
test: src/test_suite
|
||||||
|
$<
|
||||||
|
|
||||||
define SUBDIR_TEMPLATE
|
define SUBDIR_TEMPLATE
|
||||||
$(1)/%:
|
$(1)/%:
|
||||||
$$(MAKE) -C $(1) $$*
|
$$(MAKE) -C $(1) $$*
|
||||||
|
|
|
||||||
14
NOTES
14
NOTES
|
|
@ -4,3 +4,17 @@ NOTES
|
||||||
Regarding parse_result_t:
|
Regarding parse_result_t:
|
||||||
If a parse fails, the parse_result_t will be NULL.
|
If a parse fails, the parse_result_t will be NULL.
|
||||||
If a parse is successful but there's nothing there (i.e., if end_p succeeds), then there's a parse_result_t but its ast is NULL.
|
If a parse is successful but there's nothing there (i.e., if end_p succeeds), then there's a parse_result_t but its ast is NULL.
|
||||||
|
|
||||||
|
Regarding input location:
|
||||||
|
If parse is successful, input is left at beginning of next thing to be read.
|
||||||
|
If parse fails, location is UNPREDICTABLE.
|
||||||
|
|
||||||
|
|
||||||
|
If CONSISTENCY_CHECK is defined, enable a bunch of additional internal
|
||||||
|
consistency checks.
|
||||||
|
|
||||||
|
TODO: Add consistency check to the bitreader
|
||||||
|
|
||||||
|
We should support the use of parse-table-based parse methods; add a
|
||||||
|
parse_compile method that must be called before the newly-created
|
||||||
|
parser is used.
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,14 @@
|
||||||
#define MSB(range) (1:range)
|
#define MSB(range) (1:range)
|
||||||
#define LDB(range,i) (((i)>>LSB(range))&((1<<(MSB(range)-LSB(range)+1))-1))
|
#define LDB(range,i) (((i)>>LSB(range))&((1<<(MSB(range)-LSB(range)+1))-1))
|
||||||
|
|
||||||
|
|
||||||
long long read_bits(input_stream_t* state, int count, char signed_p) {
|
long long read_bits(input_stream_t* state, int count, char signed_p) {
|
||||||
|
// BUG: Does not
|
||||||
long long out = 0;
|
long long out = 0;
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
long long msb = (!!signed_p) << (count - 1); // 0 if unsigned, else 1 << (nbits - 1)
|
long long msb = (!!signed_p) << (count - 1); // 0 if unsigned, else 1 << (nbits - 1)
|
||||||
|
// BUG: does not stop early in case of
|
||||||
|
|
||||||
if ((state->bit_offset & 0x7) == 0 && (count & 0x7) == 0) {
|
if ((state->bit_offset & 0x7) == 0 && (count & 0x7) == 0) {
|
||||||
// fast path
|
// fast path
|
||||||
if (state->endianness & BYTE_BIG_ENDIAN) {
|
if (state->endianness & BYTE_BIG_ENDIAN) {
|
||||||
|
|
|
||||||
74
src/hammer.c
74
src/hammer.c
|
|
@ -19,14 +19,14 @@
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
/* TODO(thequux): rewrite to follow new parse_state_t layout
|
|
||||||
parse_state_t* from(parse_state_t *ps, const size_t index) {
|
parse_state_t* from(parse_state_t *ps, const size_t index) {
|
||||||
parse_state_t p = { ps->input, ps->index + index, ps->length - index, ps->cache };
|
|
||||||
parse_state_t *ret = g_new(parse_state_t, 1);
|
parse_state_t *ret = g_new(parse_state_t, 1);
|
||||||
*ret = p;
|
*ret = *ps;
|
||||||
|
ret->input_stream.index += index;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
const uint8_t* substring(const parse_state_t *ps, const size_t start, const size_t end) {
|
const uint8_t* substring(const parse_state_t *ps, const size_t start, const size_t end) {
|
||||||
if (end > start && (ps->input_stream.index + end) < ps->input_stream.length) {
|
if (end > start && (ps->input_stream.index + end) < ps->input_stream.length) {
|
||||||
gpointer ret = g_malloc(end - start);
|
gpointer ret = g_malloc(end - start);
|
||||||
|
|
@ -48,8 +48,7 @@ const gchar* to_string(parse_state_t *ps) {
|
||||||
return g_strescape((const gchar*)(ps->input_stream.input), NULL);
|
return g_strescape((const gchar*)(ps->input_stream.input), NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
guint djbhash(const
|
guint djbhash(const uint8_t *buf, size_t len) {
|
||||||
uint8_t *buf, size_t len) {
|
|
||||||
guint hash = 5381;
|
guint hash = 5381;
|
||||||
while (len--) {
|
while (len--) {
|
||||||
hash = hash * 33 + *buf++;
|
hash = hash * 33 + *buf++;
|
||||||
|
|
@ -75,6 +74,12 @@ parse_result_t* do_parse(const parser_t* parser, parse_state_t *state) {
|
||||||
res = parser->fn(parser->env, state);
|
res = parser->fn(parser->env, state);
|
||||||
// update the cache
|
// update the cache
|
||||||
g_hash_table_replace(state->cache, &key, res);
|
g_hash_table_replace(state->cache, &key, res);
|
||||||
|
#ifdef CONSISTENCY_CHECK
|
||||||
|
if (!res) {
|
||||||
|
state->input_stream = INVALID;
|
||||||
|
state->input_stream.input = key.input_pos.input;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -135,47 +140,41 @@ typedef struct {
|
||||||
uint8_t upper;
|
uint8_t upper;
|
||||||
} range_t;
|
} range_t;
|
||||||
|
|
||||||
static parse_result_t* parse_range(void* env, parse_state_t *state) {
|
|
||||||
range_t *range = (range_t*)env;
|
|
||||||
uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
|
|
||||||
if (range->lower <= r && range->upper >= r) {
|
|
||||||
parsed_token_t *tok = g_new(parsed_token_t, 1);
|
|
||||||
tok->token_type = TT_UINT; tok->uint = r;
|
|
||||||
return make_result(tok);
|
|
||||||
} else {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const parser_t* range(const uint8_t lower, const uint8_t upper) {
|
|
||||||
range_t *r = g_new(range_t, 1);
|
|
||||||
r->lower = lower; r->upper = upper;
|
|
||||||
parser_t *ret = g_new(parser_t, 1);
|
|
||||||
ret->fn = parse_range; ret->env = (void*)r;
|
|
||||||
return (const parser_t*)ret;
|
|
||||||
}
|
|
||||||
const parser_t* whitespace(const parser_t* p) { return NULL; }
|
const parser_t* whitespace(const parser_t* p) { return NULL; }
|
||||||
//const parser_t* action(const parser_t* p, /* fptr to action on AST */) { return NULL; }
|
//const parser_t* action(const parser_t* p, /* fptr to action on AST */) { return NULL; }
|
||||||
|
|
||||||
const parser_t* left_factor_action(const parser_t* p) { return NULL; }
|
const parser_t* left_factor_action(const parser_t* p) { return NULL; }
|
||||||
|
|
||||||
static parse_result_t* parse_negate(void *env, parse_state_t *state) {
|
static parse_result_t* parse_charset(void *env, parse_state_t *state) {
|
||||||
parser_t *p = (parser_t*)env;
|
uint8_t in = read_bits(&state->input_stream, 8, false);
|
||||||
parse_result_t *result = do_parse(p, state);
|
charset cs = (charset)env;
|
||||||
if (NULL == result) {
|
|
||||||
uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
|
if (charset_isset(cs, in)) {
|
||||||
parsed_token_t *tok = g_new(parsed_token_t, 1);
|
parsed_token_t *tok = g_new(parsed_token_t, 1);
|
||||||
tok->token_type = TT_UINT; tok->uint = r;
|
tok->token_type = TT_UINT; tok->uint = in;
|
||||||
return make_result(tok);
|
return make_result(tok);
|
||||||
} else {
|
} else
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const parser_t* negate(const parser_t* p) {
|
const parser_t* range(const uint8_t lower, const uint8_t upper) {
|
||||||
assert(parse_ch == p->fn || parse_range == p->fn);
|
|
||||||
parser_t *ret = g_new(parser_t, 1);
|
parser_t *ret = g_new(parser_t, 1);
|
||||||
ret->fn = parse_negate; ret->env = (void*)p;
|
charset cs = new_charset();
|
||||||
|
for (int i = 0; i < 256; i++)
|
||||||
|
charset_set(cs, i, (lower <= i) && (i <= upper));
|
||||||
|
ret->fn = parse_charset; ret->env = (void*)cs;
|
||||||
|
return (const parser_t*)ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
const parser_t* notin(const uint8_t *options, int count) {
|
||||||
|
parser_t *ret = g_new(parser_t, 1);
|
||||||
|
charset cs = new_charset();
|
||||||
|
for (int i = 0; i < 256; i++)
|
||||||
|
charset_set(cs, i, 1);
|
||||||
|
for (int i = 0; i < count; i++)
|
||||||
|
charset_set(cs, i, 0);
|
||||||
|
|
||||||
|
ret->fn = parse_charset; ret->env = (void*)cs;
|
||||||
return (const parser_t*)ret;
|
return (const parser_t*)ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -232,7 +231,10 @@ const parser_t* sequence(const parser_t* p_array[]) {
|
||||||
|
|
||||||
static parse_result_t* parse_choice(void *env, parse_state_t *state) {
|
static parse_result_t* parse_choice(void *env, parse_state_t *state) {
|
||||||
sequence_t *s = (sequence_t*)env;
|
sequence_t *s = (sequence_t*)env;
|
||||||
|
input_stream_t backup = state->input_stream;
|
||||||
for (size_t i=0; i<s->len; ++i) {
|
for (size_t i=0; i<s->len; ++i) {
|
||||||
|
if (i != 0)
|
||||||
|
state->input_stream = backup;
|
||||||
parse_result_t *tmp = do_parse(s->p_array[i], state);
|
parse_result_t *tmp = do_parse(s->p_array[i], state);
|
||||||
if (NULL != tmp)
|
if (NULL != tmp)
|
||||||
return tmp;
|
return tmp;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
#ifndef HAMMER_INTERNAL__H
|
#ifndef HAMMER_INTERNAL__H
|
||||||
#define HAMMER_INTERNAL__H
|
#define HAMMER_INTERNAL__H
|
||||||
|
#include <glib.h>
|
||||||
#include "hammer.h"
|
#include "hammer.h"
|
||||||
|
|
||||||
#define false 0
|
#define false 0
|
||||||
|
|
@ -10,6 +11,24 @@ typedef struct parser_cache_key {
|
||||||
const parser_t *parser;
|
const parser_t *parser;
|
||||||
} parser_cache_key_t;
|
} parser_cache_key_t;
|
||||||
|
|
||||||
|
typedef unsigned int *charset;
|
||||||
|
|
||||||
|
static inline charset new_charset() {
|
||||||
|
charset cs = g_new0(unsigned int, 256 / sizeof(unsigned int));
|
||||||
|
return cs;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int charset_isset(charset cs, uint8_t pos) {
|
||||||
|
return !!(cs[pos / sizeof(*cs)] & (1 << (pos % sizeof(*cs))));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void charset_set(charset cs, uint8_t pos, int val) {
|
||||||
|
cs[pos / sizeof(*cs)] =
|
||||||
|
val
|
||||||
|
? cs[pos / sizeof(*cs)] | (1 << (pos % sizeof(*cs)))
|
||||||
|
: cs[pos / sizeof(*cs)] & ~(1 << (pos % sizeof(*cs)));
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(thequux): Set symbol visibility for these functions so that they aren't exported.
|
// TODO(thequux): Set symbol visibility for these functions so that they aren't exported.
|
||||||
|
|
||||||
long long read_bits(input_stream_t* state, int count, char signed_p);
|
long long read_bits(input_stream_t* state, int count, char signed_p);
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue