diff --git a/.gitignore b/.gitignore index db2ee3a..7f4d7d9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ src/test_suite lib/hush examples/dns examples/base64 +examples/base64_sem1 +examples/base64_sem2 TAGS *.swp *.swo diff --git a/examples/Makefile b/examples/Makefile index 786af44..663a214 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -2,7 +2,11 @@ OUTPUTS := dns.o \ dns \ base64.o \ - base64 + base64 \ + base64_sem1.o \ + base64_sem1 \ + base64_sem2.o \ + base64_sem2 TOPLEVEL := ../ @@ -12,19 +16,26 @@ LDFLAGS += $(pkg-config --libs glib-2.0) -all: dns base64 +all: dns base64 base64_sem1 base64_sem2 dns: LDFLAGS:=-L../src -lhammer $(LDFLAGS) -dns: dns.o rr.o dns_common.o glue.o +dns: dns.o rr.o dns_common.o $(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) -dns.o: ../src/hammer.h dns_common.h -rr.o: ../src/hammer.h rr.h dns_common.h -dns_common.o: ../src/hammer.h dns_common.h -glue.o: ../src/hammer.h glue.h +dns.o: ../src/hammer.h dns_common.h ../src/glue.h +rr.o: ../src/hammer.h rr.h dns_common.h ../src/glue.h +dns_common.o: ../src/hammer.h dns_common.h ../src/glue.h base64: LDFLAGS:=-L../src -lhammer $(LDFLAGS) base64: base64.o $(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) -base64.o: ../src/hammer.h +base64_sem1: LDFLAGS:=-L../src -lhammer $(LDFLAGS) +base64_sem1: base64_sem1.o + $(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) + +base64_sem2: LDFLAGS:=-L../src -lhammer $(LDFLAGS) +base64_sem2: base64_sem2.o + $(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) + +base64%.o: ../src/hammer.h ../src/glue.h diff --git a/examples/base64.c b/examples/base64.c index 6c4db9e..ee142e3 100644 --- a/examples/base64.c +++ b/examples/base64.c @@ -1,3 +1,13 @@ +// Example parser: Base64, syntax only. +// +// Demonstrates how to construct a Hammer parser that recognizes valid Base64 +// sequences. +// +// Note that no semantic evaluation of the sequence is performed, i.e. the +// byte sequence being represented is not returned, or determined. See +// base64_sem1.c and base64_sem2.c for examples how to attach appropriate +// semantic actions to the grammar. + #include "../src/hammer.h" const HParser* document = NULL; @@ -24,7 +34,7 @@ void init_parser(void) base64_1, NULL)), NULL); - document = base64; + document = h_sequence(h_whitespace(base64), h_whitespace(h_end_p()), NULL); } diff --git a/examples/base64_sem1.c b/examples/base64_sem1.c new file mode 100644 index 0000000..f2a3e82 --- /dev/null +++ b/examples/base64_sem1.c @@ -0,0 +1,172 @@ +// Example parser: Base64, with fine-grained semantic actions +// +// Demonstrates how to attach semantic actions to grammar rules and piece by +// piece transform the parse tree into the desired semantic representation, +// in this case a sequence of 8-bit values. +// +// Note how the grammar is defined by using the macros H_RULE and H_ARULE. +// Those rules using ARULE get an attached action which must be declared (as +// a function of type HAction) with a standard name based on the rule name. +// +// This variant of the example uses fine-grained semantic actions that +// transform the parse tree in small steps in a bottom-up fashion. Compare +// base64_sem2.c for an alternative approach using a single top-level action. + +#include "../src/hammer.h" +#include "../src/glue.h" +#include + + +/// +// Semantic actions for the grammar below, each corresponds to an "ARULE". +// They must be named act_. +/// + +const HParsedToken *act_bsfdig(const HParseResult *p) +{ + HParsedToken *res = H_MAKE_UINT(0); + + uint8_t c = H_CAST_UINT(p->ast); + + if(c >= 0x40 && c <= 0x5A) // A-Z + res->uint = c - 0x41; + else if(c >= 0x60 && c <= 0x7A) // a-z + res->uint = c - 0x61 + 26; + else if(c >= 0x30 && c <= 0x39) // 0-9 + res->uint = c - 0x30 + 52; + else if(c == '+') + res->uint = 62; + else if(c == '/') + res->uint = 63; + + return res; +} + +H_ACT_APPLY(act_index0, h_act_index, 0); + +#define act_bsfdig_4bit act_bsfdig +#define act_bsfdig_2bit act_bsfdig + +#define act_equals h_act_ignore +#define act_ws h_act_ignore + +#define act_document act_index0 + +// General-form action to turn a block of base64 digits into bytes. +const HParsedToken *act_base64_n(int n, const HParseResult *p) +{ + HParsedToken *res = H_MAKE_SEQN(n); + + HParsedToken **digits = h_seq_elements(p->ast); + + uint32_t x = 0; + int bits = 0; + for(int i=0; iuint; + bits += 6; + } + x >>= bits%8; // align, i.e. cut off extra bits + + for(int i=0; iseq->elements[n-1-i] = item; // output the last byte and + x >>= 8; // discard it + } + res->seq->used = n; + + return res; +} + +H_ACT_APPLY(act_base64_3, act_base64_n, 3); +H_ACT_APPLY(act_base64_2, act_base64_n, 2); +H_ACT_APPLY(act_base64_1, act_base64_n, 1); + +const HParsedToken *act_base64(const HParseResult *p) +{ + assert(p->ast->token_type == TT_SEQUENCE); + assert(p->ast->seq->used == 2); + assert(p->ast->seq->elements[0]->token_type == TT_SEQUENCE); + + HParsedToken *res = H_MAKE_SEQ(); + + // concatenate base64_3 blocks + HCountedArray *seq = H_FIELD_SEQ(0); + for(size_t i=0; iused; i++) + h_seq_append(res, seq->elements[i]); + + // append one trailing base64_2 or _1 block + const HParsedToken *tok = h_seq_index(p->ast, 1); + if(tok->token_type == TT_SEQUENCE) + h_seq_append(res, tok); + + return res; +} + + +/// +// Set up the parser with the grammar to be recognized. +/// + +const HParser *init_parser(void) +{ + // CORE + H_RULE (digit, h_ch_range(0x30, 0x39)); + H_RULE (alpha, h_choice(h_ch_range(0x41, 0x5a), h_ch_range(0x61, 0x7a), NULL)); + H_RULE (space, h_in((uint8_t *)" \t\n\r\f\v", 6)); + + // AUX. + H_RULE (plus, h_ch('+')); + H_RULE (slash, h_ch('/')); + H_ARULE(equals, h_ch('=')); + + H_ARULE(bsfdig, h_choice(alpha, digit, plus, slash, NULL)); + H_ARULE(bsfdig_4bit, h_in((uint8_t *)"AEIMQUYcgkosw048", 16)); + H_ARULE(bsfdig_2bit, h_in((uint8_t *)"AQgw", 4)); + H_ARULE(base64_3, h_repeat_n(bsfdig, 4)); + H_ARULE(base64_2, h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL)); + H_ARULE(base64_1, h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL)); + H_ARULE(base64, h_sequence(h_many(base64_3), + h_optional(h_choice(base64_2, + base64_1, NULL)), + NULL)); + + H_ARULE(ws, h_many(space)); + H_ARULE(document, h_sequence(ws, base64, ws, h_end_p(), NULL)); + + // BUG sometimes inputs that should just don't parse. + // It *seemed* to happen mostly with things like "bbbbaaaaBA==". + // Using less actions seemed to make it less likely. + + return document; +} + + +/// +// Main routine: print input, parse, print result, return success/failure. +/// + +#include + +int main(int argc, char **argv) +{ + uint8_t input[102400]; + size_t inputsize; + const HParser *parser; + const HParseResult *result; + + parser = init_parser(); + + inputsize = fread(input, 1, sizeof(input), stdin); + fprintf(stderr, "inputsize=%lu\ninput=", inputsize); + fwrite(input, 1, inputsize, stderr); + result = h_parse(parser, input, inputsize); + + if(result) { + fprintf(stderr, "parsed=%lld bytes\n", result->bit_length/8); + h_pprint(stdout, result->ast, 0, 0); + return 0; + } else { + return 1; + } +} diff --git a/examples/base64_sem2.c b/examples/base64_sem2.c new file mode 100644 index 0000000..32afe5b --- /dev/null +++ b/examples/base64_sem2.c @@ -0,0 +1,176 @@ +// Example parser: Base64, with fine-grained semantic actions +// +// Demonstrates how to attach semantic actions to a grammar and transform the +// parse tree into the desired semantic representation, in this case a sequence +// of 8-bit values. +// +// Note how the grammar is defined by using the macros H_RULE and H_ARULE. +// Those rules using ARULE get an attached action which must be declared (as +// a function of type HAction) with a standard name based on the rule name. +// +// This variant of the example uses coarse-grained semantic actions, +// transforming the entire parse tree in one big step. Compare base64_sem1.c +// for an alternative approach using a fine-grained piece-by-piece +// transformation. + +#include "../src/hammer.h" +#include "../src/glue.h" +#include + + +/// +// Semantic actions for the grammar below, each corresponds to an "ARULE". +// They must be named act_. +/// + +// helper: return the numeric value of a parsed base64 digit +uint8_t bsfdig_value(const HParsedToken *p) +{ + uint8_t value = 0; + + if(p && p->token_type == TT_UINT) { + uint8_t c = p->uint; + if(c >= 0x40 && c <= 0x5A) // A-Z + value = c - 0x41; + else if(c >= 0x60 && c <= 0x7A) // a-z + value = c - 0x61 + 26; + else if(c >= 0x30 && c <= 0x39) // 0-9 + value = c - 0x30 + 52; + else if(c == '+') + value = 62; + else if(c == '/') + value = 63; + } + + return value; +} + +// helper: append a byte value to a sequence +#define seq_append_byte(res, b) h_seq_snoc(res, H_MAKE_UINT(b)) + +const HParsedToken *act_base64(const HParseResult *p) +{ + assert(p->ast->token_type == TT_SEQUENCE); + assert(p->ast->seq->used == 2); + assert(p->ast->seq->elements[0]->token_type == TT_SEQUENCE); + + // grab b64_3 block sequence + // grab and analyze b64 end block (_2 or _1) + const HParsedToken *b64_3 = p->ast->seq->elements[0]; + const HParsedToken *b64_2 = p->ast->seq->elements[1]; + const HParsedToken *b64_1 = p->ast->seq->elements[1]; + + if(b64_2->token_type != TT_SEQUENCE) + b64_1 = b64_2 = NULL; + else if(b64_2->seq->elements[2]->uint == '=') + b64_2 = NULL; + else + b64_1 = NULL; + + // allocate result sequence + HParsedToken *res = H_MAKE_SEQ(); + + // concatenate base64_3 blocks + for(size_t i=0; iseq->used; i++) { + assert(b64_3->seq->elements[i]->token_type == TT_SEQUENCE); + HParsedToken **digits = b64_3->seq->elements[i]->seq->elements; + + uint32_t x = bsfdig_value(digits[0]); + x <<= 6; x |= bsfdig_value(digits[1]); + x <<= 6; x |= bsfdig_value(digits[2]); + x <<= 6; x |= bsfdig_value(digits[3]); + seq_append_byte(res, (x >> 16) & 0xFF); + seq_append_byte(res, (x >> 8) & 0xFF); + seq_append_byte(res, x & 0xFF); + } + + // append one trailing base64_2 or _1 block + if(b64_2) { + HParsedToken **digits = b64_2->seq->elements; + uint32_t x = bsfdig_value(digits[0]); + x <<= 6; x |= bsfdig_value(digits[1]); + x <<= 6; x |= bsfdig_value(digits[2]); + seq_append_byte(res, (x >> 10) & 0xFF); + seq_append_byte(res, (x >> 2) & 0xFF); + } else if(b64_1) { + HParsedToken **digits = b64_1->seq->elements; + uint32_t x = bsfdig_value(digits[0]); + x <<= 6; x |= bsfdig_value(digits[1]); + seq_append_byte(res, (x >> 4) & 0xFF); + } + + return res; +} + +H_ACT_APPLY(act_index0, h_act_index, 0); + +#define act_ws h_act_ignore +#define act_document act_index0 + + +/// +// Set up the parser with the grammar to be recognized. +/// + +const HParser *init_parser(void) +{ + // CORE + H_RULE (digit, h_ch_range(0x30, 0x39)); + H_RULE (alpha, h_choice(h_ch_range(0x41, 0x5a), h_ch_range(0x61, 0x7a), NULL)); + H_RULE (space, h_in((uint8_t *)" \t\n\r\f\v", 6)); + + // AUX. + H_RULE (plus, h_ch('+')); + H_RULE (slash, h_ch('/')); + H_RULE (equals, h_ch('=')); + + H_RULE (bsfdig, h_choice(alpha, digit, plus, slash, NULL)); + H_RULE (bsfdig_4bit, h_in((uint8_t *)"AEIMQUYcgkosw048", 16)); + H_RULE (bsfdig_2bit, h_in((uint8_t *)"AQgw", 4)); + H_RULE (base64_3, h_repeat_n(bsfdig, 4)); + H_RULE (base64_2, h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL)); + H_RULE (base64_1, h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL)); + H_ARULE(base64, h_sequence(h_many(base64_3), + h_optional(h_choice(base64_2, + base64_1, NULL)), + NULL)); + + H_ARULE(ws, h_many(space)); + H_ARULE(document, h_sequence(ws, base64, ws, h_end_p(), NULL)); + + // BUG sometimes inputs that should just don't parse. + // It *seemed* to happen mostly with things like "bbbbaaaaBA==". + // Using less actions seemed to make it less likely. + + return document; +} + + +/// +// Main routine: print input, parse, print result, return success/failure. +/// + +#include + +int main(int argc, char **argv) +{ + uint8_t input[102400]; + size_t inputsize; + const HParser *parser; + const HParseResult *result; + + parser = init_parser(); + + inputsize = fread(input, 1, sizeof(input), stdin); + fprintf(stderr, "inputsize=%lu\ninput=", inputsize); + fwrite(input, 1, inputsize, stderr); + result = h_parse(parser, input, inputsize); + + if(result) { + fprintf(stderr, "parsed=%lld bytes\n", result->bit_length/8); + h_pprint(stdout, result->ast, 0, 0); + return 0; + } else { + return 1; + } +} diff --git a/examples/dns_common.h b/examples/dns_common.h index 6b04519..c1d8d7e 100644 --- a/examples/dns_common.h +++ b/examples/dns_common.h @@ -2,7 +2,7 @@ #define HAMMER_DNS_COMMON__H #include "../src/hammer.h" -#include "glue.h" +#include "../src/glue.h" const HParser* init_domain(); const HParser* init_character_string(); diff --git a/src/Makefile b/src/Makefile index 47e136d..ead0516 100644 --- a/src/Makefile +++ b/src/Makefile @@ -38,6 +38,7 @@ HAMMER_PARTS := \ system_allocator.o \ benchmark.o \ compile.o \ + glue.o \ $(PARSERS:%=parsers/%.o) \ $(BACKENDS:%=backends/%.o) @@ -67,6 +68,7 @@ libhammer.a: $(HAMMER_PARTS) bitreader.o: test_suite.h hammer.o: hammer.h +glue.o: hammer.h glue.h all: libhammer.a diff --git a/src/backends/packrat.c b/src/backends/packrat.c index d05129d..cc2a9db 100644 --- a/src/backends/packrat.c +++ b/src/backends/packrat.c @@ -77,14 +77,18 @@ HParserCacheValue* recall(HParserCacheKey *k, HParseState *state) { void setupLR(const HParser *p, HParseState *state, HLeftRec *rec_detect) { if (!rec_detect->head) { HRecursionHead *some = a_new(HRecursionHead, 1); - some->head_parser = p; some->involved_set = NULL; some->eval_set = NULL; + some->head_parser = p; + some->involved_set = h_slist_new(state->arena); + some->eval_set = NULL; rec_detect->head = some; } assert(state->lr_stack->head != NULL); - HLeftRec *lr = state->lr_stack->head->elem; - while (lr && lr->rule != p) { + HSlistNode *head = state->lr_stack->head; + HLeftRec *lr; + while (head && (lr = head->elem)->rule != p) { lr->head = rec_detect->head; h_slist_push(lr->head->involved_set, (void*)lr->rule); + head = head->next; } } @@ -101,7 +105,7 @@ HParseResult* grow(HParserCacheKey *k, HParseState *state, HRecursionHead *head) HParseResult *old_res = old_cached->right->result; // reset the eval_set of the head of the recursion at each beginning of growth - head->eval_set = head->involved_set; + head->eval_set = h_slist_copy(head->involved_set); HParseResult *tmp_res = perform_lowlevel_parse(state, k->parser); if (tmp_res) { diff --git a/src/datastructures.c b/src/datastructures.c index b1e4f75..3d94804 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -41,6 +41,26 @@ HSlist* h_slist_new(HArena *arena) { return ret; } +HSlist* h_slist_copy(HSlist *slist) { + HSlist *ret = h_slist_new(slist->arena); + HSlistNode *head = slist->head; + HSlistNode *tail; + if (head != NULL) { + h_slist_push(ret, head->elem); + tail = ret->head; + head = head->next; + } + while (head != NULL) { + // append head item to tail in a new node + HSlistNode *node = h_arena_malloc(slist->arena, sizeof(HSlistNode)); + node->elem = head->elem; + node->next = NULL; + tail = tail->next = node; + head = head->next; + } + return ret; +} + void* h_slist_pop(HSlist *slist) { HSlistNode *head = slist->head; if (!head) diff --git a/examples/glue.c b/src/glue.c similarity index 94% rename from examples/glue.c rename to src/glue.c index 7f9c6fa..f1e086a 100644 --- a/examples/glue.c +++ b/src/glue.c @@ -55,6 +55,13 @@ HParsedToken *h_make_seq(HArena *arena) return ret; } +HParsedToken *h_make_seqn(HArena *arena, size_t n) +{ + HParsedToken *ret = h_make_(arena, TT_SEQUENCE); + ret->seq = h_carray_new_sized(arena, n); + return ret; +} + HParsedToken *h_make_bytes(HArena *arena, size_t len) { HParsedToken *ret = h_make_(arena, TT_BYTES); @@ -142,7 +149,7 @@ void h_seq_append(HParsedToken *xs, const HParsedToken *ys) assert(ys != NULL); assert(ys->token_type == TT_SEQUENCE); - for(size_t i; iseq->used; i++) + for(size_t i=0; iseq->used; i++) h_carray_append(xs->seq, ys->seq->elements[i]); } diff --git a/examples/glue.h b/src/glue.h similarity index 98% rename from examples/glue.h rename to src/glue.h index ccb488e..3125ae0 100644 --- a/examples/glue.h +++ b/src/glue.h @@ -21,11 +21,11 @@ // See the leading comment blocks on the sections below for more details. // -#ifndef HAMMER_EXAMPLES_GLUE__H -#define HAMMER_EXAMPLES_GLUE__H +#ifndef HAMMER_GLUE__H +#define HAMMER_GLUE__H #include -#include "../src/hammer.h" +#include "hammer.h" // @@ -173,6 +173,7 @@ const HParsedToken *h_act_flatten(const HParseResult *p); HParsedToken *h_make(HArena *arena, HTokenType type, void *value); HParsedToken *h_make_seq(HArena *arena); // Makes empty sequence. +HParsedToken *h_make_seqn(HArena *arena, size_t n); // Makes empty sequence of expected size n. HParsedToken *h_make_bytes(HArena *arena, size_t len); HParsedToken *h_make_sint(HArena *arena, int64_t val); HParsedToken *h_make_uint(HArena *arena, uint64_t val); @@ -180,6 +181,7 @@ HParsedToken *h_make_uint(HArena *arena, uint64_t val); // Standard short-hands to make tokens in an action. #define H_MAKE(TYP, VAL) h_make(p->arena, TT_ ## TYP, VAL) #define H_MAKE_SEQ() h_make_seq(p->arena) +#define H_MAKE_SEQN(N) h_make_seqn(p->arena, N) #define H_MAKE_BYTES(LEN) h_make_bytes(p->arena, LEN) #define H_MAKE_SINT(VAL) h_make_sint(p->arena, VAL) #define H_MAKE_UINT(VAL) h_make_uint(p->arena, VAL) diff --git a/src/internal.h b/src/internal.h index 67ecb22..0dcf857 100644 --- a/src/internal.h +++ b/src/internal.h @@ -209,6 +209,7 @@ HCountedArray *h_carray_new(HArena * arena); void h_carray_append(HCountedArray *array, void* item); HSlist* h_slist_new(HArena *arena); +HSlist* h_slist_copy(HSlist *slist); void* h_slist_pop(HSlist *slist); void h_slist_push(HSlist *slist, void* item); bool h_slist_find(HSlist *slist, const void* item); diff --git a/src/t_parser.c b/src/t_parser.c index b1f9b63..daca1a3 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -365,6 +365,17 @@ static void test_not(void) { g_check_parse_ok(not_2, "a++b", 4, "(u0x61 <2b.2b> u0x62)"); } +static void test_leftrec(void) { + const HParser *a_ = h_ch('a'); + + HParser *lr_ = h_indirect(); + h_bind_indirect(lr_, h_choice(h_sequence(lr_, a_, NULL), a_, NULL)); + + g_check_parse_ok(lr_, "a", 1, "u0x61"); + g_check_parse_ok(lr_, "aa", 2, "(u0x61 u0x61)"); + g_check_parse_ok(lr_, "aaa", 3, "((u0x61 u0x61) u0x61)"); +} + void register_parser_tests(void) { g_test_add_func("/core/parser/token", test_token); g_test_add_func("/core/parser/ch", test_ch); @@ -406,4 +417,5 @@ void register_parser_tests(void) { g_test_add_func("/core/parser/and", test_and); g_test_add_func("/core/parser/not", test_not); g_test_add_func("/core/parser/ignore", test_ignore); + g_test_add_func("/core/parser/leftrec", test_leftrec); }