Merge remote-tracking branch 'mlp/master'

2012-05-23 01:02:49 +02:00 · 2012-05-23 01:02:49 +02:00 · 2dd687ea66
commit 2dd687ea66
parent 54b0f9f7b2 3bb11afeac
5 changed files with 463 additions and 63 deletions
--- a/src/hammer.c
+++ b/src/hammer.c
@ -336,7 +336,31 @@ const parser_t* whitespace(const parser_t* p) {
  return ret;
 }

-const parser_t* action(const parser_t* p, const action_t a) { return &unimplemented; }
+typedef struct {
+  const parser_t *p;
+  action_t action;
+} parse_action_t;
+
+static parse_result_t* parse_action(void *env, parse_state_t *state) {
+  parse_action_t *a = (parse_action_t*)env;
+  if (a->p && a->action) {
+    parse_result_t *tmp = do_parse(a->p, state);
+    //parsed_token_t *tok = a->action(do_parse(a->p, state));
+    const parsed_token_t *tok = a->action(tmp);
+    return make_result(state, (parsed_token_t*)tok);
+  } else // either the parser's missing or the action's missing
+    return NULL;
+}
+
+const parser_t* action(const parser_t* p, const action_t a) { 
+  parser_t *res = g_new(parser_t, 1);
+  res->fn = parse_action;
+  parse_action_t *env = g_new(parse_action_t, 1);
+  env->p = p;
+  env->action = a;
+  res->env = (void*)env;
+  return res;
+}

 static parse_result_t* parse_charset(void *env, parse_state_t *state) {
  uint8_t in = read_bits(&state->input_stream, 8, false);
@ -783,7 +807,6 @@ const parser_t* epsilon_p() {
  return res;
 }

-
 static parse_result_t* parse_indirect(void* env, parse_state_t* state) {
  return do_parse(env, state);
 }
@ -798,7 +821,68 @@ parser_t* indirect() {
  return res;
 }

-const parser_t* attr_bool(const parser_t* p, attr_bool_t a) { return &unimplemented; }
+typedef struct {
+  const parser_t *p;
+  predicate_t pred;
+} attr_bool_t;
+
+static parse_result_t* parse_attr_bool(void *env, parse_state_t *state) {
+  attr_bool_t *a = (attr_bool_t*)env;
+  parse_result_t *res = do_parse(a->p, state);
+  if (res) {
+    if (a->pred(res))
+      return res;
+    else
+      return NULL;
+  } else
+    return NULL;
+}
+
+const parser_t* attr_bool(const parser_t* p, predicate_t pred) { 
+  parser_t *res = g_new(parser_t, 1);
+  res->fn = parse_attr_bool;
+  attr_bool_t *env = g_new(attr_bool_t, 1);
+  env->p = p;
+  env->pred = pred;
+  res->env = (void*)env;
+  return res;
+}
+
+typedef struct {
+  const parser_t *length;
+  const parser_t *value;
+} lv_t;
+
+static parse_result_t* parse_length_value(void *env, parse_state_t *state) {
+  lv_t *lv = (lv_t*)env;
+  parse_result_t *len = do_parse(lv->length, state);
+  if (!len)
+    return NULL;
+  if (len->ast->token_type != TT_UINT)
+    errx(1, "Length parser must return an unsigned integer");
+  parser_t epsilon_local = {
+    .fn = parse_epsilon,
+    .env = NULL
+  };
+  repeat_t repeat = {
+    .p = lv->value,
+    .sep = &epsilon_local,
+    .count = len->ast->uint,
+    .min_p = false
+  };
+  return parse_many(&repeat, state);
+}
+
+const parser_t* length_value(const parser_t* length, const parser_t* value) {
+  parser_t *res = g_new(parser_t, 1);
+  res->fn = parse_length_value;
+  lv_t *env = g_new(lv_t, 1);
+  env->length = length;
+  env->value = value;
+  res->env = (void*)env;
+  return res;
+}
+
 const parser_t* and(const parser_t* p) { return &unimplemented; }

 static parse_result_t* parse_not(void* env, parse_state_t* state) {
@ -881,7 +965,7 @@ static void test_range(void) {
 static void test_int64(void) {
  const parser_t *int64_ = int64();

-  g_check_parse_ok(int64_, "\xff\xff\xff\xfe\x00\x00\x00\x00", 8, "s-0x200000000");
+  g_check_parse_ok(int64_, "\xff\xff\xff\xfe\x00\x00\x00\x00", 8, "s0x200000000");
  g_check_parse_failed(int64_, "\xff\xff\xff\xfe\x00\x00\x00", 7);
 }

@ -962,15 +1046,52 @@ static void test_whitespace(void) {
  g_check_parse_failed(whitespace_, "_a", 2);
 }

-parse_result_t* upcase(parse_result_t *p) {
-  return NULL; // shut compiler up
+#include <ctype.h>
+
+const parsed_token_t* upcase(parse_result_t *p) {
+  switch(p->ast->token_type) {
+  case TT_SEQUENCE:
+    {
+      parsed_token_t *ret = a_new_(p->arena, parsed_token_t, 1);
+      counted_array_t *seq = carray_new_sized(p->arena, p->ast->seq->used);
+      ret->token_type = TT_SEQUENCE;
+      for (size_t i=0; i<p->ast->seq->used; ++i) {
+	if (TT_UINT == ((parsed_token_t*)p->ast->seq->elements[i])->token_type) {
+	  parsed_token_t *tmp = a_new_(p->arena, parsed_token_t, 1);
+	  tmp->token_type = TT_UINT;
+	  tmp->uint = toupper(((parsed_token_t*)p->ast->seq->elements[i])->uint);
+	  carray_append(seq, tmp);
+	} else {
+	  carray_append(seq, p->ast->seq->elements[i]);
+	}
+      }
+      ret->seq = seq;
+      return (const parsed_token_t*)ret;
+    }
+  case TT_UINT:
+    {
+      parsed_token_t *ret = a_new_(p->arena, parsed_token_t, 1);
+      ret->token_type = TT_UINT;
+      ret->uint = toupper(p->ast->uint);
+      return (const parsed_token_t*)ret;
+    }
+  default:
+    return p->ast;
+  }
 }

 static void test_action(void) {
-  const parser_t *action_ = action(sequence(choice(ch('a'), ch('A'), NULL), choice(ch('b'), ch('B'), NULL), NULL), upcase);
+  const parser_t *action_ = action(sequence(choice(ch('a'), 
+						   ch('A'), 
+						   NULL), 
+					    choice(ch('b'), 
+						   ch('B'), 
+						   NULL), 
+					    NULL), 
+				   upcase);

-  g_check_parse_ok(action_, "ab", 2, "(u0x41, u0x42)");
-  g_check_parse_ok(action_, "AB", 2, "(u0x41, u0x42)");
+  g_check_parse_ok(action_, "ab", 2, "(u0x41 u0x42)");
+  g_check_parse_ok(action_, "AB", 2, "(u0x41 u0x42)");
 }

 static void test_not_in(void) {
--- a/src/hammer.h
+++ b/src/hammer.h
@ -36,6 +36,7 @@ typedef enum token_type {
  TT_SINT,
  TT_UINT,
  TT_SEQUENCE,
+  TT_USER = 64,
  TT_ERR,
  TT_MAX
 } token_type_t;
@ -59,173 +60,362 @@ typedef struct parsed_token {
    double dbl;
    float flt;
    counted_array_t *seq; // a sequence of parsed_token_t's
+    void *user;
  };
  size_t index;
  char bit_offset;
 } parsed_token_t;

-
-
-/* If a parse fails, the parse result will be NULL.
- * If a parse is successful but there's nothing there (i.e., if end_p succeeds) then there's a parse result but its ast is NULL.
+/**
+ * The result of a successful parse.
+ * If a parse fails, the parse result will be NULL.
+ * If a parse is successful but there's nothing there (i.e., if end_p 
+ * succeeds) then there's a parse result but its ast is NULL.
 */
 typedef struct parse_result {
  const parsed_token_t *ast;
  arena_t arena;
 } parse_result_t;

-/* Type of an action to apply to an AST, used in the action() parser. */
-typedef parse_result_t* (*action_t)(parse_result_t *p);
+/**
+ * Type of an action to apply to an AST, used in the action() parser. 
+ * It can be any (user-defined) function that takes a parse_result_t*
+ * and returns a parsed_token_t*. (This is so that the user doesn't 
+ * have to worry about memory allocation; action() does that for you.)
+ * Note that the tagged union in parsed_token_t* supports user-defined 
+ * types, so you can create your own token types (corresponding to, 
+ * say, structs) and stuff values for them into the void* in the 
+ * tagged union in parsed_token_t. 
+ */
+typedef const parsed_token_t* (*action_t)(parse_result_t *p);

-/* Type of a boolean attribute-checking function, used in the attr_bool() parser. */
-typedef int (*attr_bool_t)(void *env);
+/**
+ * Type of a boolean attribute-checking function, used in the 
+ * attr_bool() parser. It can be any (user-defined) function that takes
+ * a parse_result_t* and returns true or false. 
+ */
+typedef bool (*predicate_t)(parse_result_t *p);

 typedef struct parser {
  parse_result_t* (*fn)(void *env, parse_state_t *state);
  void *env;
 } parser_t;

+/**
+ * Top-level function to call a parser that has been built over some
+ * piece of input (of known size).
+ */
 parse_result_t* parse(const parser_t* parser, const uint8_t* input, size_t length);

-/* Given a string, returns a parser that parses that string value. */
+/**
+ * Given a string, returns a parser that parses that string value. 
+ * 
+ * Result token type: TT_BYTES
+ */
 const parser_t* token(const uint8_t *str, const size_t len);

-/* Given a single character, returns a parser that parses that character. */
+/**
+ * Given a single character, returns a parser that parses that 
+ * character. 
+ * 
+ * Result token type: TT_UINT
+ */
 const parser_t* ch(const uint8_t c);

-/* Given two single-character bounds, lower and upper, returns a parser that parses a single character within the range [lower, upper] (inclusive). */
+/**
+ * Given two single-character bounds, lower and upper, returns a parser
+ * that parses a single character within the range [lower, upper] 
+ * (inclusive). 
+ * 
+ * Result token type: TT_UINT
+ */
 const parser_t* range(const uint8_t lower, const uint8_t upper);

-/* Returns a parser that parses the specified number of bits. sign == true if signed, false if unsigned. */
+/**
+ * Returns a parser that parses the specified number of bits. sign == 
+ * true if signed, false if unsigned. 
+ *
+ * Result token type: TT_SINT if sign == true, TT_UINT if sign == false
+ */
 const parser_t* bits(size_t len, bool sign);

-/* Returns a parser that parses a signed 8-byte integer value. */
+/**
+ * Returns a parser that parses a signed 8-byte integer value. 
+ *
+ * Result token type: TT_SINT
+ */
 const parser_t* int64();

-/* Returns a parser that parses a signed 4-byte integer value. */
+/**
+ * Returns a parser that parses a signed 4-byte integer value. 
+ *
+ * Result token type: TT_SINT
+ */
 const parser_t* int32();

-/* Returns a parser that parses a signed 2-byte integer value. */
+/**
+ * Returns a parser that parses a signed 2-byte integer value. 
+ *
+ * Result token type: TT_SINT
+ */
 const parser_t* int16();

-/* Returns a parser that parses a signed 1-byte integer value. */
+/**
+ * Returns a parser that parses a signed 1-byte integer value. 
+ *
+ * Result token type: TT_SINT
+ */
 const parser_t* int8();

-/* Returns a parser that parses an unsigned 8-byte integer value. */
+/**
+ * Returns a parser that parses an unsigned 8-byte integer value. 
+ *
+ * Result token type: TT_UINT
+ */
 const parser_t* uint64();

-/* Returns a parser that parses an unsigned 4-byte integer value. */
+/**
+ * Returns a parser that parses an unsigned 4-byte integer value. 
+ *
+ * Result token type: TT_UINT
+ */
 const parser_t* uint32();

-/* Returns a parser that parses an unsigned 2-byte integer value. */
+/**
+ * Returns a parser that parses an unsigned 2-byte integer value. 
+ *
+ * Result token type: TT_UINT
+ */
 const parser_t* uint16();

-/* Returns a parser that parses an unsigned 1-byte integer value. */
+/**
+ * Returns a parser that parses an unsigned 1-byte integer value. 
+ *
+ * Result token type: TT_UINT
+ */
 const parser_t* uint8();

-/* Given another parser, p, returns a parser that skips any whitespace and then applies p. */
+/**
+ * Given another parser, p, returns a parser that skips any whitespace 
+ * and then applies p. 
+ *
+ * Result token type: p's result type
+ */
 const parser_t* whitespace(const parser_t* p);

-/* Given another parser, p, and a function f, returns a parser that applies p, then applies f to everything in the AST of p's result. */
+/**
+ * Given another parser, p, and a function f, returns a parser that 
+ * applies p, then applies f to everything in the AST of p's result. 
+ *
+ * Result token type: any
+ */
 const parser_t* action(const parser_t* p, const action_t a);

-/* Parse a single character *NOT* in charset */
+/**
+ * Parse a single character *NOT* in the given charset. 
+ *
+ * Result token type: TT_UINT
+ */
 const parser_t* not_in(const uint8_t *charset, int length);

-/* A no-argument parser that succeeds if there is no more input to parse. */
+/**
+ * A no-argument parser that succeeds if there is no more input to 
+ * parse. 
+ *
+ * Result token type: None. The parse_result_t exists but its AST is NULL.
+ */
 const parser_t* end_p();

-/* This parser always fails. */
+/**
+ * This parser always fails. 
+ *
+ * Result token type: NULL. Always.
+ */
 const parser_t* nothing_p();

-/* Given an null-terminated list of parsers, apply each parser in order. The parse succeeds only if all parsers succeed. */
+/**
+ * Given a null-terminated list of parsers, apply each parser in order.
+ * The parse succeeds only if all parsers succeed. 
+ *
+ * Result token type: TT_SEQUENCE
+ */
 const parser_t* sequence(const parser_t* p, ...) __attribute__((sentinel));

-/* Given an array of parsers, p_array, apply each parser in order. The first parser to succeed is the result; if no parsers succeed, the parse fails. */
+/**
+ * Given an array of parsers, p_array, apply each parser in order. The 
+ * first parser to succeed is the result; if no parsers succeed, the 
+ * parse fails. 
+ *
+ * Result token type: The type of the first successful parser's result.
+ */
 const parser_t* choice(const parser_t* p, ...) __attribute__((sentinel));

-/* Given two parsers, p1 and p2, this parser succeeds in the following cases: 
+/**
+ * Given two parsers, p1 and p2, this parser succeeds in the following 
+ * cases: 
 * - if p1 succeeds and p2 fails
 * - if both succeed but p1's result is as long as or shorter than p2's
+ *
+ * Result token type: p1's result type.
 */
 const parser_t* butnot(const parser_t* p1, const parser_t* p2);

-/* Given two parsers, p1 and p2, this parser succeeds in the following cases:
+/**
+ * Given two parsers, p1 and p2, this parser succeeds in the following 
+ * cases:
 * - if p1 succeeds and p2 fails
 * - if both succeed but p2's result is shorter than p1's
+ *
+ * Result token type: p1's result type.
 */
 const parser_t* difference(const parser_t* p1, const parser_t* p2);

-/* Given two parsers, p1 and p2, this parser succeeds if *either* p1 or p2 succeed, but not if they both do.
+/**
+ * Given two parsers, p1 and p2, this parser succeeds if *either* p1 or
+ * p2 succeed, but not if they both do.
+ *
+ * Result token type: The type of the result of whichever parser succeeded.
 */
 const parser_t* xor(const parser_t* p1, const parser_t* p2);

-/* Given a parser, p, this parser succeeds for zero or more repetitions of p. */
+/**
+ * Given a parser, p, this parser succeeds for zero or more repetitions
+ * of p. 
+ *
+ * Result token type: TT_SEQUENCE
+ */
 const parser_t* many(const parser_t* p);

-/* Given a parser, p, this parser succeeds for one or more repetitions of p. */
+/**
+ * Given a parser, p, this parser succeeds for one or more repetitions 
+ * of p. 
+ *
+ * Result token type: TT_SEQUENCE
+ */
 const parser_t* many1(const parser_t* p);

-/* Given a parser, p, this parser succeeds for exactly N repetitions of p. */
+/**
+ * Given a parser, p, this parser succeeds for exactly N repetitions 
+ * of p. 
+ *
+ * Result token type: TT_SEQUENCE
+ */
 const parser_t* repeat_n(const parser_t* p, const size_t n);

-/* Given a parser, p, this parser succeeds with the value p parsed or with an empty result. */
+/**
+ * Given a parser, p, this parser succeeds with the value p parsed or 
+ * with an empty result. 
+ *
+ * Result token type: If p succeeded, the type of its result; if not, TT_NONE.
+ */
 const parser_t* optional(const parser_t* p);

-/* Given a parser, p, this parser succeeds if p succeeds, but doesn't include p's result in the result. */
+/**
+ * Given a parser, p, this parser succeeds if p succeeds, but doesn't 
+ * include p's result in the result. 
+ *
+ * Result token type: None. The parse_result_t exists but its AST is NULL.
+ */
 const parser_t* ignore(const parser_t* p);

-/* Given a parser, p, and a parser for a separator, sep, this parser matches a (possibly empty) list of things that p can parse, separated by sep.
- * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy(p, sep) will match a comma-separated list of integers. 
+/**
+ * Given a parser, p, and a parser for a separator, sep, this parser 
+ * matches a (possibly empty) list of things that p can parse, 
+ * separated by sep.
+ * For example, if p is repeat1(range('0','9')) and sep is ch(','), 
+ * sepBy(p, sep) will match a comma-separated list of integers. 
+ *
+ * Result token type: TT_SEQUENCE
 */
 const parser_t* sepBy(const parser_t* p, const parser_t* sep);

-/* Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element.
+/**
+ * Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element.
 * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy1(p, sep) will match a comma-separated list of integers. 
+ *
+ * Result token type: TT_SEQUENCE
 */
 const parser_t* sepBy1(const parser_t* p, const parser_t* sep);

-/* This parser always returns a zero length match, i.e., empty string. */
+/**
+ * This parser always returns a zero length match, i.e., empty string. 
+ *
+ * Result token type: None. The parse_result_t exists but its AST is NULL.
+ */
 const parser_t* epsilon_p();

-/* This parser attaches an attribute function, which returns true or false, to a parser. The function is evaluated over the parser's result AST. 
- * The parse only succeeds if the attribute function returns true. 
+/**
+ * This parser applies its first argument to read an unsigned integer
+ * value, then applies its second argument that many times. length 
+ * should parse an unsigned integer value; this is checked at runtime.
+ * Specifically, the token_type of the returned token must be TT_UINT.
+ * In future we might relax this to include TT_USER but don't count on it.
+ *
+ * Result token type: TT_SEQUENCE
 */
-const parser_t* attr_bool(const parser_t* p, const attr_bool_t a);
+const parser_t* length_value(const parser_t* length, const parser_t* value);

-/* The 'and' parser is a predicate. It asserts that a conditional syntax is satisfied, but consumes no input. 
+/**
+ * This parser attaches a predicate function, which returns true or 
+ * false, to a parser. The function is evaluated over the parser's 
+ * result. 
+ * The parse only succeeds if the attribute function returns true. 
+ *
+ * Result token type: p's result type if pred succeeded, NULL otherwise.
+ */
+const parser_t* attr_bool(const parser_t* p, predicate_t pred);
+
+/**
+ * The 'and' parser asserts that a conditional syntax is satisfied, 
+ * but doesn't consume that conditional syntax. 
 * This is useful for lookahead. As an example:
 *
- * Suppose you already have a parser, hex_p, that parses numbers in hexadecimal format (including the leading '0x'). Then
+ * Suppose you already have a parser, hex_p, that parses numbers in 
+ * hexadecimal format (including the leading '0x'). Then
 *   sequence(and(token((const uint8_t*)"0x", 2)), hex_p)
- * checks to see whether there is a leading "0x", *does not* consume the "0x", and then applies hex_p to parse the hex-formatted number.
+ * checks to see whether there is a leading "0x", *does not* consume 
+ * the "0x", and then applies hex_p to parse the hex-formatted number.
 *
- * 'and' succeeds if p succeeds, and fails if p fails. Like 'ignore', 'and' does not attach a result to the AST.
+ * 'and' succeeds if p succeeds, and fails if p fails. 
+ *
+ * Result token type: None. The parse_result_t exists but its AST is NULL.
 */
 const parser_t* and(const parser_t* p);

-/* The 'not' parser is a predicate. It asserts that a conditional syntax is *not* satisfied, and consumes no input.
+/**
+ * The 'not' parser asserts that a conditional syntax is *not* 
+ * satisfied, but doesn't consume that conditional syntax.
 * As a somewhat contrived example:
 * 
 * Since 'choice' applies its arguments in order, the following parser:
 *   sequence(ch('a'), choice(ch('+'), token((const uint8_t*)"++"), NULL), ch('b'), NULL)
- * will not parse "a++b", because once choice() has succeeded, it will not backtrack and try other alternatives if a later parser in the sequence
- * fails. 
- * Instead, you can force the use of the second alternative by turning the ch('+') alternative into a sequence with not:
+ * will not parse "a++b", because once choice() has succeeded, it will 
+ * not backtrack and try other alternatives if a later parser in the 
+ * sequence fails. 
+ * Instead, you can force the use of the second alternative by turning 
+ * the ch('+') alternative into a sequence with not:
 *   sequence(ch('a'), choice(sequence(ch('+'), not(ch('+')), NULL), token((const uint8_t*)"++")), ch('b'), NULL)
- * If the input string is "a+b", the first alternative is applied; if the input string is "a++b", the second alternative is applied.
+ * If the input string is "a+b", the first alternative is applied; if 
+ * the input string is "a++b", the second alternative is applied.
+ * 
+ * Result token type: None. The parse_result_t exists but its AST is NULL.
 */
 const parser_t* not(const parser_t* p);

 /**
- * Create a parser that just calls out to another, as yet unknown, parser.
+ * Create a parser that just calls out to another, as yet unknown, 
+ * parser.
 * Note that the inner parser gets bound later, with bind_indirect.
 * This can be used to create recursive parsers.
+ *
+ * Result token type: the type of whatever parser is bound to it with
+ * bind_indirect().
 */
 parser_t *indirect();

 /**
- * Set the inner parser of an indirect. See comments on indirect for details.
+ * Set the inner parser of an indirect. See comments on indirect for 
+ * details.
 */
 void bind_indirect(parser_t* indirect, parser_t* inner);

--- a/src/internal.h
+++ b/src/internal.h
@ -26,7 +26,7 @@
 #else
 #define assert_message(check, message) do {				\
    if (!(check))							\
-      errx(1, "Assertation failed (programmer error): %s", message);	\
+      errx(1, "Assertion failed (programmer error): %s", message);	\
  } while(0)
 #endif
 #define false 0
--- a/src/pprint.c
+++ b/src/pprint.c
@ -94,6 +94,10 @@ static inline void append_buf_c(struct result_buf *buf, char v) {
 static void unamb_sub(const parsed_token_t* tok, struct result_buf *buf) {
  char* tmpbuf;
  int len;
+  if (!tok) {
+    append_buf(buf, "NULL", 4);
+    return;
+  }
  switch (tok->token_type) {
  case TT_NONE:
    append_buf(buf, "null", 4);