hammer/examples/base64_sem2.c

// Example parser: Base64, with fine-grained semantic actions
//
// Demonstrates how to attach semantic actions to a grammar and transform the
// parse tree into the desired semantic representation, in this case a sequence
// of 8-bit values.
//
// Note how the grammar is defined by using the macros H_RULE and H_ARULE.
// Those rules using ARULE get an attached action which must be declared (as
// a function of type HAction) with a standard name based on the rule name.
//
// This variant of the example uses coarse-grained semantic actions,
// transforming the entire parse tree in one big step. Compare base64_sem1.c
// for an alternative approach using a fine-grained piece-by-piece
// transformation.

#include "../src/hammer.h"
#include "../src/glue.h"
#include <assert.h>
#include <inttypes.h>


///
// Semantic actions for the grammar below, each corresponds to an "ARULE".
// They must be named act_<rulename>.
///

// helper: return the numeric value of a parsed base64 digit
uint8_t bsfdig_value(const HParsedToken *p)
{
    uint8_t value = 0;

    if(p && p->token_type == TT_UINT) {
        uint8_t c = p->uint;
        if(c >= 0x41 && c <= 0x5A) // A-Z
            value = c - 0x41;
        else if(c >= 0x61 && c <= 0x7A) // a-z
            value = c - 0x61 + 26;
        else if(c >= 0x30 && c <= 0x39) // 0-9
            value = c - 0x30 + 52;
        else if(c == '+')
            value = 62;
        else if(c == '/')
            value = 63;
    }

    return value;
}

// helper: append a byte value to a sequence
#define seq_append_byte(res, b) h_seq_snoc(res, H_MAKE_UINT(b))

HParsedToken *act_base64(const HParseResult *p, void* user_data)
{
    assert(p->ast->token_type == TT_SEQUENCE);
    assert(p->ast->seq->used == 2);
    assert(p->ast->seq->elements[0]->token_type == TT_SEQUENCE);

    // grab b64_3 block sequence
    // grab and analyze b64 end block (_2 or _1)
    const HParsedToken *b64_3 = p->ast->seq->elements[0];
    const HParsedToken *b64_2 = p->ast->seq->elements[1];
    const HParsedToken *b64_1 = p->ast->seq->elements[1];

    if(b64_2->token_type != TT_SEQUENCE)
        b64_1 = b64_2 = NULL;
    else if(b64_2->seq->elements[2]->uint == '=')
        b64_2 = NULL;
    else
        b64_1 = NULL;

    // allocate result sequence
    HParsedToken *res = H_MAKE_SEQ();

    // concatenate base64_3 blocks
    for(size_t i=0; i<b64_3->seq->used; i++) {
        assert(b64_3->seq->elements[i]->token_type == TT_SEQUENCE);
        HParsedToken **digits = b64_3->seq->elements[i]->seq->elements;

        uint32_t x = bsfdig_value(digits[0]);
        x <<= 6; x |= bsfdig_value(digits[1]);
        x <<= 6; x |= bsfdig_value(digits[2]);
        x <<= 6; x |= bsfdig_value(digits[3]);
        seq_append_byte(res, (x >> 16) & 0xFF);
        seq_append_byte(res, (x >> 8) & 0xFF);
        seq_append_byte(res, x & 0xFF);
    }

    // append one trailing base64_2 or _1 block
    if(b64_2) {
        HParsedToken **digits = b64_2->seq->elements;
        uint32_t x = bsfdig_value(digits[0]);
        x <<= 6; x |= bsfdig_value(digits[1]);
        x <<= 6; x |= bsfdig_value(digits[2]);
        seq_append_byte(res, (x >> 10) & 0xFF);
        seq_append_byte(res, (x >> 2) & 0xFF);
    } else if(b64_1) {
        HParsedToken **digits = b64_1->seq->elements;
        uint32_t x = bsfdig_value(digits[0]);
        x <<= 6; x |= bsfdig_value(digits[1]);
        seq_append_byte(res, (x >> 4) & 0xFF);
    }

    return res;
}

H_ACT_APPLY(act_index0, h_act_index, 0);

#define act_ws           h_act_ignore
#define act_document     act_index0


///
// Set up the parser with the grammar to be recognized.
///

const HParser *init_parser(void)
{
    // CORE
    H_RULE (digit,   h_ch_range(0x30, 0x39));
    H_RULE (alpha,   h_choice(h_ch_range(0x41, 0x5a), h_ch_range(0x61, 0x7a), NULL));
    H_RULE (space,   h_in((uint8_t *)" \t\n\r\f\v", 6));

    // AUX.
    H_RULE (plus,    h_ch('+'));
    H_RULE (slash,   h_ch('/'));
    H_RULE (equals,  h_ch('='));

    H_RULE (bsfdig,       h_choice(alpha, digit, plus, slash, NULL));
    H_RULE (bsfdig_4bit,  h_in((uint8_t *)"AEIMQUYcgkosw048", 16));
    H_RULE (bsfdig_2bit,  h_in((uint8_t *)"AQgw", 4));
    H_RULE (base64_3,     h_repeat_n(bsfdig, 4));
    H_RULE (base64_2,     h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL));
    H_RULE (base64_1,     h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL));
    H_ARULE(base64,       h_sequence(h_many(base64_3),
                                     h_optional(h_choice(base64_2,
                                                         base64_1, NULL)),
                                     NULL));

    H_ARULE(ws,           h_many(space));
    H_ARULE(document,     h_sequence(ws, base64, ws, h_end_p(), NULL));

    // BUG sometimes inputs that should just don't parse.
    // It *seemed* to happen mostly with things like "bbbbaaaaBA==".
    // Using less actions seemed to make it less likely.

    return document;
}


///
// Main routine: print input, parse, print result, return success/failure.
///

#include <stdio.h>

int main(int argc, char **argv)
{
    uint8_t input[102400];
    size_t inputsize;
    const HParser *parser;
    const HParseResult *result;

    parser = init_parser();

    inputsize = fread(input, 1, sizeof(input), stdin);
    fprintf(stderr, "inputsize=%zu\ninput=", inputsize);
    fwrite(input, 1, inputsize, stderr);
    result = h_parse(parser, input, inputsize);

    if(result) {
        fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8);
        h_pprint(stdout, result->ast, 0, 0);
        return 0;
    } else {
        return 1;
    }
}