diff --git a/src/backends/regex_debug.c b/src/backends/regex_debug.c
index 8e54f5c..5207d9e 100644
--- a/src/backends/regex_debug.c
+++ b/src/backends/regex_debug.c
@@ -1,15 +1,17 @@
 // Intended to be included from regex_debug.c
-#define _GNU_SOURCE
-#include <stdio.h>
+#include "../platform.h"
 #include <stdlib.h>
 
+#define USE_DLADDR (0)
 
-
+#if USE_DLADDR
 // This is some spectacularly non-portable code... but whee!
 #include <dlfcn.h>
-char* getsym(void* addr) {
+#endif
+
+char* getsym(HSVMActionFunc addr) {
   char* retstr;
-#if 0
+#if USE_DLADDR
   // This will be fixed later.
   Dl_info dli;
   if (dladdr(addr, &dli) != 0 && dli.dli_sname != NULL) {
@@ -19,7 +21,7 @@ char* getsym(void* addr) {
       return retstr;
   } else
 #endif
-    if (asprintf(&retstr, "%p", addr) > 0)
+    if (h_platform_asprintf(&retstr, "%p", addr) > 0)
       return retstr;
     else
       return NULL;
diff --git a/src/benchmark.c b/src/benchmark.c
index 8105df3..b6a2876 100644
--- a/src/benchmark.c
+++ b/src/benchmark.c
@@ -1,19 +1,10 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
 #include <string.h>
 #include "hammer.h"
 #include "internal.h"
-
-#ifdef __MACH__
-#include <mach/clock.h>
-#include <mach/mach.h>
-#endif
-
-#ifdef __NetBSD__
-#include <sys/resource.h>
-#endif
+#include "platform.h"
 
 static const char* HParserBackendNames[] = {
   "Packrat",
@@ -23,38 +14,6 @@ static const char* HParserBackendNames[] = {
   "GLR"
 };
 
-void h_benchmark_clock_gettime(struct timespec *ts) {
-  if (ts == NULL)
-    return;
-#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
-  /* 
-   * This returns real time, not CPU time. See http://stackoverflow.com/a/6725161
-   * Possible solution: http://stackoverflow.com/a/11659289
-   */
-  clock_serv_t cclock;
-  mach_timespec_t mts;
-  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-  clock_get_time(cclock, &mts);
-  mach_port_deallocate(mach_task_self(), cclock);
-  ts->tv_sec = mts.tv_sec;
-  ts->tv_nsec = mts.tv_nsec;
-#elif defined(__NetBSD__)
-  // NetBSD doesn't have CLOCK_THREAD_CPUTIME_ID. We'll use getrusage instead
-  struct rusage rusage;
-  getrusage(RUSAGE_SELF, &rusage);
-  ts->tv_nsec = (rusage.ru_utime.tv_usec + rusage.ru_stime.tv_usec) * 1000;
-  // not going to overflow; can be at most 2e9-2
-  ts->tv_sec = rusage.ru_utime.tv_sec + rusage.ru_utime.tv_sec;
-  if (ts->tv_nsec >= 1000000000) {
-    ts->tv_nsec -=   1000000000; // subtract a second
-    ts->tv_sec += 1; // add it back.
-  }
-  assert (ts->tv_nsec <= 1000000000);
-#else
-  clock_gettime(CLOCK_THREAD_CPUTIME_ID, ts);
-#endif
-}
-
 /*
   Usage:
   Create your parser (i.e., const HParser*), and an array of test cases
@@ -107,18 +66,18 @@ HBenchmarkResults *h_benchmark__m(HAllocator* mm__, HParser* parser, HParserTest
       HParseResult *res = h_parse(parser, tc->input, tc->length);
       char* res_unamb;
       if (res != NULL) {
-	res_unamb = h_write_result_unamb(res->ast);
+        res_unamb = h_write_result_unamb(res->ast);
       } else
-	res_unamb = NULL;
+        res_unamb = NULL;
       if ((res_unamb == NULL && tc->output_unambiguous != NULL)
-	  || (res_unamb != NULL && strcmp(res_unamb, tc->output_unambiguous) != 0)) {
-	// test case failed...
-	fprintf(stderr, "Parsing with %s failed\n", HParserBackendNames[backend]);
-	// We want to run all testcases, for purposes of generating a
-	// report. (eg, if users are trying to fix a grammar for a
-	// faster backend)
-	tc_failed++;
-	ret->results[backend].failed_testcases++;
+          || (res_unamb != NULL && strcmp(res_unamb, tc->output_unambiguous) != 0)) {
+        // test case failed...
+        fprintf(stderr, "Parsing with %s failed\n", HParserBackendNames[backend]);
+        // We want to run all testcases, for purposes of generating a
+        // report. (eg, if users are trying to fix a grammar for a
+        // faster backend)
+        tc_failed++;
+        ret->results[backend].failed_testcases++;
       }
       h_parse_result_free(res);
       (&system_allocator)->free(&system_allocator, res_unamb);
@@ -135,20 +94,16 @@ HBenchmarkResults *h_benchmark__m(HAllocator* mm__, HParser* parser, HParserTest
 
     for (tc = testcases; tc->input != NULL; tc++) {
       // The goal is to run each testcase for at least 50ms each
-      // TODO: replace this with a posix timer-based benchmark. (cf. timerfd_create, timer_create, setitimer)
       int count = 1, cur;
-      struct timespec ts_start, ts_end;
       int64_t time_diff;
       do {
-	count *= 2; // Yes, this means that the first run will run the function twice. This is fine, as we want multiple runs anyway.
-  h_benchmark_clock_gettime(&ts_start);
-	for (cur = 0; cur < count; cur++) {
-	  h_parse_result_free(h_parse(parser, tc->input, tc->length));
-	}
-  h_benchmark_clock_gettime(&ts_end);
-
-	// time_diff is in ns
-	time_diff = (ts_end.tv_sec - ts_start.tv_sec) * 1000000000 + (ts_end.tv_nsec - ts_start.tv_nsec);
+        count *= 2; // Yes, this means that the first run will run the function twice. This is fine, as we want multiple runs anyway.
+        struct HStopWatch stopwatch;
+        h_platform_stopwatch_reset(&stopwatch);
+        for (cur = 0; cur < count; cur++) {
+          h_parse_result_free(h_parse(parser, tc->input, tc->length));
+        }
+        time_diff = h_platform_stopwatch_ns(&stopwatch);
       } while (time_diff < 100000000);
       ret->results[backend].cases[cur_case].parse_time = (time_diff / count);
       ret->results[backend].cases[cur_case].length = tc->length;
diff --git a/src/platform.h b/src/platform.h
index 0c05bfe..e6eb7ec 100644
--- a/src/platform.h
+++ b/src/platform.h
@@ -8,6 +8,17 @@
 
 #include "compiler_specifics.h"
 
+#include <stdarg.h>
+#include <stdint.h>
+
+/* String Formatting */
+
+/** see GNU C asprintf */
+int h_platform_asprintf(char **strp, const char *fmt, ...);
+
+/** see GNU C vasprintf */
+int h_platform_vasprintf(char **strp, const char *fmt, va_list arg);
+
 /* Error Reporting */
 
 /* BSD errx function, seen in err.h */
@@ -15,4 +26,39 @@ H_MSVC_DECLSPEC(noreturn) \
 void h_platform_errx(int err, const char* format, ...)	\
   H_GCC_ATTRIBUTE((noreturn, format (printf,2,3)));
 
+/* Time Measurement */
+
+struct HStopWatch; /* forward definition */
+
+/* initialize a stopwatch */
+void h_platform_stopwatch_reset(struct HStopWatch* stopwatch);
+
+/* return difference between last reset point and now */
+int64_t h_platform_stopwatch_ns(struct HStopWatch* stopwatch);
+
+/* Platform dependent definitions for HStopWatch */
+#if defined(_MSC_VER)
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+#undef WIN32_LEAN_AND_MEAN
+
+struct HStopWatch {
+  LARGE_INTEGER qpf;
+  LARGE_INTEGER start;
+};
+
+#else
+/* Unix like platforms */
+
+#include <time.h>
+
+struct HStopWatch {
+  struct timespec start;
+};
+
+#endif
+
 #endif
diff --git a/src/platform_bsdlike.c b/src/platform_bsdlike.c
index ebb38d9..2ccf874 100644
--- a/src/platform_bsdlike.c
+++ b/src/platform_bsdlike.c
@@ -1,10 +1,83 @@
+#define _GNU_SOURCE // to obtain asprintf/vasprintf
 #include "platform.h"
 
+#include <stdio.h>
+
 #include <err.h>
 #include <stdarg.h>
 
+#ifdef __MACH__
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif
+
+#ifdef __NetBSD__
+#include <sys/resource.h>
+#endif
+
+int h_platform_asprintf(char **strp, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  int res = h_platform_vasprintf(strp, fmt, ap);
+  va_end(ap);
+  return res;
+}
+
+int h_platform_vasprintf(char **strp, const char *fmt, va_list arg)
+{
+  return vasprintf(strp, fmt, arg);
+}
+
 void h_platform_errx(int err, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   verrx(err, format, ap);
 }
+
+// TODO: replace this with a posix timer-based benchmark. (cf. timerfd_create, timer_create, setitimer)
+
+static void gettime(struct timespec *ts) {
+  if (ts == NULL)
+    return;
+#ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
+  /*
+   * This returns real time, not CPU time. See http://stackoverflow.com/a/6725161
+   * Possible solution: http://stackoverflow.com/a/11659289
+   */
+  clock_serv_t cclock;
+  mach_timespec_t mts;
+  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+  clock_get_time(cclock, &mts);
+  mach_port_deallocate(mach_task_self(), cclock);
+  ts->tv_sec = mts.tv_sec;
+  ts->tv_nsec = mts.tv_nsec;
+#elif defined(__NetBSD__)
+  // NetBSD doesn't have CLOCK_THREAD_CPUTIME_ID. We'll use getrusage instead
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+  ts->tv_nsec = (rusage.ru_utime.tv_usec + rusage.ru_stime.tv_usec) * 1000;
+  // not going to overflow; can be at most 2e9-2
+  ts->tv_sec = rusage.ru_utime.tv_sec + rusage.ru_utime.tv_sec;
+  if (ts->tv_nsec >= 1000000000) {
+    ts->tv_nsec -=   1000000000; // subtract a second
+    ts->tv_sec += 1; // add it back.
+  }
+  assert (ts->tv_nsec <= 1000000000);
+#else
+  clock_gettime(CLOCK_THREAD_CPUTIME_ID, ts);
+#endif
+}
+
+void h_platform_stopwatch_reset(struct HStopWatch* stopwatch) {
+  gettime(&stopwatch->start);
+}
+
+int64_t h_platform_stopwatch_ns(struct HStopWatch* stopwatch) {
+  struct timespec ts_now;
+  gettime(&ts_now);
+
+  // time_diff is in ns
+  return (ts_now.tv_sec - stopwatch->start.tv_sec) * 1000000000
+          + (ts_now.tv_nsec - stopwatch->start.tv_nsec);
+}
diff --git a/src/platform_win32.c b/src/platform_win32.c
index 30af168..9824b52 100644
--- a/src/platform_win32.c
+++ b/src/platform_win32.c
@@ -1,10 +1,61 @@
 #include "platform.h"
 
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
 #define WIN32_LEAN_AND_MEAN
-#include "windows.h"
+#include <windows.h>
+
+int h_platform_asprintf(char**strp, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  int res = h_platform_vasprintf(strp, fmt, ap);
+  va_end(ap);
+  return res;
+}
+
+int h_platform_vasprintf(char**strp, const char *fmt, va_list args)
+{
+  va_list ap;
+  va_copy(ap, args);
+  int non_null_char_count = _vscprintf(fmt, ap);
+  va_end(ap);
+
+  if (non_null_char_count < 0) {
+    return -1;
+  }
+
+  size_t buffer_size = 1 + non_null_char_count;
+  char* buffer = malloc(buffer_size);
+
+  va_copy(ap, args);
+  int res = vsnprintf_s(buffer, buffer_size, non_null_char_count, fmt, ap);
+  if (res < 0) {
+    free(buffer);
+  } else {
+    buffer[non_null_char_count] = 0;
+    *strp = buffer;
+  }
+  va_end(ap);
+
+  return res;
+}
 
 void h_platform_errx(int err, const char* format, ...) {
   // FIXME(windows) TODO(uucidl): to be implemented
   ExitProcess(err);
 }
 
+void h_platform_stopwatch_reset(struct HStopWatch* stopwatch) {
+  QueryPerformanceFrequency(&stopwatch->qpf);
+  QueryPerformanceCounter(&stopwatch->start);
+}
+
+/* return difference between last reset point and now */
+int64_t h_platform_stopwatch_ns(struct HStopWatch* stopwatch) {
+  LARGE_INTEGER now;
+  QueryPerformanceCounter(&now);
+
+  return 1000000000 * (now.QuadPart - stopwatch->start.QuadPart) / stopwatch->qpf.QuadPart;
+}
diff --git a/src/pprint.c b/src/pprint.c
index b2290dd..11ec3d6 100644
--- a/src/pprint.c
+++ b/src/pprint.c
@@ -15,7 +15,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 
-#define _GNU_SOURCE
+#include "platform.h"
+
 #include <stdio.h>
 #include <string.h>
 #include "hammer.h"
@@ -114,9 +115,24 @@ static inline bool append_buf_c(struct result_buf *buf, char v) {
   }
 }
 
-static void unamb_sub(const HParsedToken* tok, struct result_buf *buf) {
+/** append a formatted string to the result buffer */
+static inline bool append_buf_formatted(struct result_buf *buf, char* format, ...)
+{
   char* tmpbuf;
   int len;
+  bool result;
+  va_list ap;
+
+  va_start(ap, format);
+  len = h_platform_vasprintf(&tmpbuf, format, ap);
+  result = append_buf(buf, tmpbuf, len);
+  free(tmpbuf);
+  va_end(ap);
+
+  return result;
+}
+
+static void unamb_sub(const HParsedToken* tok, struct result_buf *buf) {
   if (!tok) {
     append_buf(buf, "NULL", 4);
     return;
@@ -141,16 +157,12 @@ static void unamb_sub(const HParsedToken* tok, struct result_buf *buf) {
     break;
   case TT_SINT:
     if (tok->sint < 0)
-      len = asprintf(&tmpbuf, "s-%#" PRIx64, -tok->sint);
+      append_buf_formatted(buf, "s-%#" PRIx64, -tok->sint);
     else
-      len = asprintf(&tmpbuf, "s%#" PRIx64, tok->sint);
-    append_buf(buf, tmpbuf, len);
-    free(tmpbuf);
+      append_buf_formatted(buf, "s%#" PRIx64, tok->sint);
     break;
   case TT_UINT:
-    len = asprintf(&tmpbuf, "u%#" PRIx64, tok->uint);
-    append_buf(buf, tmpbuf, len);
-    free(tmpbuf);
+    append_buf_formatted(buf, "u%#" PRIx64, tok->uint);
     break;
   case TT_ERR:
     append_buf(buf, "ERR", 3);
diff --git a/src/system_allocator.c b/src/system_allocator.c
index f9dd291..39a1a7e 100644
--- a/src/system_allocator.c
+++ b/src/system_allocator.c
@@ -2,41 +2,81 @@
 #include <stdlib.h>
 #include "internal.h"
 
-//#define DEBUG__MEMFILL 0xFF
+// NOTE(uucidl): undefine to automatically fill up newly allocated block
+// with this byte:
+// #define DEBUG__MEMFILL 0xFF
+
+#if defined(DEBUG__MEMFILL)
+/**
+ * Blocks allocated by the system_allocator start with this header.
+ * I.e. the user part of the allocation directly follows.
+ */
+typedef struct HDebugBlockHeader_
+{
+  size_t size; /** size of the user allocation */
+} HDebugBlockHeader;
+
+#define BLOCK_HEADER_SIZE (sizeof(HDebugBlockHeader))
+#else
+#define BLOCK_HEADER_SIZE (0)
+#endif
+
+/**
+ * Compute the total size needed for a given allocation size.
+ */
+static inline size_t block_size(size_t alloc_size) {
+  return BLOCK_HEADER_SIZE + alloc_size;
+}
+
+/**
+ * Obtain the block containing the user pointer `uptr`
+ */
+static inline void* block_for_user_ptr(void *uptr) {
+  return ((char*)uptr) - BLOCK_HEADER_SIZE;
+}
+
+/**
+ * Obtain the user area of the allocation from a given block
+ */
+static inline void* user_ptr(void *block) {
+  return ((char*)block) + BLOCK_HEADER_SIZE;
+}
 
 static void* system_alloc(HAllocator *allocator, size_t size) {
-  
-  void* ptr = malloc(size + sizeof(size_t));
-  if (!ptr) {
+  void *block = malloc(block_size(size));
+  if (!block) {
     return NULL;
   }
+  void *uptr = user_ptr(block);
 #ifdef DEBUG__MEMFILL
-  memset(ptr, DEBUG__MEMFILL, size + sizeof(size_t));
+  memset(uptr, DEBUG__MEMFILL, size);
+  ((HDebugBlockHeader*)block)->size = size;
 #endif
-  *(size_t*)ptr = size;
-  return ptr + sizeof(size_t);
+  return uptr;
 }
 
-static void* system_realloc(HAllocator *allocator, void* ptr, size_t size) {
-  if (!ptr) {
+static void* system_realloc(HAllocator *allocator, void* uptr, size_t size) {
+  if (!uptr) {
     return system_alloc(allocator, size);
   }
-  ptr = realloc(ptr - sizeof(size_t), size + sizeof(size_t));
-  if (!ptr) {
+  void* block = realloc(block_for_user_ptr(uptr), block_size(size));
+  if (!block) {
     return NULL;
   }
-  *(size_t*)ptr = size;
+  uptr = user_ptr(block);
+
 #ifdef DEBUG__MEMFILL
-  size_t old_size = *(size_t*)ptr;
+  size_t old_size = ((HDebugBlockHeader*)block)->size;
   if (size > old_size)
-    memset(ptr+sizeof(size_t)+old_size, DEBUG__MEMFILL, size - old_size);
+    memset((char*)uptr+old_size, DEBUG__MEMFILL, size - old_size);
+  ((HDebugBlockHeader*)block)->size = size;
 #endif
-  return ptr + sizeof(size_t);
+  return uptr;
 }
 
-static void system_free(HAllocator *allocator, void* ptr) {
-  if (ptr) {
-    free(ptr - sizeof(size_t));
+static void system_free(HAllocator *allocator, void* uptr) {
+  if (uptr) {
+    free(block_for_user_ptr(uptr));
   }
 }
 
diff --git a/tools/windows/hammer_lib_src_list b/tools/windows/hammer_lib_src_list
index 4c85a43..a8a4dc4 100644
--- a/tools/windows/hammer_lib_src_list
+++ b/tools/windows/hammer_lib_src_list
@@ -1,11 +1,14 @@
 platform_win32.c 
-allocator.c 
+allocator.c
+benchmark.c
 bitreader.c 
 bitwriter.c 
 cfgrammar.c 
 desugar.c 
 glue.c 
 hammer.c 
+pprint.c
+system_allocator.c
 parsers/action.c 
 parsers/and.c 
 parsers/attr_bool.c 
@@ -32,6 +35,7 @@ parsers/xor.c
 parsers/value.c 
 backends/packrat.c
 backends/llk.c
+backends/regex.c
 backends/glr.c
 backends/lalr.c
 backends/lr.c