wip

2 years ago · 530f85cf12
19 changed files with 1128 additions and 335 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,48 @@
 This is a library of C++ code which I use as a standard library wrapper, supplement, and in some cases, replacement.
 If you want to use it, you can add all of the source files to your source tree, configure the `#define`'s in `config.h` to suit your needs, and it should just work.
 The exceptions are the files `config.h` and `types.h` which are required by every other file.
 - Stack, Scratch, and Block-based allocators as well as memory-leak checking mechanism and OS allocator wrappers in `alloc.h/.cpp`
 - Heap-friendly String type, including format strings and StringBuffers/Builders, as well as `<string.h>` function replacements as static methods in single-header `string.h`
 - Instrusive serialization mechanism in `serialize.h/.cpp` for complex types and primitives (no reflection though)
 - A few hash functions, HashTable and CacheTable (hash table that can forget its keys) implementations in `table.hpp`
 - A dynamic/growing array implementation in `array.hpp`
 - Common file operations, `<stdio>` wrapper in `file.h/.cpp`
 And some more stuff that is TODO:
 - `cpuid` x86 instruction wrapper
 - `glm` replacement - vector, matrix, and quaternion types and some common operations involving them
 # Licenses & Other Code
 ## fast_float
 Our serialization code uses `fast_float` library by Daniel Lemire et al, provided simultaneously under the [Apache License, Version 2.0](https://github.com/fastfloat/fast_float/blob/main/LICENSE-APACHE), the [MIT license](https://github.com/fastfloat/fast_float/blob/main/LICENSE-MIT) and/or the [BOOST license](https://github.com/fastfloat/fast_float/blob/main/LICENSE-BOOST). The `fast_float` library itself uses code originally published the Apache 2.0 license.
 ## sse_mathfun.h
 The `sin`, `cos`, `exp`, and `log` replacements used by this library are provided by a single-header library written by Julien Pommier under the zlib license:
 ```
 Copyright (C) 2007  Julien Pommier
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the authors be held liable for any damages
  arising from the use of this software.
  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:
  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.
  (this is the zlib license)
 ```
--- a/alloc.cpp
+++ b/alloc.cpp
@ -7,22 +7,22 @@
 #if false
 static void* leakcheckMalloc(size_t size, const char* file, s32 line) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return malloc(size);
 }
 static void* leakcheckCalloc(size_t maxNumOfElements, size_t elementSize, const char* file, s32 line) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return calloc(maxNumOfElements, elementSize);
 }
 static void* leakcheckRealloc(void* buffer, size_t newSize, const char* file, s32 line) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return realloc(buffer, newSize);
 }
 static void leakcheckFree(void* ptr, const char* file, s32 line) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    free(ptr);
 }
@ -45,7 +45,7 @@ static void dumpLeaks() {
 // system allocators
 void* pMalloc(size_t size) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    void* p = malloc(size);
    if (!p) {
@ -55,12 +55,12 @@ void* pMalloc(size_t size) {
    return p;
 }
 void* pMalloc(size_t size, void* allocatorState) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return pMalloc(size);
 }
 void* pCalloc(size_t maxNumOfElements, size_t elementSize) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    void* p = calloc(maxNumOfElements, elementSize);
    if (!p) {
@ -70,12 +70,12 @@ void* pCalloc(size_t maxNumOfElements, size_t elementSize) {
    return p;
 }
 void* pCalloc(size_t maxNumOfElements, size_t elementSize, void* allocatorState) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return pCalloc(maxNumOfElements, elementSize);
 }
 void* pRealloc(void* buffer, size_t newSize) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    void* p = realloc(buffer, newSize);
    if (!p) {
@ -86,25 +86,25 @@ void* pRealloc(void* buffer, size_t newSize) {
    return p;
 }
 void* pRealloc(void* buffer, size_t newSize, void* allocatorState) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return pRealloc(buffer, newSize);
 }
 void pFree(void* ptr) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    free(ptr);
 }
 void pFree(void* ptr, void* allocatorState) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    pFree(ptr);
 }
 void pFree(const void* ptr) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    pFree((void*) ptr);
 }
 void pFree(const void* ptr, void* allocatorState) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    pFree((void*) ptr, allocatorState);
 }
@ -124,7 +124,7 @@ void pFree(const void* ptr, void* allocatorState) {
 static bool DefaultAllocatorInited = false;
 static Allocator DefaultAllocator;
 static void defaultAllocatorInit() {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    DefaultAllocator.state = null;
    DefaultAllocator.mallocate = pMalloc;
    DefaultAllocator.callocate = pCalloc;
@ -134,7 +134,7 @@ static void defaultAllocatorInit() {
 }
 Allocator* Allocator::GetDefault() {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    if (!DefaultAllocatorInited) defaultAllocatorInit();
    return &DefaultAllocator;
 }
@ -142,7 +142,7 @@ Allocator* Allocator::GetDefault() {
 //================================================================================ 
 // alignment should be a power of 2
 static u64 alignForward2(u64 ptr, size_t alignment) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    u64 p, a, modulo;
    p = ptr;
@ -156,14 +156,14 @@ static u64 alignForward2(u64 ptr, size_t alignment) {
    return p;
 }
 static u64 alignForward(u64 ptr, size_t alignment) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return ((ptr + alignment - 1) / alignment) * alignment;
 }
 //================================================================================ 
 // Scratch/Arena
 Arena* Arena::Init(u32 sizeInBytes) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    Arena* arena = (Arena*) pMalloc(sizeof(Arena));
    arena->index = 0;
    arena->buffer = (u8*) pMalloc(sizeof(u8) * sizeInBytes);
@ -171,7 +171,7 @@ Arena* Arena::Init(u32 sizeInBytes) {
    return arena;
 }
 void* Arena::Alloc(u32 sizeInBytes) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    u8* p = this->buffer + this->index;
    u32 offset = (u32) alignForward2((u64) p, 64);
@ -187,7 +187,7 @@ void* Arena::Alloc(u32 sizeInBytes) {
    return null;
 }
 void Arena::Clear() {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    this->index = 0;
 }
 //================================================================================ 
--- a/alloc.h
+++ b/alloc.h
@ -1,7 +1,8 @@
 #ifndef ALLOC_H
 #define ALLOC_H
 #ifndef ULE_ALLOC_H
 #define ULE_ALLOC_H
 #include "config.h"
 #include "types.h"
--- a/array.hpp
+++ b/array.hpp
@ -1,9 +1,10 @@
 #ifndef ARRAY_H
 #define ARRAY_H
 #ifndef ULE_ARRAY_H
 #define ULE_ARRAY_H
 #include <new> // operator new, operator delete
 #include "config.h"
 #include "alloc.h" // allocators...
 #include "serialize.h" // serialization
 #include "string.h" // String::memcpy
@ -32,21 +33,28 @@ struct Array {
    T* data;
    Array<T>(u32 _capacity = 8) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->length   = 0;
        this->capacity = _capacity;
        this->data     = (T*) pCalloc(sizeof (T), _capacity);
    }
    void* operator new(size_t size) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return pMalloc((u32) size);
    }
    void checkIfShouldGrow() {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (this->isFull()) {
            // optimal number as you approach infinite elements approaches PHI, but 1.5 sometimes works better for finite sizes
            // more testing is probably needed
            //
            // it seems, that a commonly chosen growth rate of '2' is perhaps the worst possible choice.
            // if you grow at a rate of 2x, you end up (likely) never being able to re-use the freed 'hole' in the heap
            // for a future allocation of the same kind.
            // useful reading for those interested in their own dynamic array implementations:
            // (facebook's vector impl, a strictly better std::vector)
            // https://github.com/facebook/folly/blob/main/folly/docs/FBVector.md
            //
            this->capacity = (u32) (this->capacity * 1.5);
            this->data = (T*) pRealloc(data, sizeof(T) * this->capacity);
        }
@ -54,7 +62,7 @@ struct Array {
    // for when the order in the array doesn't matter, move the end of the array into the removed slot
    void removeSwapWithEnd(u32 index) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (this->isEmpty()) return; // overhead, maybe assert instead?
        u32 end = this->length - 1;
@ -65,7 +73,7 @@ struct Array {
    }
    void removeSwapWithEnd(T* addr) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->length; i++) {
            if ((this->data + i) == addr) {
                removeSwapWithEnd(i);
@ -75,7 +83,7 @@ struct Array {
    }
    void removeAndShrink(u32 index) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = index + 1; i < this->length; i++) {
            String::memcpy(this->data[i - 1], this->data[i], sizeof(T));
        }
@ -83,7 +91,7 @@ struct Array {
    }
    void removeAndShrink(T* elementAddr) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        s32 index = -1;
        for (u32 i = 0; i < this->length; i++) {
            if ((this->data + i) == elementAddr) {
@ -103,7 +111,7 @@ struct Array {
    }
    T pop() {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (this->isEmpty()) {
            die("empty");
        }
@ -114,7 +122,7 @@ struct Array {
    // sometimes, you want to copy some POD data on the stack to the next position in the internal array
    // that's what this does
    u32 pushCopy(T* e) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();
        String::memcpy((void*) &this->data[this->length++], e, sizeof(T));
@ -126,14 +134,14 @@ struct Array {
    // it is irresponsible to call this and then not store a T in that address. this increments length,
    // reserving the next spot for you.
    T* pushNextAddrPromise() {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();
        return &this->data[this->length++];
    }
    u32 push(T e) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();
        this->data[this->length++] = e;
@ -142,7 +150,7 @@ struct Array {
    }
    u32 pushMany(T* elements, u32 count) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        // ensure we have capacity. if we have to realloc multiple times that can suck,
        // but should be avoidable in practice by having an appropriately large initial capacity
        while (this->capacity < (this->length + count)) {
@ -159,7 +167,7 @@ struct Array {
    }
    void reverse() {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 count = this->length / 2;
        for (u32 i = 0; i < count; i++) {
@ -172,7 +180,7 @@ struct Array {
    }
    T shift() {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (this->length == 0) {
            return null;
        }
@ -188,7 +196,7 @@ struct Array {
    }
    T unshift(T e) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();
        for (u32 i = 0; i < this->length; i++) {
@ -202,7 +210,7 @@ struct Array {
    }
    T peek() const {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (this->isEmpty()) {
            return null;
        }
@ -211,24 +219,25 @@ struct Array {
    }
    bool isEmpty() const {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return this->length == 0;
    }
    bool isFull() const {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return this->length == this->capacity;
    }
    void clear() {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->length = 0;
    }
 };
 template <typename T>
 #ifdef ULE_CONFIG_OPTION_SERIALIZATION
 extern template <typename T>
 static void serialize(String* str, Array<T> array) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    serialize(str, array.length);
    serialize(str, array.capacity);
    for (u32 i = 0; i < array.length; i++) {
@ -236,9 +245,9 @@ static void serialize(String* str, Array<T> array) {
    }
 }
 template <typename T>
 extern template <typename T>
 static void serialize(String* str, Array<T>* array) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    SERIALIZE_HANDLE_NULL(str, array);
    serialize(str, array->length);
    serialize(str, array->capacity);
@ -247,9 +256,9 @@ static void serialize(String* str, Array<T>* array) {
    }
 }
 template <typename T>
 extern template <typename T>
 static void deserialize(char** buffer, Array<T>* array) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    deserialize(buffer, &array->length);
    deserialize(buffer, &array->capacity);
    for (u32 i = 0; i < array->length; i++) {
@ -257,9 +266,9 @@ static void deserialize(char** buffer, Array<T>* array) {
    }
 }
 template <typename T>
 extern template <typename T>
 static void deserialize(char** buffer, Array<T>** array) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    DESERIALIZE_HANDLE_NULL(buffer, array);
    u32 length, capacity;
    deserialize(buffer, &length);
@ -271,5 +280,7 @@ static void deserialize(char** buffer, Array<T>** array) {
    }
    *array = _array;
 }
 #endif // ULE_CONFIG_OPTION_SERIALIZATION
 #endif
--- a/config.h
+++ b/config.h
@ -0,0 +1,15 @@
 #pragma once
 #ifndef ULE_CONFIG_H
 #define ULE_CONFIG_H
 // define this macro to include the serialization code `serialize.h/.cpp`, as well as serialization
 // for the hashtable(s) and array implementations.
 //#define ULE_CONFIG_OPTION_SERIALIZATION
 // all functions in the library will invoke a semicolon-terminated macro as their first line of execution.
 // this is for use by an instrusive profiler, though could be used for whatever purpose.
 //#define ULE_CONFIG_OPTION_FTAG ZoneScoped
 #endif
--- a/cpuid.cpp
+++ b/cpuid.cpp
@ -45,7 +45,7 @@ static const char* szFeatures[] = {
 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2008/hskdteyh(v=vs.90)?redirectedfrom=MSDN
 #include <intrin.h>
 void cpuid() {
 	TYPES_H_FTAG;
 	ULE_TYPES_H_FTAG;
    int nSteppingID = 0;
    int nModel = 0;
    int nFamily = 0;
@ -142,7 +142,7 @@ void cpuid() {
 #else
 void cpuid() {
 	TYPES_H_FTAG;
 	ULE_TYPES_H_FTAG;
 }
 #endif
--- a/cpuid.h
+++ b/cpuid.h
@ -1,6 +1,8 @@
 #ifndef CPUID_H
 #define CPUID_H
 #ifndef ULE_CPUID_H
 #define ULE_CPUID_H
 #include "config.h"
 void cpuid();
--- a/file.cpp
+++ b/file.cpp
@ -11,11 +11,11 @@
 FILE* File::Open(const char* path, const char* mode) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return fopen(path, mode);
 }
 FILE* File::Open(const char* path, size_t* outSize, const char* mode) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, mode);
    if (fp == null) {
@ -34,7 +34,7 @@ void File::Close(FILE* file) {
 }
 size_t File::Size(const char* path) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path);
    // get the file's size in bytes
    fseek(fp, 0, SEEK_END);
@ -44,7 +44,7 @@ size_t File::Size(const char* path) {
    return size;
 }
 size_t File::Size(FILE* fp) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    fseek(fp, 0, SEEK_END);
    size_t size = ftell(fp);
    fseek(fp, 0L, SEEK_SET);
@ -52,7 +52,7 @@ size_t File::Size(FILE* fp) {
 }
 u8* File::Read(const char* path) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, "rb");
    if (fp == null) {
@ -73,7 +73,7 @@ u8* File::Read(const char* path) {
    return (u8*) buffer;
 }
 u8* File::Read(const char* path, size_t* outSize) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, "rb");
    if (fp == null) {
@ -98,7 +98,7 @@ u8* File::Read(const char* path, size_t* outSize) {
    return (u8*) buffer;
 }
 size_t File::Read(FILE* fp, void* destination) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    fseek(fp, 0, SEEK_END);
    size_t size = ftell(fp);
@ -108,12 +108,12 @@ size_t File::Read(FILE* fp, void* destination) {
    return size;
 }
 size_t File::Read(FILE* fp, void* destination, size_t size) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return fread(destination, sizeof (char), size + 1, fp);
 }
 s32 File::Write(const char* path, char* data, u32 count) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, "wb");
    if (fp == null) {
@ -134,7 +134,7 @@ s32 File::Write(const char* path, char* data, u32 count) {
 #include <windows.h>
 // writes the filenames into the provided array |outFileNames|, must be allocated ahead of time.
 void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    massert(path != null, "provided 'null' for path argument");
    massert(outFileNames != null, "provided 'null' for array argument");
    WIN32_FIND_DATAA findData;
@ -160,7 +160,7 @@ void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
 #else
 #include <dirent.h>
 void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    massert(path != null, "provided 'null' for path argument");
    massert(outFileNames != null, "provided 'null' for array argument");
    DIR* dir = opendir(path);
@ -189,7 +189,7 @@ void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
 #endif
 time_t File::LastModified(const char* path) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    struct stat result;
    if (stat(path, &result) == 0) {
        return result.st_mtime;
--- a/file.h
+++ b/file.h
@ -1,10 +1,11 @@
 #ifndef FILE_H
 #define FILE_H
 #ifndef ULE_FILE_H
 #define ULE_FILE_H
 #include <stdio.h> // FILE
 #include <sys/types.h> // time_t
 #include "config.h"
 #include "array.hpp"
--- a/print.cpp
+++ b/print.cpp
@ -10,12 +10,12 @@
 void vprint(const char* format, va_list args) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    vfprintf(stdout, format, args);
 }
 void vprintln(const char* format, va_list args) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    vprint(format, args);
    print("\n");
 }
@ -25,7 +25,7 @@ void vprintln(const char* format, va_list args) {
 * +we intend to replace printf at some point with this
 */
 void print(const char* format, ...) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    if (format == null) { print("null"); return; }
    va_list args;
@ -37,7 +37,7 @@ void print(const char* format, ...) {
 }
 void println(const char* format, ...) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    if (format == null) { print("null\n"); return; }
    va_list args;
@ -58,7 +58,7 @@ void println(const char* format, ...) {
 #include <dbghelp.h>
 // if |string| is non-null, then the stack trace will be concatenated to it instead of being printed to stdout.
 void trace(String* string) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    #define BACKTRACE_MAX_FUNCTION_NAME_LENGTH 1024
    HANDLE processHandle = GetCurrentProcess();
@ -105,7 +105,7 @@ void trace(String* string) {
 #include <cxxabi.h> // abi::__cxa_demangle
 // if |string| is non-null, then the stack trace will be concatenated to it instead of being printed to stdout.
 void trace(String* string) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    void* stack[BACKTRACE_MAX_FRAMES];
    u32 stackSize = backtrace(stack, BACKTRACE_MAX_FRAMES);
@ -156,7 +156,7 @@ void trace(String* string) {
 #endif
 void _debug(const char* format, ...) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    if (format == null) {
        print("%sdebug:%s null\n", ANSI_BLUE, ANSI_RESET);
        return;
@ -172,7 +172,7 @@ void _debug(const char* format, ...) {
 }
 void _warn(const char* format, ...) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    if (format == null) {
        print("%swarning:%s null\n", ANSI_YELLOW, ANSI_RESET);
        return;
@ -198,7 +198,7 @@ void setCustomDieBehavior(void (*dieBehavior)(const char* string)) {
 // if a fatal error should not occur at runtime on a release binary, consider preferring 'massert'
 // it's unclear when you should use asserts vs. die actually. idk man
 void die(const char* format, ...) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    if (format == null) {
        if (customDie == null) {
            print("%serror:%s (unspecified error)\n", ANSI_RED, ANSI_RESET);
@ -239,42 +239,43 @@ void die(const char* format, ...) {
    }
 }
 void print(bool b)         { TYPES_H_FTAG; print("%s", b ? "true" : "false"); }
 void print(char c)         { TYPES_H_FTAG; print("%c", c); }
 void print(signed int i)   { TYPES_H_FTAG; print("%d", i); }
 void print(unsigned int i) { TYPES_H_FTAG; print("%u", i); }
 void print(float f)        { TYPES_H_FTAG; print("%.14g", f); }
 void print(double d)       { TYPES_H_FTAG; print("%.14g", d); }
 void print(void* p)        { TYPES_H_FTAG; print("%p", p); }
 void print(char* s)        { TYPES_H_FTAG; print("%s", s); }
 void print(bool b)         { ULE_TYPES_H_FTAG; print("%s", b ? "true" : "false"); }
 void print(char c)         { ULE_TYPES_H_FTAG; print("%c", c); }
 void print(signed int i)   { ULE_TYPES_H_FTAG; print("%d", i); }
 void print(unsigned int i) { ULE_TYPES_H_FTAG; print("%u", i); }
 void print(float f)        { ULE_TYPES_H_FTAG; print("%.14g", f); }
 void print(double d)       { ULE_TYPES_H_FTAG; print("%.14g", d); }
 void print(void* p)        { ULE_TYPES_H_FTAG; print("%p", p); }
 void print(char* s)        { ULE_TYPES_H_FTAG; print("%s", s); }
 #ifndef _WIN32
 void print(size_t i) { TYPES_H_FTAG; print("%u", i); }
 void println(size_t i) { TYPES_H_FTAG; print(i); print("\n"); }
 void print(size_t i) { ULE_TYPES_H_FTAG; print("%u", i); }
 void println(size_t i) { ULE_TYPES_H_FTAG; print(i); print("\n"); }
 #endif
 void println(bool b)         { TYPES_H_FTAG; print(b); print("\n"); }
 void println(char c)         { TYPES_H_FTAG; print(c); print("\n"); }
 void println(signed int i)   { TYPES_H_FTAG; print(i); print("\n"); }
 void println(unsigned int i) { TYPES_H_FTAG; print(i); print("\n"); }
 void println(float f)        { TYPES_H_FTAG; print(f); print("\n"); }
 void println(double d)       { TYPES_H_FTAG; print(d); print("\n"); }
 void println(void* p)        { TYPES_H_FTAG; print(p); print("\n"); }
 void println(char* s)        { TYPES_H_FTAG; print(s); print("\n"); }
 void println()               { TYPES_H_FTAG;           print("\n"); }
 #ifdef _USING_GLM_TYPES__
 void print(glm::vec<2, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec2: %.14g,%.14g", v.x, v.y); }
 void print(glm::vec<3, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec3: %.14g,%.14g,%.14g", v.x, v.y, v.z); }
 void print(glm::vec<4, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec4: %.14g,%.14g,%.14g,%.14g", v.x, v.y, v.z, v.w); }
 void print(glm::mat<2, 2, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat2: "); print(m[0]); print(m[1]); }
 void print(glm::mat<3, 3, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat3: "); print(m[0]); print(m[1]); print(m[2]); }
 void print(glm::mat<4, 4, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat4: "); print(m[0]); print(m[1]); print(m[2]); print(m[3]); }
 void println(glm::vec<2, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
 void println(glm::vec<3, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
 void println(glm::vec<4, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
 void println(glm::mat<2, 2, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
 void println(glm::mat<3, 3, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
 void println(glm::mat<4, 4, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
 #endif
 void println(bool b)         { ULE_TYPES_H_FTAG; print(b); print("\n"); }
 void println(char c)         { ULE_TYPES_H_FTAG; print(c); print("\n"); }
 void println(signed int i)   { ULE_TYPES_H_FTAG; print(i); print("\n"); }
 void println(unsigned int i) { ULE_TYPES_H_FTAG; print(i); print("\n"); }
 void println(float f)        { ULE_TYPES_H_FTAG; print(f); print("\n"); }
 void println(double d)       { ULE_TYPES_H_FTAG; print(d); print("\n"); }
 void println(void* p)        { ULE_TYPES_H_FTAG; print(p); print("\n"); }
 void println(char* s)        { ULE_TYPES_H_FTAG; print(s); print("\n"); }
 void println()               { ULE_TYPES_H_FTAG;           print("\n"); }
 #ifdef ULE_CONFIG_OPTION_USE_GLM
 void print(glm::vec<2, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec2: %.14g,%.14g", v.x, v.y); }
 void print(glm::vec<3, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec3: %.14g,%.14g,%.14g", v.x, v.y, v.z); }
 void print(glm::vec<4, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec4: %.14g,%.14g,%.14g,%.14g", v.x, v.y, v.z, v.w); }
 void print(glm::mat<2, 2, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat2: "); print(m[0]); print(m[1]); }
 void print(glm::mat<3, 3, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat3: "); print(m[0]); print(m[1]); print(m[2]); }
 void print(glm::mat<4, 4, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat4: "); print(m[0]); print(m[1]); print(m[2]); print(m[3]); }
 void println(glm::vec<2, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
 void println(glm::vec<3, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
 void println(glm::vec<4, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
 void println(glm::mat<2, 2, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
 void println(glm::mat<3, 3, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
 void println(glm::mat<4, 4, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
 #endif // ULE_CONFIG_OPTION_USE_GLM
--- a/print.h
+++ b/print.h
@ -1,9 +1,10 @@
 #ifndef PRINT_H
 #define PRINT_H
 #ifndef ULE_PRINT_H
 #define ULE_PRINT_H
 #include <stdarg.h> // va_list
 #include "config.h"
 #include "string.h"
 #include "types.h"
@ -135,7 +136,7 @@ extern void println(void* p);
 extern void println(char* s);
 extern void println();
 #ifdef _USING_GLM_TYPES__
 #ifdef ULE_CONFIG_OPTION_USE_GLM
 extern void print(glm::vec<2, float, (glm::qualifier) 3>);
 extern void print(glm::vec<3, float, (glm::qualifier) 3>);
 extern void print(glm::vec<4, float, (glm::qualifier) 3>);
--- a/serialize.cpp
+++ b/serialize.cpp
@ -1,4 +1,6 @@
 #ifdef ULE_CONFIG_OPTION_SERIALIZATION
 #include <fast_float/fast_float.h>
 #include "types.h"
@ -7,52 +9,52 @@
 #include "print.h"
 static inline const char* getFormatStringOut(u8     v) { TYPES_H_FTAG; return "%hu\n"; }
 static inline const char* getFormatStringOut(u16    v) { TYPES_H_FTAG; return "%hu\n"; }
 static inline const char* getFormatStringOut(u32    v) { TYPES_H_FTAG; return "%u\n"; }
 static inline const char* getFormatStringOut(u64    v) { TYPES_H_FTAG; return "%llu\n"; }
 static inline const char* getFormatStringOut(u8     v) { ULE_TYPES_H_FTAG; return "%hu\n"; }
 static inline const char* getFormatStringOut(u16    v) { ULE_TYPES_H_FTAG; return "%hu\n"; }
 static inline const char* getFormatStringOut(u32    v) { ULE_TYPES_H_FTAG; return "%u\n"; }
 static inline const char* getFormatStringOut(u64    v) { ULE_TYPES_H_FTAG; return "%llu\n"; }
 static inline const char* getFormatStringOut(s8     v) { TYPES_H_FTAG; return "%hd\n"; }
 static inline const char* getFormatStringOut(s16    v) { TYPES_H_FTAG; return "%hd\n"; }
 static inline const char* getFormatStringOut(s32    v) { TYPES_H_FTAG; return "%d\n"; }
 static inline const char* getFormatStringOut(s64    v) { TYPES_H_FTAG; return "%lld\n"; }
 static inline const char* getFormatStringOut(s8     v) { ULE_TYPES_H_FTAG; return "%hd\n"; }
 static inline const char* getFormatStringOut(s16    v) { ULE_TYPES_H_FTAG; return "%hd\n"; }
 static inline const char* getFormatStringOut(s32    v) { ULE_TYPES_H_FTAG; return "%d\n"; }
 static inline const char* getFormatStringOut(s64    v) { ULE_TYPES_H_FTAG; return "%lld\n"; }
 static inline const char* getFormatStringOut(float  v) { TYPES_H_FTAG; return "%f\n"; }
 static inline const char* getFormatStringOut(double v) { TYPES_H_FTAG; return "%f\n"; }
 static inline const char* getFormatStringOut(float  v) { ULE_TYPES_H_FTAG; return "%f\n"; }
 static inline const char* getFormatStringOut(double v) { ULE_TYPES_H_FTAG; return "%f\n"; }
 // important constraint - strings need to be wrapped in double-quotes.
 // the sentinel value 'null' without quotations is used to denote null values, which means
 // if strings were not wrapped in double quotes, you would not be able to distinguish null
 // values from the literal string "null".
 static inline const char* getFormatStringOut(char*       v) { TYPES_H_FTAG; return "\"%s\"\n"; }
 static inline const char* getFormatStringOut(const char* v) { TYPES_H_FTAG; return "\"%s\"\n"; }
 static inline const char* getFormatStringOut(char*       v) { ULE_TYPES_H_FTAG; return "\"%s\"\n"; }
 static inline const char* getFormatStringOut(const char* v) { ULE_TYPES_H_FTAG; return "\"%s\"\n"; }
 #ifdef _USING_GLM_TYPES__
 static inline const char* getFormatStringOut(glm::vec<2, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f\n"; }
 static inline const char* getFormatStringOut(glm::vec<3, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::vec<4, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::vec<2, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f\n"; }
 static inline const char* getFormatStringOut(glm::vec<3, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::vec<4, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::mat<2, 2, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::mat<3, 3, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::mat<4, 4, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::mat<2, 2, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::mat<3, 3, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f\n"; }
 static inline const char* getFormatStringOut(glm::mat<4, 4, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n"; }
 #endif
 #define SERIALIZE_H_FUNC_BODY str->appendf(getFormatStringOut(v), v);
 void serialize(String* str, u8     v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, u16    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, u32    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, u64    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s8     v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s16    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s32    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s64    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, float  v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, double v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 template<typename T> // do I really need a template for this?
 void serialize(String* str, u8     v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, u16    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, u32    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, u64    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s8     v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s16    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s32    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, s64    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, float  v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 void serialize(String* str, double v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
 extern template<typename T> // @TODO do not use a template for this.
 static inline void deserializeInteger(char** buffer, T* v) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    char* _buffer = *buffer;
    T value = 0;
@ -107,7 +109,7 @@ static const u32 BINARY32_MAX_CHARS = 14;
 static const u32 BINARY64_MAX_CHARS = 24;
 void deserialize(char** buffer, float*  v) { 
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    char* _buffer = *buffer;
    while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
    fast_float::from_chars_result result = fast_float::from_chars(_buffer, _buffer + BINARY32_MAX_CHARS, *v);
@ -115,7 +117,7 @@ void deserialize(char** buffer, float*  v) {
    *buffer = (char*) result.ptr;
 }
 void deserialize(char** buffer, double* v) { 
    TYPES_H_FTAG;  
    ULE_TYPES_H_FTAG;  
    char* _buffer = *buffer;
    while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
    fast_float::from_chars_result result = fast_float::from_chars(_buffer, _buffer + BINARY64_MAX_CHARS, *v);
@ -125,19 +127,19 @@ void deserialize(char** buffer, double* v) {
 #ifndef _WIN32
 // win32 doesn't treat size_t as different than a u64, which causes ambiguous function calls
 static inline const char* getFormatStringOut(size_t v) { TYPES_H_FTAG; return "%lu\n"; }
 static inline const char* getFormatStringOut(size_t v) { ULE_TYPES_H_FTAG; return "%lu\n"; }
 void serialize(String* str, size_t v) { SERIALIZE_H_FUNC_BODY }
 void deserialize(char** buffer, size_t* v) { SERIALIZE_H_DESERIALIZE_FUNC_BODY }
 #endif
 // STRING STUFF
 void serialize(String* str, char* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    SERIALIZE_HANDLE_NULL(str, v);
    SERIALIZE_H_FUNC_BODY;
 }
 void serialize(String* str, const char* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    SERIALIZE_HANDLE_NULL(str, v);
    SERIALIZE_H_FUNC_BODY;
 }
@ -160,7 +162,7 @@ void serialize(String* str, const char* v) {
 static char SERIALIZE_SCRATCH_BUFFER[SERIALIZE_SCRATCH_BUFFER_SIZE];
 static s32 deserializeString(char** buffer, char* v, s32 vSize) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    char* _buffer = *buffer;
    while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
@ -181,7 +183,7 @@ static s32 deserializeString(char** buffer, char* v, s32 vSize) {
    return i;
 }
 static s32 deserializeString(char** buffer, char* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    char* _buffer = *buffer;
    while (String::isAsciiWhitespace(*_buffer)) _buffer++;
    massert(_buffer[0] == '"', "expecting to deserialize a string, but found something other than a double quote");
@ -199,15 +201,15 @@ static s32 deserializeString(char** buffer, char* v) {
    return i;
 }
 void deserialize(char** buffer, char* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    deserializeString(buffer, v);
 }
 void deserialize(char** buffer, const char* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    deserializeString(buffer, (char*) v);
 }
 void deserialize(char** buffer, char** v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    DESERIALIZE_HANDLE_NULL(buffer, v);
@ -216,7 +218,7 @@ void deserialize(char** buffer, char** v) {
    *v = String::cpy(SERIALIZE_SCRATCH_BUFFER, (u32) i);
 }
 void deserialize(char** buffer, const char** v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    DESERIALIZE_HANDLE_NULL(buffer, (char*) v); // error: readonly variable is not assignable
@ -231,33 +233,33 @@ void deserialize(char** buffer, const char** v) {
 // have that template parameter == 3, so everything below becomes unresolved symbols if 
 // I don't do the nasty template garbage here
 void serialize(String* str, glm::vec<2, float, (glm::qualifier) (glm::qualifier) 3> v) { 
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v), v[0], v[1]); 
 }
 void serialize(String* str, glm::vec<3, float, (glm::qualifier) (glm::qualifier) 3> v) { 
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v), v[0], v[1], v[2]); 
 }
 void serialize(String* str, glm::vec<4, float, (glm::qualifier) 3> v) { 
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v), v[0], v[1], v[2], v[3]); 
 }
 void serialize(String* str, glm::mat<2, 2, float, (glm::qualifier) 3> v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v)
        , v[0][0], v[0][1]
        , v[1][0], v[1][1]);
 }
 void serialize(String* str, glm::mat<3, 3, float, (glm::qualifier) 3> v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v)
        , v[0][0], v[0][1], v[0][2]
        , v[1][0], v[1][1], v[1][2]
        , v[2][0], v[2][1], v[2][2]);
 }
 void serialize(String* str, glm::mat<4, 4, float, (glm::qualifier) 3> v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v)
        , v[0][0], v[0][1], v[0][2], v[0][3]
        , v[1][0], v[1][1], v[1][2], v[1][3]
@ -266,21 +268,21 @@ void serialize(String* str, glm::mat<4, 4, float, (glm::qualifier) 3> v) {
 }
 void deserialize(char** buffer, glm::vec<2, float, (glm::qualifier) 3>* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    float* _v = (float*) v;
    for (u32 i = 0; i < 2; i++) {
        deserialize(buffer, _v + i);
    }
 }
 void deserialize(char** buffer, glm::vec<3, float, (glm::qualifier) 3>* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    float* _v = (float*) v;
    for (u32 i = 0; i < 3; i++) {
        deserialize(buffer, _v + i);
    }
 }
 void deserialize(char** buffer, glm::vec<4, float, (glm::qualifier) 3>* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    float* _v = (float*) v;
    for (u32 i = 0; i < 4; i++) {
        deserialize(buffer, _v + i);
@ -288,21 +290,21 @@ void deserialize(char** buffer, glm::vec<4, float, (glm::qualifier) 3>* v) {
 }
 void deserialize(char** buffer, glm::mat<2, 2, float, (glm::qualifier) 3>* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    float* m = (float*) v;
    for (u32 i = 0; i < 4; i++) {
        deserialize(buffer, m + i);
    }
 }
 void deserialize(char** buffer, glm::mat<3, 3, float, (glm::qualifier) 3>* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    float* m = (float*) v;
    for (u32 i = 0; i < 9; i++) {
        deserialize(buffer, m + i);
    }
 }
 void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>* v) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    float* m = (float*) v;
    for (u32 i = 0; i < 16; i++) {
        deserialize(buffer, m + i);
@ -313,4 +315,5 @@ void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>* v) {
 #undef SERIALIZE_H_DESERIALIZE_FUNC_BODY
 #endif
 #endif
--- a/serialize.h
+++ b/serialize.h
@ -1,12 +1,15 @@
 #ifndef SERIALIZE_H
 #define SERIALIZE_H
 #ifdef ULE_CONFIG_OPTION_SERIALIZATION
 #ifndef ULE_SERIALIZE_H
 #define ULE_SERIALIZE_H
 #include "config.h"
 #include "print.h"
 #include "types.h"
 #include "string.h"
 /*
    NOTES ON SERIALIZATION
    after wrestling with various reflection libraries for a week, I decided to use none of them.
@ -72,13 +75,13 @@ extern void serialize(String* str, T v); \
 extern void deserialize(char** buffer, T* v); \
 extern void deserialize(char** buffer, T** v); \
 static void serializePrint(T* v) { \
    TYPES_H_FTAG; \
    ULE_TYPES_H_FTAG; \
    String str = String(""); \
    serialize(&str, v); \
    println(str.c_str()); \
 } \
 static bool serializeEquals(T* t1, T* t2) { \
    TYPES_H_FTAG; \
    ULE_TYPES_H_FTAG; \
    String s1 = String128f(""); \
    String s2 = String128f(""); \
    serialize(&s1, t1); \
@ -89,7 +92,7 @@ static bool serializeEquals(T* t1, T* t2) { \
 // if you implement deserialize with a T*.
 #define SERIALIZE_H_HELPER_CLONE_T_POINTER(T) \
 static void serializeClone(T* orig, T* destination) { \
    TYPES_H_FTAG; \
    ULE_TYPES_H_FTAG; \
    String str = String128f(""); \
    serialize(&str, orig); \
    char* buffer = str.c_str(); \
@ -99,7 +102,7 @@ static void serializeClone(T* orig, T* destination) { \
 // if you implement deserialize with a T**.
 #define SERIALIZE_H_HELPER_CLONE_T_DOUBLE_POINTER(T) \
 static void serializeClone(T* orig, T** destination) { \
    TYPES_H_FTAG; \
    ULE_TYPES_H_FTAG; \
    String str = String128f(""); \
    serialize(&str, orig); \
    char* buffer = str.c_str(); \
@ -206,3 +209,5 @@ extern void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>
 #endif
 #endif
--- a/signal-handler.h
+++ b/signal-handler.h
@ -1,9 +1,10 @@
 #ifndef SIGNAL_HANDLER_H
 #define SIGNAL_HANDLER_H
 #ifndef ULE_SIGNAL_HANDLER_H
 #define ULE_SIGNAL_HANDLER_H
 #include <signal.h> // for signal() and the SIG macros
 #include "config.h"
 #include "types.h"
 #include "print.h"
@ -11,7 +12,7 @@
 // the running process can receive and respond to a variety of platform-dependent 'signals' during runtime from the OS.
 // freebsd has something like 30 signals, windows has a subset, just 6. we'll just deal with 6.
 static inline void defaultHandler(s32 signal) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG; 
    switch (signal) {
        case SIGSEGV:
        case SIGABRT:
@ -37,7 +38,7 @@ static inline void defaultHandler(s32 signal) {
 }
 static void setSignalHandlers(void(*handler)(s32 signal) = defaultHandler) {
    TYPES_H_FTAG; 
    ULE_TYPES_H_FTAG;
    if (signal(SIGSEGV, handler) == SIG_ERR) die("failed to set SIGSEGV handler... zzz...\n");
    if (signal(SIGABRT, handler) == SIG_ERR) die("failed to set SIGABRT handler... zzz...\n");
    if (signal(SIGFPE,  handler) == SIG_ERR) die("failed to set SIGFPE handler... zzz...\n");
--- a/sse_mathfun.h
+++ b/sse_mathfun.h
@ -0,0 +1,710 @@
 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
   Inspired by Intel Approximate Math library, and based on the
   corresponding algorithms of the cephes math library
   The default is to use the SSE1 version. If you define USE_SSE2 the
   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
   not expect any significant performance improvement with SSE2.
 */
 /* Copyright (C) 2007  Julien Pommier
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the authors be held liable for any damages
  arising from the use of this software.
  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:
  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.
  (this is the zlib license)
 */
 #include <xmmintrin.h>
 /* yes I know, the top of this file is quite ugly */
 #ifdef _MSC_VER /* visual c++ */
 # define ALIGN16_BEG __declspec(align(16))
 # define ALIGN16_END 
 #else /* gcc or icc */
 # define ALIGN16_BEG
 # define ALIGN16_END __attribute__((aligned(16)))
 #endif
 /* __m128 is ugly to write */
 typedef __m128 v4sf;  // vector of 4 float (sse1)
 #ifdef USE_SSE2
 # include <emmintrin.h>
 typedef __m128i v4si; // vector of 4 int (sse2)
 #else
 typedef __m64 v2si;   // vector of 2 int (mmx)
 #endif
 /* declare some SSE constants -- why can't I figure a better way to do that? */
 #define _PS_CONST(Name, Val)                                            \
  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 #define _PI32_CONST(Name, Val)                                            \
  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 #define _PS_CONST_TYPE(Name, Type, Val)                                 \
  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 _PS_CONST(1  , 1.0f);
 _PS_CONST(0p5, 0.5f);
 /* the smallest non denormalized float number */
 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
 _PI32_CONST(1, 1);
 _PI32_CONST(inv1, ~1);
 _PI32_CONST(2, 2);
 _PI32_CONST(4, 4);
 _PI32_CONST(0x7f, 0x7f);
 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
 _PS_CONST(cephes_log_q1, -2.12194440e-4);
 _PS_CONST(cephes_log_q2, 0.693359375);
 #ifndef USE_SSE2
 typedef union xmm_mm_union {
  __m128 xmm;
  __m64 mm[2];
 } xmm_mm_union;
 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
    xmm_mm_union u; u.xmm = xmm_;                   \
    mm0_ = u.mm[0];                                 \
    mm1_ = u.mm[1];                                 \
 }
 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
  }
 #endif // USE_SSE2
 /* natural logarithm computed for 4 simultaneous float 
   return NaN for x <= 0
 */
 v4sf log_ps(v4sf x) {
 #ifdef USE_SSE2
  v4si emm0;
 #else
  v2si mm0, mm1;
 #endif
  v4sf one = *(v4sf*)_ps_1;
  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
 #ifndef USE_SSE2
  /* part 1: x = frexpf(x, &e); */
  COPY_XMM_TO_MM(x, mm0, mm1);
  mm0 = _mm_srli_pi32(mm0, 23);
  mm1 = _mm_srli_pi32(mm1, 23);
 #else
  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
 #endif
  /* keep only the fractional part */
  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
 #ifndef USE_SSE2
  /* now e=mm0:mm1 contain the really base-2 exponent */
  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
  _mm_empty(); /* bye bye mmx */
 #else
  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
  v4sf e = _mm_cvtepi32_ps(emm0);
 #endif
  e = _mm_add_ps(e, one);
  /* part2: 
     if( x < SQRTHF ) {
       e -= 1;
       x = x + x - 1.0;
     } else { x = x - 1.0; }
  */
  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
  v4sf tmp = _mm_and_ps(x, mask);
  x = _mm_sub_ps(x, one);
  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
  x = _mm_add_ps(x, tmp);
  v4sf z = _mm_mul_ps(x,x);
  v4sf y = *(v4sf*)_ps_cephes_log_p0;
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
  y = _mm_mul_ps(y, x);
  y = _mm_mul_ps(y, z);
  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
  y = _mm_add_ps(y, tmp);
  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
  x = _mm_add_ps(x, y);
  x = _mm_add_ps(x, tmp);
  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
  return x;
 }
 _PS_CONST(exp_hi,	88.3762626647949f);
 _PS_CONST(exp_lo,	-88.3762626647949f);
 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
 _PS_CONST(cephes_exp_C1, 0.693359375);
 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
 v4sf exp_ps(v4sf x) {
  v4sf tmp = _mm_setzero_ps(), fx;
 #ifdef USE_SSE2
  v4si emm0;
 #else
  v2si mm0, mm1;
 #endif
  v4sf one = *(v4sf*)_ps_1;
  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
  /* express exp(x) as exp(g + n*log(2)) */
  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
  /* how to perform a floorf with SSE: just below */
 #ifndef USE_SSE2
  /* step 1 : cast to int */
  tmp = _mm_movehl_ps(tmp, fx);
  mm0 = _mm_cvttps_pi32(fx);
  mm1 = _mm_cvttps_pi32(tmp);
  /* step 2 : cast back to float */
  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
 #else
  emm0 = _mm_cvttps_epi32(fx);
  tmp  = _mm_cvtepi32_ps(emm0);
 #endif
  /* if greater, substract 1 */
  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
  mask = _mm_and_ps(mask, one);
  fx = _mm_sub_ps(tmp, mask);
  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
  x = _mm_sub_ps(x, tmp);
  x = _mm_sub_ps(x, z);
  z = _mm_mul_ps(x,x);
  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
  y = _mm_mul_ps(y, x);
  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, x);
  y = _mm_add_ps(y, one);
  /* build 2^n */
 #ifndef USE_SSE2
  z = _mm_movehl_ps(z, fx);
  mm0 = _mm_cvttps_pi32(fx);
  mm1 = _mm_cvttps_pi32(z);
  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
  mm0 = _mm_slli_pi32(mm0, 23); 
  mm1 = _mm_slli_pi32(mm1, 23);
  v4sf pow2n; 
  COPY_MM_TO_XMM(mm0, mm1, pow2n);
  _mm_empty();
 #else
  emm0 = _mm_cvttps_epi32(fx);
  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
  emm0 = _mm_slli_epi32(emm0, 23);
  v4sf pow2n = _mm_castsi128_ps(emm0);
 #endif
  y = _mm_mul_ps(y, pow2n);
  return y;
 }
 _PS_CONST(minus_cephes_DP1, -0.78515625);
 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 _PS_CONST(sincof_p0, -1.9515295891E-4);
 _PS_CONST(sincof_p1,  8.3321608736E-3);
 _PS_CONST(sincof_p2, -1.6666654611E-1);
 _PS_CONST(coscof_p0,  2.443315711809948E-005);
 _PS_CONST(coscof_p1, -1.388731625493765E-003);
 _PS_CONST(coscof_p2,  4.166664568298827E-002);
 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
   it runs also on old athlons XPs and the pentium III of your grand
   mother.
   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).
   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.
   Performance is also surprisingly good, 1.33 times faster than the
   macos vsinf SSE2 function, and 1.5 times faster than the
   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
   too bad for an SSE1 function (with no special tuning) !
   However the latter libraries probably have a much better handling of NaN,
   Inf, denormalized and other special arguments..
   On my core 1 duo, the execution of this function takes approximately 95 cycles.
   From what I have observed on the experiments with Intel AMath lib, switching to an
   SSE2 version would improve the perf by only 10%.
   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
   deliver full speed.
 */
 v4sf sin_ps(v4sf x) { // any x
  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 #ifdef USE_SSE2
  v4si emm0, emm2;
 #else
  v2si mm0, mm1, mm2, mm3;
 #endif
  sign_bit = x;
  /* take the absolute value */
  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
  /* scale by 4/Pi */
  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 #ifdef USE_SSE2
  /* store the integer part of y in mm0 */
  emm2 = _mm_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
  y = _mm_cvtepi32_ps(emm2);
  /* get the swap sign flag */
  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
  emm0 = _mm_slli_epi32(emm0, 29);
  /* get the polynom selection mask 
     there is one polynom for 0 <= x <= Pi/4
     and another one for Pi/4<x<=Pi/2
     Both branches will be computed.
  */
  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
  v4sf poly_mask = _mm_castsi128_ps(emm2);
  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 #else
  /* store the integer part of y in mm0:mm1 */
  xmm2 = _mm_movehl_ps(xmm2, y);
  mm2 = _mm_cvttps_pi32(y);
  mm3 = _mm_cvttps_pi32(xmm2);
  /* j=(j+1) & (~1) (see the cephes sources) */
  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
  y = _mm_cvtpi32x2_ps(mm2, mm3);
  /* get the swap sign flag */
  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
  mm0 = _mm_slli_pi32(mm0, 29);
  mm1 = _mm_slli_pi32(mm1, 29);
  /* get the polynom selection mask */
  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
  v4sf swap_sign_bit, poly_mask;
  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
  _mm_empty(); /* good-bye mmx */
 #endif
  /* The magic pass: "Extended precision modular arithmetic" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
  xmm1 = _mm_mul_ps(y, xmm1);
  xmm2 = _mm_mul_ps(y, xmm2);
  xmm3 = _mm_mul_ps(y, xmm3);
  x = _mm_add_ps(x, xmm1);
  x = _mm_add_ps(x, xmm2);
  x = _mm_add_ps(x, xmm3);
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(v4sf*)_ps_coscof_p0;
  v4sf z = _mm_mul_ps(x,x);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
  y = _mm_mul_ps(y, z);
  y = _mm_mul_ps(y, z);
  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  y = _mm_add_ps(y, *(v4sf*)_ps_1);
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
  v4sf y2 = *(v4sf*)_ps_sincof_p0;
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_mul_ps(y2, x);
  y2 = _mm_add_ps(y2, x);
  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
  y = _mm_andnot_ps(xmm3, y);
  y = _mm_add_ps(y,y2);
  /* update the sign */
  y = _mm_xor_ps(y, sign_bit);
  return y;
 }
 /* almost the same as sin_ps */
 v4sf cos_ps(v4sf x) { // any x
  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
 #ifdef USE_SSE2
  v4si emm0, emm2;
 #else
  v2si mm0, mm1, mm2, mm3;
 #endif
  /* take the absolute value */
  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
  /* scale by 4/Pi */
  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 #ifdef USE_SSE2
  /* store the integer part of y in mm0 */
  emm2 = _mm_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
  y = _mm_cvtepi32_ps(emm2);
  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
  /* get the swap sign flag */
  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
  emm0 = _mm_slli_epi32(emm0, 29);
  /* get the polynom selection mask */
  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  v4sf sign_bit = _mm_castsi128_ps(emm0);
  v4sf poly_mask = _mm_castsi128_ps(emm2);
 #else
  /* store the integer part of y in mm0:mm1 */
  xmm2 = _mm_movehl_ps(xmm2, y);
  mm2 = _mm_cvttps_pi32(y);
  mm3 = _mm_cvttps_pi32(xmm2);
  /* j=(j+1) & (~1) (see the cephes sources) */
  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
  y = _mm_cvtpi32x2_ps(mm2, mm3);
  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
  /* get the swap sign flag in mm0:mm1 and the 
     polynom selection mask in mm2:mm3 */
  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
  mm0 = _mm_slli_pi32(mm0, 29);
  mm1 = _mm_slli_pi32(mm1, 29);
  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
  v4sf sign_bit, poly_mask;
  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
  _mm_empty(); /* good-bye mmx */
 #endif
  /* The magic pass: "Extended precision modular arithmetic" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
  xmm1 = _mm_mul_ps(y, xmm1);
  xmm2 = _mm_mul_ps(y, xmm2);
  xmm3 = _mm_mul_ps(y, xmm3);
  x = _mm_add_ps(x, xmm1);
  x = _mm_add_ps(x, xmm2);
  x = _mm_add_ps(x, xmm3);
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(v4sf*)_ps_coscof_p0;
  v4sf z = _mm_mul_ps(x,x);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
  y = _mm_mul_ps(y, z);
  y = _mm_mul_ps(y, z);
  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  y = _mm_add_ps(y, *(v4sf*)_ps_1);
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
  v4sf y2 = *(v4sf*)_ps_sincof_p0;
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_mul_ps(y2, x);
  y2 = _mm_add_ps(y2, x);
  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
  y = _mm_andnot_ps(xmm3, y);
  y = _mm_add_ps(y,y2);
  /* update the sign */
  y = _mm_xor_ps(y, sign_bit);
  return y;
 }
 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
 #ifdef USE_SSE2
  v4si emm0, emm2, emm4;
 #else
  v2si mm0, mm1, mm2, mm3, mm4, mm5;
 #endif
  sign_bit_sin = x;
  /* take the absolute value */
  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
  /* scale by 4/Pi */
  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 #ifdef USE_SSE2
  /* store the integer part of y in emm2 */
  emm2 = _mm_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
  y = _mm_cvtepi32_ps(emm2);
  emm4 = emm2;
  /* get the swap sign flag for the sine */
  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
  emm0 = _mm_slli_epi32(emm0, 29);
  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
  /* get the polynom selection mask for the sine*/
  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  v4sf poly_mask = _mm_castsi128_ps(emm2);
 #else
  /* store the integer part of y in mm2:mm3 */
  xmm3 = _mm_movehl_ps(xmm3, y);
  mm2 = _mm_cvttps_pi32(y);
  mm3 = _mm_cvttps_pi32(xmm3);
  /* j=(j+1) & (~1) (see the cephes sources) */
  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
  y = _mm_cvtpi32x2_ps(mm2, mm3);
  mm4 = mm2;
  mm5 = mm3;
  /* get the swap sign flag for the sine */
  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
  mm0 = _mm_slli_pi32(mm0, 29);
  mm1 = _mm_slli_pi32(mm1, 29);
  v4sf swap_sign_bit_sin;
  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
  /* get the polynom selection mask for the sine */
  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
  v4sf poly_mask;
  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 #endif
  /* The magic pass: "Extended precision modular arithmetic" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
  xmm1 = _mm_mul_ps(y, xmm1);
  xmm2 = _mm_mul_ps(y, xmm2);
  xmm3 = _mm_mul_ps(y, xmm3);
  x = _mm_add_ps(x, xmm1);
  x = _mm_add_ps(x, xmm2);
  x = _mm_add_ps(x, xmm3);
 #ifdef USE_SSE2
  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
  emm4 = _mm_slli_epi32(emm4, 29);
  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
 #else
  /* get the sign flag for the cosine */
  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
  mm4 = _mm_slli_pi32(mm4, 29);
  mm5 = _mm_slli_pi32(mm5, 29);
  v4sf sign_bit_cos;
  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
  _mm_empty(); /* good-bye mmx */
 #endif
  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  v4sf z = _mm_mul_ps(x,x);
  y = *(v4sf*)_ps_coscof_p0;
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
  y = _mm_mul_ps(y, z);
  y = _mm_mul_ps(y, z);
  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  y = _mm_add_ps(y, *(v4sf*)_ps_1);
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
  v4sf y2 = *(v4sf*)_ps_sincof_p0;
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_mul_ps(y2, x);
  y2 = _mm_add_ps(y2, x);
  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  v4sf ysin2 = _mm_and_ps(xmm3, y2);
  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
  y2 = _mm_sub_ps(y2,ysin2);
  y = _mm_sub_ps(y, ysin1);
  xmm1 = _mm_add_ps(ysin1,ysin2);
  xmm2 = _mm_add_ps(y,y2);
  /* update the sign */
  *s = _mm_xor_ps(xmm1, sign_bit_sin);
  *c = _mm_xor_ps(xmm2, sign_bit_cos);
 }
--- a/string.h
+++ b/string.h
@ -1,11 +1,12 @@
 #ifndef STRING_H
 #define STRING_H
 #ifndef ULE_STRING_H
 #define ULE_STRING_H
 #include "config.h"
 #include "types.h"
 #include "alloc.h"
 #include <string.h> // @TODO remove this
 //#include <string.h> // @TODO remove this
 #define STB_SPRINTF_IMPLEMENTATION
 #define STB_SPRINTF_STATIC
@ -21,6 +22,7 @@
 // 'String' is a datatype, but it also is a namespace for a bunch of static 'char*' operations that 
 // you would normally find in the <cstring> or <string.h> header
 // The datatype is a modified version of a string class developed by Omar Cornut: https://github.com/ocornut/str
 class String {
 public:
    // Static empty buffer we can point to for empty strings
@ -49,7 +51,7 @@ public:
    };
    static inline s32 sprintf(char* buffer, const char* format, ...) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        va_list args;
        va_start(args, format);
@ -59,7 +61,7 @@ public:
        return code;
    }
    static inline s32 snprintf(char* buffer, s32 count, const char* format, ...) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        va_list args;
        va_start(args, format);
@ -73,31 +75,31 @@ public:
    }
    static inline bool isDigit(char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return (c >= '0') && (c <= '9');
    }
    static inline bool isAlpha(char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return (c >= 'A' && c <= 'Z')
            || (c >= 'a' && c <= 'z');
    }
    static inline bool isHexDigit(char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return ((c >= '0') && (c <= '9'))
            || ((c >= 'A') && (c <= 'F'))
            || ((c >= 'a') && (c <= 'f'));
    }
    static inline bool isOctalDigit(char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return (c >= '0') && (c <= '7');
    }
    static inline bool isBinaryDigit(char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return c == '0' || c == '1';
    }
    static inline char* intToString(u64 integer) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 capacity = 10;
        u32* remainders = (u32*) pMalloc(sizeof (u32) * capacity);
@ -124,7 +126,7 @@ public:
    }
    static inline u64 hexStringToInt(const char* str) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u64 out = 0;
        while (*str != '\0') {
@ -148,7 +150,7 @@ public:
    }
    static inline u32 len(const char* string) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        const char* start = string;
        while (*string++ != '\0') {}
        return (u32) (string - start);
@ -156,7 +158,7 @@ public:
    // returns true if null-terminated strings |s1| and |s2| are equal
    static inline bool eq(const char* s1, const char* s2) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 l1 = String::len(s1);
        u32 l2 = String::len(s2);
@ -173,7 +175,7 @@ public:
    // same as |eq|, but handles |s1| and/or |s2| being null
    static inline bool eqNullCheck(const char* s1, const char* s2) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (s1 == null) {
            if (s2 == null) {
                return true;
@ -189,7 +191,7 @@ public:
    // heap allocates a copy of |string| and returns a pointer to it.
    static inline char* cpy(const char* string, u32 length, Allocator* allocator = Allocator::GetDefault()) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        char* buffer = (char*) allocator->mallocate(sizeof (char) * (length + 1), allocator->state);
        u32 i = 0;
@ -202,18 +204,18 @@ public:
    // heap allocates a copy of |string| and returns a pointer to it.
    static inline char* cpy(const char* string, Allocator* allocator = Allocator::GetDefault()) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 len = String::len(string);
        return String::cpy(string, len, allocator = Allocator::GetDefault());
    }
    static inline bool memeq(const unsigned char* m1, const unsigned char* m2, size_t length) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return memcmp(m1, m2, length) == 0;
    }
    static inline bool memeq(const unsigned char* m1, size_t l1, const unsigned char* m2, size_t l2) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (l1 != l2) return false;
        return memeq(m1, m2, l1);
@ -221,13 +223,13 @@ public:
    #ifdef _WIN32
    static inline size_t wcharToChar(wchar_t* wstring, char* buffer, size_t maxBufferLength) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return wcstombs(buffer, wstring, maxBufferLength);
    }
    #endif
    static inline void* memset(void* p, char c, u32 length) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        //__stosb((unsigned char*) p, c, length);
        char* a = (char*) p;
        for (u32 i = 0; i < length; i++) a[i] = c;
@ -236,7 +238,7 @@ public:
    static inline void memcpy(void* dest, void* src, u32 size) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u8* dest_ = (u8*) dest;
        u8* src_ = (u8*) src;
@ -247,7 +249,7 @@ public:
    // replace all instances of |c1| in |string| with |c2|
    static inline void replaceC(char* string, u32 length, char c1, char c2) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < length; i++) {
            if (string[i] == c1) {
                string[i] = c2;
@ -256,7 +258,7 @@ public:
    }
    static inline const char* firstCharOccurence(const char* string, u32 length, char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (s32 i = 0; i < length; i++) {
            const char* s = string + i;
            if (*s == c) {
@ -267,12 +269,12 @@ public:
    }
    static inline const char* firstCharOccurence(const char* string, char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return String::firstCharOccurence(string, String::len(string), c);
    }
    static inline const char* lastCharOccurence(const char* string, u32 length, char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (s32 i = length - 1; i >= 0; i--) { // @NOTE 'i' needs to be a signed int here...
            if (*(string + i) == c) {
                return string + i;
@ -282,19 +284,19 @@ public:
    }
    static inline const char* lastCharOccurence(const char* string, char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return String::lastCharOccurence(string, String::len(string), c);
    }
    static inline bool hasSuffix(const char* string, const char* suffix) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        const char* p = String::lastCharOccurence(string, String::len(string), suffix[0]);
        if (p) return String::eq(p, suffix);
        return false;
    }
    static inline u32 countLines(const char* buffer) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 lines = 0;
        char c;
@ -306,7 +308,7 @@ public:
    }
    static inline bool isAscii(const char* buffer, u32 length) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        const unsigned char* ubuffer = (const unsigned char*) buffer;
        for (u32 i = 0; i < length; i++) {
            if (ubuffer[i] & 128) { // binary: 0b 1000 0000
@ -317,7 +319,7 @@ public:
    }
    static inline bool isAsciiWhitespace(char c) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        switch (c) {
            //case '\b':
            //case '\v':
@ -339,7 +341,7 @@ public:
    //static inline char* trimStart(const char* str, u32 count);
    //static inline char* trimEnd(const char* str, u32 count);
    static inline char* trim(const char* str, u32 count, Allocator* allocator = Allocator::GetDefault()) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 length = String::len(str);
        if (length <= count) {
@ -359,7 +361,7 @@ public:
    }
    static inline char* asciiToLower(const char* str, Allocator* allocator = Allocator::GetDefault()) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 length = String::len(str);
        char* buffer = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
        u32 i = 0;
@ -371,7 +373,7 @@ public:
    }
    static inline char* asciiToUpper(const char* str, Allocator* allocator = Allocator::GetDefault()) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 length = String::len(str);
        char* buffer = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
        u32 i = 0;
@ -383,7 +385,7 @@ public:
    }
    static inline char* concat(const char* str1, const char* str2, Allocator* allocator = Allocator::GetDefault()) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 l1 = String::len(str1);
        u32 l2 = String::len(str2);
        u32 newLength = l1 + l2;
@ -401,7 +403,7 @@ public:
    }
    static inline u32 write(char* dest, const char* src, u32 length) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 i = 0;
        for (; i < length; i++) {
            dest[i] = src[i];
@ -412,13 +414,13 @@ public:
    // returns the number of characters written.
    static inline u32 write(char* dest, const char* src) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        u32 length = String::len(src);
        return String::write(dest, src, length);
    }
    static inline char* read(const char* buffer, u32 length, Allocator* allocator = Allocator::GetDefault()) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        char* tk = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
        u32 i = 0;
        while (i < length) {
@ -434,38 +436,38 @@ public:
    int                 LocalBufSize : 10;      // Max 1023 bytes
    unsigned int        Owned : 1;              // Set when we have ownership of the pointed data (most common, unless using set_ref() method or StringRef constructor)
    inline char*        c_str()                                 { TYPES_H_FTAG; return Data; }
    inline const char*  c_str() const                           { TYPES_H_FTAG; return Data; }
    inline bool         empty() const                           { TYPES_H_FTAG; return Data[0] == 0; }
    inline int          length() const                          { TYPES_H_FTAG; return (int)strlen(Data); }    // by design, allow user to write into the buffer at any time
    inline int          capacity() const                        { TYPES_H_FTAG; return Capacity; }
    inline bool         owned() const                           { TYPES_H_FTAG; return Owned ? true : false; }
    inline char*        c_str()                                 { ULE_TYPES_H_FTAG; return Data; }
    inline const char*  c_str() const                           { ULE_TYPES_H_FTAG; return Data; }
    inline bool         empty() const                           { ULE_TYPES_H_FTAG; return Data[0] == 0; }
    inline int          length() const                          { ULE_TYPES_H_FTAG; return (int)strlen(Data); }    // by design, allow user to write into the buffer at any time
    inline int          capacity() const                        { ULE_TYPES_H_FTAG; return Capacity; }
    inline bool         owned() const                           { ULE_TYPES_H_FTAG; return Owned ? true : false; }
    inline char&        operator[](size_t i)                    { TYPES_H_FTAG; return Data[i]; }
    inline char         operator[](size_t i) const              { TYPES_H_FTAG; return Data[i]; }
    inline String&      operator=(const String& rhs)            { TYPES_H_FTAG; set(rhs); return *this; }
    inline bool         operator==(const String& rhs) const     { TYPES_H_FTAG; return strcmp(c_str(), rhs.c_str()) == 0; }
    inline String&      operator=(const char* rhs)              { TYPES_H_FTAG; set(rhs); return *this; }
    inline bool         operator==(const char* rhs) const       { TYPES_H_FTAG; return strcmp(c_str(), rhs) == 0; }
    inline char&        operator[](size_t i)                    { ULE_TYPES_H_FTAG; return Data[i]; }
    inline char         operator[](size_t i) const              { ULE_TYPES_H_FTAG; return Data[i]; }
    inline String&      operator=(const String& rhs)            { ULE_TYPES_H_FTAG; set(rhs); return *this; }
    inline bool         operator==(const String& rhs) const     { ULE_TYPES_H_FTAG; return strcmp(c_str(), rhs.c_str()) == 0; }
    inline String&      operator=(const char* rhs)              { ULE_TYPES_H_FTAG; set(rhs); return *this; }
    inline bool         operator==(const char* rhs) const       { ULE_TYPES_H_FTAG; return strcmp(c_str(), rhs) == 0; }
    inline String() {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        Data = EmptyBuffer; // Shared READ-ONLY initial buffer for 0 capacity
        Capacity = 0;
        LocalBufSize = 0;
        Owned = 0;
    }
    inline String(const String& rhs) : String() {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        set(rhs);
    }
    inline String(const char* rhs) : String() {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        set(rhs);
    }
    inline void set_ref(const char* src) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        if (Owned && !is_using_local_buf())
            STR_MEMFREE(Data);
        Data = src ? (char*)src : EmptyBuffer;
@ -473,7 +475,7 @@ public:
        Owned = 0;
    }
    inline void set(const String& src) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        int buf_len = (int)strlen(src.c_str())+1;
        if ((int)Capacity < buf_len)
            reserve_discard(buf_len);
@ -482,7 +484,7 @@ public:
    }
    inline void set(const char* src) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        // We allow set(NULL) or via = operator to clear the string.
        if (src == NULL)
        {
@ -497,7 +499,7 @@ public:
    }
    inline void set(const char* src, const char* src_end) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        STR_ASSERT(src != NULL && src_end >= src);
        int buf_len = (int)(src_end-src)+1;
        if ((int)Capacity < buf_len)
@ -509,7 +511,7 @@ public:
    // Clear
    inline void clear() {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        if (Owned && !is_using_local_buf())
            STR_MEMFREE(Data);
        if (LocalBufSize) {
@ -526,7 +528,7 @@ public:
    // Reserve memory, preserving the current of the buffer
    inline void reserve(int new_capacity) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        if (new_capacity <= Capacity)
            return;
@ -558,7 +560,7 @@ public:
    // Reserve memory, discarding the current of the buffer (if we expect to be fully rewritten)
    inline void reserve_discard(int new_capacity) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        if (new_capacity <= Capacity)
            return;
@ -578,7 +580,7 @@ public:
    }
    inline void shrink_to_fit() {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        if (!Owned || is_using_local_buf()) return;
        int new_capacity = length() + 1;
        if (Capacity <= new_capacity) return;
@ -592,7 +594,7 @@ public:
    // FIXME: merge setfv() and appendfv()?
    inline int setfv(const char* fmt, va_list args) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        // Needed for portability on platforms where va_list are passed by reference and modified by functions
        va_list args2;
        va_copy(args2, args);
@ -612,7 +614,7 @@ public:
    }
    inline int setf(const char* fmt, ...) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        va_list args;
        va_start(args, fmt);
        int len = setfv(fmt, args);
@ -621,7 +623,7 @@ public:
    }
    inline int setfv_nogrow(const char* fmt, va_list args) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        STR_ASSERT(Owned);
        if (Capacity == 0) return 0;
@ -633,7 +635,7 @@ public:
    }
    inline int setf_nogrow(const char* fmt, ...) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        va_list args;
        va_start(args, fmt);
        int len = setfv_nogrow(fmt, args);
@ -642,7 +644,7 @@ public:
    }
    inline int append_from(int idx, char c) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        int add_len = 1;
        if (Capacity < idx + add_len + 1)
            reserve(idx + add_len + 1);
@ -653,7 +655,7 @@ public:
    }
    inline int append_from(int idx, const char* s, const char* s_end) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        if (!s_end) s_end = s + strlen(s);
        int add_len = (int)(s_end - s);
        if (Capacity < idx + add_len + 1) reserve(idx + add_len + 1);
@ -665,7 +667,7 @@ public:
    // FIXME: merge setfv() and appendfv()?
    inline int appendfv_from(int idx, const char* fmt, va_list args) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        // Needed for portability on platforms where va_list are passed by reference and modified by functions
        va_list args2;
        va_copy(args2, args);
@ -684,7 +686,7 @@ public:
    }
    inline int appendf_from(int idx, const char* fmt, ...) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        va_list args;
        va_start(args, fmt);
        int len = appendfv_from(idx, fmt, args);
@ -693,25 +695,25 @@ public:
    }
    inline int append(char c) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        int cur_len = length();
        return append_from(cur_len, c);
    }
    inline int append(const char* s, const char* s_end = null) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        int cur_len = length();
        return append_from(cur_len, s, s_end);
    }
    inline int appendfv(const char* fmt, va_list args) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        int cur_len = length();
        return appendfv_from(cur_len, fmt, args);
    }
    int appendf(const char* fmt, ...) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        va_list args;
        va_start(args, fmt);
        int len = appendfv(fmt, args);
@ -727,13 +729,13 @@ public:
    }
 protected:
    inline char*        local_buf()                             { TYPES_H_FTAG; return (char*)this + sizeof(String); }
    inline const char*  local_buf() const                       { TYPES_H_FTAG; return (char*)this + sizeof(String); }
    inline bool         is_using_local_buf() const              { TYPES_H_FTAG; return Data == local_buf() && LocalBufSize != 0; }
    inline char*        local_buf()                             { ULE_TYPES_H_FTAG; return (char*)this + sizeof(String); }
    inline const char*  local_buf() const                       { ULE_TYPES_H_FTAG; return (char*)this + sizeof(String); }
    inline bool         is_using_local_buf() const              { ULE_TYPES_H_FTAG; return Data == local_buf() && LocalBufSize != 0; }
    // Constructor for StringXXX variants with local buffer
    String(unsigned short local_buf_size) {
        TYPES_H_FTAG; 
        ULE_TYPES_H_FTAG; 
        STR_ASSERT(local_buf_size < 1024);
        Data = local_buf();
        Data[0] = '\0';
@ -746,7 +748,7 @@ protected:
 // Literal/reference string
 class StringRef : public String {
 public:
    StringRef(const char* s) : String() { TYPES_H_FTAG; set_ref(s); }
    StringRef(const char* s) : String() { ULE_TYPES_H_FTAG; set_ref(s); }
 };
 // Types embedding a local buffer
@ -757,12 +759,12 @@ class TYPENAME : public String
    char local_buf[LOCALBUFSIZE];                                                                  \
 public:                                                                                            \
    TYPENAME() : String(LOCALBUFSIZE) {}                                                           \
    TYPENAME(const String& rhs) : String(LOCALBUFSIZE)   { TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const char* rhs) : String(LOCALBUFSIZE)     { TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const TYPENAME& rhs) : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
    TYPENAME&   operator=(const char* rhs)               { TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME&   operator=(const String& rhs)             { TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME&   operator=(const TYPENAME& rhs)           { TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME(const String& rhs) : String(LOCALBUFSIZE)   { ULE_TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const char* rhs) : String(LOCALBUFSIZE)     { ULE_TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const TYPENAME& rhs) : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
    TYPENAME&   operator=(const char* rhs)               { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME&   operator=(const String& rhs)             { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME&   operator=(const TYPENAME& rhs)           { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
 };
 // Disable PVS-Studio warning V730: Not all members of a class are initialized inside the constructor (local_buf is not initialized and that is fine)
@ -773,7 +775,7 @@ public:
 class TYPENAME_F : public TYPENAME                                                  \
 {                                                                                   \
 public:                                                                             \
    TYPENAME_F(const char* fmt, ...) : TYPENAME() { TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
    TYPENAME_F(const char* fmt, ...) : TYPENAME() { ULE_TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
 };
 #ifdef __clang__
@ -812,16 +814,16 @@ STR_DEFINETYPE_F(String32, String32f)
 class TYPENAME : public String {                                                    \
    char local_buf[LOCALBUFSIZE];                                                   \
 public:                                                                             \
    TYPENAME(const char* fmt, ...) : String(LOCALBUFSIZE) { TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
    TYPENAME()                     : String(LOCALBUFSIZE) { TYPES_H_FTAG; }                         \
    TYPENAME(const String& rhs)    : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const char* rhs)      : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const TYPENAME& rhs)  : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
    TYPENAME& operator=(const char* rhs)                  { TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME& operator=(const String& rhs)                { TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME& operator=(const TYPENAME& rhs)              { TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME(const char* fmt, ...) : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
    TYPENAME()                     : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; }                         \
    TYPENAME(const String& rhs)    : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const char* rhs)      : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
    TYPENAME(const TYPENAME& rhs)  : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
    TYPENAME& operator=(const char* rhs)                  { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME& operator=(const String& rhs)                { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
    TYPENAME& operator=(const TYPENAME& rhs)              { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
    void reserve(int new_capacity) { \
        TYPES_H_FTAG; \
        ULE_TYPES_H_FTAG; \
        if (new_capacity <= Capacity) \
            return; \
        char* new_data; \
@ -840,7 +842,7 @@ public:
        Owned = 1; \
    } \
    void reserve_discard(int new_capacity) { \
        TYPES_H_FTAG; \
        ULE_TYPES_H_FTAG; \
        if (new_capacity <= Capacity) \
            return; \
        if (Owned && !is_using_local_buf()) \
@ -921,7 +923,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //};
 ////================================================================================ 
 //StringBuffer::StringBuffer(u32 initialSize) {
 //    TYPES_H_FTAG;
 //    ULE_TYPES_H_FTAG;
 //    this->length = 0;
 //    this->capacity = initialSize;
 //    this->data = (char*) pMalloc(sizeof(char) * this->capacity);
@ -938,7 +940,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //void StringBuffer::checkIfShouldGrow() {
 //    TYPES_H_FTAG;
 //    ULE_TYPES_H_FTAG;
 //    if (this->isFull()) {
 //        // optimal number as you approach infinite elements approaches PHI, but 1.5 sometimes works better for finite sizes
 //        // more testing is probably needed
@ -948,17 +950,17 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //bool StringBuffer::isEmpty() const {
 //    TYPES_H_FTAG;
 //    ULE_TYPES_H_FTAG;
 //    return this->length == 0;
 //}
 //
 //bool StringBuffer::isFull() const {
 //    TYPES_H_FTAG;
 //    ULE_TYPES_H_FTAG;
 //    return this->length == this->capacity;
 //}
 //
 //char StringBuffer::pop() {
 //    TYPES_H_FTAG;
 //    ULE_TYPES_H_FTAG;
 //    if (this->isEmpty()) {
 //        die("empty");
 //    }
@ -967,7 +969,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //u32 StringBuffer::append(char e) {
 //    TYPES_H_FTAG;
 //    ULE_TYPES_H_FTAG;
 //    this->checkIfShouldGrow();
 //
 //    this->data[this->length++] = e;
--- a/table.hpp
+++ b/table.hpp
@ -1,17 +1,17 @@
 #ifndef TABLE_H
 #define TABLE_H
 #ifndef ULE_TABLE_H
 #define ULE_TABLE_H
 #include <new> // new
 #include <functional> // std::function for traversal
 #include <type_traits> // std::enable_if
 #include "config.h"
 #include "alloc.h"
 #include "string.h"
 #include "types.h"
 // what follows is a collection of hash functions taken from: https://www.partow.net/programming/hashfunctions/#:~:text=The%20hash%20functions%20in%20this,containers%20such%20as%20hash%2Dtables.
 //
 // Available Hash Functions
@ -203,7 +203,7 @@ static inline u32 fastModuloReductionDanielLemire(u32 v, u32 c) {
 }
 static inline u32 hash(const char* key, u32 keyLength, u32 capacity) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    u32 value = APHash(key, keyLength);
@ -233,18 +233,18 @@ struct Table {
    TableEntry<V>** entries;
    Table<V>(u32 _lanes = 16) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->lanes = _lanes;
        this->length = 0;
        this->entries = (TableEntry<V>**) pCalloc(sizeof(TableEntry<V>*), this->lanes);
    }
    void* operator new(size_t size) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return (Table<V>*) pMalloc(sizeof(Table<V>));
    }
    V insert(const char* key, u32 keyLength, V value) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        TableEntry<V>* entry = this->lookup(key, keyLength);
        if (!entry) { // no entry with that key exists
@ -270,7 +270,7 @@ struct Table {
    }
    TableEntry<V>* lookup(const char* key, u32 keyLength) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        TableEntry<V>* entry = this->entries[hash(key, keyLength, lanes)];
        for (; entry != null; entry = entry->next) {
@ -283,7 +283,7 @@ struct Table {
    }
    V lookupWithDefault(const char* key, u32 keyLength, V defaultValue) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        auto entry = this->lookup(key, keyLength);
        if (entry == null) return defaultValue;
@ -294,7 +294,7 @@ struct Table {
    // do not set |freeValues| to true unless the template parameter 'T' is a pointer,
    // and the table is responsible for freeing the memory.
    void clear(bool freeValues = false) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            TableEntry<V>** lane = &this->entries[i];
            TableEntry<V>* entry = *lane;
@ -334,7 +334,7 @@ struct Table {
    }
    void traverse(const std::function <void (TableEntry<V>*)>& entryCallback) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            TableEntry<V>* entry = this->entries[i];
@ -346,9 +346,10 @@ struct Table {
    }
 };
 #ifdef ULE_CONFIG_OPTION_SERIALIZATION
 template <typename T>
 static void serialize(String* str, Table<T> table) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    serialize(str, table.lanes);
    serialize(str, table.length);
    for (u32 i = 0; i < table.lanes; i++) {
@ -364,7 +365,7 @@ static void serialize(String* str, Table<T> table) {
 template <typename T>
 static void serialize(String* str, Table<T>* table) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    SERIALIZE_HANDLE_NULL(str, table);
    serialize(str, table->lanes);
    serialize(str, table->length);
@ -381,7 +382,7 @@ static void serialize(String* str, Table<T>* table) {
 template <typename T>
 static void deserialize(char** buffer, Table<T>* table) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    deserialize(buffer, &table->lanes);
    u32 length;
    deserialize(buffer, &length);
@ -398,7 +399,7 @@ static void deserialize(char** buffer, Table<T>* table) {
 template <typename T>
 static void deserialize(char** buffer, Table<T>** table) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    DESERIALIZE_HANDLE_NULL(buffer, table);
    u32 lanes;
    deserialize(buffer, &lanes);
@ -416,6 +417,7 @@ static void deserialize(char** buffer, Table<T>** table) {
    _table->length = length;
    *table = _table;
 }
 #endif // ULE_CONFIG_OPTION_SERIALIZATION
 //================================================================================ 
 // Fixed-key size table.
@ -436,7 +438,7 @@ static void deserialize(char** buffer, Table<T>** table) {
 //#include <mmintrin.h>
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 64>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    // AVX512:
    //__mmask32 result = _mm512_cmpeq_epi16_mask (*((__m512i*)m1), *((__m512i*)m2));
@ -465,7 +467,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 32>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    //sse4.2:
    //int result = 0;
    //for (u32 i = 0; i < 4; i++) {
@ -490,7 +492,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 16>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    // MMX: (this one is barely nanoseconds (~1-10ns) faster than String::memeq)
    //__m64 result = _mm_cmpeq_pi32(*((__m64*)m1), *((__m64*)m2));
    //return ((u64)result) == ~0ULL;
@ -499,7 +501,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE != 64 && KEY_SIZE != 32 && KEY_SIZE != 16>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
    TYPES_H_FTAG;
    ULE_TYPES_H_FTAG;
    return String::memeq(m1, m2, KEY_SIZE);
 }
@ -518,18 +520,18 @@ struct FixedKeySizeTable {
    FixedKeySizeTableEntry<KEY_SIZE, V>** entries;
    FixedKeySizeTable<KEY_SIZE, V>(u32 _lanes = 16) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->lanes = _lanes;
        this->length = 0;
        this->entries = (FixedKeySizeTableEntry<KEY_SIZE, V>**) pCalloc(sizeof(FixedKeySizeTableEntry<KEY_SIZE, V>*), this->lanes);
    }
    void* operator new(size_t size) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        return (FixedKeySizeTable<KEY_SIZE, V>*) pMalloc(sizeof(FixedKeySizeTable<KEY_SIZE, V>));
    }
    V insert(const char* key, V value) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->lookup(key);
        if (!entry) { // no entry with that key exists
@ -554,7 +556,7 @@ struct FixedKeySizeTable {
    }
    FixedKeySizeTableEntry<KEY_SIZE, V>* lookup(const char* key) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->entries[hash(key, KEY_SIZE, lanes)];
        for (; entry != null; entry = entry->next) {
@ -567,7 +569,7 @@ struct FixedKeySizeTable {
    }
    V lookupWithDefault(const char* key, V defaultValue) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        auto entry = this->lookup(key);
        if (entry == null) return defaultValue;
@ -578,7 +580,7 @@ struct FixedKeySizeTable {
    // do not set |freeValues| to true unless the template parameter 'T' is a pointer,
    // and the table is responsible for freeing the memory.
    void clear(bool freeValues = false) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            FixedKeySizeTableEntry<KEY_SIZE, V>** lane = &this->entries[i];
            FixedKeySizeTableEntry<KEY_SIZE, V>* entry = *lane;
@ -617,7 +619,7 @@ struct FixedKeySizeTable {
    }
    void traverse(const std::function <void (FixedKeySizeTableEntry<KEY_SIZE, V>*)>& entryCallback) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->entries[i];
@ -643,14 +645,14 @@ struct CacheTable {
    CacheTableEntry* entries; // n and p are the dimensions of the array. n is first.
    CacheTable(u32 _n = 8, u32 _p = 8) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        this->n       = _n;
        this->p       = _p;
        this->entries = (CacheTableEntry*) pCalloc(this->n*this->p, sizeof(CacheTableEntry));
    }
    void* insert(const char* key, u32 keyLength, void* value) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        CacheTableEntry* row = this->entries + hash(key, keyLength, this->n) * this->n;
        // We're going to insert in 'row'. We need some policy to decide which column to evict.
@ -682,7 +684,7 @@ struct CacheTable {
    }
    CacheTableEntry* lookup(const char* key, u32 keyLength) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        CacheTableEntry* row = this->entries + hash(key, keyLength, this->n) * this->n;
@ -698,7 +700,7 @@ struct CacheTable {
    }
    void clear(bool freeValues = false) {
        TYPES_H_FTAG;
        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->n; i++) {
            CacheTableEntry* row = this->entries + i * this->n;
--- a/types.h
+++ b/types.h
@ -1,25 +1,17 @@
 #ifndef TYPES_H
 #define TYPES_H
 #ifndef ULE_TYPES_H
 #define ULE_TYPES_H
 #include <stddef.h> // size_t
 #define null 0
 // long term, it would be nice to not have to '#include' tracy here,
 // a client using the library should include it and use a define to instruct
 // the library what to put at the beginning of function calls for profiling needs, 
 // but i've had trouble implementing that.
 #ifndef TYPES_H_FTAG
 #include <Tracy.hpp>
 #define TYPES_H_FTAG ZoneScoped
 #ifndef ULE_TYPES_H_FTAG
 #ifdef ULE_CONFIG_OPTION_FTAG
 #define ULE_TYPES_H_FTAG ULE_CONFIG_OPTION_FTAG
 #else
 #define ULE_TYPES_H_FTAG
 #endif
 // bool is included by default for C++11
 #ifndef __cplusplus
    typedef _Bool bool;
    #define true 1
    #define false 0
 #endif
 // The restrict declspec is used on functions that return unaliased pointers. This keyword is used for the C-Runtime Library implementation of malloc since it will never return a pointer value that is already in use in the current program (unless you are doing something illegal, such as using memory after it has been freed).
@ -47,12 +39,12 @@
 typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
 typedef uint8_t  u8;
 typedef int64_t s64;
 typedef int32_t s32;
 typedef int16_t s16;
 typedef int8_t s8;
 typedef int8_t  s8;
 //typedef size_t size_t;
@ -62,9 +54,7 @@ typedef int8_t s8;
 //typedef long double extended;
 // if we're using the glm vector/matrix types, or other types, define them here
 #define _USING_GLM_TYPES__
 #ifdef _USING_GLM_TYPES__
 #ifdef ULE_CONFIG_OPTION_USE_GLM
 // force high precision for everything
 #define GLM_PRECISION_HIGHP_FLOAT
 #define GLM_PRECISION_HIGHP_DOUBLE
--- a/util.h
+++ b/util.h
@ -1,6 +1,6 @@
 #ifndef UTIL_H
 #define UTIL_H
 #ifndef ULE_UTIL_H
 #define ULE_UTIL_H
 #define STATIC_ARRAY_LENGTH(a) (sizeof(a)/sizeof(a[0]))