wip

2 years ago · 530f85cf12
19 changed files with 1128 additions and 335 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,48 @@
+
+This is a library of C++ code which I use as a standard library wrapper, supplement, and in some cases, replacement.
+
+If you want to use it, you can add all of the source files to your source tree, configure the `#define`'s in `config.h` to suit your needs, and it should just work.
+
+The exceptions are the files `config.h` and `types.h` which are required by every other file.
+
+- Stack, Scratch, and Block-based allocators as well as memory-leak checking mechanism and OS allocator wrappers in `alloc.h/.cpp`
+- Heap-friendly String type, including format strings and StringBuffers/Builders, as well as `<string.h>` function replacements as static methods in single-header `string.h`
+- Instrusive serialization mechanism in `serialize.h/.cpp` for complex types and primitives (no reflection though)
+- A few hash functions, HashTable and CacheTable (hash table that can forget its keys) implementations in `table.hpp`
+- A dynamic/growing array implementation in `array.hpp`
+- Common file operations, `<stdio>` wrapper in `file.h/.cpp`
+
+And some more stuff that is TODO:
+- `cpuid` x86 instruction wrapper
+- `glm` replacement - vector, matrix, and quaternion types and some common operations involving them
+
+# Licenses & Other Code
+
+## fast_float
+Our serialization code uses `fast_float` library by Daniel Lemire et al, provided simultaneously under the [Apache License, Version 2.0](https://github.com/fastfloat/fast_float/blob/main/LICENSE-APACHE), the [MIT license](https://github.com/fastfloat/fast_float/blob/main/LICENSE-MIT) and/or the [BOOST license](https://github.com/fastfloat/fast_float/blob/main/LICENSE-BOOST). The `fast_float` library itself uses code originally published the Apache 2.0 license.
+
+## sse_mathfun.h
+The `sin`, `cos`, `exp`, and `log` replacements used by this library are provided by a single-header library written by Julien Pommier under the zlib license:
+
+```
+Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+```
+
--- a/alloc.cpp
+++ b/alloc.cpp
@ -7,22 +7,22 @@

 #if false
 static void* leakcheckMalloc(size_t size, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return malloc(size);
 }

 static void* leakcheckCalloc(size_t maxNumOfElements, size_t elementSize, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return calloc(maxNumOfElements, elementSize);
 }

 static void* leakcheckRealloc(void* buffer, size_t newSize, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return realloc(buffer, newSize);
 }

 static void leakcheckFree(void* ptr, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    free(ptr);
 }

@ -45,7 +45,7 @@ static void dumpLeaks() {

 // system allocators
 void* pMalloc(size_t size) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    void* p = malloc(size);

    if (!p) {
@ -55,12 +55,12 @@ void* pMalloc(size_t size) {
    return p;
 }
 void* pMalloc(size_t size, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return pMalloc(size);
 }

 void* pCalloc(size_t maxNumOfElements, size_t elementSize) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    void* p = calloc(maxNumOfElements, elementSize);

    if (!p) {
@ -70,12 +70,12 @@ void* pCalloc(size_t maxNumOfElements, size_t elementSize) {
    return p;
 }
 void* pCalloc(size_t maxNumOfElements, size_t elementSize, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return pCalloc(maxNumOfElements, elementSize);
 }

 void* pRealloc(void* buffer, size_t newSize) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    void* p = realloc(buffer, newSize);

    if (!p) {
@ -86,25 +86,25 @@ void* pRealloc(void* buffer, size_t newSize) {
    return p;
 }
 void* pRealloc(void* buffer, size_t newSize, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return pRealloc(buffer, newSize);
 }

 void pFree(void* ptr) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    free(ptr);
 }
 void pFree(void* ptr, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    pFree(ptr);
 }

 void pFree(const void* ptr) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    pFree((void*) ptr);
 }
 void pFree(const void* ptr, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    pFree((void*) ptr, allocatorState);
 }

@ -124,7 +124,7 @@ void pFree(const void* ptr, void* allocatorState) {
 static bool DefaultAllocatorInited = false;
 static Allocator DefaultAllocator;
 static void defaultAllocatorInit() {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    DefaultAllocator.state = null;
    DefaultAllocator.mallocate = pMalloc;
    DefaultAllocator.callocate = pCalloc;
@ -134,7 +134,7 @@ static void defaultAllocatorInit() {
 }

 Allocator* Allocator::GetDefault() {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    if (!DefaultAllocatorInited) defaultAllocatorInit();
    return &DefaultAllocator;
 }
@ -142,7 +142,7 @@ Allocator* Allocator::GetDefault() {
 //================================================================================ 
 // alignment should be a power of 2
 static u64 alignForward2(u64 ptr, size_t alignment) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    u64 p, a, modulo;

    p = ptr;
@ -156,14 +156,14 @@ static u64 alignForward2(u64 ptr, size_t alignment) {
    return p;
 }
 static u64 alignForward(u64 ptr, size_t alignment) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return ((ptr + alignment - 1) / alignment) * alignment;
 }

 //================================================================================ 
 // Scratch/Arena
 Arena* Arena::Init(u32 sizeInBytes) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    Arena* arena = (Arena*) pMalloc(sizeof(Arena));
    arena->index = 0;
    arena->buffer = (u8*) pMalloc(sizeof(u8) * sizeInBytes);
@ -171,7 +171,7 @@ Arena* Arena::Init(u32 sizeInBytes) {
    return arena;
 }
 void* Arena::Alloc(u32 sizeInBytes) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    u8* p = this->buffer + this->index;
    u32 offset = (u32) alignForward2((u64) p, 64);

@ -187,7 +187,7 @@ void* Arena::Alloc(u32 sizeInBytes) {
    return null;
 }
 void Arena::Clear() {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    this->index = 0;
 }
 //================================================================================ 
--- a/alloc.h
+++ b/alloc.h
@ -1,7 +1,8 @@

-#ifndef ALLOC_H
-#define ALLOC_H
+#ifndef ULE_ALLOC_H
+#define ULE_ALLOC_H

+#include "config.h"
 #include "types.h"


--- a/array.hpp
+++ b/array.hpp
@ -1,9 +1,10 @@

-#ifndef ARRAY_H
-#define ARRAY_H
+#ifndef ULE_ARRAY_H
+#define ULE_ARRAY_H

 #include <new> // operator new, operator delete

+#include "config.h"
 #include "alloc.h" // allocators...
 #include "serialize.h" // serialization
 #include "string.h" // String::memcpy
@ -32,21 +33,28 @@ struct Array {
    T* data;

    Array<T>(u32 _capacity = 8) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->length   = 0;
        this->capacity = _capacity;
        this->data     = (T*) pCalloc(sizeof (T), _capacity);
    }
    void* operator new(size_t size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return pMalloc((u32) size);
    }

    void checkIfShouldGrow() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (this->isFull()) {
            // optimal number as you approach infinite elements approaches PHI, but 1.5 sometimes works better for finite sizes
-            // more testing is probably needed
+            //
+            // it seems, that a commonly chosen growth rate of '2' is perhaps the worst possible choice.
+            // if you grow at a rate of 2x, you end up (likely) never being able to re-use the freed 'hole' in the heap
+            // for a future allocation of the same kind.
+            // useful reading for those interested in their own dynamic array implementations:
+            // (facebook's vector impl, a strictly better std::vector)
+            // https://github.com/facebook/folly/blob/main/folly/docs/FBVector.md
+            //
            this->capacity = (u32) (this->capacity * 1.5);
            this->data = (T*) pRealloc(data, sizeof(T) * this->capacity);
        }
@ -54,7 +62,7 @@ struct Array {

    // for when the order in the array doesn't matter, move the end of the array into the removed slot
    void removeSwapWithEnd(u32 index) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (this->isEmpty()) return; // overhead, maybe assert instead?

        u32 end = this->length - 1;
@ -65,7 +73,7 @@ struct Array {
    }

    void removeSwapWithEnd(T* addr) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->length; i++) {
            if ((this->data + i) == addr) {
                removeSwapWithEnd(i);
@ -75,7 +83,7 @@ struct Array {
    }

    void removeAndShrink(u32 index) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (u32 i = index + 1; i < this->length; i++) {
            String::memcpy(this->data[i - 1], this->data[i], sizeof(T));
        }
@ -83,7 +91,7 @@ struct Array {
    }

    void removeAndShrink(T* elementAddr) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        s32 index = -1;
        for (u32 i = 0; i < this->length; i++) {
            if ((this->data + i) == elementAddr) {
@ -103,7 +111,7 @@ struct Array {
    }

    T pop() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (this->isEmpty()) {
            die("empty");
        }
@ -114,7 +122,7 @@ struct Array {
    // sometimes, you want to copy some POD data on the stack to the next position in the internal array
    // that's what this does
    u32 pushCopy(T* e) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();

        String::memcpy((void*) &this->data[this->length++], e, sizeof(T));
@ -126,14 +134,14 @@ struct Array {
    // it is irresponsible to call this and then not store a T in that address. this increments length,
    // reserving the next spot for you.
    T* pushNextAddrPromise() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();

        return &this->data[this->length++];
    }

    u32 push(T e) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();

        this->data[this->length++] = e;
@ -142,7 +150,7 @@ struct Array {
    }

    u32 pushMany(T* elements, u32 count) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        // ensure we have capacity. if we have to realloc multiple times that can suck,
        // but should be avoidable in practice by having an appropriately large initial capacity
        while (this->capacity < (this->length + count)) {
@ -159,7 +167,7 @@ struct Array {
    }

    void reverse() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 count = this->length / 2;

        for (u32 i = 0; i < count; i++) {
@ -172,7 +180,7 @@ struct Array {
    }

    T shift() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (this->length == 0) {
            return null;
        }
@ -188,7 +196,7 @@ struct Array {
    }

    T unshift(T e) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->checkIfShouldGrow();

        for (u32 i = 0; i < this->length; i++) {
@ -202,7 +210,7 @@ struct Array {
    }

    T peek() const {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (this->isEmpty()) {
            return null;
        }
@ -211,24 +219,25 @@ struct Array {
    }

    bool isEmpty() const {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return this->length == 0;
    }

    bool isFull() const {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return this->length == this->capacity;
    }

    void clear() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->length = 0;
    }
 };

-template <typename T>
+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
+extern template <typename T>
 static void serialize(String* str, Array<T> array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    serialize(str, array.length);
    serialize(str, array.capacity);
    for (u32 i = 0; i < array.length; i++) {
@ -236,9 +245,9 @@ static void serialize(String* str, Array<T> array) {
    }
 }

-template <typename T>
+extern template <typename T>
 static void serialize(String* str, Array<T>* array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    SERIALIZE_HANDLE_NULL(str, array);
    serialize(str, array->length);
    serialize(str, array->capacity);
@ -247,9 +256,9 @@ static void serialize(String* str, Array<T>* array) {
    }
 }

-template <typename T>
+extern template <typename T>
 static void deserialize(char** buffer, Array<T>* array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    deserialize(buffer, &array->length);
    deserialize(buffer, &array->capacity);
    for (u32 i = 0; i < array->length; i++) {
@ -257,9 +266,9 @@ static void deserialize(char** buffer, Array<T>* array) {
    }
 }

-template <typename T>
+extern template <typename T>
 static void deserialize(char** buffer, Array<T>** array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    DESERIALIZE_HANDLE_NULL(buffer, array);
    u32 length, capacity;
    deserialize(buffer, &length);
@ -271,5 +280,7 @@ static void deserialize(char** buffer, Array<T>** array) {
    }
    *array = _array;
 }
+#endif // ULE_CONFIG_OPTION_SERIALIZATION

 #endif
+
--- a/config.h
+++ b/config.h
@ -0,0 +1,15 @@
+
+#pragma once
+#ifndef ULE_CONFIG_H
+#define ULE_CONFIG_H
+
+// define this macro to include the serialization code `serialize.h/.cpp`, as well as serialization
+// for the hashtable(s) and array implementations.
+//#define ULE_CONFIG_OPTION_SERIALIZATION
+
+// all functions in the library will invoke a semicolon-terminated macro as their first line of execution.
+// this is for use by an instrusive profiler, though could be used for whatever purpose.
+//#define ULE_CONFIG_OPTION_FTAG ZoneScoped
+
+#endif
+
--- a/cpuid.cpp
+++ b/cpuid.cpp
@ -45,7 +45,7 @@ static const char* szFeatures[] = {
 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2008/hskdteyh(v=vs.90)?redirectedfrom=MSDN
 #include <intrin.h>
 void cpuid() {
-	TYPES_H_FTAG;
+	ULE_TYPES_H_FTAG;
    int nSteppingID = 0;
    int nModel = 0;
    int nFamily = 0;
@ -142,7 +142,7 @@ void cpuid() {

 #else
 void cpuid() {
-	TYPES_H_FTAG;
+	ULE_TYPES_H_FTAG;
 }

 #endif
--- a/cpuid.h
+++ b/cpuid.h
@ -1,6 +1,8 @@

-#ifndef CPUID_H
-#define CPUID_H
+#ifndef ULE_CPUID_H
+#define ULE_CPUID_H
+
+#include "config.h"

 void cpuid();

--- a/file.cpp
+++ b/file.cpp
@ -11,11 +11,11 @@


 FILE* File::Open(const char* path, const char* mode) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return fopen(path, mode);
 }
 FILE* File::Open(const char* path, size_t* outSize, const char* mode) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, mode);

    if (fp == null) {
@ -34,7 +34,7 @@ void File::Close(FILE* file) {
 }

 size_t File::Size(const char* path) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path);
    // get the file's size in bytes
    fseek(fp, 0, SEEK_END);
@ -44,7 +44,7 @@ size_t File::Size(const char* path) {
    return size;
 }
 size_t File::Size(FILE* fp) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    fseek(fp, 0, SEEK_END);
    size_t size = ftell(fp);
    fseek(fp, 0L, SEEK_SET);
@ -52,7 +52,7 @@ size_t File::Size(FILE* fp) {
 }

 u8* File::Read(const char* path) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, "rb");

    if (fp == null) {
@ -73,7 +73,7 @@ u8* File::Read(const char* path) {
    return (u8*) buffer;
 }
 u8* File::Read(const char* path, size_t* outSize) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, "rb");

    if (fp == null) {
@ -98,7 +98,7 @@ u8* File::Read(const char* path, size_t* outSize) {
    return (u8*) buffer;
 }
 size_t File::Read(FILE* fp, void* destination) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;

    fseek(fp, 0, SEEK_END);
    size_t size = ftell(fp);
@ -108,12 +108,12 @@ size_t File::Read(FILE* fp, void* destination) {
    return size;
 }
 size_t File::Read(FILE* fp, void* destination, size_t size) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return fread(destination, sizeof (char), size + 1, fp);
 }

 s32 File::Write(const char* path, char* data, u32 count) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    FILE* fp = File::Open(path, "wb");

    if (fp == null) {
@ -134,7 +134,7 @@ s32 File::Write(const char* path, char* data, u32 count) {
 #include <windows.h>
 // writes the filenames into the provided array |outFileNames|, must be allocated ahead of time.
 void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    massert(path != null, "provided 'null' for path argument");
    massert(outFileNames != null, "provided 'null' for array argument");
    WIN32_FIND_DATAA findData;
@ -160,7 +160,7 @@ void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
 #else
 #include <dirent.h>
 void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    massert(path != null, "provided 'null' for path argument");
    massert(outFileNames != null, "provided 'null' for array argument");
    DIR* dir = opendir(path);
@ -189,7 +189,7 @@ void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
 #endif

 time_t File::LastModified(const char* path) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    struct stat result;
    if (stat(path, &result) == 0) {
        return result.st_mtime;
--- a/file.h
+++ b/file.h
@ -1,10 +1,11 @@

-#ifndef FILE_H
-#define FILE_H
+#ifndef ULE_FILE_H
+#define ULE_FILE_H

 #include <stdio.h> // FILE
 #include <sys/types.h> // time_t

+#include "config.h"
 #include "array.hpp"


--- a/print.cpp
+++ b/print.cpp
@ -10,12 +10,12 @@


 void vprint(const char* format, va_list args) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    vfprintf(stdout, format, args);
 }

 void vprintln(const char* format, va_list args) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    vprint(format, args);
    print("\n");
 }
@ -25,7 +25,7 @@ void vprintln(const char* format, va_list args) {
 * +we intend to replace printf at some point with this
 */
 void print(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    if (format == null) { print("null"); return; }

    va_list args;
@ -37,7 +37,7 @@ void print(const char* format, ...) {
 }

 void println(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    if (format == null) { print("null\n"); return; }

    va_list args;
@ -58,7 +58,7 @@ void println(const char* format, ...) {
 #include <dbghelp.h>
 // if |string| is non-null, then the stack trace will be concatenated to it instead of being printed to stdout.
 void trace(String* string) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;

    #define BACKTRACE_MAX_FUNCTION_NAME_LENGTH 1024
    HANDLE processHandle = GetCurrentProcess();
@ -105,7 +105,7 @@ void trace(String* string) {
 #include <cxxabi.h> // abi::__cxa_demangle
 // if |string| is non-null, then the stack trace will be concatenated to it instead of being printed to stdout.
 void trace(String* string) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;

    void* stack[BACKTRACE_MAX_FRAMES];
    u32 stackSize = backtrace(stack, BACKTRACE_MAX_FRAMES);
@ -156,7 +156,7 @@ void trace(String* string) {
 #endif

 void _debug(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    if (format == null) {
        print("%sdebug:%s null\n", ANSI_BLUE, ANSI_RESET);
        return;
@ -172,7 +172,7 @@ void _debug(const char* format, ...) {
 }

 void _warn(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    if (format == null) {
        print("%swarning:%s null\n", ANSI_YELLOW, ANSI_RESET);
        return;
@ -198,7 +198,7 @@ void setCustomDieBehavior(void (*dieBehavior)(const char* string)) {
 // if a fatal error should not occur at runtime on a release binary, consider preferring 'massert'
 // it's unclear when you should use asserts vs. die actually. idk man
 void die(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    if (format == null) {
        if (customDie == null) {
            print("%serror:%s (unspecified error)\n", ANSI_RED, ANSI_RESET);
@ -239,42 +239,43 @@ void die(const char* format, ...) {
    }
 }

-void print(bool b)         { TYPES_H_FTAG; print("%s", b ? "true" : "false"); }
-void print(char c)         { TYPES_H_FTAG; print("%c", c); }
-void print(signed int i)   { TYPES_H_FTAG; print("%d", i); }
-void print(unsigned int i) { TYPES_H_FTAG; print("%u", i); }
-void print(float f)        { TYPES_H_FTAG; print("%.14g", f); }
-void print(double d)       { TYPES_H_FTAG; print("%.14g", d); }
-void print(void* p)        { TYPES_H_FTAG; print("%p", p); }
-void print(char* s)        { TYPES_H_FTAG; print("%s", s); }
+void print(bool b)         { ULE_TYPES_H_FTAG; print("%s", b ? "true" : "false"); }
+void print(char c)         { ULE_TYPES_H_FTAG; print("%c", c); }
+void print(signed int i)   { ULE_TYPES_H_FTAG; print("%d", i); }
+void print(unsigned int i) { ULE_TYPES_H_FTAG; print("%u", i); }
+void print(float f)        { ULE_TYPES_H_FTAG; print("%.14g", f); }
+void print(double d)       { ULE_TYPES_H_FTAG; print("%.14g", d); }
+void print(void* p)        { ULE_TYPES_H_FTAG; print("%p", p); }
+void print(char* s)        { ULE_TYPES_H_FTAG; print("%s", s); }

 #ifndef _WIN32
-void print(size_t i) { TYPES_H_FTAG; print("%u", i); }
-void println(size_t i) { TYPES_H_FTAG; print(i); print("\n"); }
+void print(size_t i) { ULE_TYPES_H_FTAG; print("%u", i); }
+void println(size_t i) { ULE_TYPES_H_FTAG; print(i); print("\n"); }
 #endif

-void println(bool b)         { TYPES_H_FTAG; print(b); print("\n"); }
-void println(char c)         { TYPES_H_FTAG; print(c); print("\n"); }
-void println(signed int i)   { TYPES_H_FTAG; print(i); print("\n"); }
-void println(unsigned int i) { TYPES_H_FTAG; print(i); print("\n"); }
-void println(float f)        { TYPES_H_FTAG; print(f); print("\n"); }
-void println(double d)       { TYPES_H_FTAG; print(d); print("\n"); }
-void println(void* p)        { TYPES_H_FTAG; print(p); print("\n"); }
-void println(char* s)        { TYPES_H_FTAG; print(s); print("\n"); }
-void println()               { TYPES_H_FTAG;           print("\n"); }
-
-#ifdef _USING_GLM_TYPES__
-void print(glm::vec<2, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec2: %.14g,%.14g", v.x, v.y); }
-void print(glm::vec<3, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec3: %.14g,%.14g,%.14g", v.x, v.y, v.z); }
-void print(glm::vec<4, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec4: %.14g,%.14g,%.14g,%.14g", v.x, v.y, v.z, v.w); }
-void print(glm::mat<2, 2, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat2: "); print(m[0]); print(m[1]); }
-void print(glm::mat<3, 3, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat3: "); print(m[0]); print(m[1]); print(m[2]); }
-void print(glm::mat<4, 4, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat4: "); print(m[0]); print(m[1]); print(m[2]); print(m[3]); }
-
-void println(glm::vec<2, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
-void println(glm::vec<3, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
-void println(glm::vec<4, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
-void println(glm::mat<2, 2, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
-void println(glm::mat<3, 3, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
-void println(glm::mat<4, 4, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
-#endif
+void println(bool b)         { ULE_TYPES_H_FTAG; print(b); print("\n"); }
+void println(char c)         { ULE_TYPES_H_FTAG; print(c); print("\n"); }
+void println(signed int i)   { ULE_TYPES_H_FTAG; print(i); print("\n"); }
+void println(unsigned int i) { ULE_TYPES_H_FTAG; print(i); print("\n"); }
+void println(float f)        { ULE_TYPES_H_FTAG; print(f); print("\n"); }
+void println(double d)       { ULE_TYPES_H_FTAG; print(d); print("\n"); }
+void println(void* p)        { ULE_TYPES_H_FTAG; print(p); print("\n"); }
+void println(char* s)        { ULE_TYPES_H_FTAG; print(s); print("\n"); }
+void println()               { ULE_TYPES_H_FTAG;           print("\n"); }
+
+#ifdef ULE_CONFIG_OPTION_USE_GLM
+void print(glm::vec<2, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec2: %.14g,%.14g", v.x, v.y); }
+void print(glm::vec<3, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec3: %.14g,%.14g,%.14g", v.x, v.y, v.z); }
+void print(glm::vec<4, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec4: %.14g,%.14g,%.14g,%.14g", v.x, v.y, v.z, v.w); }
+void print(glm::mat<2, 2, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat2: "); print(m[0]); print(m[1]); }
+void print(glm::mat<3, 3, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat3: "); print(m[0]); print(m[1]); print(m[2]); }
+void print(glm::mat<4, 4, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat4: "); print(m[0]); print(m[1]); print(m[2]); print(m[3]); }
+
+void println(glm::vec<2, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
+void println(glm::vec<3, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
+void println(glm::vec<4, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
+void println(glm::mat<2, 2, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
+void println(glm::mat<3, 3, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
+void println(glm::mat<4, 4, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
+#endif // ULE_CONFIG_OPTION_USE_GLM
+
--- a/print.h
+++ b/print.h
@ -1,9 +1,10 @@

-#ifndef PRINT_H
-#define PRINT_H
+#ifndef ULE_PRINT_H
+#define ULE_PRINT_H

 #include <stdarg.h> // va_list

+#include "config.h"
 #include "string.h"
 #include "types.h"

@ -135,7 +136,7 @@ extern void println(void* p);
 extern void println(char* s);
 extern void println();

-#ifdef _USING_GLM_TYPES__
+#ifdef ULE_CONFIG_OPTION_USE_GLM
 extern void print(glm::vec<2, float, (glm::qualifier) 3>);
 extern void print(glm::vec<3, float, (glm::qualifier) 3>);
 extern void print(glm::vec<4, float, (glm::qualifier) 3>);
--- a/serialize.cpp
+++ b/serialize.cpp
@ -1,4 +1,6 @@

+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
+
 #include <fast_float/fast_float.h>

 #include "types.h"
@ -7,52 +9,52 @@
 #include "print.h"


-static inline const char* getFormatStringOut(u8     v) { TYPES_H_FTAG; return "%hu\n"; }
-static inline const char* getFormatStringOut(u16    v) { TYPES_H_FTAG; return "%hu\n"; }
-static inline const char* getFormatStringOut(u32    v) { TYPES_H_FTAG; return "%u\n"; }
-static inline const char* getFormatStringOut(u64    v) { TYPES_H_FTAG; return "%llu\n"; }
+static inline const char* getFormatStringOut(u8     v) { ULE_TYPES_H_FTAG; return "%hu\n"; }
+static inline const char* getFormatStringOut(u16    v) { ULE_TYPES_H_FTAG; return "%hu\n"; }
+static inline const char* getFormatStringOut(u32    v) { ULE_TYPES_H_FTAG; return "%u\n"; }
+static inline const char* getFormatStringOut(u64    v) { ULE_TYPES_H_FTAG; return "%llu\n"; }

-static inline const char* getFormatStringOut(s8     v) { TYPES_H_FTAG; return "%hd\n"; }
-static inline const char* getFormatStringOut(s16    v) { TYPES_H_FTAG; return "%hd\n"; }
-static inline const char* getFormatStringOut(s32    v) { TYPES_H_FTAG; return "%d\n"; }
-static inline const char* getFormatStringOut(s64    v) { TYPES_H_FTAG; return "%lld\n"; }
+static inline const char* getFormatStringOut(s8     v) { ULE_TYPES_H_FTAG; return "%hd\n"; }
+static inline const char* getFormatStringOut(s16    v) { ULE_TYPES_H_FTAG; return "%hd\n"; }
+static inline const char* getFormatStringOut(s32    v) { ULE_TYPES_H_FTAG; return "%d\n"; }
+static inline const char* getFormatStringOut(s64    v) { ULE_TYPES_H_FTAG; return "%lld\n"; }

-static inline const char* getFormatStringOut(float  v) { TYPES_H_FTAG; return "%f\n"; }
-static inline const char* getFormatStringOut(double v) { TYPES_H_FTAG; return "%f\n"; }
+static inline const char* getFormatStringOut(float  v) { ULE_TYPES_H_FTAG; return "%f\n"; }
+static inline const char* getFormatStringOut(double v) { ULE_TYPES_H_FTAG; return "%f\n"; }

 // important constraint - strings need to be wrapped in double-quotes.
 // the sentinel value 'null' without quotations is used to denote null values, which means
 // if strings were not wrapped in double quotes, you would not be able to distinguish null
 // values from the literal string "null".
-static inline const char* getFormatStringOut(char*       v) { TYPES_H_FTAG; return "\"%s\"\n"; }
-static inline const char* getFormatStringOut(const char* v) { TYPES_H_FTAG; return "\"%s\"\n"; }
+static inline const char* getFormatStringOut(char*       v) { ULE_TYPES_H_FTAG; return "\"%s\"\n"; }
+static inline const char* getFormatStringOut(const char* v) { ULE_TYPES_H_FTAG; return "\"%s\"\n"; }

 #ifdef _USING_GLM_TYPES__
-static inline const char* getFormatStringOut(glm::vec<2, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f\n"; }
-static inline const char* getFormatStringOut(glm::vec<3, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f\n"; }
-static inline const char* getFormatStringOut(glm::vec<4, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::vec<2, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f\n"; }
+static inline const char* getFormatStringOut(glm::vec<3, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::vec<4, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f\n"; }

-static inline const char* getFormatStringOut(glm::mat<2, 2, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f\n"; }
-static inline const char* getFormatStringOut(glm::mat<3, 3, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f\n"; }
-static inline const char* getFormatStringOut(glm::mat<4, 4, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::mat<2, 2, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::mat<3, 3, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::mat<4, 4, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n"; }

 #endif

 #define SERIALIZE_H_FUNC_BODY str->appendf(getFormatStringOut(v), v);
-void serialize(String* str, u8     v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, u16    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, u32    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, u64    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s8     v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s16    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s32    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s64    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, float  v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, double v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-
-template<typename T> // do I really need a template for this?
+void serialize(String* str, u8     v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, u16    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, u32    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, u64    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s8     v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s16    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s32    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s64    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, float  v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, double v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+
+extern template<typename T> // @TODO do not use a template for this.
 static inline void deserializeInteger(char** buffer, T* v) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    char* _buffer = *buffer;
    T value = 0;
    
@ -107,7 +109,7 @@ static const u32 BINARY32_MAX_CHARS = 14;
 static const u32 BINARY64_MAX_CHARS = 24;

 void deserialize(char** buffer, float*  v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    char* _buffer = *buffer;
    while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
    fast_float::from_chars_result result = fast_float::from_chars(_buffer, _buffer + BINARY32_MAX_CHARS, *v);
@ -115,7 +117,7 @@ void deserialize(char** buffer, float*  v) {
    *buffer = (char*) result.ptr;
 }
 void deserialize(char** buffer, double* v) { 
-    TYPES_H_FTAG;  
+    ULE_TYPES_H_FTAG;  
    char* _buffer = *buffer;
    while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
    fast_float::from_chars_result result = fast_float::from_chars(_buffer, _buffer + BINARY64_MAX_CHARS, *v);
@ -125,19 +127,19 @@ void deserialize(char** buffer, double* v) {

 #ifndef _WIN32
 // win32 doesn't treat size_t as different than a u64, which causes ambiguous function calls
-static inline const char* getFormatStringOut(size_t v) { TYPES_H_FTAG; return "%lu\n"; }
+static inline const char* getFormatStringOut(size_t v) { ULE_TYPES_H_FTAG; return "%lu\n"; }
 void serialize(String* str, size_t v) { SERIALIZE_H_FUNC_BODY }
 void deserialize(char** buffer, size_t* v) { SERIALIZE_H_DESERIALIZE_FUNC_BODY }
 #endif

 // STRING STUFF
 void serialize(String* str, char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    SERIALIZE_HANDLE_NULL(str, v);
    SERIALIZE_H_FUNC_BODY;
 }
 void serialize(String* str, const char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    SERIALIZE_HANDLE_NULL(str, v);
    SERIALIZE_H_FUNC_BODY;
 }
@ -160,7 +162,7 @@ void serialize(String* str, const char* v) {
 static char SERIALIZE_SCRATCH_BUFFER[SERIALIZE_SCRATCH_BUFFER_SIZE];

 static s32 deserializeString(char** buffer, char* v, s32 vSize) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    char* _buffer = *buffer;

    while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
@ -181,7 +183,7 @@ static s32 deserializeString(char** buffer, char* v, s32 vSize) {
    return i;
 }
 static s32 deserializeString(char** buffer, char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    char* _buffer = *buffer;
    while (String::isAsciiWhitespace(*_buffer)) _buffer++;
    massert(_buffer[0] == '"', "expecting to deserialize a string, but found something other than a double quote");
@ -199,15 +201,15 @@ static s32 deserializeString(char** buffer, char* v) {
    return i;
 }
 void deserialize(char** buffer, char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    deserializeString(buffer, v);
 }
 void deserialize(char** buffer, const char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    deserializeString(buffer, (char*) v);
 }
 void deserialize(char** buffer, char** v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    
    DESERIALIZE_HANDLE_NULL(buffer, v);
    
@ -216,7 +218,7 @@ void deserialize(char** buffer, char** v) {
    *v = String::cpy(SERIALIZE_SCRATCH_BUFFER, (u32) i);
 }
 void deserialize(char** buffer, const char** v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    
    DESERIALIZE_HANDLE_NULL(buffer, (char*) v); // error: readonly variable is not assignable
    
@ -231,33 +233,33 @@ void deserialize(char** buffer, const char** v) {
 // have that template parameter == 3, so everything below becomes unresolved symbols if 
 // I don't do the nasty template garbage here
 void serialize(String* str, glm::vec<2, float, (glm::qualifier) (glm::qualifier) 3> v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v), v[0], v[1]); 
 }
 void serialize(String* str, glm::vec<3, float, (glm::qualifier) (glm::qualifier) 3> v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v), v[0], v[1], v[2]); 
 }
 void serialize(String* str, glm::vec<4, float, (glm::qualifier) 3> v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v), v[0], v[1], v[2], v[3]); 
 }

 void serialize(String* str, glm::mat<2, 2, float, (glm::qualifier) 3> v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v)
        , v[0][0], v[0][1]
        , v[1][0], v[1][1]);
 }
 void serialize(String* str, glm::mat<3, 3, float, (glm::qualifier) 3> v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v)
        , v[0][0], v[0][1], v[0][2]
        , v[1][0], v[1][1], v[1][2]
        , v[2][0], v[2][1], v[2][2]);
 }
 void serialize(String* str, glm::mat<4, 4, float, (glm::qualifier) 3> v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    str->appendf(getFormatStringOut(v)
        , v[0][0], v[0][1], v[0][2], v[0][3]
        , v[1][0], v[1][1], v[1][2], v[1][3]
@ -266,21 +268,21 @@ void serialize(String* str, glm::mat<4, 4, float, (glm::qualifier) 3> v) {
 }

 void deserialize(char** buffer, glm::vec<2, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    float* _v = (float*) v;
    for (u32 i = 0; i < 2; i++) {
        deserialize(buffer, _v + i);
    }
 }
 void deserialize(char** buffer, glm::vec<3, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    float* _v = (float*) v;
    for (u32 i = 0; i < 3; i++) {
        deserialize(buffer, _v + i);
    }
 }
 void deserialize(char** buffer, glm::vec<4, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    float* _v = (float*) v;
    for (u32 i = 0; i < 4; i++) {
        deserialize(buffer, _v + i);
@ -288,21 +290,21 @@ void deserialize(char** buffer, glm::vec<4, float, (glm::qualifier) 3>* v) {
 }

 void deserialize(char** buffer, glm::mat<2, 2, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    float* m = (float*) v;
    for (u32 i = 0; i < 4; i++) {
        deserialize(buffer, m + i);
    }
 }
 void deserialize(char** buffer, glm::mat<3, 3, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    float* m = (float*) v;
    for (u32 i = 0; i < 9; i++) {
        deserialize(buffer, m + i);
    }
 }
 void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    float* m = (float*) v;
    for (u32 i = 0; i < 16; i++) {
        deserialize(buffer, m + i);
@ -313,4 +315,5 @@ void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>* v) {
 #undef SERIALIZE_H_DESERIALIZE_FUNC_BODY

 #endif
+#endif

--- a/serialize.h
+++ b/serialize.h
@ -1,12 +1,15 @@

-#ifndef SERIALIZE_H
-#define SERIALIZE_H
+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
+#ifndef ULE_SERIALIZE_H
+#define ULE_SERIALIZE_H


+#include "config.h"
 #include "print.h"
 #include "types.h"
 #include "string.h"

+
 /*
    NOTES ON SERIALIZATION
    after wrestling with various reflection libraries for a week, I decided to use none of them.
@ -72,13 +75,13 @@ extern void serialize(String* str, T v); \
 extern void deserialize(char** buffer, T* v); \
 extern void deserialize(char** buffer, T** v); \
 static void serializePrint(T* v) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
    String str = String(""); \
    serialize(&str, v); \
    println(str.c_str()); \
 } \
 static bool serializeEquals(T* t1, T* t2) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
    String s1 = String128f(""); \
    String s2 = String128f(""); \
    serialize(&s1, t1); \
@ -89,7 +92,7 @@ static bool serializeEquals(T* t1, T* t2) { \
 // if you implement deserialize with a T*.
 #define SERIALIZE_H_HELPER_CLONE_T_POINTER(T) \
 static void serializeClone(T* orig, T* destination) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
    String str = String128f(""); \
    serialize(&str, orig); \
    char* buffer = str.c_str(); \
@ -99,7 +102,7 @@ static void serializeClone(T* orig, T* destination) { \
 // if you implement deserialize with a T**.
 #define SERIALIZE_H_HELPER_CLONE_T_DOUBLE_POINTER(T) \
 static void serializeClone(T* orig, T** destination) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
    String str = String128f(""); \
    serialize(&str, orig); \
    char* buffer = str.c_str(); \
@ -206,3 +209,5 @@ extern void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>

 #endif

+#endif
+
--- a/signal-handler.h
+++ b/signal-handler.h
@ -1,9 +1,10 @@

-#ifndef SIGNAL_HANDLER_H
-#define SIGNAL_HANDLER_H
+#ifndef ULE_SIGNAL_HANDLER_H
+#define ULE_SIGNAL_HANDLER_H

 #include <signal.h> // for signal() and the SIG macros

+#include "config.h"
 #include "types.h"
 #include "print.h"

@ -11,7 +12,7 @@
 // the running process can receive and respond to a variety of platform-dependent 'signals' during runtime from the OS.
 // freebsd has something like 30 signals, windows has a subset, just 6. we'll just deal with 6.
 static inline void defaultHandler(s32 signal) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
    switch (signal) {
        case SIGSEGV:
        case SIGABRT:
@ -37,7 +38,7 @@ static inline void defaultHandler(s32 signal) {
 }

 static void setSignalHandlers(void(*handler)(s32 signal) = defaultHandler) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG;
    if (signal(SIGSEGV, handler) == SIG_ERR) die("failed to set SIGSEGV handler... zzz...\n");
    if (signal(SIGABRT, handler) == SIG_ERR) die("failed to set SIGABRT handler... zzz...\n");
    if (signal(SIGFPE,  handler) == SIG_ERR) die("failed to set SIGFPE handler... zzz...\n");
--- a/sse_mathfun.h
+++ b/sse_mathfun.h
@ -0,0 +1,710 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
--- a/string.h
+++ b/string.h
@ -1,11 +1,12 @@

-#ifndef STRING_H
-#define STRING_H
+#ifndef ULE_STRING_H
+#define ULE_STRING_H

+#include "config.h"
 #include "types.h"
 #include "alloc.h"

-#include <string.h> // @TODO remove this
+//#include <string.h> // @TODO remove this

 #define STB_SPRINTF_IMPLEMENTATION
 #define STB_SPRINTF_STATIC
@ -21,6 +22,7 @@

 // 'String' is a datatype, but it also is a namespace for a bunch of static 'char*' operations that 
 // you would normally find in the <cstring> or <string.h> header
+// The datatype is a modified version of a string class developed by Omar Cornut: https://github.com/ocornut/str
 class String {
 public:
    // Static empty buffer we can point to for empty strings
@ -49,7 +51,7 @@ public:
    };

    static inline s32 sprintf(char* buffer, const char* format, ...) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        va_list args;
        va_start(args, format);

@ -59,7 +61,7 @@ public:
        return code;
    }
    static inline s32 snprintf(char* buffer, s32 count, const char* format, ...) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        va_list args;
        va_start(args, format);

@ -73,31 +75,31 @@ public:
    }

    static inline bool isDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return (c >= '0') && (c <= '9');
    }
    static inline bool isAlpha(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return (c >= 'A' && c <= 'Z')
            || (c >= 'a' && c <= 'z');
    }
    static inline bool isHexDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return ((c >= '0') && (c <= '9'))
            || ((c >= 'A') && (c <= 'F'))
            || ((c >= 'a') && (c <= 'f'));
    }
    static inline bool isOctalDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return (c >= '0') && (c <= '7');
    }
    static inline bool isBinaryDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return c == '0' || c == '1';
    }

    static inline char* intToString(u64 integer) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 capacity = 10;
        u32* remainders = (u32*) pMalloc(sizeof (u32) * capacity);

@ -124,7 +126,7 @@ public:
    }

    static inline u64 hexStringToInt(const char* str) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u64 out = 0;

        while (*str != '\0') {
@ -148,7 +150,7 @@ public:
    }

    static inline u32 len(const char* string) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        const char* start = string;
        while (*string++ != '\0') {}
        return (u32) (string - start);
@ -156,7 +158,7 @@ public:

    // returns true if null-terminated strings |s1| and |s2| are equal
    static inline bool eq(const char* s1, const char* s2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 l1 = String::len(s1);
        u32 l2 = String::len(s2);

@ -173,7 +175,7 @@ public:

    // same as |eq|, but handles |s1| and/or |s2| being null
    static inline bool eqNullCheck(const char* s1, const char* s2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (s1 == null) {
            if (s2 == null) {
                return true;
@ -189,7 +191,7 @@ public:

    // heap allocates a copy of |string| and returns a pointer to it.
    static inline char* cpy(const char* string, u32 length, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        char* buffer = (char*) allocator->mallocate(sizeof (char) * (length + 1), allocator->state);

        u32 i = 0;
@ -202,18 +204,18 @@ public:

    // heap allocates a copy of |string| and returns a pointer to it.
    static inline char* cpy(const char* string, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 len = String::len(string);

        return String::cpy(string, len, allocator = Allocator::GetDefault());
    }

    static inline bool memeq(const unsigned char* m1, const unsigned char* m2, size_t length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return memcmp(m1, m2, length) == 0;
    }
    static inline bool memeq(const unsigned char* m1, size_t l1, const unsigned char* m2, size_t l2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (l1 != l2) return false;

        return memeq(m1, m2, l1);
@ -221,13 +223,13 @@ public:

    #ifdef _WIN32
    static inline size_t wcharToChar(wchar_t* wstring, char* buffer, size_t maxBufferLength) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return wcstombs(buffer, wstring, maxBufferLength);
    }
    #endif

    static inline void* memset(void* p, char c, u32 length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        //__stosb((unsigned char*) p, c, length);
        char* a = (char*) p;
        for (u32 i = 0; i < length; i++) a[i] = c;
@ -236,7 +238,7 @@ public:


    static inline void memcpy(void* dest, void* src, u32 size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u8* dest_ = (u8*) dest;
        u8* src_ = (u8*) src;

@ -247,7 +249,7 @@ public:

    // replace all instances of |c1| in |string| with |c2|
    static inline void replaceC(char* string, u32 length, char c1, char c2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < length; i++) {
            if (string[i] == c1) {
                string[i] = c2;
@ -256,7 +258,7 @@ public:
    }

    static inline const char* firstCharOccurence(const char* string, u32 length, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (s32 i = 0; i < length; i++) {
            const char* s = string + i;
            if (*s == c) {
@ -267,12 +269,12 @@ public:
    }

    static inline const char* firstCharOccurence(const char* string, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return String::firstCharOccurence(string, String::len(string), c);
    }

    static inline const char* lastCharOccurence(const char* string, u32 length, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (s32 i = length - 1; i >= 0; i--) { // @NOTE 'i' needs to be a signed int here...
            if (*(string + i) == c) {
                return string + i;
@ -282,19 +284,19 @@ public:
    }

    static inline const char* lastCharOccurence(const char* string, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return String::lastCharOccurence(string, String::len(string), c);
    }

    static inline bool hasSuffix(const char* string, const char* suffix) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        const char* p = String::lastCharOccurence(string, String::len(string), suffix[0]);
        if (p) return String::eq(p, suffix);
        return false;
    }

    static inline u32 countLines(const char* buffer) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 lines = 0;
        char c;

@ -306,7 +308,7 @@ public:
    }

    static inline bool isAscii(const char* buffer, u32 length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        const unsigned char* ubuffer = (const unsigned char*) buffer;
        for (u32 i = 0; i < length; i++) {
            if (ubuffer[i] & 128) { // binary: 0b 1000 0000
@ -317,7 +319,7 @@ public:
    }

    static inline bool isAsciiWhitespace(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        switch (c) {
            //case '\b':
            //case '\v':
@ -339,7 +341,7 @@ public:
    //static inline char* trimStart(const char* str, u32 count);
    //static inline char* trimEnd(const char* str, u32 count);
    static inline char* trim(const char* str, u32 count, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 length = String::len(str);

        if (length <= count) {
@ -359,7 +361,7 @@ public:
    }

    static inline char* asciiToLower(const char* str, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 length = String::len(str);
        char* buffer = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
        u32 i = 0;
@ -371,7 +373,7 @@ public:
    }

    static inline char* asciiToUpper(const char* str, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 length = String::len(str);
        char* buffer = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
        u32 i = 0;
@ -383,7 +385,7 @@ public:
    }

    static inline char* concat(const char* str1, const char* str2, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 l1 = String::len(str1);
        u32 l2 = String::len(str2);
        u32 newLength = l1 + l2;
@ -401,7 +403,7 @@ public:
    }

    static inline u32 write(char* dest, const char* src, u32 length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 i = 0;
        for (; i < length; i++) {
            dest[i] = src[i];
@ -412,13 +414,13 @@ public:

    // returns the number of characters written.
    static inline u32 write(char* dest, const char* src) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        u32 length = String::len(src);
        return String::write(dest, src, length);
    }

    static inline char* read(const char* buffer, u32 length, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        char* tk = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
        u32 i = 0;
        while (i < length) {
@ -434,38 +436,38 @@ public:
    int                 LocalBufSize : 10;      // Max 1023 bytes
    unsigned int        Owned : 1;              // Set when we have ownership of the pointed data (most common, unless using set_ref() method or StringRef constructor)

-    inline char*        c_str()                                 { TYPES_H_FTAG; return Data; }
-    inline const char*  c_str() const                           { TYPES_H_FTAG; return Data; }
-    inline bool         empty() const                           { TYPES_H_FTAG; return Data[0] == 0; }
-    inline int          length() const                          { TYPES_H_FTAG; return (int)strlen(Data); }    // by design, allow user to write into the buffer at any time
-    inline int          capacity() const                        { TYPES_H_FTAG; return Capacity; }
-    inline bool         owned() const                           { TYPES_H_FTAG; return Owned ? true : false; }
+    inline char*        c_str()                                 { ULE_TYPES_H_FTAG; return Data; }
+    inline const char*  c_str() const                           { ULE_TYPES_H_FTAG; return Data; }
+    inline bool         empty() const                           { ULE_TYPES_H_FTAG; return Data[0] == 0; }
+    inline int          length() const                          { ULE_TYPES_H_FTAG; return (int)strlen(Data); }    // by design, allow user to write into the buffer at any time
+    inline int          capacity() const                        { ULE_TYPES_H_FTAG; return Capacity; }
+    inline bool         owned() const                           { ULE_TYPES_H_FTAG; return Owned ? true : false; }

-    inline char&        operator[](size_t i)                    { TYPES_H_FTAG; return Data[i]; }
-    inline char         operator[](size_t i) const              { TYPES_H_FTAG; return Data[i]; }
-    inline String&      operator=(const String& rhs)            { TYPES_H_FTAG; set(rhs); return *this; }
-    inline bool         operator==(const String& rhs) const     { TYPES_H_FTAG; return strcmp(c_str(), rhs.c_str()) == 0; }
-    inline String&      operator=(const char* rhs)              { TYPES_H_FTAG; set(rhs); return *this; }
-    inline bool         operator==(const char* rhs) const       { TYPES_H_FTAG; return strcmp(c_str(), rhs) == 0; }
+    inline char&        operator[](size_t i)                    { ULE_TYPES_H_FTAG; return Data[i]; }
+    inline char         operator[](size_t i) const              { ULE_TYPES_H_FTAG; return Data[i]; }
+    inline String&      operator=(const String& rhs)            { ULE_TYPES_H_FTAG; set(rhs); return *this; }
+    inline bool         operator==(const String& rhs) const     { ULE_TYPES_H_FTAG; return strcmp(c_str(), rhs.c_str()) == 0; }
+    inline String&      operator=(const char* rhs)              { ULE_TYPES_H_FTAG; set(rhs); return *this; }
+    inline bool         operator==(const char* rhs) const       { ULE_TYPES_H_FTAG; return strcmp(c_str(), rhs) == 0; }

    inline String() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        Data = EmptyBuffer; // Shared READ-ONLY initial buffer for 0 capacity
        Capacity = 0;
        LocalBufSize = 0;
        Owned = 0;
    }
    inline String(const String& rhs) : String() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        set(rhs);
    }
    inline String(const char* rhs) : String() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        set(rhs);
    }

    inline void set_ref(const char* src) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        if (Owned && !is_using_local_buf())
            STR_MEMFREE(Data);
        Data = src ? (char*)src : EmptyBuffer;
@ -473,7 +475,7 @@ public:
        Owned = 0;
    }
    inline void set(const String& src) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        int buf_len = (int)strlen(src.c_str())+1;
        if ((int)Capacity < buf_len)
            reserve_discard(buf_len);
@ -482,7 +484,7 @@ public:
    }

    inline void set(const char* src) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        // We allow set(NULL) or via = operator to clear the string.
        if (src == NULL)
        {
@ -497,7 +499,7 @@ public:
    }

    inline void set(const char* src, const char* src_end) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        STR_ASSERT(src != NULL && src_end >= src);
        int buf_len = (int)(src_end-src)+1;
        if ((int)Capacity < buf_len)
@ -509,7 +511,7 @@ public:

    // Clear
    inline void clear() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        if (Owned && !is_using_local_buf())
            STR_MEMFREE(Data);
        if (LocalBufSize) {
@ -526,7 +528,7 @@ public:

    // Reserve memory, preserving the current of the buffer
    inline void reserve(int new_capacity) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        if (new_capacity <= Capacity)
            return;

@ -558,7 +560,7 @@ public:

    // Reserve memory, discarding the current of the buffer (if we expect to be fully rewritten)
    inline void reserve_discard(int new_capacity) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        if (new_capacity <= Capacity)
            return;

@ -578,7 +580,7 @@ public:
    }

    inline void shrink_to_fit() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        if (!Owned || is_using_local_buf()) return;
        int new_capacity = length() + 1;
        if (Capacity <= new_capacity) return;
@ -592,7 +594,7 @@ public:

    // FIXME: merge setfv() and appendfv()?
    inline int setfv(const char* fmt, va_list args) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        // Needed for portability on platforms where va_list are passed by reference and modified by functions
        va_list args2;
        va_copy(args2, args);
@ -612,7 +614,7 @@ public:
    }

    inline int setf(const char* fmt, ...) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        va_list args;
        va_start(args, fmt);
        int len = setfv(fmt, args);
@ -621,7 +623,7 @@ public:
    }

    inline int setfv_nogrow(const char* fmt, va_list args) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        STR_ASSERT(Owned);

        if (Capacity == 0) return 0;
@ -633,7 +635,7 @@ public:
    }

    inline int setf_nogrow(const char* fmt, ...) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        va_list args;
        va_start(args, fmt);
        int len = setfv_nogrow(fmt, args);
@ -642,7 +644,7 @@ public:
    }

    inline int append_from(int idx, char c) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        int add_len = 1;
        if (Capacity < idx + add_len + 1)
            reserve(idx + add_len + 1);
@ -653,7 +655,7 @@ public:
    }

    inline int append_from(int idx, const char* s, const char* s_end) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        if (!s_end) s_end = s + strlen(s);
        int add_len = (int)(s_end - s);
        if (Capacity < idx + add_len + 1) reserve(idx + add_len + 1);
@ -665,7 +667,7 @@ public:

    // FIXME: merge setfv() and appendfv()?
    inline int appendfv_from(int idx, const char* fmt, va_list args) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        // Needed for portability on platforms where va_list are passed by reference and modified by functions
        va_list args2;
        va_copy(args2, args);
@ -684,7 +686,7 @@ public:
    }

    inline int appendf_from(int idx, const char* fmt, ...) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        va_list args;
        va_start(args, fmt);
        int len = appendfv_from(idx, fmt, args);
@ -693,25 +695,25 @@ public:
    }

    inline int append(char c) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        int cur_len = length();
        return append_from(cur_len, c);
    }

    inline int append(const char* s, const char* s_end = null) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        int cur_len = length();
        return append_from(cur_len, s, s_end);
    }

    inline int appendfv(const char* fmt, va_list args) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        int cur_len = length();
        return appendfv_from(cur_len, fmt, args);
    }

    int appendf(const char* fmt, ...) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        va_list args;
        va_start(args, fmt);
        int len = appendfv(fmt, args);
@ -727,13 +729,13 @@ public:
    }

 protected:
-    inline char*        local_buf()                             { TYPES_H_FTAG; return (char*)this + sizeof(String); }
-    inline const char*  local_buf() const                       { TYPES_H_FTAG; return (char*)this + sizeof(String); }
-    inline bool         is_using_local_buf() const              { TYPES_H_FTAG; return Data == local_buf() && LocalBufSize != 0; }
+    inline char*        local_buf()                             { ULE_TYPES_H_FTAG; return (char*)this + sizeof(String); }
+    inline const char*  local_buf() const                       { ULE_TYPES_H_FTAG; return (char*)this + sizeof(String); }
+    inline bool         is_using_local_buf() const              { ULE_TYPES_H_FTAG; return Data == local_buf() && LocalBufSize != 0; }

    // Constructor for StringXXX variants with local buffer
    String(unsigned short local_buf_size) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
        STR_ASSERT(local_buf_size < 1024);
        Data = local_buf();
        Data[0] = '\0';
@ -746,7 +748,7 @@ protected:
 // Literal/reference string
 class StringRef : public String {
 public:
-    StringRef(const char* s) : String() { TYPES_H_FTAG; set_ref(s); }
+    StringRef(const char* s) : String() { ULE_TYPES_H_FTAG; set_ref(s); }
 };

 // Types embedding a local buffer
@ -757,12 +759,12 @@ class TYPENAME : public String
    char local_buf[LOCALBUFSIZE];                                                                  \
 public:                                                                                            \
    TYPENAME() : String(LOCALBUFSIZE) {}                                                           \
-    TYPENAME(const String& rhs) : String(LOCALBUFSIZE)   { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const char* rhs) : String(LOCALBUFSIZE)     { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const TYPENAME& rhs) : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME&   operator=(const char* rhs)               { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME&   operator=(const String& rhs)             { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME&   operator=(const TYPENAME& rhs)           { TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME(const String& rhs) : String(LOCALBUFSIZE)   { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const char* rhs) : String(LOCALBUFSIZE)     { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const TYPENAME& rhs) : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME&   operator=(const char* rhs)               { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME&   operator=(const String& rhs)             { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME&   operator=(const TYPENAME& rhs)           { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
 };

 // Disable PVS-Studio warning V730: Not all members of a class are initialized inside the constructor (local_buf is not initialized and that is fine)
@ -773,7 +775,7 @@ public:
 class TYPENAME_F : public TYPENAME                                                  \
 {                                                                                   \
 public:                                                                             \
-    TYPENAME_F(const char* fmt, ...) : TYPENAME() { TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
+    TYPENAME_F(const char* fmt, ...) : TYPENAME() { ULE_TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
 };

 #ifdef __clang__
@ -812,16 +814,16 @@ STR_DEFINETYPE_F(String32, String32f)
 class TYPENAME : public String {                                                    \
    char local_buf[LOCALBUFSIZE];                                                   \
 public:                                                                             \
-    TYPENAME(const char* fmt, ...) : String(LOCALBUFSIZE) { TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
-    TYPENAME()                     : String(LOCALBUFSIZE) { TYPES_H_FTAG; }                         \
-    TYPENAME(const String& rhs)    : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const char* rhs)      : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const TYPENAME& rhs)  : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME& operator=(const char* rhs)                  { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME& operator=(const String& rhs)                { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME& operator=(const TYPENAME& rhs)              { TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME(const char* fmt, ...) : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
+    TYPENAME()                     : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; }                         \
+    TYPENAME(const String& rhs)    : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const char* rhs)      : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const TYPENAME& rhs)  : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME& operator=(const char* rhs)                  { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME& operator=(const String& rhs)                { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME& operator=(const TYPENAME& rhs)              { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
    void reserve(int new_capacity) { \
-        TYPES_H_FTAG; \
+        ULE_TYPES_H_FTAG; \
        if (new_capacity <= Capacity) \
            return; \
        char* new_data; \
@ -840,7 +842,7 @@ public:
        Owned = 1; \
    } \
    void reserve_discard(int new_capacity) { \
-        TYPES_H_FTAG; \
+        ULE_TYPES_H_FTAG; \
        if (new_capacity <= Capacity) \
            return; \
        if (Owned && !is_using_local_buf()) \
@ -921,7 +923,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //};
 ////================================================================================ 
 //StringBuffer::StringBuffer(u32 initialSize) {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    this->length = 0;
 //    this->capacity = initialSize;
 //    this->data = (char*) pMalloc(sizeof(char) * this->capacity);
@ -938,7 +940,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //void StringBuffer::checkIfShouldGrow() {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    if (this->isFull()) {
 //        // optimal number as you approach infinite elements approaches PHI, but 1.5 sometimes works better for finite sizes
 //        // more testing is probably needed
@ -948,17 +950,17 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //bool StringBuffer::isEmpty() const {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    return this->length == 0;
 //}
 //
 //bool StringBuffer::isFull() const {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    return this->length == this->capacity;
 //}
 //
 //char StringBuffer::pop() {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    if (this->isEmpty()) {
 //        die("empty");
 //    }
@ -967,7 +969,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //u32 StringBuffer::append(char e) {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    this->checkIfShouldGrow();
 //
 //    this->data[this->length++] = e;
--- a/table.hpp
+++ b/table.hpp
@ -1,17 +1,17 @@

-#ifndef TABLE_H
-#define TABLE_H
+#ifndef ULE_TABLE_H
+#define ULE_TABLE_H

 #include <new> // new
 #include <functional> // std::function for traversal
 #include <type_traits> // std::enable_if

+#include "config.h"
 #include "alloc.h"
 #include "string.h"
 #include "types.h"


-
 // what follows is a collection of hash functions taken from: https://www.partow.net/programming/hashfunctions/#:~:text=The%20hash%20functions%20in%20this,containers%20such%20as%20hash%2Dtables.
 //
 // Available Hash Functions
@ -203,7 +203,7 @@ static inline u32 fastModuloReductionDanielLemire(u32 v, u32 c) {
 }

 static inline u32 hash(const char* key, u32 keyLength, u32 capacity) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;

    u32 value = APHash(key, keyLength);

@ -233,18 +233,18 @@ struct Table {
    TableEntry<V>** entries;

    Table<V>(u32 _lanes = 16) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->lanes = _lanes;
        this->length = 0;
        this->entries = (TableEntry<V>**) pCalloc(sizeof(TableEntry<V>*), this->lanes);
    }
    void* operator new(size_t size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return (Table<V>*) pMalloc(sizeof(Table<V>));
    }

    V insert(const char* key, u32 keyLength, V value) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        TableEntry<V>* entry = this->lookup(key, keyLength);

        if (!entry) { // no entry with that key exists
@ -270,7 +270,7 @@ struct Table {
    }

    TableEntry<V>* lookup(const char* key, u32 keyLength) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        TableEntry<V>* entry = this->entries[hash(key, keyLength, lanes)];

        for (; entry != null; entry = entry->next) {
@ -283,7 +283,7 @@ struct Table {
    }

    V lookupWithDefault(const char* key, u32 keyLength, V defaultValue) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        auto entry = this->lookup(key, keyLength);

        if (entry == null) return defaultValue;
@ -294,7 +294,7 @@ struct Table {
    // do not set |freeValues| to true unless the template parameter 'T' is a pointer,
    // and the table is responsible for freeing the memory.
    void clear(bool freeValues = false) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            TableEntry<V>** lane = &this->entries[i];
            TableEntry<V>* entry = *lane;
@ -334,7 +334,7 @@ struct Table {
    }

    void traverse(const std::function <void (TableEntry<V>*)>& entryCallback) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            TableEntry<V>* entry = this->entries[i];

@ -346,9 +346,10 @@ struct Table {
    }
 };

+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
 template <typename T>
 static void serialize(String* str, Table<T> table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    serialize(str, table.lanes);
    serialize(str, table.length);
    for (u32 i = 0; i < table.lanes; i++) {
@ -364,7 +365,7 @@ static void serialize(String* str, Table<T> table) {

 template <typename T>
 static void serialize(String* str, Table<T>* table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    SERIALIZE_HANDLE_NULL(str, table);
    serialize(str, table->lanes);
    serialize(str, table->length);
@ -381,7 +382,7 @@ static void serialize(String* str, Table<T>* table) {

 template <typename T>
 static void deserialize(char** buffer, Table<T>* table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    deserialize(buffer, &table->lanes);
    u32 length;
    deserialize(buffer, &length);
@ -398,7 +399,7 @@ static void deserialize(char** buffer, Table<T>* table) {

 template <typename T>
 static void deserialize(char** buffer, Table<T>** table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    DESERIALIZE_HANDLE_NULL(buffer, table);
    u32 lanes;
    deserialize(buffer, &lanes);
@ -416,6 +417,7 @@ static void deserialize(char** buffer, Table<T>** table) {
    _table->length = length;
    *table = _table;
 }
+#endif // ULE_CONFIG_OPTION_SERIALIZATION

 //================================================================================ 
 // Fixed-key size table.
@ -436,7 +438,7 @@ static void deserialize(char** buffer, Table<T>** table) {
 //#include <mmintrin.h>
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 64>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;

    // AVX512:
    //__mmask32 result = _mm512_cmpeq_epi16_mask (*((__m512i*)m1), *((__m512i*)m2));
@ -465,7 +467,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 32>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    //sse4.2:
    //int result = 0;
    //for (u32 i = 0; i < 4; i++) {
@ -490,7 +492,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 16>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    // MMX: (this one is barely nanoseconds (~1-10ns) faster than String::memeq)
    //__m64 result = _mm_cmpeq_pi32(*((__m64*)m1), *((__m64*)m2));
    //return ((u64)result) == ~0ULL;
@ -499,7 +501,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE != 64 && KEY_SIZE != 32 && KEY_SIZE != 16>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
    return String::memeq(m1, m2, KEY_SIZE);
 }

@ -518,18 +520,18 @@ struct FixedKeySizeTable {
    FixedKeySizeTableEntry<KEY_SIZE, V>** entries;

    FixedKeySizeTable<KEY_SIZE, V>(u32 _lanes = 16) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->lanes = _lanes;
        this->length = 0;
        this->entries = (FixedKeySizeTableEntry<KEY_SIZE, V>**) pCalloc(sizeof(FixedKeySizeTableEntry<KEY_SIZE, V>*), this->lanes);
    }
    void* operator new(size_t size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        return (FixedKeySizeTable<KEY_SIZE, V>*) pMalloc(sizeof(FixedKeySizeTable<KEY_SIZE, V>));
    }

    V insert(const char* key, V value) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->lookup(key);

        if (!entry) { // no entry with that key exists
@ -554,7 +556,7 @@ struct FixedKeySizeTable {
    }

    FixedKeySizeTableEntry<KEY_SIZE, V>* lookup(const char* key) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->entries[hash(key, KEY_SIZE, lanes)];

        for (; entry != null; entry = entry->next) {
@ -567,7 +569,7 @@ struct FixedKeySizeTable {
    }

    V lookupWithDefault(const char* key, V defaultValue) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        auto entry = this->lookup(key);

        if (entry == null) return defaultValue;
@ -578,7 +580,7 @@ struct FixedKeySizeTable {
    // do not set |freeValues| to true unless the template parameter 'T' is a pointer,
    // and the table is responsible for freeing the memory.
    void clear(bool freeValues = false) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            FixedKeySizeTableEntry<KEY_SIZE, V>** lane = &this->entries[i];
            FixedKeySizeTableEntry<KEY_SIZE, V>* entry = *lane;
@ -617,7 +619,7 @@ struct FixedKeySizeTable {
    }

    void traverse(const std::function <void (FixedKeySizeTableEntry<KEY_SIZE, V>*)>& entryCallback) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        for (u32 i = 0; i < this->lanes; i++) {
            FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->entries[i];

@ -643,14 +645,14 @@ struct CacheTable {
    CacheTableEntry* entries; // n and p are the dimensions of the array. n is first.

    CacheTable(u32 _n = 8, u32 _p = 8) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
        this->n       = _n;
        this->p       = _p;
        this->entries = (CacheTableEntry*) pCalloc(this->n*this->p, sizeof(CacheTableEntry));
    }

    void* insert(const char* key, u32 keyLength, void* value) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;

        CacheTableEntry* row = this->entries + hash(key, keyLength, this->n) * this->n;
        // We're going to insert in 'row'. We need some policy to decide which column to evict.
@ -682,7 +684,7 @@ struct CacheTable {
    }

    CacheTableEntry* lookup(const char* key, u32 keyLength) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;

        CacheTableEntry* row = this->entries + hash(key, keyLength, this->n) * this->n;

@ -698,7 +700,7 @@ struct CacheTable {
    }

    void clear(bool freeValues = false) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;

        for (u32 i = 0; i < this->n; i++) {
            CacheTableEntry* row = this->entries + i * this->n;
--- a/types.h
+++ b/types.h
@ -1,25 +1,17 @@

-#ifndef TYPES_H
-#define TYPES_H
+#ifndef ULE_TYPES_H
+#define ULE_TYPES_H

 #include <stddef.h> // size_t

 #define null 0

-// long term, it would be nice to not have to '#include' tracy here,
-// a client using the library should include it and use a define to instruct
-// the library what to put at the beginning of function calls for profiling needs, 
-// but i've had trouble implementing that.
-#ifndef TYPES_H_FTAG
-#include <Tracy.hpp>
-#define TYPES_H_FTAG ZoneScoped
+#ifndef ULE_TYPES_H_FTAG
+#ifdef ULE_CONFIG_OPTION_FTAG
+#define ULE_TYPES_H_FTAG ULE_CONFIG_OPTION_FTAG
+#else
+#define ULE_TYPES_H_FTAG
 #endif
-
-// bool is included by default for C++11
-#ifndef __cplusplus
-    typedef _Bool bool;
-    #define true 1
-    #define false 0
 #endif

 // The restrict declspec is used on functions that return unaliased pointers. This keyword is used for the C-Runtime Library implementation of malloc since it will never return a pointer value that is already in use in the current program (unless you are doing something illegal, such as using memory after it has been freed).
@ -47,12 +39,12 @@
 typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
-typedef uint8_t u8;
+typedef uint8_t  u8;

 typedef int64_t s64;
 typedef int32_t s32;
 typedef int16_t s16;
-typedef int8_t s8;
+typedef int8_t  s8;

 //typedef size_t size_t;

@ -62,9 +54,7 @@ typedef int8_t s8;
 //typedef long double extended;

 // if we're using the glm vector/matrix types, or other types, define them here
-#define _USING_GLM_TYPES__
-
-#ifdef _USING_GLM_TYPES__
+#ifdef ULE_CONFIG_OPTION_USE_GLM
 // force high precision for everything
 #define GLM_PRECISION_HIGHP_FLOAT
 #define GLM_PRECISION_HIGHP_DOUBLE
--- a/util.h
+++ b/util.h
@ -1,6 +1,6 @@

-#ifndef UTIL_H
-#define UTIL_H
+#ifndef ULE_UTIL_H
+#define ULE_UTIL_H


 #define STATIC_ARRAY_LENGTH(a) (sizeof(a)/sizeof(a[0]))