diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1120436
--- /dev/null
+++ b/README.md
@@ -0,0 +1,48 @@
+
+This is a library of C++ code which I use as a standard library wrapper, supplement, and in some cases, replacement.
+
+If you want to use it, you can add all of the source files to your source tree, configure the `#define`'s in `config.h` to suit your needs, and it should just work.
+
+The exceptions are the files `config.h` and `types.h` which are required by every other file.
+
+- Stack, Scratch, and Block-based allocators as well as memory-leak checking mechanism and OS allocator wrappers in `alloc.h/.cpp`
+- Heap-friendly String type, including format strings and StringBuffers/Builders, as well as `<string.h>` function replacements as static methods in single-header `string.h`
+- Instrusive serialization mechanism in `serialize.h/.cpp` for complex types and primitives (no reflection though)
+- A few hash functions, HashTable and CacheTable (hash table that can forget its keys) implementations in `table.hpp`
+- A dynamic/growing array implementation in `array.hpp`
+- Common file operations, `<stdio>` wrapper in `file.h/.cpp`
+
+And some more stuff that is TODO:
+- `cpuid` x86 instruction wrapper
+- `glm` replacement - vector, matrix, and quaternion types and some common operations involving them
+
+# Licenses & Other Code
+
+## fast_float
+Our serialization code uses `fast_float` library by Daniel Lemire et al, provided simultaneously under the [Apache License, Version 2.0](https://github.com/fastfloat/fast_float/blob/main/LICENSE-APACHE), the [MIT license](https://github.com/fastfloat/fast_float/blob/main/LICENSE-MIT) and/or the [BOOST license](https://github.com/fastfloat/fast_float/blob/main/LICENSE-BOOST). The `fast_float` library itself uses code originally published the Apache 2.0 license.
+
+## sse_mathfun.h
+The `sin`, `cos`, `exp`, and `log` replacements used by this library are provided by a single-header library written by Julien Pommier under the zlib license:
+
+```
+Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+```
+
diff --git a/alloc.cpp b/alloc.cpp
index e367a81..8db667d 100644
--- a/alloc.cpp
+++ b/alloc.cpp
@@ -7,22 +7,22 @@
 
 #if false
 static void* leakcheckMalloc(size_t size, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return malloc(size);
 }
 
 static void* leakcheckCalloc(size_t maxNumOfElements, size_t elementSize, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return calloc(maxNumOfElements, elementSize);
 }
 
 static void* leakcheckRealloc(void* buffer, size_t newSize, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return realloc(buffer, newSize);
 }
 
 static void leakcheckFree(void* ptr, const char* file, s32 line) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     free(ptr);
 }
 
@@ -45,7 +45,7 @@ static void dumpLeaks() {
 
 // system allocators
 void* pMalloc(size_t size) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     void* p = malloc(size);
 
     if (!p) {
@@ -55,12 +55,12 @@ void* pMalloc(size_t size) {
     return p;
 }
 void* pMalloc(size_t size, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return pMalloc(size);
 }
 
 void* pCalloc(size_t maxNumOfElements, size_t elementSize) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     void* p = calloc(maxNumOfElements, elementSize);
 
     if (!p) {
@@ -70,12 +70,12 @@ void* pCalloc(size_t maxNumOfElements, size_t elementSize) {
     return p;
 }
 void* pCalloc(size_t maxNumOfElements, size_t elementSize, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return pCalloc(maxNumOfElements, elementSize);
 }
 
 void* pRealloc(void* buffer, size_t newSize) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     void* p = realloc(buffer, newSize);
 
     if (!p) {
@@ -86,25 +86,25 @@ void* pRealloc(void* buffer, size_t newSize) {
     return p;
 }
 void* pRealloc(void* buffer, size_t newSize, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return pRealloc(buffer, newSize);
 }
 
 void pFree(void* ptr) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     free(ptr);
 }
 void pFree(void* ptr, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     pFree(ptr);
 }
 
 void pFree(const void* ptr) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     pFree((void*) ptr);
 }
 void pFree(const void* ptr, void* allocatorState) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     pFree((void*) ptr, allocatorState);
 }
 
@@ -124,7 +124,7 @@ void pFree(const void* ptr, void* allocatorState) {
 static bool DefaultAllocatorInited = false;
 static Allocator DefaultAllocator;
 static void defaultAllocatorInit() {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     DefaultAllocator.state = null;
     DefaultAllocator.mallocate = pMalloc;
     DefaultAllocator.callocate = pCalloc;
@@ -134,7 +134,7 @@ static void defaultAllocatorInit() {
 }
 
 Allocator* Allocator::GetDefault() {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     if (!DefaultAllocatorInited) defaultAllocatorInit();
     return &DefaultAllocator;
 }
@@ -142,7 +142,7 @@ Allocator* Allocator::GetDefault() {
 //================================================================================ 
 // alignment should be a power of 2
 static u64 alignForward2(u64 ptr, size_t alignment) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     u64 p, a, modulo;
 
     p = ptr;
@@ -156,14 +156,14 @@ static u64 alignForward2(u64 ptr, size_t alignment) {
     return p;
 }
 static u64 alignForward(u64 ptr, size_t alignment) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return ((ptr + alignment - 1) / alignment) * alignment;
 }
 
 //================================================================================ 
 // Scratch/Arena
 Arena* Arena::Init(u32 sizeInBytes) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     Arena* arena = (Arena*) pMalloc(sizeof(Arena));
     arena->index = 0;
     arena->buffer = (u8*) pMalloc(sizeof(u8) * sizeInBytes);
@@ -171,7 +171,7 @@ Arena* Arena::Init(u32 sizeInBytes) {
     return arena;
 }
 void* Arena::Alloc(u32 sizeInBytes) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     u8* p = this->buffer + this->index;
     u32 offset = (u32) alignForward2((u64) p, 64);
 
@@ -187,7 +187,7 @@ void* Arena::Alloc(u32 sizeInBytes) {
     return null;
 }
 void Arena::Clear() {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     this->index = 0;
 }
 //================================================================================ 
diff --git a/alloc.h b/alloc.h
index 0374251..59d928f 100644
--- a/alloc.h
+++ b/alloc.h
@@ -1,7 +1,8 @@
 
-#ifndef ALLOC_H
-#define ALLOC_H
+#ifndef ULE_ALLOC_H
+#define ULE_ALLOC_H
 
+#include "config.h"
 #include "types.h"
 
 
diff --git a/array.hpp b/array.hpp
index e7090dd..899c97f 100644
--- a/array.hpp
+++ b/array.hpp
@@ -1,9 +1,10 @@
 
-#ifndef ARRAY_H
-#define ARRAY_H
+#ifndef ULE_ARRAY_H
+#define ULE_ARRAY_H
 
 #include <new> // operator new, operator delete
 
+#include "config.h"
 #include "alloc.h" // allocators...
 #include "serialize.h" // serialization
 #include "string.h" // String::memcpy
@@ -32,21 +33,28 @@ struct Array {
     T* data;
 
     Array<T>(u32 _capacity = 8) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->length   = 0;
         this->capacity = _capacity;
         this->data     = (T*) pCalloc(sizeof (T), _capacity);
     }
     void* operator new(size_t size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return pMalloc((u32) size);
     }
 
     void checkIfShouldGrow() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (this->isFull()) {
             // optimal number as you approach infinite elements approaches PHI, but 1.5 sometimes works better for finite sizes
-            // more testing is probably needed
+            //
+            // it seems, that a commonly chosen growth rate of '2' is perhaps the worst possible choice.
+            // if you grow at a rate of 2x, you end up (likely) never being able to re-use the freed 'hole' in the heap
+            // for a future allocation of the same kind.
+            // useful reading for those interested in their own dynamic array implementations:
+            // (facebook's vector impl, a strictly better std::vector)
+            // https://github.com/facebook/folly/blob/main/folly/docs/FBVector.md
+            //
             this->capacity = (u32) (this->capacity * 1.5);
             this->data = (T*) pRealloc(data, sizeof(T) * this->capacity);
         }
@@ -54,7 +62,7 @@ struct Array {
 
     // for when the order in the array doesn't matter, move the end of the array into the removed slot
     void removeSwapWithEnd(u32 index) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (this->isEmpty()) return; // overhead, maybe assert instead?
 
         u32 end = this->length - 1;
@@ -65,7 +73,7 @@ struct Array {
     }
 
     void removeSwapWithEnd(T* addr) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (u32 i = 0; i < this->length; i++) {
             if ((this->data + i) == addr) {
                 removeSwapWithEnd(i);
@@ -75,7 +83,7 @@ struct Array {
     }
 
     void removeAndShrink(u32 index) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (u32 i = index + 1; i < this->length; i++) {
             String::memcpy(this->data[i - 1], this->data[i], sizeof(T));
         }
@@ -83,7 +91,7 @@ struct Array {
     }
 
     void removeAndShrink(T* elementAddr) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         s32 index = -1;
         for (u32 i = 0; i < this->length; i++) {
             if ((this->data + i) == elementAddr) {
@@ -103,7 +111,7 @@ struct Array {
     }
 
     T pop() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (this->isEmpty()) {
             die("empty");
         }
@@ -114,7 +122,7 @@ struct Array {
     // sometimes, you want to copy some POD data on the stack to the next position in the internal array
     // that's what this does
     u32 pushCopy(T* e) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->checkIfShouldGrow();
 
         String::memcpy((void*) &this->data[this->length++], e, sizeof(T));
@@ -126,14 +134,14 @@ struct Array {
     // it is irresponsible to call this and then not store a T in that address. this increments length,
     // reserving the next spot for you.
     T* pushNextAddrPromise() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->checkIfShouldGrow();
 
         return &this->data[this->length++];
     }
 
     u32 push(T e) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->checkIfShouldGrow();
 
         this->data[this->length++] = e;
@@ -142,7 +150,7 @@ struct Array {
     }
 
     u32 pushMany(T* elements, u32 count) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         // ensure we have capacity. if we have to realloc multiple times that can suck,
         // but should be avoidable in practice by having an appropriately large initial capacity
         while (this->capacity < (this->length + count)) {
@@ -159,7 +167,7 @@ struct Array {
     }
 
     void reverse() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 count = this->length / 2;
 
         for (u32 i = 0; i < count; i++) {
@@ -172,7 +180,7 @@ struct Array {
     }
 
     T shift() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (this->length == 0) {
             return null;
         }
@@ -188,7 +196,7 @@ struct Array {
     }
 
     T unshift(T e) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->checkIfShouldGrow();
 
         for (u32 i = 0; i < this->length; i++) {
@@ -202,7 +210,7 @@ struct Array {
     }
 
     T peek() const {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (this->isEmpty()) {
             return null;
         }
@@ -211,24 +219,25 @@ struct Array {
     }
 
     bool isEmpty() const {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return this->length == 0;
     }
 
     bool isFull() const {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return this->length == this->capacity;
     }
 
     void clear() {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->length = 0;
     }
 };
 
-template <typename T>
+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
+extern template <typename T>
 static void serialize(String* str, Array<T> array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     serialize(str, array.length);
     serialize(str, array.capacity);
     for (u32 i = 0; i < array.length; i++) {
@@ -236,9 +245,9 @@ static void serialize(String* str, Array<T> array) {
     }
 }
 
-template <typename T>
+extern template <typename T>
 static void serialize(String* str, Array<T>* array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     SERIALIZE_HANDLE_NULL(str, array);
     serialize(str, array->length);
     serialize(str, array->capacity);
@@ -247,9 +256,9 @@ static void serialize(String* str, Array<T>* array) {
     }
 }
 
-template <typename T>
+extern template <typename T>
 static void deserialize(char** buffer, Array<T>* array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     deserialize(buffer, &array->length);
     deserialize(buffer, &array->capacity);
     for (u32 i = 0; i < array->length; i++) {
@@ -257,9 +266,9 @@ static void deserialize(char** buffer, Array<T>* array) {
     }
 }
 
-template <typename T>
+extern template <typename T>
 static void deserialize(char** buffer, Array<T>** array) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     DESERIALIZE_HANDLE_NULL(buffer, array);
     u32 length, capacity;
     deserialize(buffer, &length);
@@ -271,5 +280,7 @@ static void deserialize(char** buffer, Array<T>** array) {
     }
     *array = _array;
 }
+#endif // ULE_CONFIG_OPTION_SERIALIZATION
 
 #endif
+
diff --git a/config.h b/config.h
new file mode 100644
index 0000000..aba3317
--- /dev/null
+++ b/config.h
@@ -0,0 +1,15 @@
+
+#pragma once
+#ifndef ULE_CONFIG_H
+#define ULE_CONFIG_H
+
+// define this macro to include the serialization code `serialize.h/.cpp`, as well as serialization
+// for the hashtable(s) and array implementations.
+//#define ULE_CONFIG_OPTION_SERIALIZATION
+
+// all functions in the library will invoke a semicolon-terminated macro as their first line of execution.
+// this is for use by an instrusive profiler, though could be used for whatever purpose.
+//#define ULE_CONFIG_OPTION_FTAG ZoneScoped
+
+#endif
+
diff --git a/cpuid.cpp b/cpuid.cpp
index 146f5af..6f5ab0f 100644
--- a/cpuid.cpp
+++ b/cpuid.cpp
@@ -45,7 +45,7 @@ static const char* szFeatures[] = {
 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2008/hskdteyh(v=vs.90)?redirectedfrom=MSDN
 #include <intrin.h>
 void cpuid() {
-	TYPES_H_FTAG;
+	ULE_TYPES_H_FTAG;
     int nSteppingID = 0;
     int nModel = 0;
     int nFamily = 0;
@@ -142,7 +142,7 @@ void cpuid() {
 
 #else
 void cpuid() {
-	TYPES_H_FTAG;
+	ULE_TYPES_H_FTAG;
 }
 
 #endif
diff --git a/cpuid.h b/cpuid.h
index 78bb4b9..8db4d02 100644
--- a/cpuid.h
+++ b/cpuid.h
@@ -1,6 +1,8 @@
 
-#ifndef CPUID_H
-#define CPUID_H
+#ifndef ULE_CPUID_H
+#define ULE_CPUID_H
+
+#include "config.h"
 
 void cpuid();
 
diff --git a/file.cpp b/file.cpp
index fa2e91a..5d6ad2f 100644
--- a/file.cpp
+++ b/file.cpp
@@ -11,11 +11,11 @@
 
 
 FILE* File::Open(const char* path, const char* mode) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return fopen(path, mode);
 }
 FILE* File::Open(const char* path, size_t* outSize, const char* mode) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     FILE* fp = File::Open(path, mode);
 
     if (fp == null) {
@@ -34,7 +34,7 @@ void File::Close(FILE* file) {
 }
 
 size_t File::Size(const char* path) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     FILE* fp = File::Open(path);
     // get the file's size in bytes
     fseek(fp, 0, SEEK_END);
@@ -44,7 +44,7 @@ size_t File::Size(const char* path) {
     return size;
 }
 size_t File::Size(FILE* fp) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     fseek(fp, 0, SEEK_END);
     size_t size = ftell(fp);
     fseek(fp, 0L, SEEK_SET);
@@ -52,7 +52,7 @@ size_t File::Size(FILE* fp) {
 }
 
 u8* File::Read(const char* path) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     FILE* fp = File::Open(path, "rb");
 
     if (fp == null) {
@@ -73,7 +73,7 @@ u8* File::Read(const char* path) {
     return (u8*) buffer;
 }
 u8* File::Read(const char* path, size_t* outSize) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     FILE* fp = File::Open(path, "rb");
 
     if (fp == null) {
@@ -98,7 +98,7 @@ u8* File::Read(const char* path, size_t* outSize) {
     return (u8*) buffer;
 }
 size_t File::Read(FILE* fp, void* destination) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
 
     fseek(fp, 0, SEEK_END);
     size_t size = ftell(fp);
@@ -108,12 +108,12 @@ size_t File::Read(FILE* fp, void* destination) {
     return size;
 }
 size_t File::Read(FILE* fp, void* destination, size_t size) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return fread(destination, sizeof (char), size + 1, fp);
 }
 
 s32 File::Write(const char* path, char* data, u32 count) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     FILE* fp = File::Open(path, "wb");
 
     if (fp == null) {
@@ -134,7 +134,7 @@ s32 File::Write(const char* path, char* data, u32 count) {
 #include <windows.h>
 // writes the filenames into the provided array |outFileNames|, must be allocated ahead of time.
 void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     massert(path != null, "provided 'null' for path argument");
     massert(outFileNames != null, "provided 'null' for array argument");
     WIN32_FIND_DATAA findData;
@@ -160,7 +160,7 @@ void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
 #else
 #include <dirent.h>
 void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     massert(path != null, "provided 'null' for path argument");
     massert(outFileNames != null, "provided 'null' for array argument");
     DIR* dir = opendir(path);
@@ -189,7 +189,7 @@ void File::GetFileNamesInFolder(const char* path, Array<char*>* outFileNames) {
 #endif
 
 time_t File::LastModified(const char* path) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     struct stat result;
     if (stat(path, &result) == 0) {
         return result.st_mtime;
diff --git a/file.h b/file.h
index e455e64..c0843c6 100644
--- a/file.h
+++ b/file.h
@@ -1,10 +1,11 @@
 
-#ifndef FILE_H
-#define FILE_H
+#ifndef ULE_FILE_H
+#define ULE_FILE_H
 
 #include <stdio.h> // FILE
 #include <sys/types.h> // time_t
 
+#include "config.h"
 #include "array.hpp"
 
 
diff --git a/print.cpp b/print.cpp
index 823f86a..e9f189f 100644
--- a/print.cpp
+++ b/print.cpp
@@ -10,12 +10,12 @@
 
 
 void vprint(const char* format, va_list args) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     vfprintf(stdout, format, args);
 }
 
 void vprintln(const char* format, va_list args) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     vprint(format, args);
     print("\n");
 }
@@ -25,7 +25,7 @@ void vprintln(const char* format, va_list args) {
  * +we intend to replace printf at some point with this
  */
 void print(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     if (format == null) { print("null"); return; }
 
     va_list args;
@@ -37,7 +37,7 @@ void print(const char* format, ...) {
 }
 
 void println(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     if (format == null) { print("null\n"); return; }
 
     va_list args;
@@ -58,7 +58,7 @@ void println(const char* format, ...) {
 #include <dbghelp.h>
 // if |string| is non-null, then the stack trace will be concatenated to it instead of being printed to stdout.
 void trace(String* string) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
 
     #define BACKTRACE_MAX_FUNCTION_NAME_LENGTH 1024
     HANDLE processHandle = GetCurrentProcess();
@@ -105,7 +105,7 @@ void trace(String* string) {
 #include <cxxabi.h> // abi::__cxa_demangle
 // if |string| is non-null, then the stack trace will be concatenated to it instead of being printed to stdout.
 void trace(String* string) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
 
     void* stack[BACKTRACE_MAX_FRAMES];
     u32 stackSize = backtrace(stack, BACKTRACE_MAX_FRAMES);
@@ -156,7 +156,7 @@ void trace(String* string) {
 #endif
 
 void _debug(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     if (format == null) {
         print("%sdebug:%s null\n", ANSI_BLUE, ANSI_RESET);
         return;
@@ -172,7 +172,7 @@ void _debug(const char* format, ...) {
 }
 
 void _warn(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     if (format == null) {
         print("%swarning:%s null\n", ANSI_YELLOW, ANSI_RESET);
         return;
@@ -198,7 +198,7 @@ void setCustomDieBehavior(void (*dieBehavior)(const char* string)) {
 // if a fatal error should not occur at runtime on a release binary, consider preferring 'massert'
 // it's unclear when you should use asserts vs. die actually. idk man
 void die(const char* format, ...) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     if (format == null) {
         if (customDie == null) {
             print("%serror:%s (unspecified error)\n", ANSI_RED, ANSI_RESET);
@@ -239,42 +239,43 @@ void die(const char* format, ...) {
     }
 }
 
-void print(bool b)         { TYPES_H_FTAG; print("%s", b ? "true" : "false"); }
-void print(char c)         { TYPES_H_FTAG; print("%c", c); }
-void print(signed int i)   { TYPES_H_FTAG; print("%d", i); }
-void print(unsigned int i) { TYPES_H_FTAG; print("%u", i); }
-void print(float f)        { TYPES_H_FTAG; print("%.14g", f); }
-void print(double d)       { TYPES_H_FTAG; print("%.14g", d); }
-void print(void* p)        { TYPES_H_FTAG; print("%p", p); }
-void print(char* s)        { TYPES_H_FTAG; print("%s", s); }
+void print(bool b)         { ULE_TYPES_H_FTAG; print("%s", b ? "true" : "false"); }
+void print(char c)         { ULE_TYPES_H_FTAG; print("%c", c); }
+void print(signed int i)   { ULE_TYPES_H_FTAG; print("%d", i); }
+void print(unsigned int i) { ULE_TYPES_H_FTAG; print("%u", i); }
+void print(float f)        { ULE_TYPES_H_FTAG; print("%.14g", f); }
+void print(double d)       { ULE_TYPES_H_FTAG; print("%.14g", d); }
+void print(void* p)        { ULE_TYPES_H_FTAG; print("%p", p); }
+void print(char* s)        { ULE_TYPES_H_FTAG; print("%s", s); }
 
 #ifndef _WIN32
-void print(size_t i) { TYPES_H_FTAG; print("%u", i); }
-void println(size_t i) { TYPES_H_FTAG; print(i); print("\n"); }
+void print(size_t i) { ULE_TYPES_H_FTAG; print("%u", i); }
+void println(size_t i) { ULE_TYPES_H_FTAG; print(i); print("\n"); }
 #endif
 
-void println(bool b)         { TYPES_H_FTAG; print(b); print("\n"); }
-void println(char c)         { TYPES_H_FTAG; print(c); print("\n"); }
-void println(signed int i)   { TYPES_H_FTAG; print(i); print("\n"); }
-void println(unsigned int i) { TYPES_H_FTAG; print(i); print("\n"); }
-void println(float f)        { TYPES_H_FTAG; print(f); print("\n"); }
-void println(double d)       { TYPES_H_FTAG; print(d); print("\n"); }
-void println(void* p)        { TYPES_H_FTAG; print(p); print("\n"); }
-void println(char* s)        { TYPES_H_FTAG; print(s); print("\n"); }
-void println()               { TYPES_H_FTAG;           print("\n"); }
-
-#ifdef _USING_GLM_TYPES__
-void print(glm::vec<2, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec2: %.14g,%.14g", v.x, v.y); }
-void print(glm::vec<3, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec3: %.14g,%.14g,%.14g", v.x, v.y, v.z); }
-void print(glm::vec<4, float, (glm::qualifier) 3> v)      { TYPES_H_FTAG; print("vec4: %.14g,%.14g,%.14g,%.14g", v.x, v.y, v.z, v.w); }
-void print(glm::mat<2, 2, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat2: "); print(m[0]); print(m[1]); }
-void print(glm::mat<3, 3, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat3: "); print(m[0]); print(m[1]); print(m[2]); }
-void print(glm::mat<4, 4, float, (glm::qualifier) 3> m)   { TYPES_H_FTAG; print("mat4: "); print(m[0]); print(m[1]); print(m[2]); print(m[3]); }
-
-void println(glm::vec<2, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
-void println(glm::vec<3, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
-void println(glm::vec<4, float, (glm::qualifier) 3> v)    { TYPES_H_FTAG; print(v); print("\n"); }
-void println(glm::mat<2, 2, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
-void println(glm::mat<3, 3, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
-void println(glm::mat<4, 4, float, (glm::qualifier) 3> m) { TYPES_H_FTAG; print(m); print("\n"); }
-#endif
+void println(bool b)         { ULE_TYPES_H_FTAG; print(b); print("\n"); }
+void println(char c)         { ULE_TYPES_H_FTAG; print(c); print("\n"); }
+void println(signed int i)   { ULE_TYPES_H_FTAG; print(i); print("\n"); }
+void println(unsigned int i) { ULE_TYPES_H_FTAG; print(i); print("\n"); }
+void println(float f)        { ULE_TYPES_H_FTAG; print(f); print("\n"); }
+void println(double d)       { ULE_TYPES_H_FTAG; print(d); print("\n"); }
+void println(void* p)        { ULE_TYPES_H_FTAG; print(p); print("\n"); }
+void println(char* s)        { ULE_TYPES_H_FTAG; print(s); print("\n"); }
+void println()               { ULE_TYPES_H_FTAG;           print("\n"); }
+
+#ifdef ULE_CONFIG_OPTION_USE_GLM
+void print(glm::vec<2, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec2: %.14g,%.14g", v.x, v.y); }
+void print(glm::vec<3, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec3: %.14g,%.14g,%.14g", v.x, v.y, v.z); }
+void print(glm::vec<4, float, (glm::qualifier) 3> v)      { ULE_TYPES_H_FTAG; print("vec4: %.14g,%.14g,%.14g,%.14g", v.x, v.y, v.z, v.w); }
+void print(glm::mat<2, 2, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat2: "); print(m[0]); print(m[1]); }
+void print(glm::mat<3, 3, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat3: "); print(m[0]); print(m[1]); print(m[2]); }
+void print(glm::mat<4, 4, float, (glm::qualifier) 3> m)   { ULE_TYPES_H_FTAG; print("mat4: "); print(m[0]); print(m[1]); print(m[2]); print(m[3]); }
+
+void println(glm::vec<2, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
+void println(glm::vec<3, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
+void println(glm::vec<4, float, (glm::qualifier) 3> v)    { ULE_TYPES_H_FTAG; print(v); print("\n"); }
+void println(glm::mat<2, 2, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
+void println(glm::mat<3, 3, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
+void println(glm::mat<4, 4, float, (glm::qualifier) 3> m) { ULE_TYPES_H_FTAG; print(m); print("\n"); }
+#endif // ULE_CONFIG_OPTION_USE_GLM
+
diff --git a/print.h b/print.h
index 3d1b927..956b935 100644
--- a/print.h
+++ b/print.h
@@ -1,9 +1,10 @@
 
-#ifndef PRINT_H
-#define PRINT_H
+#ifndef ULE_PRINT_H
+#define ULE_PRINT_H
 
 #include <stdarg.h> // va_list
 
+#include "config.h"
 #include "string.h"
 #include "types.h"
 
@@ -135,7 +136,7 @@ extern void println(void* p);
 extern void println(char* s);
 extern void println();
 
-#ifdef _USING_GLM_TYPES__
+#ifdef ULE_CONFIG_OPTION_USE_GLM
 extern void print(glm::vec<2, float, (glm::qualifier) 3>);
 extern void print(glm::vec<3, float, (glm::qualifier) 3>);
 extern void print(glm::vec<4, float, (glm::qualifier) 3>);
diff --git a/serialize.cpp b/serialize.cpp
index bd957a3..258d9a2 100644
--- a/serialize.cpp
+++ b/serialize.cpp
@@ -1,4 +1,6 @@
 
+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
+
 #include <fast_float/fast_float.h>
 
 #include "types.h"
@@ -7,52 +9,52 @@
 #include "print.h"
 
 
-static inline const char* getFormatStringOut(u8     v) { TYPES_H_FTAG; return "%hu\n"; }
-static inline const char* getFormatStringOut(u16    v) { TYPES_H_FTAG; return "%hu\n"; }
-static inline const char* getFormatStringOut(u32    v) { TYPES_H_FTAG; return "%u\n"; }
-static inline const char* getFormatStringOut(u64    v) { TYPES_H_FTAG; return "%llu\n"; }
+static inline const char* getFormatStringOut(u8     v) { ULE_TYPES_H_FTAG; return "%hu\n"; }
+static inline const char* getFormatStringOut(u16    v) { ULE_TYPES_H_FTAG; return "%hu\n"; }
+static inline const char* getFormatStringOut(u32    v) { ULE_TYPES_H_FTAG; return "%u\n"; }
+static inline const char* getFormatStringOut(u64    v) { ULE_TYPES_H_FTAG; return "%llu\n"; }
 
-static inline const char* getFormatStringOut(s8     v) { TYPES_H_FTAG; return "%hd\n"; }
-static inline const char* getFormatStringOut(s16    v) { TYPES_H_FTAG; return "%hd\n"; }
-static inline const char* getFormatStringOut(s32    v) { TYPES_H_FTAG; return "%d\n"; }
-static inline const char* getFormatStringOut(s64    v) { TYPES_H_FTAG; return "%lld\n"; }
+static inline const char* getFormatStringOut(s8     v) { ULE_TYPES_H_FTAG; return "%hd\n"; }
+static inline const char* getFormatStringOut(s16    v) { ULE_TYPES_H_FTAG; return "%hd\n"; }
+static inline const char* getFormatStringOut(s32    v) { ULE_TYPES_H_FTAG; return "%d\n"; }
+static inline const char* getFormatStringOut(s64    v) { ULE_TYPES_H_FTAG; return "%lld\n"; }
 
-static inline const char* getFormatStringOut(float  v) { TYPES_H_FTAG; return "%f\n"; }
-static inline const char* getFormatStringOut(double v) { TYPES_H_FTAG; return "%f\n"; }
+static inline const char* getFormatStringOut(float  v) { ULE_TYPES_H_FTAG; return "%f\n"; }
+static inline const char* getFormatStringOut(double v) { ULE_TYPES_H_FTAG; return "%f\n"; }
 
 // important constraint - strings need to be wrapped in double-quotes.
 // the sentinel value 'null' without quotations is used to denote null values, which means
 // if strings were not wrapped in double quotes, you would not be able to distinguish null
 // values from the literal string "null".
-static inline const char* getFormatStringOut(char*       v) { TYPES_H_FTAG; return "\"%s\"\n"; }
-static inline const char* getFormatStringOut(const char* v) { TYPES_H_FTAG; return "\"%s\"\n"; }
+static inline const char* getFormatStringOut(char*       v) { ULE_TYPES_H_FTAG; return "\"%s\"\n"; }
+static inline const char* getFormatStringOut(const char* v) { ULE_TYPES_H_FTAG; return "\"%s\"\n"; }
 
 #ifdef _USING_GLM_TYPES__
-static inline const char* getFormatStringOut(glm::vec<2, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f\n"; }
-static inline const char* getFormatStringOut(glm::vec<3, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f\n"; }
-static inline const char* getFormatStringOut(glm::vec<4, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::vec<2, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f\n"; }
+static inline const char* getFormatStringOut(glm::vec<3, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::vec<4, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f\n"; }
 
-static inline const char* getFormatStringOut(glm::mat<2, 2, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f\n"; }
-static inline const char* getFormatStringOut(glm::mat<3, 3, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f\n"; }
-static inline const char* getFormatStringOut(glm::mat<4, 4, float, (glm::qualifier) 3> v) { TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::mat<2, 2, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::mat<3, 3, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f\n"; }
+static inline const char* getFormatStringOut(glm::mat<4, 4, float, (glm::qualifier) 3> v) { ULE_TYPES_H_FTAG; return "%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n"; }
 
 #endif
 
 #define SERIALIZE_H_FUNC_BODY str->appendf(getFormatStringOut(v), v);
-void serialize(String* str, u8     v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, u16    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, u32    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, u64    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s8     v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s16    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s32    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, s64    v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, float  v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-void serialize(String* str, double v) { TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
-
-template<typename T> // do I really need a template for this?
+void serialize(String* str, u8     v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, u16    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, u32    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, u64    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s8     v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s16    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s32    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, s64    v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, float  v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+void serialize(String* str, double v) { ULE_TYPES_H_FTAG; SERIALIZE_H_FUNC_BODY }
+
+extern template<typename T> // @TODO do not use a template for this.
 static inline void deserializeInteger(char** buffer, T* v) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     char* _buffer = *buffer;
     T value = 0;
     
@@ -107,7 +109,7 @@ static const u32 BINARY32_MAX_CHARS = 14;
 static const u32 BINARY64_MAX_CHARS = 24;
 
 void deserialize(char** buffer, float*  v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     char* _buffer = *buffer;
     while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
     fast_float::from_chars_result result = fast_float::from_chars(_buffer, _buffer + BINARY32_MAX_CHARS, *v);
@@ -115,7 +117,7 @@ void deserialize(char** buffer, float*  v) {
     *buffer = (char*) result.ptr;
 }
 void deserialize(char** buffer, double* v) { 
-    TYPES_H_FTAG;  
+    ULE_TYPES_H_FTAG;  
     char* _buffer = *buffer;
     while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
     fast_float::from_chars_result result = fast_float::from_chars(_buffer, _buffer + BINARY64_MAX_CHARS, *v);
@@ -125,19 +127,19 @@ void deserialize(char** buffer, double* v) {
 
 #ifndef _WIN32
 // win32 doesn't treat size_t as different than a u64, which causes ambiguous function calls
-static inline const char* getFormatStringOut(size_t v) { TYPES_H_FTAG; return "%lu\n"; }
+static inline const char* getFormatStringOut(size_t v) { ULE_TYPES_H_FTAG; return "%lu\n"; }
 void serialize(String* str, size_t v) { SERIALIZE_H_FUNC_BODY }
 void deserialize(char** buffer, size_t* v) { SERIALIZE_H_DESERIALIZE_FUNC_BODY }
 #endif
 
 // STRING STUFF
 void serialize(String* str, char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     SERIALIZE_HANDLE_NULL(str, v);
     SERIALIZE_H_FUNC_BODY;
 }
 void serialize(String* str, const char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     SERIALIZE_HANDLE_NULL(str, v);
     SERIALIZE_H_FUNC_BODY;
 }
@@ -160,7 +162,7 @@ void serialize(String* str, const char* v) {
 static char SERIALIZE_SCRATCH_BUFFER[SERIALIZE_SCRATCH_BUFFER_SIZE];
 
 static s32 deserializeString(char** buffer, char* v, s32 vSize) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     char* _buffer = *buffer;
 
     while (String::isAsciiWhitespace(*_buffer)) _buffer++; 
@@ -181,7 +183,7 @@ static s32 deserializeString(char** buffer, char* v, s32 vSize) {
     return i;
 }
 static s32 deserializeString(char** buffer, char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     char* _buffer = *buffer;
     while (String::isAsciiWhitespace(*_buffer)) _buffer++;
     massert(_buffer[0] == '"', "expecting to deserialize a string, but found something other than a double quote");
@@ -199,15 +201,15 @@ static s32 deserializeString(char** buffer, char* v) {
     return i;
 }
 void deserialize(char** buffer, char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     deserializeString(buffer, v);
 }
 void deserialize(char** buffer, const char* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     deserializeString(buffer, (char*) v);
 }
 void deserialize(char** buffer, char** v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     
     DESERIALIZE_HANDLE_NULL(buffer, v);
     
@@ -216,7 +218,7 @@ void deserialize(char** buffer, char** v) {
     *v = String::cpy(SERIALIZE_SCRATCH_BUFFER, (u32) i);
 }
 void deserialize(char** buffer, const char** v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     
     DESERIALIZE_HANDLE_NULL(buffer, (char*) v); // error: readonly variable is not assignable
     
@@ -231,33 +233,33 @@ void deserialize(char** buffer, const char** v) {
 // have that template parameter == 3, so everything below becomes unresolved symbols if 
 // I don't do the nasty template garbage here
 void serialize(String* str, glm::vec<2, float, (glm::qualifier) (glm::qualifier) 3> v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     str->appendf(getFormatStringOut(v), v[0], v[1]); 
 }
 void serialize(String* str, glm::vec<3, float, (glm::qualifier) (glm::qualifier) 3> v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     str->appendf(getFormatStringOut(v), v[0], v[1], v[2]); 
 }
 void serialize(String* str, glm::vec<4, float, (glm::qualifier) 3> v) { 
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     str->appendf(getFormatStringOut(v), v[0], v[1], v[2], v[3]); 
 }
 
 void serialize(String* str, glm::mat<2, 2, float, (glm::qualifier) 3> v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     str->appendf(getFormatStringOut(v)
         , v[0][0], v[0][1]
         , v[1][0], v[1][1]);
 }
 void serialize(String* str, glm::mat<3, 3, float, (glm::qualifier) 3> v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     str->appendf(getFormatStringOut(v)
         , v[0][0], v[0][1], v[0][2]
         , v[1][0], v[1][1], v[1][2]
         , v[2][0], v[2][1], v[2][2]);
 }
 void serialize(String* str, glm::mat<4, 4, float, (glm::qualifier) 3> v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     str->appendf(getFormatStringOut(v)
         , v[0][0], v[0][1], v[0][2], v[0][3]
         , v[1][0], v[1][1], v[1][2], v[1][3]
@@ -266,21 +268,21 @@ void serialize(String* str, glm::mat<4, 4, float, (glm::qualifier) 3> v) {
 }
 
 void deserialize(char** buffer, glm::vec<2, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     float* _v = (float*) v;
     for (u32 i = 0; i < 2; i++) {
         deserialize(buffer, _v + i);
     }
 }
 void deserialize(char** buffer, glm::vec<3, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     float* _v = (float*) v;
     for (u32 i = 0; i < 3; i++) {
         deserialize(buffer, _v + i);
     }
 }
 void deserialize(char** buffer, glm::vec<4, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     float* _v = (float*) v;
     for (u32 i = 0; i < 4; i++) {
         deserialize(buffer, _v + i);
@@ -288,21 +290,21 @@ void deserialize(char** buffer, glm::vec<4, float, (glm::qualifier) 3>* v) {
 }
 
 void deserialize(char** buffer, glm::mat<2, 2, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     float* m = (float*) v;
     for (u32 i = 0; i < 4; i++) {
         deserialize(buffer, m + i);
     }
 }
 void deserialize(char** buffer, glm::mat<3, 3, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     float* m = (float*) v;
     for (u32 i = 0; i < 9; i++) {
         deserialize(buffer, m + i);
     }
 }
 void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>* v) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     float* m = (float*) v;
     for (u32 i = 0; i < 16; i++) {
         deserialize(buffer, m + i);
@@ -313,4 +315,5 @@ void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>* v) {
 #undef SERIALIZE_H_DESERIALIZE_FUNC_BODY
 
 #endif
+#endif
 
diff --git a/serialize.h b/serialize.h
index 47fc441..88c66f1 100644
--- a/serialize.h
+++ b/serialize.h
@@ -1,12 +1,15 @@
 
-#ifndef SERIALIZE_H
-#define SERIALIZE_H
+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
+#ifndef ULE_SERIALIZE_H
+#define ULE_SERIALIZE_H
 
 
+#include "config.h"
 #include "print.h"
 #include "types.h"
 #include "string.h"
 
+
 /*
     NOTES ON SERIALIZATION
     after wrestling with various reflection libraries for a week, I decided to use none of them.
@@ -72,13 +75,13 @@ extern void serialize(String* str, T v); \
 extern void deserialize(char** buffer, T* v); \
 extern void deserialize(char** buffer, T** v); \
 static void serializePrint(T* v) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
     String str = String(""); \
     serialize(&str, v); \
     println(str.c_str()); \
 } \
 static bool serializeEquals(T* t1, T* t2) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
     String s1 = String128f(""); \
     String s2 = String128f(""); \
     serialize(&s1, t1); \
@@ -89,7 +92,7 @@ static bool serializeEquals(T* t1, T* t2) { \
 // if you implement deserialize with a T*.
 #define SERIALIZE_H_HELPER_CLONE_T_POINTER(T) \
 static void serializeClone(T* orig, T* destination) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
     String str = String128f(""); \
     serialize(&str, orig); \
     char* buffer = str.c_str(); \
@@ -99,7 +102,7 @@ static void serializeClone(T* orig, T* destination) { \
 // if you implement deserialize with a T**.
 #define SERIALIZE_H_HELPER_CLONE_T_DOUBLE_POINTER(T) \
 static void serializeClone(T* orig, T** destination) { \
-    TYPES_H_FTAG; \
+    ULE_TYPES_H_FTAG; \
     String str = String128f(""); \
     serialize(&str, orig); \
     char* buffer = str.c_str(); \
@@ -206,3 +209,5 @@ extern void deserialize(char** buffer, glm::mat<4, 4, float, (glm::qualifier) 3>
 
 #endif
 
+#endif
+
diff --git a/signal-handler.h b/signal-handler.h
index 21d52f5..0f3266a 100644
--- a/signal-handler.h
+++ b/signal-handler.h
@@ -1,9 +1,10 @@
 
-#ifndef SIGNAL_HANDLER_H
-#define SIGNAL_HANDLER_H
+#ifndef ULE_SIGNAL_HANDLER_H
+#define ULE_SIGNAL_HANDLER_H
 
 #include <signal.h> // for signal() and the SIG macros
 
+#include "config.h"
 #include "types.h"
 #include "print.h"
 
@@ -11,7 +12,7 @@
 // the running process can receive and respond to a variety of platform-dependent 'signals' during runtime from the OS.
 // freebsd has something like 30 signals, windows has a subset, just 6. we'll just deal with 6.
 static inline void defaultHandler(s32 signal) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG; 
     switch (signal) {
         case SIGSEGV:
         case SIGABRT:
@@ -37,7 +38,7 @@ static inline void defaultHandler(s32 signal) {
 }
 
 static void setSignalHandlers(void(*handler)(s32 signal) = defaultHandler) {
-    TYPES_H_FTAG; 
+    ULE_TYPES_H_FTAG;
     if (signal(SIGSEGV, handler) == SIG_ERR) die("failed to set SIGSEGV handler... zzz...\n");
     if (signal(SIGABRT, handler) == SIG_ERR) die("failed to set SIGABRT handler... zzz...\n");
     if (signal(SIGFPE,  handler) == SIG_ERR) die("failed to set SIGFPE handler... zzz...\n");
diff --git a/sse_mathfun.h b/sse_mathfun.h
new file mode 100644
index 0000000..5d87179
--- /dev/null
+++ b/sse_mathfun.h
@@ -0,0 +1,710 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
diff --git a/string.h b/string.h
index cbf78a0..5d20deb 100644
--- a/string.h
+++ b/string.h
@@ -1,11 +1,12 @@
 
-#ifndef STRING_H
-#define STRING_H
+#ifndef ULE_STRING_H
+#define ULE_STRING_H
 
+#include "config.h"
 #include "types.h"
 #include "alloc.h"
 
-#include <string.h> // @TODO remove this
+//#include <string.h> // @TODO remove this
 
 #define STB_SPRINTF_IMPLEMENTATION
 #define STB_SPRINTF_STATIC
@@ -21,6 +22,7 @@
 
 // 'String' is a datatype, but it also is a namespace for a bunch of static 'char*' operations that 
 // you would normally find in the <cstring> or <string.h> header
+// The datatype is a modified version of a string class developed by Omar Cornut: https://github.com/ocornut/str
 class String {
 public:
     // Static empty buffer we can point to for empty strings
@@ -49,7 +51,7 @@ public:
     };
 
     static inline s32 sprintf(char* buffer, const char* format, ...) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         va_list args;
         va_start(args, format);
 
@@ -59,7 +61,7 @@ public:
         return code;
     }
     static inline s32 snprintf(char* buffer, s32 count, const char* format, ...) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         va_list args;
         va_start(args, format);
 
@@ -73,31 +75,31 @@ public:
     }
 
     static inline bool isDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return (c >= '0') && (c <= '9');
     }
     static inline bool isAlpha(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return (c >= 'A' && c <= 'Z')
             || (c >= 'a' && c <= 'z');
     }
     static inline bool isHexDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return ((c >= '0') && (c <= '9'))
             || ((c >= 'A') && (c <= 'F'))
             || ((c >= 'a') && (c <= 'f'));
     }
     static inline bool isOctalDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return (c >= '0') && (c <= '7');
     }
     static inline bool isBinaryDigit(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return c == '0' || c == '1';
     }
 
     static inline char* intToString(u64 integer) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 capacity = 10;
         u32* remainders = (u32*) pMalloc(sizeof (u32) * capacity);
 
@@ -124,7 +126,7 @@ public:
     }
 
     static inline u64 hexStringToInt(const char* str) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u64 out = 0;
 
         while (*str != '\0') {
@@ -148,7 +150,7 @@ public:
     }
 
     static inline u32 len(const char* string) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         const char* start = string;
         while (*string++ != '\0') {}
         return (u32) (string - start);
@@ -156,7 +158,7 @@ public:
 
     // returns true if null-terminated strings |s1| and |s2| are equal
     static inline bool eq(const char* s1, const char* s2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 l1 = String::len(s1);
         u32 l2 = String::len(s2);
 
@@ -173,7 +175,7 @@ public:
 
     // same as |eq|, but handles |s1| and/or |s2| being null
     static inline bool eqNullCheck(const char* s1, const char* s2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (s1 == null) {
             if (s2 == null) {
                 return true;
@@ -189,7 +191,7 @@ public:
 
     // heap allocates a copy of |string| and returns a pointer to it.
     static inline char* cpy(const char* string, u32 length, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         char* buffer = (char*) allocator->mallocate(sizeof (char) * (length + 1), allocator->state);
 
         u32 i = 0;
@@ -202,18 +204,18 @@ public:
 
     // heap allocates a copy of |string| and returns a pointer to it.
     static inline char* cpy(const char* string, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 len = String::len(string);
 
         return String::cpy(string, len, allocator = Allocator::GetDefault());
     }
 
     static inline bool memeq(const unsigned char* m1, const unsigned char* m2, size_t length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return memcmp(m1, m2, length) == 0;
     }
     static inline bool memeq(const unsigned char* m1, size_t l1, const unsigned char* m2, size_t l2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (l1 != l2) return false;
 
         return memeq(m1, m2, l1);
@@ -221,13 +223,13 @@ public:
 
     #ifdef _WIN32
     static inline size_t wcharToChar(wchar_t* wstring, char* buffer, size_t maxBufferLength) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return wcstombs(buffer, wstring, maxBufferLength);
     }
     #endif
 
     static inline void* memset(void* p, char c, u32 length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         //__stosb((unsigned char*) p, c, length);
         char* a = (char*) p;
         for (u32 i = 0; i < length; i++) a[i] = c;
@@ -236,7 +238,7 @@ public:
 
 
     static inline void memcpy(void* dest, void* src, u32 size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u8* dest_ = (u8*) dest;
         u8* src_ = (u8*) src;
 
@@ -247,7 +249,7 @@ public:
 
     // replace all instances of |c1| in |string| with |c2|
     static inline void replaceC(char* string, u32 length, char c1, char c2) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (u32 i = 0; i < length; i++) {
             if (string[i] == c1) {
                 string[i] = c2;
@@ -256,7 +258,7 @@ public:
     }
 
     static inline const char* firstCharOccurence(const char* string, u32 length, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (s32 i = 0; i < length; i++) {
             const char* s = string + i;
             if (*s == c) {
@@ -267,12 +269,12 @@ public:
     }
 
     static inline const char* firstCharOccurence(const char* string, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return String::firstCharOccurence(string, String::len(string), c);
     }
 
     static inline const char* lastCharOccurence(const char* string, u32 length, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (s32 i = length - 1; i >= 0; i--) { // @NOTE 'i' needs to be a signed int here...
             if (*(string + i) == c) {
                 return string + i;
@@ -282,19 +284,19 @@ public:
     }
 
     static inline const char* lastCharOccurence(const char* string, char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return String::lastCharOccurence(string, String::len(string), c);
     }
 
     static inline bool hasSuffix(const char* string, const char* suffix) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         const char* p = String::lastCharOccurence(string, String::len(string), suffix[0]);
         if (p) return String::eq(p, suffix);
         return false;
     }
 
     static inline u32 countLines(const char* buffer) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 lines = 0;
         char c;
 
@@ -306,7 +308,7 @@ public:
     }
 
     static inline bool isAscii(const char* buffer, u32 length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         const unsigned char* ubuffer = (const unsigned char*) buffer;
         for (u32 i = 0; i < length; i++) {
             if (ubuffer[i] & 128) { // binary: 0b 1000 0000
@@ -317,7 +319,7 @@ public:
     }
 
     static inline bool isAsciiWhitespace(char c) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         switch (c) {
             //case '\b':
             //case '\v':
@@ -339,7 +341,7 @@ public:
     //static inline char* trimStart(const char* str, u32 count);
     //static inline char* trimEnd(const char* str, u32 count);
     static inline char* trim(const char* str, u32 count, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 length = String::len(str);
 
         if (length <= count) {
@@ -359,7 +361,7 @@ public:
     }
 
     static inline char* asciiToLower(const char* str, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 length = String::len(str);
         char* buffer = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
         u32 i = 0;
@@ -371,7 +373,7 @@ public:
     }
 
     static inline char* asciiToUpper(const char* str, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 length = String::len(str);
         char* buffer = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
         u32 i = 0;
@@ -383,7 +385,7 @@ public:
     }
 
     static inline char* concat(const char* str1, const char* str2, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 l1 = String::len(str1);
         u32 l2 = String::len(str2);
         u32 newLength = l1 + l2;
@@ -401,7 +403,7 @@ public:
     }
 
     static inline u32 write(char* dest, const char* src, u32 length) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 i = 0;
         for (; i < length; i++) {
             dest[i] = src[i];
@@ -412,13 +414,13 @@ public:
 
     // returns the number of characters written.
     static inline u32 write(char* dest, const char* src) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         u32 length = String::len(src);
         return String::write(dest, src, length);
     }
 
     static inline char* read(const char* buffer, u32 length, Allocator* allocator = Allocator::GetDefault()) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         char* tk = (char*) allocator->mallocate(sizeof (char) * length + 1, allocator->state);
         u32 i = 0;
         while (i < length) {
@@ -434,38 +436,38 @@ public:
     int                 LocalBufSize : 10;      // Max 1023 bytes
     unsigned int        Owned : 1;              // Set when we have ownership of the pointed data (most common, unless using set_ref() method or StringRef constructor)
 
-    inline char*        c_str()                                 { TYPES_H_FTAG; return Data; }
-    inline const char*  c_str() const                           { TYPES_H_FTAG; return Data; }
-    inline bool         empty() const                           { TYPES_H_FTAG; return Data[0] == 0; }
-    inline int          length() const                          { TYPES_H_FTAG; return (int)strlen(Data); }    // by design, allow user to write into the buffer at any time
-    inline int          capacity() const                        { TYPES_H_FTAG; return Capacity; }
-    inline bool         owned() const                           { TYPES_H_FTAG; return Owned ? true : false; }
+    inline char*        c_str()                                 { ULE_TYPES_H_FTAG; return Data; }
+    inline const char*  c_str() const                           { ULE_TYPES_H_FTAG; return Data; }
+    inline bool         empty() const                           { ULE_TYPES_H_FTAG; return Data[0] == 0; }
+    inline int          length() const                          { ULE_TYPES_H_FTAG; return (int)strlen(Data); }    // by design, allow user to write into the buffer at any time
+    inline int          capacity() const                        { ULE_TYPES_H_FTAG; return Capacity; }
+    inline bool         owned() const                           { ULE_TYPES_H_FTAG; return Owned ? true : false; }
 
-    inline char&        operator[](size_t i)                    { TYPES_H_FTAG; return Data[i]; }
-    inline char         operator[](size_t i) const              { TYPES_H_FTAG; return Data[i]; }
-    inline String&      operator=(const String& rhs)            { TYPES_H_FTAG; set(rhs); return *this; }
-    inline bool         operator==(const String& rhs) const     { TYPES_H_FTAG; return strcmp(c_str(), rhs.c_str()) == 0; }
-    inline String&      operator=(const char* rhs)              { TYPES_H_FTAG; set(rhs); return *this; }
-    inline bool         operator==(const char* rhs) const       { TYPES_H_FTAG; return strcmp(c_str(), rhs) == 0; }
+    inline char&        operator[](size_t i)                    { ULE_TYPES_H_FTAG; return Data[i]; }
+    inline char         operator[](size_t i) const              { ULE_TYPES_H_FTAG; return Data[i]; }
+    inline String&      operator=(const String& rhs)            { ULE_TYPES_H_FTAG; set(rhs); return *this; }
+    inline bool         operator==(const String& rhs) const     { ULE_TYPES_H_FTAG; return strcmp(c_str(), rhs.c_str()) == 0; }
+    inline String&      operator=(const char* rhs)              { ULE_TYPES_H_FTAG; set(rhs); return *this; }
+    inline bool         operator==(const char* rhs) const       { ULE_TYPES_H_FTAG; return strcmp(c_str(), rhs) == 0; }
 
     inline String() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         Data = EmptyBuffer; // Shared READ-ONLY initial buffer for 0 capacity
         Capacity = 0;
         LocalBufSize = 0;
         Owned = 0;
     }
     inline String(const String& rhs) : String() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         set(rhs);
     }
     inline String(const char* rhs) : String() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         set(rhs);
     }
 
     inline void set_ref(const char* src) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         if (Owned && !is_using_local_buf())
             STR_MEMFREE(Data);
         Data = src ? (char*)src : EmptyBuffer;
@@ -473,7 +475,7 @@ public:
         Owned = 0;
     }
     inline void set(const String& src) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         int buf_len = (int)strlen(src.c_str())+1;
         if ((int)Capacity < buf_len)
             reserve_discard(buf_len);
@@ -482,7 +484,7 @@ public:
     }
 
     inline void set(const char* src) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         // We allow set(NULL) or via = operator to clear the string.
         if (src == NULL)
         {
@@ -497,7 +499,7 @@ public:
     }
 
     inline void set(const char* src, const char* src_end) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         STR_ASSERT(src != NULL && src_end >= src);
         int buf_len = (int)(src_end-src)+1;
         if ((int)Capacity < buf_len)
@@ -509,7 +511,7 @@ public:
 
     // Clear
     inline void clear() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         if (Owned && !is_using_local_buf())
             STR_MEMFREE(Data);
         if (LocalBufSize) {
@@ -526,7 +528,7 @@ public:
 
     // Reserve memory, preserving the current of the buffer
     inline void reserve(int new_capacity) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         if (new_capacity <= Capacity)
             return;
 
@@ -558,7 +560,7 @@ public:
 
     // Reserve memory, discarding the current of the buffer (if we expect to be fully rewritten)
     inline void reserve_discard(int new_capacity) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         if (new_capacity <= Capacity)
             return;
 
@@ -578,7 +580,7 @@ public:
     }
 
     inline void shrink_to_fit() {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         if (!Owned || is_using_local_buf()) return;
         int new_capacity = length() + 1;
         if (Capacity <= new_capacity) return;
@@ -592,7 +594,7 @@ public:
 
     // FIXME: merge setfv() and appendfv()?
     inline int setfv(const char* fmt, va_list args) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         // Needed for portability on platforms where va_list are passed by reference and modified by functions
         va_list args2;
         va_copy(args2, args);
@@ -612,7 +614,7 @@ public:
     }
 
     inline int setf(const char* fmt, ...) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         va_list args;
         va_start(args, fmt);
         int len = setfv(fmt, args);
@@ -621,7 +623,7 @@ public:
     }
 
     inline int setfv_nogrow(const char* fmt, va_list args) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         STR_ASSERT(Owned);
 
         if (Capacity == 0) return 0;
@@ -633,7 +635,7 @@ public:
     }
 
     inline int setf_nogrow(const char* fmt, ...) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         va_list args;
         va_start(args, fmt);
         int len = setfv_nogrow(fmt, args);
@@ -642,7 +644,7 @@ public:
     }
 
     inline int append_from(int idx, char c) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         int add_len = 1;
         if (Capacity < idx + add_len + 1)
             reserve(idx + add_len + 1);
@@ -653,7 +655,7 @@ public:
     }
 
     inline int append_from(int idx, const char* s, const char* s_end) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         if (!s_end) s_end = s + strlen(s);
         int add_len = (int)(s_end - s);
         if (Capacity < idx + add_len + 1) reserve(idx + add_len + 1);
@@ -665,7 +667,7 @@ public:
 
     // FIXME: merge setfv() and appendfv()?
     inline int appendfv_from(int idx, const char* fmt, va_list args) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         // Needed for portability on platforms where va_list are passed by reference and modified by functions
         va_list args2;
         va_copy(args2, args);
@@ -684,7 +686,7 @@ public:
     }
 
     inline int appendf_from(int idx, const char* fmt, ...) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         va_list args;
         va_start(args, fmt);
         int len = appendfv_from(idx, fmt, args);
@@ -693,25 +695,25 @@ public:
     }
 
     inline int append(char c) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         int cur_len = length();
         return append_from(cur_len, c);
     }
 
     inline int append(const char* s, const char* s_end = null) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         int cur_len = length();
         return append_from(cur_len, s, s_end);
     }
 
     inline int appendfv(const char* fmt, va_list args) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         int cur_len = length();
         return appendfv_from(cur_len, fmt, args);
     }
 
     int appendf(const char* fmt, ...) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         va_list args;
         va_start(args, fmt);
         int len = appendfv(fmt, args);
@@ -727,13 +729,13 @@ public:
     }
 
 protected:
-    inline char*        local_buf()                             { TYPES_H_FTAG; return (char*)this + sizeof(String); }
-    inline const char*  local_buf() const                       { TYPES_H_FTAG; return (char*)this + sizeof(String); }
-    inline bool         is_using_local_buf() const              { TYPES_H_FTAG; return Data == local_buf() && LocalBufSize != 0; }
+    inline char*        local_buf()                             { ULE_TYPES_H_FTAG; return (char*)this + sizeof(String); }
+    inline const char*  local_buf() const                       { ULE_TYPES_H_FTAG; return (char*)this + sizeof(String); }
+    inline bool         is_using_local_buf() const              { ULE_TYPES_H_FTAG; return Data == local_buf() && LocalBufSize != 0; }
 
     // Constructor for StringXXX variants with local buffer
     String(unsigned short local_buf_size) {
-        TYPES_H_FTAG; 
+        ULE_TYPES_H_FTAG; 
         STR_ASSERT(local_buf_size < 1024);
         Data = local_buf();
         Data[0] = '\0';
@@ -746,7 +748,7 @@ protected:
 // Literal/reference string
 class StringRef : public String {
 public:
-    StringRef(const char* s) : String() { TYPES_H_FTAG; set_ref(s); }
+    StringRef(const char* s) : String() { ULE_TYPES_H_FTAG; set_ref(s); }
 };
 
 // Types embedding a local buffer
@@ -757,12 +759,12 @@ class TYPENAME : public String
     char local_buf[LOCALBUFSIZE];                                                                  \
 public:                                                                                            \
     TYPENAME() : String(LOCALBUFSIZE) {}                                                           \
-    TYPENAME(const String& rhs) : String(LOCALBUFSIZE)   { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const char* rhs) : String(LOCALBUFSIZE)     { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const TYPENAME& rhs) : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME&   operator=(const char* rhs)               { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME&   operator=(const String& rhs)             { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME&   operator=(const TYPENAME& rhs)           { TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME(const String& rhs) : String(LOCALBUFSIZE)   { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const char* rhs) : String(LOCALBUFSIZE)     { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const TYPENAME& rhs) : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME&   operator=(const char* rhs)               { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME&   operator=(const String& rhs)             { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME&   operator=(const TYPENAME& rhs)           { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
 };
 
 // Disable PVS-Studio warning V730: Not all members of a class are initialized inside the constructor (local_buf is not initialized and that is fine)
@@ -773,7 +775,7 @@ public:
 class TYPENAME_F : public TYPENAME                                                  \
 {                                                                                   \
 public:                                                                             \
-    TYPENAME_F(const char* fmt, ...) : TYPENAME() { TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
+    TYPENAME_F(const char* fmt, ...) : TYPENAME() { ULE_TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
 };
 
 #ifdef __clang__
@@ -812,16 +814,16 @@ STR_DEFINETYPE_F(String32, String32f)
 class TYPENAME : public String {                                                    \
     char local_buf[LOCALBUFSIZE];                                                   \
 public:                                                                             \
-    TYPENAME(const char* fmt, ...) : String(LOCALBUFSIZE) { TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
-    TYPENAME()                     : String(LOCALBUFSIZE) { TYPES_H_FTAG; }                         \
-    TYPENAME(const String& rhs)    : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const char* rhs)      : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME(const TYPENAME& rhs)  : String(LOCALBUFSIZE) { TYPES_H_FTAG; set(rhs); }               \
-    TYPENAME& operator=(const char* rhs)                  { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME& operator=(const String& rhs)                { TYPES_H_FTAG; set(rhs); return *this; } \
-    TYPENAME& operator=(const TYPENAME& rhs)              { TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME(const char* fmt, ...) : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; va_list args; va_start(args, fmt); setfv(fmt, args); va_end(args); } \
+    TYPENAME()                     : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; }                         \
+    TYPENAME(const String& rhs)    : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const char* rhs)      : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME(const TYPENAME& rhs)  : String(LOCALBUFSIZE) { ULE_TYPES_H_FTAG; set(rhs); }               \
+    TYPENAME& operator=(const char* rhs)                  { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME& operator=(const String& rhs)                { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
+    TYPENAME& operator=(const TYPENAME& rhs)              { ULE_TYPES_H_FTAG; set(rhs); return *this; } \
     void reserve(int new_capacity) { \
-        TYPES_H_FTAG; \
+        ULE_TYPES_H_FTAG; \
         if (new_capacity <= Capacity) \
             return; \
         char* new_data; \
@@ -840,7 +842,7 @@ public:
         Owned = 1; \
     } \
     void reserve_discard(int new_capacity) { \
-        TYPES_H_FTAG; \
+        ULE_TYPES_H_FTAG; \
         if (new_capacity <= Capacity) \
             return; \
         if (Owned && !is_using_local_buf()) \
@@ -921,7 +923,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //};
 ////================================================================================ 
 //StringBuffer::StringBuffer(u32 initialSize) {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    this->length = 0;
 //    this->capacity = initialSize;
 //    this->data = (char*) pMalloc(sizeof(char) * this->capacity);
@@ -938,7 +940,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //void StringBuffer::checkIfShouldGrow() {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    if (this->isFull()) {
 //        // optimal number as you approach infinite elements approaches PHI, but 1.5 sometimes works better for finite sizes
 //        // more testing is probably needed
@@ -948,17 +950,17 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //bool StringBuffer::isEmpty() const {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    return this->length == 0;
 //}
 //
 //bool StringBuffer::isFull() const {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    return this->length == this->capacity;
 //}
 //
 //char StringBuffer::pop() {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    if (this->isEmpty()) {
 //        die("empty");
 //    }
@@ -967,7 +969,7 @@ bool isUnicodeSpaceSeparator(char c) {
 //}
 //
 //u32 StringBuffer::append(char e) {
-//    TYPES_H_FTAG;
+//    ULE_TYPES_H_FTAG;
 //    this->checkIfShouldGrow();
 //
 //    this->data[this->length++] = e;
diff --git a/table.hpp b/table.hpp
index 0cd21ec..09de137 100644
--- a/table.hpp
+++ b/table.hpp
@@ -1,17 +1,17 @@
 
-#ifndef TABLE_H
-#define TABLE_H
+#ifndef ULE_TABLE_H
+#define ULE_TABLE_H
 
 #include <new> // new
 #include <functional> // std::function for traversal
 #include <type_traits> // std::enable_if
 
+#include "config.h"
 #include "alloc.h"
 #include "string.h"
 #include "types.h"
 
 
-
 // what follows is a collection of hash functions taken from: https://www.partow.net/programming/hashfunctions/#:~:text=The%20hash%20functions%20in%20this,containers%20such%20as%20hash%2Dtables.
 //
 // Available Hash Functions
@@ -203,7 +203,7 @@ static inline u32 fastModuloReductionDanielLemire(u32 v, u32 c) {
 }
 
 static inline u32 hash(const char* key, u32 keyLength, u32 capacity) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
 
     u32 value = APHash(key, keyLength);
 
@@ -233,18 +233,18 @@ struct Table {
     TableEntry<V>** entries;
 
     Table<V>(u32 _lanes = 16) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->lanes = _lanes;
         this->length = 0;
         this->entries = (TableEntry<V>**) pCalloc(sizeof(TableEntry<V>*), this->lanes);
     }
     void* operator new(size_t size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return (Table<V>*) pMalloc(sizeof(Table<V>));
     }
 
     V insert(const char* key, u32 keyLength, V value) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         TableEntry<V>* entry = this->lookup(key, keyLength);
 
         if (!entry) { // no entry with that key exists
@@ -270,7 +270,7 @@ struct Table {
     }
 
     TableEntry<V>* lookup(const char* key, u32 keyLength) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         TableEntry<V>* entry = this->entries[hash(key, keyLength, lanes)];
 
         for (; entry != null; entry = entry->next) {
@@ -283,7 +283,7 @@ struct Table {
     }
 
     V lookupWithDefault(const char* key, u32 keyLength, V defaultValue) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         auto entry = this->lookup(key, keyLength);
 
         if (entry == null) return defaultValue;
@@ -294,7 +294,7 @@ struct Table {
     // do not set |freeValues| to true unless the template parameter 'T' is a pointer,
     // and the table is responsible for freeing the memory.
     void clear(bool freeValues = false) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (u32 i = 0; i < this->lanes; i++) {
             TableEntry<V>** lane = &this->entries[i];
             TableEntry<V>* entry = *lane;
@@ -334,7 +334,7 @@ struct Table {
     }
 
     void traverse(const std::function <void (TableEntry<V>*)>& entryCallback) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (u32 i = 0; i < this->lanes; i++) {
             TableEntry<V>* entry = this->entries[i];
 
@@ -346,9 +346,10 @@ struct Table {
     }
 };
 
+#ifdef ULE_CONFIG_OPTION_SERIALIZATION
 template <typename T>
 static void serialize(String* str, Table<T> table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     serialize(str, table.lanes);
     serialize(str, table.length);
     for (u32 i = 0; i < table.lanes; i++) {
@@ -364,7 +365,7 @@ static void serialize(String* str, Table<T> table) {
 
 template <typename T>
 static void serialize(String* str, Table<T>* table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     SERIALIZE_HANDLE_NULL(str, table);
     serialize(str, table->lanes);
     serialize(str, table->length);
@@ -381,7 +382,7 @@ static void serialize(String* str, Table<T>* table) {
 
 template <typename T>
 static void deserialize(char** buffer, Table<T>* table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     deserialize(buffer, &table->lanes);
     u32 length;
     deserialize(buffer, &length);
@@ -398,7 +399,7 @@ static void deserialize(char** buffer, Table<T>* table) {
 
 template <typename T>
 static void deserialize(char** buffer, Table<T>** table) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     DESERIALIZE_HANDLE_NULL(buffer, table);
     u32 lanes;
     deserialize(buffer, &lanes);
@@ -416,6 +417,7 @@ static void deserialize(char** buffer, Table<T>** table) {
     _table->length = length;
     *table = _table;
 }
+#endif // ULE_CONFIG_OPTION_SERIALIZATION
 
 //================================================================================ 
 // Fixed-key size table.
@@ -436,7 +438,7 @@ static void deserialize(char** buffer, Table<T>** table) {
 //#include <mmintrin.h>
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 64>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
 
     // AVX512:
     //__mmask32 result = _mm512_cmpeq_epi16_mask (*((__m512i*)m1), *((__m512i*)m2));
@@ -465,7 +467,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 32>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     //sse4.2:
     //int result = 0;
     //for (u32 i = 0; i < 4; i++) {
@@ -490,7 +492,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE == 16>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     // MMX: (this one is barely nanoseconds (~1-10ns) faster than String::memeq)
     //__m64 result = _mm_cmpeq_pi32(*((__m64*)m1), *((__m64*)m2));
     //return ((u64)result) == ~0ULL;
@@ -499,7 +501,7 @@ static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
 }
 template <size_t KEY_SIZE, typename std::enable_if<KEY_SIZE != 64 && KEY_SIZE != 32 && KEY_SIZE != 16>::type* = nullptr>
 static inline bool fixedKeySizeMemEq(u8* m1, u8* m2) {
-    TYPES_H_FTAG;
+    ULE_TYPES_H_FTAG;
     return String::memeq(m1, m2, KEY_SIZE);
 }
 
@@ -518,18 +520,18 @@ struct FixedKeySizeTable {
     FixedKeySizeTableEntry<KEY_SIZE, V>** entries;
 
     FixedKeySizeTable<KEY_SIZE, V>(u32 _lanes = 16) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->lanes = _lanes;
         this->length = 0;
         this->entries = (FixedKeySizeTableEntry<KEY_SIZE, V>**) pCalloc(sizeof(FixedKeySizeTableEntry<KEY_SIZE, V>*), this->lanes);
     }
     void* operator new(size_t size) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         return (FixedKeySizeTable<KEY_SIZE, V>*) pMalloc(sizeof(FixedKeySizeTable<KEY_SIZE, V>));
     }
 
     V insert(const char* key, V value) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->lookup(key);
 
         if (!entry) { // no entry with that key exists
@@ -554,7 +556,7 @@ struct FixedKeySizeTable {
     }
 
     FixedKeySizeTableEntry<KEY_SIZE, V>* lookup(const char* key) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->entries[hash(key, KEY_SIZE, lanes)];
 
         for (; entry != null; entry = entry->next) {
@@ -567,7 +569,7 @@ struct FixedKeySizeTable {
     }
 
     V lookupWithDefault(const char* key, V defaultValue) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         auto entry = this->lookup(key);
 
         if (entry == null) return defaultValue;
@@ -578,7 +580,7 @@ struct FixedKeySizeTable {
     // do not set |freeValues| to true unless the template parameter 'T' is a pointer,
     // and the table is responsible for freeing the memory.
     void clear(bool freeValues = false) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (u32 i = 0; i < this->lanes; i++) {
             FixedKeySizeTableEntry<KEY_SIZE, V>** lane = &this->entries[i];
             FixedKeySizeTableEntry<KEY_SIZE, V>* entry = *lane;
@@ -617,7 +619,7 @@ struct FixedKeySizeTable {
     }
 
     void traverse(const std::function <void (FixedKeySizeTableEntry<KEY_SIZE, V>*)>& entryCallback) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         for (u32 i = 0; i < this->lanes; i++) {
             FixedKeySizeTableEntry<KEY_SIZE, V>* entry = this->entries[i];
 
@@ -643,14 +645,14 @@ struct CacheTable {
     CacheTableEntry* entries; // n and p are the dimensions of the array. n is first.
 
     CacheTable(u32 _n = 8, u32 _p = 8) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
         this->n       = _n;
         this->p       = _p;
         this->entries = (CacheTableEntry*) pCalloc(this->n*this->p, sizeof(CacheTableEntry));
     }
 
     void* insert(const char* key, u32 keyLength, void* value) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
 
         CacheTableEntry* row = this->entries + hash(key, keyLength, this->n) * this->n;
         // We're going to insert in 'row'. We need some policy to decide which column to evict.
@@ -682,7 +684,7 @@ struct CacheTable {
     }
 
     CacheTableEntry* lookup(const char* key, u32 keyLength) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
 
         CacheTableEntry* row = this->entries + hash(key, keyLength, this->n) * this->n;
 
@@ -698,7 +700,7 @@ struct CacheTable {
     }
 
     void clear(bool freeValues = false) {
-        TYPES_H_FTAG;
+        ULE_TYPES_H_FTAG;
 
         for (u32 i = 0; i < this->n; i++) {
             CacheTableEntry* row = this->entries + i * this->n;
diff --git a/types.h b/types.h
index bfd9902..cbe0ec3 100644
--- a/types.h
+++ b/types.h
@@ -1,25 +1,17 @@
 
-#ifndef TYPES_H
-#define TYPES_H
+#ifndef ULE_TYPES_H
+#define ULE_TYPES_H
 
 #include <stddef.h> // size_t
 
 #define null 0
 
-// long term, it would be nice to not have to '#include' tracy here,
-// a client using the library should include it and use a define to instruct
-// the library what to put at the beginning of function calls for profiling needs, 
-// but i've had trouble implementing that.
-#ifndef TYPES_H_FTAG
-#include <Tracy.hpp>
-#define TYPES_H_FTAG ZoneScoped
+#ifndef ULE_TYPES_H_FTAG
+#ifdef ULE_CONFIG_OPTION_FTAG
+#define ULE_TYPES_H_FTAG ULE_CONFIG_OPTION_FTAG
+#else
+#define ULE_TYPES_H_FTAG
 #endif
-
-// bool is included by default for C++11
-#ifndef __cplusplus
-    typedef _Bool bool;
-    #define true 1
-    #define false 0
 #endif
 
 // The restrict declspec is used on functions that return unaliased pointers. This keyword is used for the C-Runtime Library implementation of malloc since it will never return a pointer value that is already in use in the current program (unless you are doing something illegal, such as using memory after it has been freed).
@@ -47,12 +39,12 @@
 typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
-typedef uint8_t u8;
+typedef uint8_t  u8;
 
 typedef int64_t s64;
 typedef int32_t s32;
 typedef int16_t s16;
-typedef int8_t s8;
+typedef int8_t  s8;
 
 //typedef size_t size_t;
 
@@ -62,9 +54,7 @@ typedef int8_t s8;
 //typedef long double extended;
 
 // if we're using the glm vector/matrix types, or other types, define them here
-#define _USING_GLM_TYPES__
-
-#ifdef _USING_GLM_TYPES__
+#ifdef ULE_CONFIG_OPTION_USE_GLM
 // force high precision for everything
 #define GLM_PRECISION_HIGHP_FLOAT
 #define GLM_PRECISION_HIGHP_DOUBLE
diff --git a/util.h b/util.h
index a2f4542..cc594bb 100644
--- a/util.h
+++ b/util.h
@@ -1,6 +1,6 @@
 
-#ifndef UTIL_H
-#define UTIL_H
+#ifndef ULE_UTIL_H
+#define ULE_UTIL_H
 
 
 #define STATIC_ARRAY_LENGTH(a) (sizeof(a)/sizeof(a[0]))