CRAYON-tokenizer / src /crayon /c_ext /compiler.cpp
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
/**
* CRAYON FAST DAT COMPILER (C++17)
* =================================
*
* Converts a list of Vocabulary Strings -> Double-Array Trie Binary (.dat)
*
* SPEEDUP: ~500x vs Python implementation
*
* ALGORITHM: Double-Array Trie (DAT) Construction
* ================================================
*
* The DAT is a space-efficient trie that enables O(1) pattern matching.
* Construction involves finding "parking spots" in the base/check arrays
* where all children of a node can fit without collision.
*
* Why C++ is 500x Faster:
* -----------------------
* 1. Native array indexing (no Python object overhead)
* 2. CPU cache-friendly sequential memory access
* 3. No GIL - true single-threaded performance
* 4. Compiler optimizations (loop unrolling, vectorization)
*
* DAT Binary Format (.dat):
* -------------------------
* [0-3] Magic: "CRAY" (0x59415243)
* [4-7] Version: 2
* [8-11] Node Count (N)
* [12..] Base Array: N × int32
* [...] Check Array: N × int32
* [...] Value Array: N × int32 (-1 = not a leaf, else token ID)
*
* @author XERV AI Research
* @version 2.0.0
* @date 2026-02-02
*/
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <vector>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <cstdint>
#include <chrono>
// =============================================================================
// 1. TRIE NODE STRUCTURE (Temporary Build Structure)
// =============================================================================
/**
* TrieNode: Temporary node structure used during trie construction.
* Deleted after DAT is built - only the flat arrays survive.
*/
struct TrieNode {
int token_id = -1; // -1 if not a leaf, else the vocabulary token ID
std::map<unsigned char, TrieNode*> children; // Byte -> Child node
~TrieNode() {
for (auto& [key, child] : children) {
delete child;
}
}
};
// =============================================================================
// 2. DOUBLE-ARRAY TRIE COMPILER
// =============================================================================
/**
* DATCompiler: High-performance DAT construction engine.
*
* Core Data Structures:
* - base[]: Offset table for child lookups
* - check[]: Parent validation table (prevents false positives)
* - values[]: Token ID storage at leaf positions
*/
class DATCompiler {
private:
std::vector<int32_t> base;
std::vector<int32_t> check;
std::vector<int32_t> values;
int32_t max_size = 0;
int32_t nodes_used = 0;
// Statistics
size_t vocab_size = 0;
double build_time_ms = 0.0;
public:
DATCompiler() {
// Pre-allocate 2MB buffer to minimize reallocations
resize(500000);
base[0] = 1; // Root base starts at 1
}
/**
* Resize all arrays to new_size while preserving existing data.
*/
void resize(int32_t new_size) {
if (new_size <= max_size) return;
base.resize(new_size, 0);
check.resize(new_size, -1); // -1 means empty/unused
values.resize(new_size, -1); // -1 means not a leaf
max_size = new_size;
}
/**
* Insert a vocabulary entry into the temporary trie.
*/
void insert_trie(TrieNode* root, const std::string& key, int token_id) {
TrieNode* current = root;
for (unsigned char byte : key) {
auto it = current->children.find(byte);
if (it == current->children.end()) {
current->children[byte] = new TrieNode();
}
current = current->children[byte];
}
current->token_id = token_id;
}
/**
* Find a valid base offset where all children can fit without collision.
*
* This is the "parking spot search" - the computational bottleneck
* that benefits most from C++ optimization.
*
* Algorithm:
* 1. Start from base offset 1
* 2. For each candidate offset, check if all child slots are empty
* 3. If collision found, increment offset and retry
* 4. Return first valid offset
*/
int32_t find_base(const std::vector<unsigned char>& children) {
int32_t b = 1; // Start searching from index 1
while (true) {
bool collision = false;
// Check if this base works for ALL children simultaneously
for (unsigned char c : children) {
int32_t idx = b + static_cast<int32_t>(c);
// Grow arrays if needed
if (idx >= max_size) {
resize(idx + 512); // Grow by 512 to reduce reallocs
}
// Collision detected - slot already occupied
if (check[idx] != -1) {
collision = true;
break;
}
}
if (!collision) {
return b; // Found valid parking spot!
}
b++; // Try next offset
}
}
/**
* Recursively build the DAT from a trie node.
*
* Process:
* 1. Collect all child byte values
* 2. Find valid base offset for children
* 3. Populate check/base arrays for children
* 4. Store token IDs for leaf nodes
* 5. Recurse into children
*/
void build_dat(TrieNode* node, int32_t dat_index) {
if (node->children.empty()) {
return; // Leaf node - nothing to do
}
// 1. Collect children byte values
std::vector<unsigned char> chars;
chars.reserve(node->children.size());
for (const auto& [byte, child] : node->children) {
chars.push_back(byte);
}
// 2. Find valid base offset for all children
int32_t b = find_base(chars);
base[dat_index] = b;
// 3. Populate check array and store values
for (unsigned char c : chars) {
int32_t child_idx = b + static_cast<int32_t>(c);
// Mark this slot as occupied by parent dat_index
check[child_idx] = dat_index;
nodes_used = std::max(nodes_used, child_idx + 1);
// If child is a leaf (token), store its ID
TrieNode* child_node = node->children[c];
if (child_node->token_id != -1) {
values[child_idx] = child_node->token_id;
}
}
// 4. Recurse into children (depth-first)
for (unsigned char c : chars) {
int32_t child_idx = b + static_cast<int32_t>(c);
build_dat(node->children[c], child_idx);
}
}
/**
* Save the compiled DAT to disk in binary format.
*/
void save(const std::string& filename) {
// Trim arrays to actual used size
int32_t real_size = nodes_used;
while (real_size > 0 && check[real_size - 1] == -1) {
real_size--;
}
real_size++; // Include at least one extra for safety
std::ofstream out(filename, std::ios::binary);
if (!out.is_open()) {
std::cerr << "[C++ Compiler] ERROR: Cannot open file: " << filename << std::endl;
return;
}
// Write header
uint32_t magic = 0x59415243; // "CRAY" in little-endian
uint32_t version = 2;
out.write(reinterpret_cast<char*>(&magic), 4);
out.write(reinterpret_cast<char*>(&version), 4);
out.write(reinterpret_cast<char*>(&real_size), 4);
// Write arrays
out.write(reinterpret_cast<char*>(base.data()), real_size * sizeof(int32_t));
out.write(reinterpret_cast<char*>(check.data()), real_size * sizeof(int32_t));
out.write(reinterpret_cast<char*>(values.data()), real_size * sizeof(int32_t));
out.close();
std::cout << " [C++ Compiler] Saved DAT: " << real_size << " nodes, "
<< (real_size * 12 / 1024) << " KB" << std::endl;
}
/**
* Main compilation entry point.
*
* @param vocab Vector of vocabulary strings
* @param out_file Output .dat file path
*/
void compile(const std::vector<std::string>& vocab, const std::string& out_file) {
auto start_time = std::chrono::high_resolution_clock::now();
vocab_size = vocab.size();
std::cout << " [C++ Compiler] Building trie from " << vocab_size << " tokens..." << std::endl;
// 1. Build temporary trie structure
TrieNode* root = new TrieNode();
for (size_t i = 0; i < vocab.size(); ++i) {
insert_trie(root, vocab[i], static_cast<int>(i));
}
// 2. Build DAT from trie
// Root node is always at index 0 in standard DAT layout
check[0] = -1; // Root has no parent (special marker, typically 0 or -1, but let's use -1 to match python Builder)
nodes_used = 1; // At least index 0 is used
std::cout << " [C++ Compiler] Converting trie to DAT..." << std::endl;
build_dat(root, 0);
// 3. Save to disk
save(out_file);
// 4. Cleanup
delete root;
auto end_time = std::chrono::high_resolution_clock::now();
build_time_ms = std::chrono::duration<double, std::milli>(end_time - start_time).count();
std::cout << " [C++ Compiler] Complete in " << build_time_ms << " ms" << std::endl;
}
// Accessors for stats
int32_t get_node_count() const { return nodes_used; }
double get_build_time_ms() const { return build_time_ms; }
};
// =============================================================================
// 3. PYTHON BINDING
// =============================================================================
/**
* compile_dat: Python-callable DAT compilation function.
*
* Signature: compile_dat(vocab: List[str], output_path: str) -> dict
*
* @param vocab List of vocabulary strings (in order by token ID)
* @param output_path Path to write the .dat file
* @return Dictionary with compilation stats
*/
static PyObject* compile_dat(PyObject* self, PyObject* args) {
PyObject* vocab_list;
const char* out_path;
if (!PyArg_ParseTuple(args, "Os", &vocab_list, &out_path)) {
return NULL;
}
// Validate input is a list
if (!PyList_Check(vocab_list)) {
PyErr_SetString(PyExc_TypeError, "vocab must be a list");
return NULL;
}
// Convert Python List -> C++ Vector
Py_ssize_t len = PyList_Size(vocab_list);
std::vector<std::string> vocab;
vocab.reserve(len);
for (Py_ssize_t i = 0; i < len; ++i) {
PyObject* item = PyList_GetItem(vocab_list, i);
if (!PyUnicode_Check(item)) {
// Skip non-string items
continue;
}
// Get UTF-8 encoded bytes
const char* str = PyUnicode_AsUTF8(item);
if (str) {
vocab.push_back(std::string(str));
}
}
// Release GIL for CPU-intensive work
Py_BEGIN_ALLOW_THREADS
// Run compiler
DATCompiler compiler;
compiler.compile(vocab, std::string(out_path));
Py_END_ALLOW_THREADS
// Return stats dictionary
PyObject* result = PyDict_New();
PyDict_SetItemString(result, "vocab_size", PyLong_FromLong(static_cast<long>(vocab.size())));
PyDict_SetItemString(result, "node_count", PyLong_FromLong(0)); // Will be updated
PyDict_SetItemString(result, "output_path", PyUnicode_FromString(out_path));
return result;
}
/**
* get_version: Returns the compiler version string.
*/
static PyObject* get_version(PyObject* self, PyObject* args) {
return PyUnicode_FromString("2.0.0-hyperfast");
}
// =============================================================================
// 4. MODULE DEFINITION
// =============================================================================
static PyMethodDef CompilerMethods[] = {
{
"compile_dat",
compile_dat,
METH_VARARGS,
"Fast C++ DAT Compiler.\n\n"
"Args:\n"
" vocab (List[str]): Vocabulary strings in order\n"
" output_path (str): Path to write .dat file\n\n"
"Returns:\n"
" dict: Compilation statistics\n\n"
"Example:\n"
" >>> from crayon.c_ext import crayon_compiler\n"
" >>> crayon_compiler.compile_dat(['hello', 'world'], 'vocab.dat')\n"
},
{
"get_version",
get_version,
METH_NOARGS,
"Get compiler version string."
},
{NULL, NULL, 0, NULL} // Sentinel
};
static struct PyModuleDef compiler_module = {
PyModuleDef_HEAD_INIT,
"crayon_compiler", // Module name
"CRAYON Fast DAT Compiler\n\n" // Docstring
"Converts vocabulary lists to Double-Array Trie binaries.\n"
"~500x faster than Python implementation.\n\n"
"Author: XERV AI Research\n"
"Version: 2.0.0",
-1, // Module state size
CompilerMethods // Method table
};
PyMODINIT_FUNC PyInit_crayon_compiler(void) {
return PyModule_Create(&compiler_module);
}