Upload folder using huggingface_hub

708f4a3 verified 4 days ago

13.5 kB

	/**
	* CRAYON FAST DAT COMPILER (C++17)
	* =================================
	*
	* Converts a list of Vocabulary Strings -> Double-Array Trie Binary (.dat)
	*
	* SPEEDUP: ~500x vs Python implementation
	*
	* ALGORITHM: Double-Array Trie (DAT) Construction
	* ================================================
	*
	* The DAT is a space-efficient trie that enables O(1) pattern matching.
	* Construction involves finding "parking spots" in the base/check arrays
	* where all children of a node can fit without collision.
	*
	* Why C++ is 500x Faster:
	* -----------------------
	* 1. Native array indexing (no Python object overhead)
	* 2. CPU cache-friendly sequential memory access
	* 3. No GIL - true single-threaded performance
	* 4. Compiler optimizations (loop unrolling, vectorization)
	*
	* DAT Binary Format (.dat):
	* -------------------------
	* [0-3] Magic: "CRAY" (0x59415243)
	* [4-7] Version: 2
	* [8-11] Node Count (N)
	* [12..] Base Array: N × int32
	* [...] Check Array: N × int32
	* [...] Value Array: N × int32 (-1 = not a leaf, else token ID)
	*
	* @author XERV AI Research
	* @version 2.0.0
	* @date 2026-02-02
	*/

	#define PY_SSIZE_T_CLEAN
	#include <Python.h>
	#include <vector>
	#include <string>
	#include <map>
	#include <iostream>
	#include <fstream>
	#include <algorithm>
	#include <cstdint>
	#include <chrono>


	// =============================================================================
	// 1. TRIE NODE STRUCTURE (Temporary Build Structure)
	// =============================================================================
	/**
	* TrieNode: Temporary node structure used during trie construction.
	* Deleted after DAT is built - only the flat arrays survive.
	*/
	struct TrieNode {
	int token_id = -1; // -1 if not a leaf, else the vocabulary token ID
	std::map<unsigned char, TrieNode*> children; // Byte -> Child node

	~TrieNode() {
	for (auto& [key, child] : children) {
	delete child;
	}
	}
	};


	// =============================================================================
	// 2. DOUBLE-ARRAY TRIE COMPILER
	// =============================================================================
	/**
	* DATCompiler: High-performance DAT construction engine.
	*
	* Core Data Structures:
	* - base[]: Offset table for child lookups
	* - check[]: Parent validation table (prevents false positives)
	* - values[]: Token ID storage at leaf positions
	*/
	class DATCompiler {
	private:
	std::vector<int32_t> base;
	std::vector<int32_t> check;
	std::vector<int32_t> values;

	int32_t max_size = 0;
	int32_t nodes_used = 0;

	// Statistics
	size_t vocab_size = 0;
	double build_time_ms = 0.0;

	public:
	DATCompiler() {
	// Pre-allocate 2MB buffer to minimize reallocations
	resize(500000);
	base[0] = 1; // Root base starts at 1
	}

	/**
	* Resize all arrays to new_size while preserving existing data.
	*/
	void resize(int32_t new_size) {
	if (new_size <= max_size) return;

	base.resize(new_size, 0);
	check.resize(new_size, -1); // -1 means empty/unused
	values.resize(new_size, -1); // -1 means not a leaf
	max_size = new_size;
	}

	/**
	* Insert a vocabulary entry into the temporary trie.
	*/
	void insert_trie(TrieNode* root, const std::string& key, int token_id) {
	TrieNode* current = root;

	for (unsigned char byte : key) {
	auto it = current->children.find(byte);
	if (it == current->children.end()) {
	current->children[byte] = new TrieNode();
	}
	current = current->children[byte];
	}

	current->token_id = token_id;
	}

	/**
	* Find a valid base offset where all children can fit without collision.
	*
	* This is the "parking spot search" - the computational bottleneck
	* that benefits most from C++ optimization.
	*
	* Algorithm:
	* 1. Start from base offset 1
	* 2. For each candidate offset, check if all child slots are empty
	* 3. If collision found, increment offset and retry
	* 4. Return first valid offset
	*/
	int32_t find_base(const std::vector<unsigned char>& children) {
	int32_t b = 1; // Start searching from index 1

	while (true) {
	bool collision = false;

	// Check if this base works for ALL children simultaneously
	for (unsigned char c : children) {
	int32_t idx = b + static_cast<int32_t>(c);

	// Grow arrays if needed
	if (idx >= max_size) {
	resize(idx + 512); // Grow by 512 to reduce reallocs
	}

	// Collision detected - slot already occupied
	if (check[idx] != -1) {
	collision = true;
	break;
	}
	}

	if (!collision) {
	return b; // Found valid parking spot!
	}

	b++; // Try next offset
	}
	}

	/**
	* Recursively build the DAT from a trie node.
	*
	* Process:
	* 1. Collect all child byte values
	* 2. Find valid base offset for children
	* 3. Populate check/base arrays for children
	* 4. Store token IDs for leaf nodes
	* 5. Recurse into children
	*/
	void build_dat(TrieNode* node, int32_t dat_index) {
	if (node->children.empty()) {
	return; // Leaf node - nothing to do
	}

	// 1. Collect children byte values
	std::vector<unsigned char> chars;
	chars.reserve(node->children.size());

	for (const auto& [byte, child] : node->children) {
	chars.push_back(byte);
	}

	// 2. Find valid base offset for all children
	int32_t b = find_base(chars);
	base[dat_index] = b;

	// 3. Populate check array and store values
	for (unsigned char c : chars) {
	int32_t child_idx = b + static_cast<int32_t>(c);

	// Mark this slot as occupied by parent dat_index
	check[child_idx] = dat_index;
	nodes_used = std::max(nodes_used, child_idx + 1);

	// If child is a leaf (token), store its ID
	TrieNode* child_node = node->children[c];
	if (child_node->token_id != -1) {
	values[child_idx] = child_node->token_id;
	}
	}

	// 4. Recurse into children (depth-first)
	for (unsigned char c : chars) {
	int32_t child_idx = b + static_cast<int32_t>(c);
	build_dat(node->children[c], child_idx);
	}
	}

	/**
	* Save the compiled DAT to disk in binary format.
	*/
	void save(const std::string& filename) {
	// Trim arrays to actual used size
	int32_t real_size = nodes_used;
	while (real_size > 0 && check[real_size - 1] == -1) {
	real_size--;
	}
	real_size++; // Include at least one extra for safety

	std::ofstream out(filename, std::ios::binary);
	if (!out.is_open()) {
	std::cerr << "[C++ Compiler] ERROR: Cannot open file: " << filename << std::endl;
	return;
	}

	// Write header
	uint32_t magic = 0x59415243; // "CRAY" in little-endian
	uint32_t version = 2;

	out.write(reinterpret_cast<char*>(&magic), 4);
	out.write(reinterpret_cast<char*>(&version), 4);
	out.write(reinterpret_cast<char*>(&real_size), 4);

	// Write arrays
	out.write(reinterpret_cast<char>(base.data()), real_size sizeof(int32_t));
	out.write(reinterpret_cast<char>(check.data()), real_size sizeof(int32_t));
	out.write(reinterpret_cast<char>(values.data()), real_size sizeof(int32_t));

	out.close();

	std::cout << " [C++ Compiler] Saved DAT: " << real_size << " nodes, "
	<< (real_size * 12 / 1024) << " KB" << std::endl;
	}

	/**
	* Main compilation entry point.
	*
	* @param vocab Vector of vocabulary strings
	* @param out_file Output .dat file path
	*/
	void compile(const std::vector<std::string>& vocab, const std::string& out_file) {
	auto start_time = std::chrono::high_resolution_clock::now();

	vocab_size = vocab.size();
	std::cout << " [C++ Compiler] Building trie from " << vocab_size << " tokens..." << std::endl;

	// 1. Build temporary trie structure
	TrieNode* root = new TrieNode();

	for (size_t i = 0; i < vocab.size(); ++i) {
	insert_trie(root, vocab[i], static_cast<int>(i));
	}

	// 2. Build DAT from trie
	// Root node is always at index 0 in standard DAT layout
	check[0] = -1; // Root has no parent (special marker, typically 0 or -1, but let's use -1 to match python Builder)
	nodes_used = 1; // At least index 0 is used

	std::cout << " [C++ Compiler] Converting trie to DAT..." << std::endl;
	build_dat(root, 0);

	// 3. Save to disk
	save(out_file);

	// 4. Cleanup
	delete root;

	auto end_time = std::chrono::high_resolution_clock::now();
	build_time_ms = std::chrono::duration<double, std::milli>(end_time - start_time).count();

	std::cout << " [C++ Compiler] Complete in " << build_time_ms << " ms" << std::endl;
	}

	// Accessors for stats
	int32_t get_node_count() const { return nodes_used; }
	double get_build_time_ms() const { return build_time_ms; }
	};


	// =============================================================================
	// 3. PYTHON BINDING
	// =============================================================================

	/**
	* compile_dat: Python-callable DAT compilation function.
	*
	* Signature: compile_dat(vocab: List[str], output_path: str) -> dict
	*
	* @param vocab List of vocabulary strings (in order by token ID)
	* @param output_path Path to write the .dat file
	* @return Dictionary with compilation stats
	*/
	static PyObject* compile_dat(PyObject* self, PyObject* args) {
	PyObject* vocab_list;
	const char* out_path;

	if (!PyArg_ParseTuple(args, "Os", &vocab_list, &out_path)) {
	return NULL;
	}

	// Validate input is a list
	if (!PyList_Check(vocab_list)) {
	PyErr_SetString(PyExc_TypeError, "vocab must be a list");
	return NULL;
	}

	// Convert Python List -> C++ Vector
	Py_ssize_t len = PyList_Size(vocab_list);
	std::vector<std::string> vocab;
	vocab.reserve(len);

	for (Py_ssize_t i = 0; i < len; ++i) {
	PyObject* item = PyList_GetItem(vocab_list, i);

	if (!PyUnicode_Check(item)) {
	// Skip non-string items
	continue;
	}

	// Get UTF-8 encoded bytes
	const char* str = PyUnicode_AsUTF8(item);
	if (str) {
	vocab.push_back(std::string(str));
	}
	}

	// Release GIL for CPU-intensive work
	Py_BEGIN_ALLOW_THREADS

	// Run compiler
	DATCompiler compiler;
	compiler.compile(vocab, std::string(out_path));

	Py_END_ALLOW_THREADS

	// Return stats dictionary
	PyObject* result = PyDict_New();
	PyDict_SetItemString(result, "vocab_size", PyLong_FromLong(static_cast<long>(vocab.size())));
	PyDict_SetItemString(result, "node_count", PyLong_FromLong(0)); // Will be updated
	PyDict_SetItemString(result, "output_path", PyUnicode_FromString(out_path));

	return result;
	}


	/**
	* get_version: Returns the compiler version string.
	*/
	static PyObject* get_version(PyObject* self, PyObject* args) {
	return PyUnicode_FromString("2.0.0-hyperfast");
	}


	// =============================================================================
	// 4. MODULE DEFINITION
	// =============================================================================

	static PyMethodDef CompilerMethods[] = {
	{
	"compile_dat",
	compile_dat,
	METH_VARARGS,
	"Fast C++ DAT Compiler.\n\n"
	"Args:\n"
	" vocab (List[str]): Vocabulary strings in order\n"
	" output_path (str): Path to write .dat file\n\n"
	"Returns:\n"
	" dict: Compilation statistics\n\n"
	"Example:\n"
	" >>> from crayon.c_ext import crayon_compiler\n"
	" >>> crayon_compiler.compile_dat(['hello', 'world'], 'vocab.dat')\n"
	},
	{
	"get_version",
	get_version,
	METH_NOARGS,
	"Get compiler version string."
	},
	{NULL, NULL, 0, NULL} // Sentinel
	};

	static struct PyModuleDef compiler_module = {
	PyModuleDef_HEAD_INIT,
	"crayon_compiler", // Module name
	"CRAYON Fast DAT Compiler\n\n" // Docstring
	"Converts vocabulary lists to Double-Array Trie binaries.\n"
	"~500x faster than Python implementation.\n\n"
	"Author: XERV AI Research\n"
	"Version: 2.0.0",
	-1, // Module state size
	CompilerMethods // Method table
	};


	PyMODINIT_FUNC PyInit_crayon_compiler(void) {
	return PyModule_Create(&compiler_module);
	}