|
|
|
|
|
|
| import json
|
| from pathlib import Path
|
|
|
|
|
|
|
| TRAINING_CONFIG = {
|
| "model": {
|
| "vocab_size": 50257,
|
| "max_seq_len": 2048,
|
| "dim": 1024,
|
| "n_layers": 16,
|
| "n_heads": 16,
|
| "hidden_dim": 2736,
|
| "dropout": 0.0
|
| },
|
| "training": {
|
| "batch_size": 1,
|
| "gradient_accumulation_steps": 32,
|
| "max_steps": 50000,
|
| "warmup_steps": 2000,
|
| "learning_rate": 3e-4,
|
| "weight_decay": 0.01,
|
| "grad_clip": 1.0,
|
| "mixed_precision": "bf16",
|
| "gradient_checkpointing": True
|
| },
|
| "data": {
|
| "seq_length": 1024,
|
| "data_path": "data/tokens/packed_1024.txt"
|
| },
|
| "hardware": {
|
| "device": "cuda",
|
| "compile_model": False
|
| },
|
| "logging": {
|
| "log_interval": 10,
|
| "save_interval": 2000,
|
| "output_dir": "checkpoints"
|
| }
|
| }
|
|
|
|
|
| DATA_CONFIG = {
|
| "num_docs": 20000,
|
| "seq_length": 1024,
|
| "tokenizer": "gpt2",
|
| "output_dir": "data",
|
| "min_text_length": 50,
|
| "max_text_length": 10000
|
| }
|
|
|
|
|
| def setup_project():
|
| """Create project directory structure"""
|
| directories = [
|
| "data/shards",
|
| "data/processed",
|
| "data/tokens",
|
| "checkpoints",
|
| "configs",
|
| "logs",
|
| "notebooks"
|
| ]
|
|
|
| for dir_path in directories:
|
| Path(dir_path).mkdir(parents=True, exist_ok=True)
|
| print(f"Created directory: {dir_path}")
|
|
|
|
|
| def save_configs():
|
| """Save configuration files"""
|
|
|
| with open("configs/training_config.json", "w") as f:
|
| json.dump(TRAINING_CONFIG, f, indent=2)
|
|
|
|
|
| with open("configs/data_config.json", "w") as f:
|
| json.dump(DATA_CONFIG, f, indent=2)
|
|
|
| print("Configuration files saved to configs/")
|
|
|
|
|
| def create_requirements_txt():
|
| """Create requirements.txt file"""
|
| requirements = [
|
| "torch>=2.0.0",
|
| "transformers>=4.35.0",
|
| "tokenizers>=0.14.0",
|
| "datasets>=2.14.0",
|
| "accelerate>=0.24.0",
|
| "sentencepiece>=0.1.99",
|
| "langdetect>=1.0.9",
|
| "zstandard>=0.21.0",
|
| "tqdm>=4.65.0",
|
| "numpy>=1.24.0",
|
| "matplotlib>=3.6.0",
|
| "tensorboard>=2.14.0"
|
| ]
|
|
|
| with open("requirements.txt", "w") as f:
|
| f.write("\n".join(requirements))
|
|
|
| print("Created requirements.txt")
|
|
|
|
|
| def create_run_script():
|
| """Create a simple run script for training"""
|
| run_script = '''#!/usr/bin/env python3
|
| # Run MAP-NEO Mini training pipeline
|
|
|
| import subprocess
|
| import sys
|
| from pathlib import Path
|
|
|
| def run_command(cmd, description):
|
| """Run a command and handle errors"""
|
| print(f"\\n{'='*50}")
|
| print(f"Running: {description}")
|
| print(f"Command: {cmd}")
|
| print(f"{'='*50}")
|
|
|
| result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
|
| if result.returncode != 0:
|
| print(f"Error in {description}:")
|
| print(result.stderr)
|
| sys.exit(1)
|
| else:
|
| print(f"Success: {description}")
|
| if result.stdout:
|
| print(result.stdout)
|
|
|
| def main():
|
| print("MAP-NEO Mini Training Pipeline")
|
| print("Optimized for RTX 5070 8GB VRAM")
|
|
|
| # Step 1: Data preprocessing
|
| if not Path("data/tokens/packed_1024.txt").exists():
|
| print("\\nStep 1: Data preprocessing")
|
| run_command(
|
| "python data_prep.py --num_docs 20000 --seq_length 1024",
|
| "Data preprocessing"
|
| )
|
| else:
|
| print("\\nSkipping data preprocessing (data exists)")
|
|
|
| # Step 2: Model training
|
| print("\\nStep 2: Starting model training")
|
| run_command(
|
| "python train_neo.py",
|
| "Model training"
|
| )
|
|
|
| print("\\n" + "="*50)
|
| print("Training pipeline completed!")
|
| print("Check checkpoints/ directory for saved models")
|
| print("="*50)
|
|
|
| if __name__ == "__main__":
|
| main()
|
| '''
|
|
|
| with open("run_training.py", "w") as f:
|
| f.write(run_script)
|
|
|
| print("Created run_training.py script")
|
|
|
|
|
| if __name__ == "__main__":
|
| print("Setting up MAP-NEO Mini project...")
|
|
|
| setup_project()
|
| save_configs()
|
| create_requirements_txt()
|
| create_run_script()
|
|
|
| print("\nProject setup complete!")
|
| print("\nNext steps:")
|
| print("1. Run: python data_prep.py --num_docs 10000")
|
| print("2. Run: python train_neo.py")
|
| print("3. Or use: python run_training.py") |