import torch import numpy as np from transformers import AutoTokenizer, AutoModel from sklearn.metrics.pairwise import cosine_similarity class GraphCodeBERT: def __init__(self): import gc # Using "Small" model to fit in Render Free Tier (512MB RAM) self.model_name = "huggingface/CodeBERTa-small-v1" print(f"Loading Analyzer Model: {self.model_name}...") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) # Load model usually model_fp32 = AutoModel.from_pretrained(self.model_name) # QUANTIZATION MAGIC: Compress model to int8 (4x smaller RAM) print("📉 Quantizing model to reduce memory usage for Render...") # FIX: Explicitly set engine for ARM64/Mac/Container compatibility torch.backends.quantized.engine = 'qnnpack' self.model = torch.quantization.quantize_dynamic( model_fp32, {torch.nn.Linear}, dtype=torch.qint8 ) del model_fp32 gc.collect() self.model.eval() # Set to evaluation mode def get_embedding(self, code_snippet): """ Converts a string of code into a dense vector (embedding). """ if not code_snippet or not isinstance(code_snippet, str): return np.zeros(768) # Return empty vector if code is invalid try: inputs = self.tokenizer( code_snippet, return_tensors="pt", truncation=True, max_length=512 ) with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling to capture the overall semantic meaning embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() return embedding except Exception as e: print(f"Error generating embedding: {e}") return np.zeros(768) # --- EXPORTED FUNCTIONS (To fix the ImportError) --- # 1. Create a global instance of the model _bert_instance = GraphCodeBERT() # 2. Expose the function so scorer.py can import it def get_embedding(code_snippet): return _bert_instance.get_embedding(code_snippet) # 3. Expose the similarity function def compute_similarity(embedding1, embedding2): """ Calculates cosine similarity between two embeddings. """ if embedding1 is None or embedding2 is None: return 0.0 # Ensure they are numpy arrays e1 = np.array(embedding1).reshape(1, -1) e2 = np.array(embedding2).reshape(1, -1) return float(cosine_similarity(e1, e2)[0][0])