import torch import gc from transformers import TextStreamer, pipeline, BitsAndBytesConfig # 1. Clear GPU memory to recover from the previous OOM if 'pipe' in locals(): del pipe if 'outputs' in locals(): del outputs gc.collect() torch.cuda.empty_cache() # 2. Configure 4-bit quantization properly for custom architectures bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True ) model_id = "Xerv-AI/tarn" pipe = pipeline( "image-text-to-text", model=model_id, device_map="auto", trust_remote_code=True, model_kwargs={ "quantization_config": bnb_config, "torch_dtype": torch.float16 } ) streamer = TextStreamer(pipe.tokenizer, skip_prompt=True) messages = [ { "role": "user", "content": [ { "type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG" }, { "type": "text", "text": "Analyze the visual artifacts present in this image and define the principles of triboelectricity." } ] }, ] print("=== Initiating Real-Time Telemetry Stream (Quantized) ===") outputs = pipe( text=messages, generate_kwargs={ "max_new_tokens": 512, "streamer": streamer } )