import torch
import gc
from transformers import TextStreamer, pipeline, BitsAndBytesConfig

# 1. Clear GPU memory to recover from the previous OOM
if 'pipe' in locals(): del pipe
if 'outputs' in locals(): del outputs
gc.collect()
torch.cuda.empty_cache()

# 2. Configure 4-bit quantization properly for custom architectures
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model_id = "Xerv-AI/tarn"
pipe = pipeline(
    "image-text-to-text",
    model=model_id,
    device_map="auto",
    trust_remote_code=True,
    model_kwargs={
        "quantization_config": bnb_config,
        "torch_dtype": torch.float16
    }
)

streamer = TextStreamer(pipe.tokenizer, skip_prompt=True)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"
            },
            {
                "type": "text",
                "text": "Analyze the visual artifacts present in this image and define the principles of triboelectricity."
            }
        ]
    },
]

print("=== Initiating Real-Time Telemetry Stream (Quantized) ===")
outputs = pipe(
    text=messages,
    generate_kwargs={
        "max_new_tokens": 512,
        "streamer": streamer
    }
)