Spaces:

TIGER-Lab
/

RationalRewards-Demo

Running on Zero

App Files Files Community

RationalRewards-Demo / app.py

JasperHaozhe

Update app.py

d2f0440 verified 3 days ago

raw

history blame contribute delete

37.1 kB

	import spaces
	import gradio as gr
	from diffusers import FluxKontextPipeline
	from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
	from transformers.image_utils import load_image
	from threading import Thread
	import torch
	import os
	from transformers import BitsAndBytesConfig

	from serve_constants import html_header, bibtext, learn_more_markdown, tos_markdown

	# Determine devices
	if torch.cuda.is_available() and torch.cuda.device_count() > 1:
	device_vlm = "cuda:0"
	device_gen = "cuda:1"
	else:
	device_vlm = "cuda"
	device_gen = "cuda"

	# Memory & Quantization Configuration
	# To constrain memory usage, you can set these values
	VLM_MAX_MEMORY = {0: "14GiB"} if device_vlm == "cuda" else None # Example: "14GiB" per GPU
	VLM_QUANTIZATION_4BIT = False # Load VLM in 4-bit to save memory
	VLM_QUANTIZATION_8BIT = False # Load VLM in 8-bit to save memory (mutually exclusive with 4-bit)

	MODEL_ID = "JasperHaozhe/RationalRewards-Both-Demo"
	FLUX_MODEL_ID = "camenduru/flux1-kontext-dev_fp8_e4m3fn_diffusers" # "black-forest-labs/FLUX.1-Kontext-dev"

	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

	# Configure Quantization
	quantization_config = None
	if VLM_QUANTIZATION_4BIT:
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	)
	elif VLM_QUANTIZATION_8BIT:
	quantization_config = BitsAndBytesConfig(
	load_in_8bit=True,
	)

	# Load VLM
	# Note: When using device_map="auto" or quantization, avoid manual .to(device) calls if mapped
	model_kwargs = {
	"trust_remote_code": True,
	"torch_dtype": torch.bfloat16,
	"quantization_config": quantization_config,
	}

	# If VLM_MAX_MEMORY is set or using quantization, use device_map
	# if VLM_MAX_MEMORY or VLM_QUANTIZATION_4BIT or VLM_QUANTIZATION_8BIT:
	# model_kwargs["device_map"] = "auto"
	# if VLM_MAX_MEMORY:
	# model_kwargs["max_memory"] = VLM_MAX_MEMORY

	model = AutoModelForImageTextToText.from_pretrained(
	MODEL_ID,
	**model_kwargs
	)

	# Only manually move to CPU/eval if NOT using device_map/quantization (which handles placement)
	# if not (VLM_MAX_MEMORY or VLM_QUANTIZATION_4BIT or VLM_QUANTIZATION_8BIT):
	# model.to("cpu").eval()
	# else:
	model.eval()

	# Load Flux Pipeline
	flux_pipeline = FluxKontextPipeline.from_pretrained(
	FLUX_MODEL_ID,
	torch_dtype=torch.bfloat16
	)
	# Fix VAE precision for Flux to avoid artifacts
	# flux_pipeline.vae.to(dtype=torch.float32)
	# flux_pipeline.enable_attention_slicing() # Enable attention slicing to save memory during inference
	# Assume we can load both models simultaneously (User request)
	# No CPU offloading logic here.
	flux_pipeline.to(device_gen)

	TASK_CHOICES = [
	"Prompt Tuning - Image Editing",
	"Prompt Tuning - T2I Generation",
	"Pointwise - Image Editing",
	"Pointwise - T2I Generation",
	"Pairwise - Image Editing",
	"Pairwise - T2I Generation",
	]

	# ============================================================
	# Instruction Templates
	# ============================================================

	POINTWISE_EDITING_INSTRUCTION = """You are an expert image editing evaluator. Your task is to evaluate the quality of an edited image based on a source image and a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better image edits (if any).

	User Instruction: {prompt}
	You are provided with two images:
	1. Source Image <image>
	2. Edited Image <image>

	Your task is to evaluate the Edited Image against the Source Image and the User Instruction.
	To do this, you must first assess the image on four critical aspects, provide justifications and absolute scores in 1-4 scale.

	### Critical Aspects & Scoring Rubric
	1. Text Faithfulness (How accurately does the output follow the instruction?)
	- 4 (Full match): All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
	- 3 (Minor mismatch): Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
	- 2 (Some mismatch): Some key elements are missing, altered, or interpreted incorrectly.
	- 1 (Major deviations): Key elements are completely missing, altered, or contradicted. Instruction is ignored.

	2. Image Faithfulness (How well are the non-edited parts and key input elements preserved?)
	- 4 (Uses input fully): All relevant elements from the input (background, style, lighting, identity) are accurately preserved or transformed as instructed.
	- 3 (Minor mismatch): Most relevant elements are preserved, but a few aspects (e.g., background details, lighting consistency) are missing or incorrectly handled.
	- 2 (Partial mismatch): Some elements are carried over, but key aspects of the original image are lost or distorted.
	- 1 (Fails to use input): Key elements of the input image are ignored, misinterpreted, or destroyed.

	3. Physical and Visual Quality (Technical errors, composition, realism, and physics)
	- 4 (No noticeable flaws): The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
	- 3 (Minor flaws): Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
	- 2 (Some flaws): Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
	- 1 (Severe flaws): Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).

	4. Text Rendering (Only if the instruction involves generating text)
	- 4 (Full match): Text is correct, legible, and integrated well.
	- 3 (Mostly match): Minor misspellings or inconsistent capitalization.
	- 2 (Partial match): Major misspellings or distorted text.
	- 1 (Major deviations): Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).

	### Scoring Methodology (CRITICAL)
	During assessment for each aspect, recall the initial user request, source image and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
	1. Anchor: Have a global inspection based on the user request and the resulting generation. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided .
	2. Justify and Adjust: Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
	- Example: deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.

	Afterwards, try to construct a refined user request that helps the visual generation model to produce better image edits.
	Think of the weaknesses identified in the judgement, then map them to instruction details and apply specific fixes.
	Provide a final new user request that enrich the initial user request.

	Output your evaluation in the following format:
	# User Request Analysis
	[ understanding the user request, try to analyze or decompose the user request deeper. Think of what the request might imply or what needs to be inferred to successfully execute the request. ]
	# Detailed Judgement
	1. Text Faithfulness:
	## Justification: [ Analysis of the user request and the assessment of the resulting generation. How it comes to a final score. ]
	## Score: [ float score ]
	2. Image Faithfulness:
	## Justification: [ Similar to above. Analysis and assessment. ]
	## Score: [ float score ]
	3. Physical and Visual Quality:
	## Justification: [ Similar to above. Analysis and assessment. ]
	## Score: [ float score ]
	4. Text Rendering:
	## Justification: [ Similar to above. Analysis and assessment. ]
	## Score: [ float score or N/A ]
	# Summary: [ Summary of the evaluation ]

	# User Request Refinement:
	## Refinement Comments: [Specific suggestions for improving the user request]
	## Refined Request: [The improved, more specific user request for editing like a standard user instruction]"""

	POINTWISE_T2I_INSTRUCTION = """You are an expert image generation evaluator. Your task is to evaluate the quality of a generated image based on a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better image generation (if any).

	User Instruction: {prompt}
	Generated Image: <image>

	Your task is to evaluate the Generated Image against the User Instruction.
	To do this, you must first assess the image on three critical aspects, provide justifications and absolute scores in 1-4 scale.

	### Critical Aspects & Scoring Rubric
	1. Text Faithfulness (How accurately does the output follow the instruction?)
	- 4 (Full match): All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
	- 3 (Minor mismatch): Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
	- 2 (Some mismatch): Some key elements are missing, altered, or interpreted incorrectly.
	- 1 (Major deviations): Key elements are completely missing, altered, or contradicted. Instruction is ignored.

	2. Physical and Visual Quality (Technical errors, composition, realism, and physics)
	- 4 (No noticeable flaws): The image is physically plausible (correct lighting, shadows, geometry, human anatomy). No visible artifacts (seams, blurring, noise). And all elements work together cohesively.
	- 3 (Minor flaws): Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
	- 2 (Some flaws): Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
	- 1 (Severe flaws): Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).

	3. Text Rendering (Only if the instruction involves generating text)
	- 4 (Full match): Text is correct, legible, and integrated well.
	- 3 (Mostly match): Minor misspellings or inconsistent capitalization.
	- 2 (Partial match): Major misspellings or distorted text.
	- 1 (Major deviations): Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).

	### Scoring Methodology (CRITICAL)
	During assessment for each aspect, recall the initial user request, source image and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
	1. Anchor: Have a global inspection based on the user request and the resulting generation. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided .
	2. Justify and Adjust: Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
	- Example: deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.

	Afterwards, try to construct a refined user request that helps the visual generation model to produce better image edits.
	Think of the weaknesses identified in the judgement, then map them to instruction details and apply specific fixes.
	Provide a final new user request that enrich the initial user request.

	Output your evaluation in the following format:
	# User Request Analysis
	[ understanding the user request, try to analyze or decompose the user request deeper. Think of what the request might imply or what needs to be inferred to successfully execute the request. ]
	# Detailed Judgement
	1. Text Faithfulness:
	## Justification: [ Analysis of the user request and the assessment of the resulting generation. How it comes to a final score. ]
	## Score: [ float score ]
	2. Physical and Visual Quality:
	## Justification: [ Similar to above. Analysis and assessment. ]
	## Score: [ float score ]
	3. Text Rendering:
	## Justification: [ Similar to above. Analysis and assessment. ]
	## Score: [ float score or N/A ]
	# Summary: [ Summary of the evaluation ]

	# User Request Refinement:
	## Refinement Comments: [ Specific suggestions for improving generation quality ]
	## Refined Request: [ The improved, more specific user request ]"""

	PAIRWISE_EDITING_INSTRUCTION = """You are an expert image editing evaluator. Your task is to evaluate the quality of an edited image based on a source image and a user instruction.

	User Instruction: {prompt}
	You are provided with three images:
	1. Source Image <image>
	2. Edited Image A <image>
	3. Edited Image B <image>

	Your task is to compare the two Edited Images according to the User Instruction and source image.
	To do this, you must compare the image on four critical aspects, provide absolute scores for each image and determine who wins.

	### Critical Aspects & Scoring Rubric
	1. Text Faithfulness (How accurately does the output follow the instruction?)
	- 4 (Full match): All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
	- 3 (Minor mismatch): Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
	- 2 (Some mismatch): Some key elements are missing, altered, or interpreted incorrectly.
	- 1 (Major deviations): Key elements are completely missing, altered, or contradicted. Instruction is ignored.

	2. Image Faithfulness (How well are the non-edited parts and key input elements preserved?)
	- 4 (Uses input fully): All relevant elements from the input (background, style, lighting, identity) are accurately preserved or transformed as instructed.
	- 3 (Minor mismatch): Most relevant elements are preserved, but a few aspects (e.g., background details, lighting consistency) are missing or incorrectly handled.
	- 2 (Partial mismatch): Some elements are carried over, but key aspects of the original image are lost or distorted.
	- 1 (Fails to use input): Key elements of the input image are ignored, misinterpreted, or destroyed.

	3. Physical and Visual Quality (Technical errors, composition, realism, and physics)
	- 4 (No noticeable flaws): The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
	- 3 (Minor flaws): Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
	- 2 (Some flaws): Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
	- 1 (Severe flaws): Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).

	4. Text Rendering (Only if the instruction involves generating text)
	- 4 (Full match): Text is correct, legible, and integrated well.
	- 3 (Mostly match): Minor misspellings or inconsistent capitalization.
	- 2 (Partial match): Major misspellings or distorted text.
	- 1 (Major deviations): Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).

	### Scoring Methodology (CRITICAL)
	During assessment for each aspect, recall the initial user request, source image and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
	1. Anchor: Have a global inspection. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided (you can also refer to the given human preference or rating).
	2. Justify and Adjust: Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
	- Example: deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
	3. Compare: Ensure the difference between Score A and Score B reflects the correct preference.

	Output your evaluation in the following format:
	# User Request Analysis
	[ understanding the user request, and what needs to be considered during image editing ]
	# Detailed Judgement
	1. Text Faithfulness:
	## Justification: [ Comparative Analysis: Given the request, source image and the scoring rubrics, which image is better in this dimension? Provide concrete evidence and scoring logic. e.g., Image A is roughly [X] score level because [reason]. Deduct/Add points for [specific details] to reach final score. ]
	## Score A: [float score for Image A]
	## Score B: [float score for Image B]
	## Winner: [Image A or Image B or It's a tie]
	2. Image Faithfulness:
	## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic for image faithfulness. ]
	## Score A: [float score for Image A]
	## Score B: [float score for Image B]
	## Winner: [Image A or Image B or It's a tie]
	3. Physical and Visual Quality:
	## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since physical/visual quality is often not perfect, give 4.0 sparingly only when it is perfectly realistic. ]
	## Score A: [float score for Image A]
	## Score B: [float score for Image B]
	## Winner: [Image A or Image B or It's a tie]
	4. Text Rendering:
	## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since text rendering is often challenging, give 4.0 sparingly only if it is perfect. ]
	## Score A: [float score for Image A]
	## Score B: [float score for Image B]
	## Winner: [N/A or Image A or Image B or It's a tie]
	# Summary: [Summary of the evaluation]"""

	PAIRWISE_T2I_INSTRUCTION = """You are an expert image evaluator. Your task is to evaluate the quality of two generated images based on a user instruction.

	User Instruction: {prompt}
	You are provided with two images:
	1. Generated Image A <image>
	2. Generated Image B <image>

	Your task is to compare the two Generated Images according to the User Instruction.
	To do this, you must compare the image on three critical aspects, provide absolute scores for each image and determine who wins.

	### Critical Aspects & Scoring Rubric
	1. Text Faithfulness (How accurately does the output follow the instruction?)
	- 4 (Full match): All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
	- 3 (Minor mismatch): Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
	- 2 (Some mismatch): Some key elements are missing, altered, or interpreted incorrectly.
	- 1 (Major deviations): Key elements are completely missing, altered, or contradicted. Instruction is ignored.

	2. Physical and Visual Quality (Technical errors, composition, realism, and physics)
	- 4 (No noticeable flaws): The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
	- 3 (Minor flaws): Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
	- 2 (Some flaws): Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
	- 1 (Severe flaws): Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).

	3. Text Rendering (Only if the instruction involves generating text)
	- 4 (Full match): Text is correct, legible, and integrated well.
	- 3 (Mostly match): Minor misspellings or inconsistent capitalization.
	- 2 (Partial match): Major misspellings or distorted text.
	- 1 (Major deviations): Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).

	### Scoring Methodology (CRITICAL)
	For every aspect, you must first recap the initial user request and the scoring rubrics of the aspect, then follow this "Anchor and Adjust" process to compare and score the two images:
	1. Anchor: Determine the rough integer score level (1, 2, 3, or 4) based on the definitions provided.
	2. Adjust: Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
	- Example: deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
	3. Compare: Ensure the difference between Score A and Score B reflects the magnitude of the preference. (e.g., A large gap implies one is significantly better; if one is only slightly better, the fine-grained scorings based on identified flaws help explain the preference).

	Output your evaluation in the following format:
	# User Request Analysis
	[ understanding the user request, try to analyze or decompose the user request deeper. Think of what the request might imply or what needs to be inferred to successfully execute the request. ]
	# Detailed Judgement
	1. Text Faithfulness:
	## Justification: [ Comparative Analysis: Given the request and the scoring rubrics, which image is better in this dimension? Provide concrete evidence and scoring logic. e.g., Image A is roughly [X] score level because [reason]. Deduct/Add points for [specific details] to reach final score. ]
	## Score A: [float score for Image A]
	## Score B: [float score for Image B]
	## Winner: [Image A or Image B or It's a tie]
	2. Physical and Visual Quality:
	## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since physical/visual quality is often not perfect, give 4.0 sparingly only when it is perfectly realistic. ]
	## Score A: [float score for Image A]
	## Score B: [float score for Image B]
	## Winner: [Image A or Image B or It's a tie]
	3. Text Rendering:
	## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since text rendering is often challenging, give 4.0 sparingly only if it is perfect. ]
	## Score A: [float score for Image A]
	## Score B: [float score for Image B]
	## Winner: [N/A or Image A or Image B or It's a tie]
	# Summary: [Summary of the evaluation]"""

	def create_instruction(prompt, task_type):
	"""Create the appropriate instruction based on the task type."""
	if task_type == "Pointwise - Image Editing":
	return POINTWISE_EDITING_INSTRUCTION.format(prompt=prompt)
	elif task_type == "Pointwise - T2I Generation":
	return POINTWISE_T2I_INSTRUCTION.format(prompt=prompt)
	elif task_type == "Pairwise - Image Editing":
	return PAIRWISE_EDITING_INSTRUCTION.format(prompt=prompt)
	elif task_type == "Pairwise - T2I Generation":
	return PAIRWISE_T2I_INSTRUCTION.format(prompt=prompt)
	else:
	raise ValueError(f"Unknown task type: {task_type}")

	def update_ui_for_task(task_type):
	"""Update image component visibility/labels, instruction label, and generate button based on selected task type."""
	if task_type == "Prompt Tuning - Image Editing":
	return (
	gr.update(visible=True, label="Source Image"),
	gr.update(visible=True, label="Generated Image", interactive=False, value=None),
	gr.update(visible=False, label="(unused)", value=None),
	gr.update(label="Editing Instruction", placeholder="Enter the instruction for editing..."),
	gr.update(visible=True), # generate_btn visible
	)
	elif task_type == "Prompt Tuning - T2I Generation":
	return (
	gr.update(visible=True, label="Generated Image"),
	gr.update(visible=False, label="(unused)", value=None),
	gr.update(visible=False, label="(unused)", value=None),
	gr.update(label="T2I Prompt", placeholder="Enter the text-to-image generation prompt…"),
	gr.update(visible=False), # generate_btn hidden (no T2I pipeline)
	)
	elif task_type == "Pointwise - Image Editing":
	return (
	gr.update(visible=True, label="Source Image"),
	gr.update(visible=True, label="Edited Image"),
	gr.update(visible=False, label="Image B", value=None),
	gr.update(label="Editing Instruction", placeholder="Describe the edit that was applied to the source image…"),
	gr.update(visible=False), # generate_btn hidden
	)
	elif task_type == "Pointwise - T2I Generation":
	return (
	gr.update(visible=True, label="Generated Image"),
	gr.update(visible=False, label="(unused)", value=None),
	gr.update(visible=False, label="(unused)", value=None),
	gr.update(label="Text-to-Image Prompt", placeholder="Enter the text-to-image generation prompt…"),
	gr.update(visible=False), # generate_btn hidden
	)
	elif task_type == "Pairwise - Image Editing":
	return (
	gr.update(visible=True, label="Source Image"),
	gr.update(visible=True, label="Image A"),
	gr.update(visible=True, label="Image B"),
	gr.update(label="Editing Instruction", placeholder="Describe the edit that was applied to the source image…"),
	gr.update(visible=False), # generate_btn hidden
	)
	elif task_type == "Pairwise - T2I Generation":
	return (
	gr.update(visible=True, label="Image A"),
	gr.update(visible=True, label="Image B"),
	gr.update(visible=False, label="(unused)", value=None),
	gr.update(label="Text-to-Image Prompt", placeholder="Enter the text-to-image generation prompt…"),
	gr.update(visible=False), # generate_btn hidden
	)
	else:
	raise ValueError(f"Unknown task type: {task_type}")

	import time
	import os

	@spaces.GPU(duration=120)
	def run_vlm_evaluation(messages, loaded_images):
	"""Run VLM model on GPU to evaluate images and stream output text."""
	# Ensure model is on CUDA/device for evaluation
	# if not (VLM_MAX_MEMORY or VLM_QUANTIZATION_4BIT or VLM_QUANTIZATION_8BIT):
	model.to(device_vlm)

	# Generate and stream text
	prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(
	text=[prompt],
	images=loaded_images,
	return_tensors="pt",
	padding=True,
	).to(device_vlm)

	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048, temperature=0.3)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer

	@spaces.GPU(duration=60)
	def generate_image(task_type, instruction_text, image1, progress=gr.Progress()):
	"""Generate an edited image using Flux (Kontext) and return the path."""
	if task_type != "Prompt Tuning - Image Editing":
	raise gr.Error("Generate is only available for Prompt Tuning – Image Editing.")
	if not image1:
	raise gr.Error("Please upload the Source Image first.")
	if not instruction_text:
	raise gr.Error("Please enter an editing instruction first.")

	source_img = load_image(image1)
	width, height = source_img.size

	num_steps = 10
	generator = torch.Generator(device_gen).manual_seed(42)

	def step_progress(pipe, step_index, timestep, callback_kwargs):
	progress((step_index + 1) / num_steps, desc=f"Generating… Step {step_index + 1}/{num_steps}")
	return callback_kwargs

	progress(0, desc="Starting Flux generation…")
	with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
	output = flux_pipeline(
	prompt=instruction_text,
	image=source_img,
	guidance_scale=3.5,
	num_inference_steps=num_steps,
	width=width,
	height=height,
	generator=generator,
	callback_on_step_end=step_progress,
	)
	generated_image = output.images[0]

	timestamp = int(time.time())
	os.makedirs("generated_images", exist_ok=True)
	generated_image_path = f"generated_images/flux_edit_{timestamp}.png"
	generated_image.save(generated_image_path)

	return generated_image_path

	@spaces.GPU(duration=300)
	def model_inference(task_type, instruction_text, image1, image2, image3, progress=gr.Progress()):
	"""Run VLM evaluation based on the selected task type and uploaded images."""

	loaded_images = []
	task_for_template = task_type

	# Validate inputs and collect images based on task
	if task_type in ("Pointwise - Image Editing", "Prompt Tuning - Image Editing"):
	if not image1 or not image2:
	yield "Error: Please upload Source Image and Edited/Generated Image."
	return
	files = [image1, image2]
	loaded_images = [load_image(img) for img in files]
	task_for_template = "Pointwise - Image Editing"

	elif task_type in ("Pointwise - T2I Generation", "Prompt Tuning - T2I Generation"):
	if not image1:
	yield "Error: Please upload the Generated Image."
	return
	files = [image1]
	loaded_images = [load_image(img) for img in files]
	task_for_template = "Pointwise - T2I Generation"

	elif task_type == "Pairwise - Image Editing":
	if not image1 or not image2 or not image3:
	yield "Error: Please upload Source Image, Image A, and Image B."
	return
	files = [image1, image2, image3]
	loaded_images = [load_image(img) for img in files]

	elif task_type == "Pairwise - T2I Generation":
	if not image1 or not image2:
	yield "Error: Please upload both Image A and Image B."
	return
	files = [image1, image2]
	loaded_images = [load_image(img) for img in files]

	else:
	yield "Error: Unknown task type selected."
	return

	# Build instruction with <image> placeholders
	instruction = create_instruction(instruction_text, task_for_template)

	# Interleave images into the <image> placeholders
	content = []
	parts = instruction.split("<image>")
	for i, part in enumerate(parts):
	content.append({"type": "text", "text": part})
	if i < len(loaded_images):
	content.append({"type": "image", "image": loaded_images[i]})

	messages = [{"role": "user", "content": content}]

	# Run VLM evaluation on GPU (streaming)
	progress(0.9, desc="Evaluating...")
	for text in run_vlm_evaluation(messages, loaded_images):
	yield text

	# ============================================================
	# Gradio UI
	# ============================================================

	OVERVIEW_MD = """
	### 📋 Task Overview

	This demo supports six tasks. Select one to get started:

	\| Task \| Description \|
	\|------\|-------------\|
	\| Prompt Tuning – Image Editing \| Generate an edit using Flux (Kontext) from a source image and instruction, then evaluate it. Use the refinement to tune your prompt. \|
	\| Prompt Tuning – T2I Generation \| Upload a generated image and a text-to-image prompt, then evaluate it. Use the refinement to iteratively improve your prompt. \|
	\| Pointwise – Image Editing \| Rate a single edited image against its source image and the editing instruction. Produces per-aspect scores and a refined request. \|
	\| Pointwise – T2I Generation \| Rate a single generated image against a text-to-image prompt. Produces per-aspect scores and a refined prompt. \|
	\| Pairwise – Image Editing \| Compare two edited images (A vs B) given a source image and editing instruction. Determines which edit is better per aspect. \|
	\| Pairwise – T2I Generation \| Compare two generated images (A vs B) given a text-to-image prompt. Determines which generation is better per aspect. \|

	Try the examples on the right - they're basically begging to be clicked! 🎯
	"""

	with gr.Blocks(css="""
	#input-panel { max-height: 85vh; overflow-y: auto; padding-right: 8px; }
	""") as demo:
	gr.HTML(html_header)

	# ---- Overview ----
	gr.Markdown(OVERVIEW_MD)

	with gr.Row(equal_height=True):
	# ============ LEFT COLUMN – inputs ============
	with gr.Column(scale=1, elem_id="input-panel"):
	task_selector = gr.Radio(
	choices=TASK_CHOICES,
	value="Prompt Tuning - Image Editing",
	label="Task Type",
	info="Select the evaluation task",
	)

	# ---- Image upload row ----
	with gr.Row():
	with gr.Column(scale=1, min_width=160):
	image1 = gr.Image(
	label="Source Image",
	type="filepath",
	sources=["upload", "clipboard"],
	)
	with gr.Column(scale=1, min_width=160):
	image2 = gr.Image(
	label="Generated Image",
	type="filepath",
	sources=["upload", "clipboard"],
	interactive=False,
	)
	with gr.Column(scale=1, min_width=160):
	image3 = gr.Image(
	label="Image B",
	type="filepath",
	sources=["upload", "clipboard"],
	visible=False,
	)

	# ---- Instruction + Buttons ----
	instruction = gr.Textbox(
	label="Editing Instruction",
	lines=3,
	placeholder="Enter the instruction for editing...",
	)
	with gr.Row():
	generate_btn = gr.Button("Generate Image", variant="secondary", visible=True)
	submit_btn = gr.Button("Evaluate", variant="primary")

	# ============ RIGHT COLUMN – examples ============
	with gr.Column(scale=1):
	gr.Examples(
	examples=[
	["Prompt Tuning - Image Editing", "make it into comic style", "example_images/edit_source.png", None, None],
	["Prompt Tuning - T2I Generation", "A black and white picture of a busy street, only red is in color, a red double decker bus drives down the road.", "example_images/t2i_a.png", None, None],
	["Pointwise - Image Editing", "make it into comic style", "example_images/edit_source.png", "example_images/edit_a.png", None],
	["Pointwise - T2I Generation", "A black and white picture of a busy street, only red is in color, a red double decker bus drives down the road.", "example_images/t2i_a.png", None, None],
	["Pairwise - Image Editing", "make it into comic style", "example_images/edit_source.png", "example_images/edit_a.png", "example_images/edit_b.png"],
	["Pairwise - T2I Generation", "A black and white picture of a busy street, only red is in color, a red double decker bus drives down the road.", "example_images/t2i_a.png", "example_images/t2i_b.png", None],
	],
	inputs=[task_selector, instruction, image1, image2, image3],
	)

	# ---- Evaluation result (full width, below the input/examples row) ----
	output = gr.Textbox(label="Evaluation Result", lines=20)

	# ---- Wire task selector to update image visibility/labels, instruction label, and generate button ----
	task_selector.change(
	fn=update_ui_for_task,
	inputs=[task_selector],
	outputs=[image1, image2, image3, instruction, generate_btn],
	)

	# ---- Wire generate button (Prompt Tuning – Image Editing only) ----
	generate_btn.click(
	fn=generate_image,
	inputs=[task_selector, instruction, image1],
	outputs=[image2],
	)

	# ---- Wire evaluate button ----
	submit_btn.click(
	fn=model_inference,
	inputs=[task_selector, instruction, image1, image2, image3],
	outputs=[output],
	)

	gr.Markdown(tos_markdown)
	gr.Markdown(learn_more_markdown)
	gr.Markdown(bibtext)

	demo.launch(debug=True)