OmniParser-v2-api

Sleeping

App Files Files Community

Netrava commited on Aug 4, 2025

Commit

92f29a3

verified ·

1 Parent(s): 2f06275

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -74

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # app.py
-# FINAL CORRECTED VERSION - Self-Contained
-# This version includes all necessary functions and fixes the model loading error.
 import os
 import torch
@@ -8,43 +8,29 @@ import gradio as gr
 import numpy as np
 from PIL import Image
 import json
-from io import BytesIO
-# Hugging Face Hub utilities for downloading model files
 from huggingface_hub import hf_hub_download
-# Core libraries for the models
 from ultralytics import YOLO
 from transformers import AutoProcessor, AutoModelForCausalLM, set_seed
 from paddleocr import PaddleOCR
-# Computer vision utilities
 import supervision as sv
-# Suppress unnecessary warnings for a cleaner log
 import warnings
 warnings.filterwarnings("ignore")
 # --- Global Configuration ---
 REPO_ID = "microsoft/OmniParser-v2.0"
-DETECTION_MODEL_FILENAME = "weights/detection/model.pt" # Correct path in the original repo
-CAPTION_MODEL_PATH = "weights/icon_caption"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# This specific revision of the Florence-2 model code is known to work on CPU.
-FLORENCE_REVISION = "e149e62e3c88b64e42138e12b9a04683b5d1e26e"
 print(f"INFO: Using device: {DEVICE}")
 set_seed(42)
-# =====================================================================================
-# PART 2: AUTOMATED MODEL LOADING FROM THE HUB
-# =====================================================================================
 def load_detection_model():
     print("INFO: Loading detection model...")
     try:
-        # Note: The original repo structure might use a local path 'weights/detection/model.pt'
-        # hf_hub_download ensures it's available.
         model_path = hf_hub_download(repo_id=REPO_ID, filename=DETECTION_MODEL_FILENAME)
         model = YOLO(model_path)
         print("INFO: Detection model loaded successfully.")
@@ -56,44 +42,24 @@ def load_detection_model():
 def load_caption_model():
     print("INFO: Loading captioning model...")
     try:
-        # The caption model is based on Florence-2, but the fine-tuned weights are in the OmniParser repo.
-        # We load the weights from the subfolder but pin the *remote code* to a specific stable revision.
         model = AutoModelForCausalLM.from_pretrained(
-            os.path.join(REPO_ID, CAPTION_MODEL_PATH), # Load from subfolder
             torch_dtype=torch.float32,
             trust_remote_code=True,
-            revision=FLORENCE_REVISION # Pin the remote code to the stable version
         ).to(DEVICE)
         processor = AutoProcessor.from_pretrained(
-            os.path.join(REPO_ID, CAPTION_MODEL_PATH), # Load from subfolder
-            trust_remote_code=True,
-            revision=FLORENCE_REVISION # Pin the remote code to the stable version
         )
         print("INFO: Captioning model loaded successfully.")
         return model, processor
     except Exception as e:
-        # This fallback is crucial for debugging
-        print(f"ERROR: Failed to load captioning model from primary path. Attempting fallback.")
-        try:
-             # Fallback to the main repo ID if the subfolder path fails
-             model = AutoModelForCausalLM.from_pretrained(
-                "microsoft/Florence-2-base-ft",
-                torch_dtype=torch.float32,
-                trust_remote_code=True,
-                revision=FLORENCE_REVISION
-            ).to(DEVICE)
-             processor = AutoProcessor.from_pretrained(
-                "microsoft/Florence-2-base-ft",
-                trust_remote_code=True,
-                revision=FLORENCE_REVISION
-            )
-             print("INFO: Captioning model loaded successfully via fallback.")
-             return model, processor
-        except Exception as fallback_e:
-            print(f"ERROR: Fallback also failed. Main error: {e}, Fallback error: {fallback_e}")
-            return None, None
 def load_ocr_model():
     print("INFO: Loading OCR model...")
@@ -105,15 +71,11 @@ def load_ocr_model():
         print(f"ERROR: Failed to load OCR model: {e}")
         return None
-# --- Initialize Models at Application Startup ---
 detection_model = load_detection_model()
 caption_model, caption_processor = load_caption_model()
 ocr_model = load_ocr_model()
-# =====================================================================================
-# PART 3: HELPER AND PREDICTION FUNCTIONS
-# =====================================================================================
 def run_captioning(image, text, model, processor):
     prompt = f"<OD> <ref> {text} </ref>"
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(DEVICE)
@@ -129,14 +91,22 @@ def run_captioning(image, text, model, processor):
     final_caption_list = parsed_text.get('<OD>', {}).get('labels', [])
     return final_caption_list[0] if final_caption_list else "No description available"
-def predict_func(input_image: Image.Image):
     if not all([detection_model, caption_model, ocr_model]):
-        # UI Error Message
-        return {"error": "A model failed to load. Please check the container logs for details."}
     image_np = np.array(input_image.convert("RGB"))
-    # OCR
     ocr_results = ocr_model.ocr(image_np, cls=True)[0]
     ocr_texts = []
     if ocr_results:
@@ -144,42 +114,48 @@ def predict_func(input_image: Image.Image):
             points, (text, conf) = line
             x_coords = [p[0] for p in points]
             y_coords = [p[1] for p in points]
-            ocr_texts.append({"box": [min(x_coords), min(y_coords), max(x_coords), max(y_coords)], "text": text})
-    # Object Detection
     detection_results = detection_model(image_np, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(detection_results)
-    # Process and combine
     parsed_elements = []
     for i in range(len(detections)):
         box = detections.xyxy[i].astype(int)
         class_name = detection_model.model.names[detections.class_id[i]]
         cropped_image = input_image.crop(tuple(box))
         caption = run_captioning(cropped_image, f"Describe this UI element.", caption_model, caption_processor)
-        contained_text = " ".join([o["text"] for o in ocr_texts if (box[0] < o["box"][0] and box[1] < o["box"][1] and box[2] > o["box"][2] and box[3] > o["box"][3])])
         parsed_elements.append({
-            "box_2d": box.tolist(), "class": class_name,
-            "ocr_text": contained_text.strip(), "description": caption
         })
     return {"parsed_elements": parsed_elements}
-# =====================================================================================
-# PART 4: GRADIO INTERFACE
-# =====================================================================================
-with gr.Blocks() as demo:
-    gr.Markdown("# Microsoft OmniParser-v2 API Endpoint")
     with gr.Row():
         image_input = gr.Image(type="pil", label="Input UI Screenshot")
         json_output = gr.JSON(label="Parsed UI Elements")
     submit_button = gr.Button("Parse UI", variant="primary")
-    submit_button.click(fn=predict_func, inputs=[image_input], outputs=[json_output], api_name="predict")
-if __name__ == "__main__":
-    demo.launch()

 # app.py
+# FINAL, DEFINITIVE VERSION
+# Corrects all model loading paths and relies on the stable requirements.txt
 import os
 import torch
 import numpy as np
 from PIL import Image
 import json
 from huggingface_hub import hf_hub_download
 from ultralytics import YOLO
 from transformers import AutoProcessor, AutoModelForCausalLM, set_seed
 from paddleocr import PaddleOCR
 import supervision as sv
 import warnings
 warnings.filterwarnings("ignore")
 # --- Global Configuration ---
 REPO_ID = "microsoft/OmniParser-v2.0"
+# CORRECTED file paths as they exist in the Hugging Face repository
+DETECTION_MODEL_FILENAME = "icon_detect/model.pt"
+CAPTION_MODEL_SUBFOLDER = "icon_caption"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"INFO: Using device: {DEVICE}")
 set_seed(42)
+# --- Model Loading ---
 def load_detection_model():
     print("INFO: Loading detection model...")
     try:
         model_path = hf_hub_download(repo_id=REPO_ID, filename=DETECTION_MODEL_FILENAME)
         model = YOLO(model_path)
         print("INFO: Detection model loaded successfully.")
 def load_caption_model():
     print("INFO: Loading captioning model...")
     try:
+        # CORRECTED loading logic using repo_id and subfolder arguments
         model = AutoModelForCausalLM.from_pretrained(
+            REPO_ID,
+            subfolder=CAPTION_MODEL_SUBFOLDER,
             torch_dtype=torch.float32,
             trust_remote_code=True,
+            attn_implementation="eager"
         ).to(DEVICE)
         processor = AutoProcessor.from_pretrained(
+            REPO_ID,
+            subfolder=CAPTION_MODEL_SUBFOLDER,
+            trust_remote_code=True
         )
         print("INFO: Captioning model loaded successfully.")
         return model, processor
     except Exception as e:
+        print(f"ERROR: Failed to load captioning model: {e}")
+        return None, None
 def load_ocr_model():
     print("INFO: Loading OCR model...")
         print(f"ERROR: Failed to load OCR model: {e}")
         return None
 detection_model = load_detection_model()
 caption_model, caption_processor = load_caption_model()
 ocr_model = load_ocr_model()
+# --- Inference Pipeline ---
 def run_captioning(image, text, model, processor):
     prompt = f"<OD> <ref> {text} </ref>"
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(DEVICE)
     final_caption_list = parsed_text.get('<OD>', {}).get('labels', [])
     return final_caption_list[0] if final_caption_list else "No description available"
+def is_box_contained(outer_box, inner_box):
+    return (outer_box[0] <= inner_box[0] and
+            outer_box[1] <= inner_box[1] and
+            outer_box[2] >= inner_box[2] and
+            outer_box[3] >= inner_box[3])
+def predict(input_image: Image.Image):
     if not all([detection_model, caption_model, ocr_model]):
+        error_messages = []
+        if not detection_model: error_messages.append("Detection model failed.")
+        if not caption_model: error_messages.append("Captioning model failed.")
+        if not ocr_model: error_messages.append("OCR model failed.")
+        return {"error": " ".join(error_messages) + " Check container logs for details."}
     image_np = np.array(input_image.convert("RGB"))
     ocr_results = ocr_model.ocr(image_np, cls=True)[0]
     ocr_texts = []
     if ocr_results:
             points, (text, conf) = line
             x_coords = [p[0] for p in points]
             y_coords = [p[1] for p in points]
+            ocr_texts.append({"box": [min(x_coords), min(y_coords), max(x_coords), max(y_coords)], "text": text, "conf": conf})
     detection_results = detection_model(image_np, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(detection_results)
     parsed_elements = []
+    element_id_counter = 0
     for i in range(len(detections)):
         box = detections.xyxy[i].astype(int)
+        confidence = detections.confidence[i]
         class_name = detection_model.model.names[detections.class_id[i]]
         cropped_image = input_image.crop(tuple(box))
         caption = run_captioning(cropped_image, f"Describe this UI element.", caption_model, caption_processor)
+        contained_text = " ".join([o["text"] for o in ocr_texts if is_box_contained(box.tolist(), o["box"])])
         parsed_elements.append({
+            "id": element_id_counter, "box_2d": box.tolist(), "type": class_name,
+            "text": contained_text.strip(), "description": caption, "confidence": float(confidence)
         })
+        element_id_counter += 1
+    for ocr in ocr_texts:
+        if not any(is_box_contained(el["box_2d"], ocr["box"]) for el in parsed_elements):
+            parsed_elements.append({
+                "id": element_id_counter, "box_2d": [int(p) for p in ocr["box"]], "type": "text_label",
+                "text": ocr["text"], "description": "A text label.", "confidence": float(ocr["conf"])
+            })
+            element_id_counter += 1
     return {"parsed_elements": parsed_elements}
+# --- Gradio Interface ---
+with gr.Blocks(css="footer {display: none!important}") as demo:
+    gr.Markdown("# Microsoft OmniParser-v2 API Endpoint\nUpload a UI screenshot to get a parsed JSON output.")
     with gr.Row():
         image_input = gr.Image(type="pil", label="Input UI Screenshot")
         json_output = gr.JSON(label="Parsed UI Elements")
     submit_button = gr.Button("Parse UI", variant="primary")
+    submit_button.click(fn=predict, inputs=[image_input], outputs=[json_output], api_name="predict")
+demo.launch()