diff --git a/.gitattributes b/.gitattributes index 34832b75648fde4a03b5ee7b1db774ffe3495a9a..0d8380cabc5d2971fabdaadc7e1735ba49aac4e8 100644 --- a/.gitattributes +++ b/.gitattributes @@ -52,3 +52,173 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 10samples/sample_0007/overlays/overlay_accepted.png filter=lfs diff=lfs merge=lfs -text 10samples/sample_0007/overlays/overlay_intended.png filter=lfs diff=lfs merge=lfs -text 10samples/sample_0007/overlays/overlay_measured.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_metal_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_metal_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_metal_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_city_buildings.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_twilight_sky.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_vehicle_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_city_buildings.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_twilight_sky.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_vehicle_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_city_buildings.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_drainage_grate.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_car_left.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_suv_right.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_pedestrian_walking.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_shop_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_storefront_sign.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_signs.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_traveling_dark_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_twilight_sky.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_vehicle_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_white_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_yellow_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_black_sedan.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_waiting_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_black_sedan.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_waiting_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_black_sedan.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_waiting_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_street_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_street_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_delivery_truck.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_street_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_traffic_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_walker.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_concrete_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_concrete_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_uniformed_officer.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_bystander_in_suit.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_concrete_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_firefighter.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_traffic_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_uniformed_officer.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_dark_building_facade.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_double_solid_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_dark_building_facade.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_double_solid_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_white_panel_van.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_awning_building_corner.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_dark_building_facade.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_distant_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_double_solid_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_emergency_vehicle.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_green_street_sign.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_vertical_illuminated_sign.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_white_panel_van.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_brick_building_right.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_crosswalk_markings.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_iron_balcony.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_multi_story_building_left.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_overhead_wires.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_street_light_pole.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_iron_balcony.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_multi_story_building_left.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_overhead_wires.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_street_light_pole.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_brick_building_right.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_bunch_of_balloons.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_crosswalk_markings.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_1.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_2.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_iron_balcony.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_multi_story_building_left.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_overhead_wires.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_garbage_bag.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_sedan.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_woman_in_dark_dress.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_street_lamp.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_sign_holder.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_street_lamp.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_woman_waiting.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_young_man_waiting.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_businessman_waiting.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_crossing_right.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_standing.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away_sidewalk.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_sign_holder.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_street_lamp.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_woman_waiting.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_young_man_waiting.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_metal_railing.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_overpass.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_metal_railing.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_overpass.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_street_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_metal_railing.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_overpass.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_pedestrian_in_suit.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_black_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_man_in_grey_sweater.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_light_blue.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_striped_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_black_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_grey_sweater.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_pink_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_light_blue.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_striped_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_with_backpack.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_black_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_grey_sweater.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_pink_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_blue.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_jacket.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_red.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_striped_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_with_backpack.png filter=lfs diff=lfs merge=lfs -text diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/README.md b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/README.md new file mode 100644 index 0000000000000000000000000000000000000000..874cae8d08f4d741479240ec6bd426b1e974d281 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/README.md @@ -0,0 +1,16 @@ +# samples_v8 + +Generated with `data_recipe_v8.md`: vocabulary-first planning, adaptive canvas selection, structured JSON compose prompts, no identity verification, no gate, SAM white-background reference postprocessing, and strict reference-completeness verification with regenerate-until-pass behavior. + +- chat model: `gcp/google/gemini-3.1-pro-preview` +- image model: `gcp/google/gemini-3-pro-image-preview` +- people references: `white_bg_full_body_front` +- non-person references: `white_bg_encyclopedia_photo` +- SAM postprocess: every generated reference is segmented with `sam_vit_b` and pasted onto pure `#ffffff` background +- reference verify max attempts per subject: `10` +- allowed canvases: `[{"aspect_ratio": "1:1", "size": [1024, 1024], "style": "photorealistic"}, {"aspect_ratio": "4:3", "size": [1152, 864], "style": "photorealistic"}, {"aspect_ratio": "3:4", "size": [864, 1152], "style": "photorealistic"}, {"aspect_ratio": "3:2", "size": [1248, 832], "style": "photorealistic"}, {"aspect_ratio": "2:3", "size": [832, 1248], "style": "photorealistic"}, {"aspect_ratio": "16:9", "size": [1280, 720], "style": "photorealistic"}, {"aspect_ratio": "9:16", "size": [720, 1280], "style": "photorealistic"}]` +- scenario mode: `driving` +- pools: `vocab_task_pool`, `plan_pool`, `scene_pool`, `detection_pool`, `reference_pool` +- bbox overlay: `bbox_overlay.png` draws every planned subject bbox; a sample is rejected and regenerated if any planned subject is still missing after VLM detection retries +- detection max attempts per subject: `3` +- launch args: `{"compose_workers": 3, "detect_max_attempts": 3, "detect_workers": 3, "emit_workers": 4, "idle_sleep": 1.0, "image_inflight": 32, "image_interval": 0.05, "image_max_retries": 8, "max_retries": 3, "no_topup": false, "plan_workers": 6, "ref_verify_max_attempts": 10, "reference_workers": 6, "requeue_in_progress": true, "seed": 1781927993, "status_interval": 30.0, "subject_detect_workers": 24, "target_samples": 10}` diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/dataset.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/dataset.json new file mode 100644 index 0000000000000000000000000000000000000000..bc3dd7073085c1e4642d80ddcbe8404775220451 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/dataset.json @@ -0,0 +1,3706 @@ +[ + { + "sample_id": "sample_000001", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", + "sub_caption": "pedestrian: A person wearing a dark coat and trousers, walking confidently.. Scene role: Walking along the curbside near the barrier.", + "measured_bbox": [ + 0.1528, + 0.301, + 0.2511, + 0.7071 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 42.0, + 698.0, + 1007.0 + ], + "mask_score": 3.413244, + "mask_area_ratio": 0.159381, + "elapsed_seconds": 33.2771 + } + }, + { + "name": "parked_dark_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", + "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the street near the curb in the background right.", + "measured_bbox": [ + 0.5163, + 0.3897, + 0.9968, + 0.9244 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_dark_car.png", + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_01.png", + "reference_verify": "references/reference_verify_parked_dark_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_01.png", + "output": "references/ref_parked_dark_car.png", + "mask": "references/sam_mask_parked_dark_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 315.0, + 976.0, + 694.0 + ], + "mask_score": 3.4345, + "mask_area_ratio": 0.180014, + "elapsed_seconds": 7.1991 + } + }, + { + "name": "metal_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", + "sub_caption": "metal barrier: A silver metal barricade placed along the street.. Scene role: Positioned along the curb to section off the pedestrian area from the road.", + "measured_bbox": [ + 0.3454, + 0.4302, + 0.5465, + 0.8402 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_barrier.png", + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "reference_verify": "references/reference_verify_metal_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "references/ref_metal_barrier.png", + "mask": "references/sam_mask_metal_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 2.0, + 107.0, + 1009.0, + 986.0 + ], + "mask_score": 1.555076, + "mask_area_ratio": 0.845579, + "elapsed_seconds": 7.2854 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000002", + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "canvas_size": [ + 1152, + 864 + ], + "canvas_aspect_ratio": "4:3", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 15, + "n_detected": 15, + "n_subjects": 15, + "subjects": [ + { + "name": "pedestrian_walking", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", + "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: walking along the sidewalk on the right side of the street", + "measured_bbox": [ + 0.7497, + 0.4757, + 0.7954, + 0.6192 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_walking_attempt_01.png", + "output": "references/ref_pedestrian_walking.png", + "mask": "references/sam_mask_pedestrian_walking.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 63.0, + 695.0, + 972.0 + ], + "mask_score": 3.459152, + "mask_area_ratio": 0.145545, + "elapsed_seconds": 8.3331 + } + }, + { + "name": "shop_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", + "sub_caption": "pedestrian: Person standing near a shop entrance on the right, partially obscured.. Scene role: standing on the sidewalk near the storefronts on the right", + "measured_bbox": [ + 0.9337, + 0.4752, + 0.9695, + 0.6107 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shop_pedestrian.png", + "raw_ref_image": "references/raw_ref_shop_pedestrian_attempt_02.png", + "reference_verify": "references/reference_verify_shop_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_shop_pedestrian_attempt_02.png", + "output": "references/ref_shop_pedestrian.png", + "mask": "references/sam_mask_shop_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 312.0, + 43.0, + 719.0, + 1020.0 + ], + "mask_score": 3.162079, + "mask_area_ratio": 0.167512, + "elapsed_seconds": 7.2283 + } + }, + { + "name": "city_buildings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", + "sub_caption": "building: Various city buildings of different heights forming the skyline and lining the street.. Scene role: framing the street and forming the background skyline", + "measured_bbox": [ + 0.0, + 0.0, + 1.0, + 0.6084 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_city_buildings.png", + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "reference_verify": "references/reference_verify_city_buildings.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "references/ref_city_buildings.png", + "mask": "references/sam_mask_city_buildings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 14.0, + 171.0, + 1009.0, + 883.0 + ], + "mask_score": 3.176814, + "mask_area_ratio": 0.327415, + "elapsed_seconds": 7.146 + } + }, + { + "name": "street_signs", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", + "sub_caption": "street signs: Various blank street signs attached to a pole on the right side of the street.. Scene role: mounted on a pole next to the right sidewalk", + "measured_bbox": [ + 0.641, + 0.165, + 0.744, + 0.408 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_signs.png", + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "reference_verify": "references/reference_verify_street_signs.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "references/ref_street_signs.png", + "mask": "references/sam_mask_street_signs.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 224.0, + 0.0, + 744.0, + 1023.0 + ], + "mask_score": 3.332549, + "mask_area_ratio": 0.190769, + "elapsed_seconds": 7.1886 + } + }, + { + "name": "storefront_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", + "sub_caption": "storefront sign: A dark hanging sign framework attached to a building on the right, devoid of readable text.. Scene role: hanging above the shop entrance on the right side of the road", + "measured_bbox": [ + 0.7854, + 0.1934, + 0.9082, + 0.2906 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_storefront_sign.png", + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "reference_verify": "references/reference_verify_storefront_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "references/ref_storefront_sign.png", + "mask": "references/sam_mask_storefront_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 0.0, + 1023.0, + 811.0 + ], + "mask_score": 3.296373, + "mask_area_ratio": 0.447847, + "elapsed_seconds": 7.3102 + } + }, + { + "name": "parked_suv_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", + "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: parked alongside the right curb", + "measured_bbox": [ + 0.5507, + 0.4879, + 0.6783, + 0.6234 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_suv_right.png", + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "reference_verify": "references/reference_verify_parked_suv_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "references/ref_parked_suv_right.png", + "mask": "references/sam_mask_parked_suv_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 156.0, + 150.0, + 868.0, + 812.0 + ], + "mask_score": 3.463227, + "mask_area_ratio": 0.291222, + "elapsed_seconds": 7.2583 + } + }, + { + "name": "parked_car_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", + "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: parked alongside the left curb", + "measured_bbox": [ + 0.0, + 0.5102, + 0.1259, + 0.5998 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_car_left.png", + "raw_ref_image": "references/raw_ref_parked_car_left_attempt_01.png", + "reference_verify": "references/reference_verify_parked_car_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_car_left_attempt_01.png", + "output": "references/ref_parked_car_left.png", + "mask": "references/sam_mask_parked_car_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 319.0, + 1023.0, + 695.0 + ], + "mask_score": 3.122119, + "mask_area_ratio": 0.19451, + "elapsed_seconds": 8.5738 + } + }, + { + "name": "traveling_dark_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", + "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible red taillights reflecting the twilight.. Scene role: driving in the adjacent lane", + "measured_bbox": [ + 0.2594, + 0.4853, + 0.417, + 0.6419 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traveling_dark_suv.png", + "raw_ref_image": "references/raw_ref_traveling_dark_suv_attempt_01.png", + "reference_verify": "references/reference_verify_traveling_dark_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_traveling_dark_suv_attempt_01.png", + "output": "references/ref_traveling_dark_suv.png", + "mask": "references/sam_mask_traveling_dark_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 119.0, + 198.0, + 910.0, + 810.0 + ], + "mask_score": 3.470329, + "mask_area_ratio": 0.300606, + "elapsed_seconds": 8.5072 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", + "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road from the right side.. Scene role: providing illumination from the right sidewalk", + "measured_bbox": [ + 0.5577, + 0.0219, + 0.6964, + 0.588 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 17.0, + 688.0, + 996.0 + ], + "mask_score": 3.395182, + "mask_area_ratio": 0.033435, + "elapsed_seconds": 7.0701 + } + }, + { + "name": "vehicle_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", + "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground.. Scene role: anchoring the bottom of the frame to establish a driver's perspective", + "measured_bbox": [ + 0.0, + 0.8881, + 1.0, + 1.0 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vehicle_dashboard.png", + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_vehicle_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "references/ref_vehicle_dashboard.png", + "mask": "references/sam_mask_vehicle_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 223.0, + 1023.0, + 700.0 + ], + "mask_score": 2.938032, + "mask_area_ratio": 0.282133, + "elapsed_seconds": 7.1679 + } + }, + { + "name": "drainage_grate", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b4d0e72d-3b208072:object:16", + "source_name": "drainage grate", + "source_description": "A metal drainage grate on the edge of the road on the right. Source dataset: BDD100K. Scene context: A driving scene on a multi-lane highway with a dark red minivan in the left lane, under a partly cloudy sky.", + "sub_caption": "drainage grate: A metal drainage grate on the edge of the road on the right.. Scene role: embedded in the road surface near the right curb", + "measured_bbox": [ + 0.5682, + 0.6773, + 0.8089, + 0.73 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_drainage_grate.png", + "raw_ref_image": "references/raw_ref_drainage_grate_attempt_01.png", + "reference_verify": "references/reference_verify_drainage_grate.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_drainage_grate_attempt_01.png", + "output": "references/ref_drainage_grate.png", + "mask": "references/sam_mask_drainage_grate.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 250.0, + 1023.0, + 773.0 + ], + "mask_score": 3.366042, + "mask_area_ratio": 0.379179, + "elapsed_seconds": 8.3171 + } + }, + { + "name": "white_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", + "sub_caption": "white car: A white car visible further down the road in the right lane.. Scene role: driving ahead in the same lane", + "measured_bbox": [ + 0.4356, + 0.5036, + 0.4784, + 0.548 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_car.png", + "raw_ref_image": "references/raw_ref_white_car_attempt_01.png", + "reference_verify": "references/reference_verify_white_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_attempt_01.png", + "output": "references/ref_white_car.png", + "mask": "references/sam_mask_white_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 137.0, + 215.0, + 884.0, + 819.0 + ], + "mask_score": 3.442096, + "mask_area_ratio": 0.295652, + "elapsed_seconds": 7.1564 + } + }, + { + "name": "yellow_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", + "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: painted down the center of the road", + "measured_bbox": [ + 0.0, + 0.622, + 0.2642, + 0.7692 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_lines.png", + "raw_ref_image": "references/raw_ref_yellow_lines_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_yellow_lines_attempt_01.png", + "output": "references/ref_yellow_lines.png", + "mask": "references/sam_mask_yellow_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 3.166027, + "mask_area_ratio": 0.242679, + "elapsed_seconds": 7.0941 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", + "sub_caption": "trees: Numerous trees with dense green foliage lining both sides of the road.. Scene role: growing along the sidewalks, adding greenery", + "measured_bbox": [ + 0.2664, + 0.0, + 0.7141, + 0.5127 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 35.0, + 55.0, + 1002.0, + 1000.0 + ], + "mask_score": 3.226043, + "mask_area_ratio": 0.439437, + "elapsed_seconds": 7.0986 + } + }, + { + "name": "twilight_sky", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", + "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: visible above the buildings and trees at the end of the road", + "measured_bbox": [ + 0.188, + 0.0, + 0.862, + 0.4846 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_twilight_sky.png", + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "reference_verify": "references/reference_verify_twilight_sky.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "references/ref_twilight_sky.png", + "mask": "references/sam_mask_twilight_sky.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 72.0, + 72.0, + 951.0, + 951.0 + ], + "mask_score": 3.471577, + "mask_area_ratio": 0.631801, + "elapsed_seconds": 7.5016 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000003", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "waiting_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", + "sub_caption": "shopper: A pedestrian wearing a dark top and dark pants, standing upright with a natural posture.. Scene role: Waiting at the curb near the crosswalk on the left side of the street.", + "measured_bbox": [ + 0.0928, + 0.1174, + 0.205, + 0.9401 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_waiting_pedestrian.png", + "raw_ref_image": "references/raw_ref_waiting_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_waiting_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_waiting_pedestrian_attempt_01.png", + "output": "references/ref_waiting_pedestrian.png", + "mask": "references/sam_mask_waiting_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 48.0, + 704.0, + 1015.0 + ], + "mask_score": 3.427649, + "mask_area_ratio": 0.155239, + "elapsed_seconds": 6.9951 + } + }, + { + "name": "black_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", + "sub_caption": "black sedan: A glossy black sedan with visible headlights and a detailed front grille.. Scene role: Approaching the crosswalk in the center traffic lane.", + "measured_bbox": [ + 0.3895, + 0.2431, + 0.591, + 0.5084 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_sedan.png", + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "reference_verify": "references/reference_verify_black_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "references/ref_black_sedan.png", + "mask": "references/sam_mask_black_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 221.0, + 1023.0, + 796.0 + ], + "mask_score": 3.446312, + "mask_area_ratio": 0.340465, + "elapsed_seconds": 7.2258 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", + "sub_caption": "silver car: A metallic silver car reflecting daylight.. Scene role: Driving in the right lane slightly ahead of the black sedan, approaching the intersection.", + "measured_bbox": [ + 0.6628, + 0.2419, + 0.9089, + 0.4999 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 220.0, + 1011.0, + 811.0 + ], + "mask_score": 3.077144, + "mask_area_ratio": 0.338042, + "elapsed_seconds": 7.0902 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000004", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "walker", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", + "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Standing at the edge of the sidewalk near the crosswalk, waiting to cross the street.", + "measured_bbox": [ + 0.7914, + 0.2893, + 0.834, + 0.4815 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_walker.png", + "raw_ref_image": "references/raw_ref_walker_attempt_01.png", + "reference_verify": "references/reference_verify_walker.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_walker_attempt_01.png", + "output": "references/ref_walker.png", + "mask": "references/sam_mask_walker.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 51.0, + 688.0, + 1005.0 + ], + "mask_score": 3.433924, + "mask_area_ratio": 0.16005, + "elapsed_seconds": 7.2846 + } + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", + "sub_caption": "traffic light: A set of traffic lights suspended over the intersection, showing a red light.. Scene role: Hanging high above the intersection in the upper-center of the frame.", + "measured_bbox": [ + 0.4425, + 0.023, + 0.467, + 0.1052 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 160.0, + 93.0, + 864.0, + 930.0 + ], + "mask_score": 3.437579, + "mask_area_ratio": 0.253583, + "elapsed_seconds": 7.0663 + } + }, + { + "name": "delivery_truck", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", + "sub_caption": "delivery truck: A large, plain white box delivery truck.. Scene role: Parked alongside the right curb in the background, past the intersection.", + "measured_bbox": [ + 0.576, + 0.1929, + 0.7135, + 0.4081 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_delivery_truck.png", + "raw_ref_image": "references/raw_ref_delivery_truck_attempt_01.png", + "reference_verify": "references/reference_verify_delivery_truck.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_delivery_truck_attempt_01.png", + "output": "references/ref_delivery_truck.png", + "mask": "references/sam_mask_delivery_truck.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 95.0, + 100.0, + 910.0, + 932.0 + ], + "mask_score": 3.445823, + "mask_area_ratio": 0.476913, + "elapsed_seconds": 7.1923 + } + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", + "sub_caption": "dark parked car: A dark-colored sedan.. Scene role: Parked on the right side of the street near the sidewalk in the mid-ground.", + "measured_bbox": [ + 0.8414, + 0.3717, + 0.9967, + 0.7454 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_parked_car.png", + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "reference_verify": "references/reference_verify_dark_parked_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "references/ref_dark_parked_car.png", + "mask": "references/sam_mask_dark_parked_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 19.0, + 336.0, + 1003.0, + 700.0 + ], + "mask_score": 3.408233, + "mask_area_ratio": 0.181406, + "elapsed_seconds": 8.4178 + } + }, + { + "name": "street_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", + "sub_caption": "street lines: Double yellow lines separating traffic directions and crisp white painted lines indicating lanes and a crosswalk.. Scene role: Painted on the asphalt road surface, extending from the foreground toward the intersection.", + "measured_bbox": [ + 0.003, + 0.3541, + 0.915, + 0.8612 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lines.png", + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "reference_verify": "references/reference_verify_street_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "references/ref_street_lines.png", + "mask": "references/sam_mask_street_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 38.0, + 225.0, + 985.0, + 799.0 + ], + "mask_score": 3.287982, + "mask_area_ratio": 0.400985, + "elapsed_seconds": 7.2613 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000005", + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 6, + "n_detected": 6, + "n_subjects": 6, + "subjects": [ + { + "name": "firefighter", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", + "source_name": "firefighter", + "source_description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away near the fire truck. Source dataset: CrowdHuman. Scene context: Emergency response personnel, including firefighters and ambulance crew, are gathered outside a large classical building with pillars and banners, accompanied by emergency vehicles.", + "sub_caption": "firefighter: Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away.. Scene role: Assisting with incident management, positioned near the stopped car and barrier.", + "measured_bbox": [ + 0.2626, + 0.3463, + 0.3289, + 0.6561 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_firefighter.png", + "raw_ref_image": "references/raw_ref_firefighter_attempt_01.png", + "reference_verify": "references/reference_verify_firefighter.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_firefighter_attempt_01.png", + "output": "references/ref_firefighter.png", + "mask": "references/sam_mask_firefighter.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 34.0, + 709.0, + 1009.0 + ], + "mask_score": 3.445343, + "mask_area_ratio": 0.178691, + "elapsed_seconds": 7.0362 + } + }, + { + "name": "uniformed_officer", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", + "source_name": "uniformed officer", + "source_description": "wearing a khaki uniform and helmet, holding a baton, looking towards the left Source dataset: CrowdHuman. Scene context: A large crowd of people, including some in uniform with batons and helmets, stands in front of a red and yellow building.", + "sub_caption": "uniformed officer: Wearing a khaki uniform and helmet, holding a baton, looking towards the left.. Scene role: Directing surrounding traffic away from the stopped vehicle using a baton.", + "measured_bbox": [ + 0.0497, + 0.3566, + 0.1691, + 0.6118 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_uniformed_officer.png", + "raw_ref_image": "references/raw_ref_uniformed_officer_attempt_01.png", + "reference_verify": "references/reference_verify_uniformed_officer.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_uniformed_officer_attempt_01.png", + "output": "references/ref_uniformed_officer.png", + "mask": "references/sam_mask_uniformed_officer.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 24.0, + 689.0, + 1005.0 + ], + "mask_score": 3.475629, + "mask_area_ratio": 0.156165, + "elapsed_seconds": 7.0984 + } + }, + { + "name": "bystander_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", + "sub_caption": "crowd member: A person wearing a suit.. Scene role: Standing off to the right side of the road behind the barrier, acting as the driver or an involved pedestrian.", + "measured_bbox": [ + 0.7467, + 0.3318, + 0.8036, + 0.5111 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_bystander_in_suit.png", + "raw_ref_image": "references/raw_ref_bystander_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_bystander_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_bystander_in_suit_attempt_01.png", + "output": "references/ref_bystander_in_suit.png", + "mask": "references/sam_mask_bystander_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 59.0, + 677.0, + 996.0 + ], + "mask_score": 3.480669, + "mask_area_ratio": 0.144797, + "elapsed_seconds": 7.0242 + } + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", + "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Hanging overhead or mounted prominently on a pole at the intersection.", + "measured_bbox": [ + 0.5381, + 0.0316, + 0.5856, + 0.2076 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 272.0, + 15.0, + 750.0, + 1006.0 + ], + "mask_score": 3.448339, + "mask_area_ratio": 0.303974, + "elapsed_seconds": 8.3734 + } + }, + { + "name": "concrete_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c946c532-07177e0a:object:11", + "source_name": "concrete barrier", + "source_description": "A continuous low concrete wall acting as a barrier on the right side of the road. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway during the day, with construction or industrial sites visible alongside.", + "sub_caption": "concrete barrier: A continuous low concrete wall acting as a barrier on the right side of the road.. Scene role: Lining the right side of the street, separating the pedestrian walkway or construction zone from the active traffic lane.", + "measured_bbox": [ + 0.6322, + 0.4972, + 0.9964, + 0.6985 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_concrete_barrier.png", + "raw_ref_image": "references/raw_ref_concrete_barrier_attempt_01.png", + "reference_verify": "references/reference_verify_concrete_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_concrete_barrier_attempt_01.png", + "output": "references/ref_concrete_barrier.png", + "mask": "references/sam_mask_concrete_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 53.0, + 219.0, + 970.0, + 811.0 + ], + "mask_score": 3.469119, + "mask_area_ratio": 0.3653, + "elapsed_seconds": 7.0274 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", + "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection with illuminated brake lights.. Scene role: Stopped in the active lane near the barrier, serving as the focal point of the traffic response.", + "measured_bbox": [ + 0.3396, + 0.3754, + 0.6399, + 0.6647 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 215.0, + 976.0, + 829.0 + ], + "mask_score": 3.457698, + "mask_area_ratio": 0.330622, + "elapsed_seconds": 7.0933 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000006", + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 8, + "n_detected": 8, + "n_subjects": 8, + "subjects": [ + { + "name": "distant_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", + "sub_caption": "pedestrian: A person walking across the street in the distant background.. Scene role: Crossing the crosswalk in the distance ahead of the approaching vehicles.", + "measured_bbox": [ + 0.3877, + 0.478, + 0.4204, + 0.5881 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_distant_pedestrian.png", + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_distant_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "references/ref_distant_pedestrian.png", + "mask": "references/sam_mask_distant_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 324.0, + 9.0, + 705.0, + 1015.0 + ], + "mask_score": 3.338419, + "mask_area_ratio": 0.174056, + "elapsed_seconds": 8.694 + } + }, + { + "name": "vertical_illuminated_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb1b7e42-9608265e:object:6", + "source_name": "street sign", + "source_description": "A vertical 'PARK' sign illuminated on the right side of the street, indicating a parking garage. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a city street with tall buildings on both sides, following a yellow taxi, with other cars parked and driving.", + "sub_caption": "street sign: A vertical illuminated neon sign with abstract shapes, glowing brightly.. Scene role: Mounted on the building facade on the right side of the street, adding ambient night lighting.", + "measured_bbox": [ + 0.7683, + 0.0355, + 0.8177, + 0.2837 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vertical_illuminated_sign.png", + "raw_ref_image": "references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "reference_verify": "references/reference_verify_vertical_illuminated_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "output": "references/ref_vertical_illuminated_sign.png", + "mask": "references/sam_mask_vertical_illuminated_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 305.0, + 20.0, + 728.0, + 1002.0 + ], + "mask_score": 3.37343, + "mask_area_ratio": 0.273593, + "elapsed_seconds": 7.1332 + } + }, + { + "name": "emergency_vehicle", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", + "sub_caption": "vehicle: A dark-colored vehicle with bright blue emergency lights flashing.. Scene role: Parked on the left side of the street near the intersection.", + "measured_bbox": [ + 0.1031, + 0.4564, + 0.2827, + 0.6497 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_emergency_vehicle.png", + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "reference_verify": "references/reference_verify_emergency_vehicle.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "references/ref_emergency_vehicle.png", + "mask": "references/sam_mask_emergency_vehicle.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 8.0, + 237.0, + 1015.0, + 828.0 + ], + "mask_score": 3.468468, + "mask_area_ratio": 0.355034, + "elapsed_seconds": 7.0896 + } + }, + { + "name": "white_panel_van", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", + "sub_caption": "white van: A large white panel van with red taillights illuminated.. Scene role: Driving in the lane directly ahead of the camera perspective.", + "measured_bbox": [ + 0.4556, + 0.3288, + 0.5926, + 0.6597 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_panel_van.png", + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "reference_verify": "references/reference_verify_white_panel_van.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "references/ref_white_panel_van.png", + "mask": "references/sam_mask_white_panel_van.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 181.0, + 63.0, + 843.0, + 937.0 + ], + "mask_score": 2.636854, + "mask_area_ratio": 0.376409, + "elapsed_seconds": 7.1379 + } + }, + { + "name": "double_solid_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", + "sub_caption": "double solid white line: Two continuous white painted lines on the dark asphalt road surface.. Scene role: Separating the traffic lanes on the dark road, leading toward the intersection.", + "measured_bbox": [ + 0.1922, + 0.6133, + 0.4541, + 1.0 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_solid_line.png", + "raw_ref_image": "references/raw_ref_double_solid_line_attempt_01.png", + "reference_verify": "references/reference_verify_double_solid_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_line_attempt_01.png", + "output": "references/ref_double_solid_line.png", + "mask": "references/sam_mask_double_solid_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 186.0, + 1001.0, + 837.0 + ], + "mask_score": 3.460181, + "mask_area_ratio": 0.372935, + "elapsed_seconds": 8.3174 + } + }, + { + "name": "dark_building_facade", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c807cb19-7e09cb11:object:8", + "source_name": "building facade", + "source_description": "Dark outlines of buildings lining the street on both sides, with some lit windows. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane city street with traffic lights and vehicles ahead.", + "sub_caption": "building facade: Dark outlines of buildings with scattered, warm-toned lit windows.. Scene role: Forming the urban backdrop along the left side of the street.", + "measured_bbox": [ + 0.1397, + 0.0, + 0.366, + 0.5427 + ], + "detection_confidence": 0.8, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_building_facade.png", + "raw_ref_image": "references/raw_ref_dark_building_facade_attempt_01.png", + "reference_verify": "references/reference_verify_dark_building_facade.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_dark_building_facade_attempt_01.png", + "output": "references/ref_dark_building_facade.png", + "mask": "references/sam_mask_dark_building_facade.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 128.0, + 0.0, + 887.0, + 1000.0 + ], + "mask_score": 2.829968, + "mask_area_ratio": 0.624767, + "elapsed_seconds": 7.1675 + } + }, + { + "name": "awning_building_corner", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c06d23aa-cb9ae751:object:6", + "source_name": "building corner", + "source_description": "The corner of a building on the right side, with an orange or red awning and some lit signs. Source dataset: BDD100K. Scene context: Nighttime driving scene at an intersection with a stop sign and a large black SUV passing on the right.", + "sub_caption": "building corner: The corner of a building featuring an awning and brightly lit abstract signboards.. Scene role: Anchoring the right side of the intersection with a warm architectural glow.", + "measured_bbox": [ + 0.6102, + 0.3347, + 0.7867, + 0.5412 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_awning_building_corner.png", + "raw_ref_image": "references/raw_ref_awning_building_corner_attempt_01.png", + "reference_verify": "references/reference_verify_awning_building_corner.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_awning_building_corner_attempt_01.png", + "output": "references/ref_awning_building_corner.png", + "mask": "references/sam_mask_awning_building_corner.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 27.0, + 975.0, + 980.0 + ], + "mask_score": 3.458235, + "mask_area_ratio": 0.594922, + "elapsed_seconds": 7.3072 + } + }, + { + "name": "green_street_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", + "sub_caption": "street sign: A standard green street sign without any readable text.. Scene role: Hanging from a traffic light pole near the intersection.", + "measured_bbox": [ + 0.5754, + 0.1583, + 0.6522, + 0.1884 + ], + "detection_confidence": 100, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_green_street_sign.png", + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "reference_verify": "references/reference_verify_green_street_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "references/ref_green_street_sign.png", + "mask": "references/sam_mask_green_street_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 61.0, + 378.0, + 962.0, + 645.0 + ], + "mask_score": 3.379525, + "mask_area_ratio": 0.536634, + "elapsed_seconds": 7.1734 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000007", + "target_total": 14, + "target_people": 1, + "target_objects": 13, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 14, + "n_detected": 14, + "n_subjects": 14, + "subjects": [ + { + "name": "woman_in_dark_dress", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "source_description": "A woman with dark hair wearing a dark knee-length dress, walking along the path. Source dataset: CrowdHuman. Scene context: A bride in a white gown and her bridesmaids in dark dresses are walking along a paved path next to a building with stairs, surrounded by trees and a white fence in a sunlit outdoor setting.", + "sub_caption": "bridesmaid: A woman with dark hair wearing a dark knee-length dress.. Scene role: walking along the left sidewalk, approaching the crosswalk", + "measured_bbox": [ + 0.1595, + 0.5229, + 0.2058, + 0.7308 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_woman_in_dark_dress.png", + "raw_ref_image": "references/raw_ref_woman_in_dark_dress_attempt_01.png", + "reference_verify": "references/reference_verify_woman_in_dark_dress.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_woman_in_dark_dress_attempt_01.png", + "output": "references/ref_woman_in_dark_dress.png", + "mask": "references/sam_mask_woman_in_dark_dress.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 339.0, + 6.0, + 680.0, + 1019.0 + ], + "mask_score": 3.415794, + "mask_area_ratio": 0.169896, + "elapsed_seconds": 8.5595 + } + }, + { + "name": "dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "source_description": "The dashboard of the camera vehicle, visible at the bottom of the frame. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching a tunnel or underpass, with buildings on the left and a retaining wall on the right.", + "sub_caption": "dashboard: The dashboard of the camera vehicle, visible at the bottom of the frame.. Scene role: anchoring the bottom foreground of the frame to establish the interior car viewpoint", + "measured_bbox": [ + 0.0, + 0.7874, + 1.0, + 1.0 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dashboard.png", + "raw_ref_image": "references/raw_ref_dashboard_attempt_02.png", + "reference_verify": "references/reference_verify_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dashboard_attempt_02.png", + "output": "references/ref_dashboard.png", + "mask": "references/sam_mask_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 267.0, + 1023.0, + 746.0 + ], + "mask_score": 3.179413, + "mask_area_ratio": 0.226381, + "elapsed_seconds": 8.7463 + } + }, + { + "name": "overhead_wires", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "source_description": "Power and communication lines stretching across the sky above the street. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential street lined with parked cars and houses.", + "sub_caption": "overhead wires: Power and communication lines stretching across the sky.. Scene role: strung overhead across the sky, connecting the buildings on either side", + "measured_bbox": [ + 0.0656, + 0.0, + 0.9992, + 0.338 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overhead_wires.png", + "raw_ref_image": "references/raw_ref_overhead_wires_attempt_01.png", + "reference_verify": "references/reference_verify_overhead_wires.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_overhead_wires_attempt_01.png", + "output": "references/ref_overhead_wires.png", + "mask": "references/sam_mask_overhead_wires.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 152.0, + 1023.0, + 791.0 + ], + "mask_score": 2.72423, + "mask_area_ratio": 0.290783, + "elapsed_seconds": 7.2274 + } + }, + { + "name": "bunch_of_balloons", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", + "source_name": "bunch of balloons", + "source_description": "A bunch of heart-shaped balloons, some pink and some red, tied to a wooden utility pole. Source dataset: CrowdHuman. Scene context: A group of people standing in a circle, holding hands on a street in a residential area.", + "sub_caption": "bunch of balloons: A bunch of heart-shaped balloons, some pink and some red.. Scene role: tied to a metal pole on the left sidewalk", + "measured_bbox": [ + 0.2318, + 0.3806, + 0.2869, + 0.4973 + ], + "detection_confidence": 100, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_bunch_of_balloons.png", + "raw_ref_image": "references/raw_ref_bunch_of_balloons_attempt_01.png", + "reference_verify": "references/reference_verify_bunch_of_balloons.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_bunch_of_balloons_attempt_01.png", + "output": "references/ref_bunch_of_balloons.png", + "mask": "references/sam_mask_bunch_of_balloons.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 213.0, + 104.0, + 830.0, + 1023.0 + ], + "mask_score": 3.440433, + "mask_area_ratio": 0.246776, + "elapsed_seconds": 7.1532 + } + }, + { + "name": "white_garbage_bag", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", + "source_name": "white garbage bag", + "source_description": "large white plastic bag on the bottom right corner Source dataset: CrowdHuman. Scene context: A group of fifteen people posing for a photo in front of a colorful graffiti wall.", + "sub_caption": "white garbage bag: A large white plastic bag.. Scene role: placed on the curb near the crosswalk on the right side", + "measured_bbox": [ + 0.8062, + 0.6476, + 0.8807, + 0.7562 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_garbage_bag.png", + "raw_ref_image": "references/raw_ref_white_garbage_bag_attempt_01.png", + "reference_verify": "references/reference_verify_white_garbage_bag.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_garbage_bag_attempt_01.png", + "output": "references/ref_white_garbage_bag.png", + "mask": "references/sam_mask_white_garbage_bag.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 77.0, + 963.0, + 989.0 + ], + "mask_score": 3.477571, + "mask_area_ratio": 0.521497, + "elapsed_seconds": 7.4222 + } + }, + { + "name": "multi_story_building_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "source_description": "Multi-story brick buildings with numerous windows and fire escapes on the left side. Source dataset: BDD100K. Scene context: A narrow city street lined with parked cars on both sides, with residential and commercial buildings featuring fire escapes and awnings, illuminated by sunlight filtering through mature trees.", + "sub_caption": "building on left: Multi-story brick buildings with numerous windows and fire escapes.. Scene role: forming the street facade along the left side of the frame", + "measured_bbox": [ + 0.1156, + 0.0, + 0.416, + 0.6004 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_multi_story_building_left.png", + "raw_ref_image": "references/raw_ref_multi_story_building_left_attempt_01.png", + "reference_verify": "references/reference_verify_multi_story_building_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_multi_story_building_left_attempt_01.png", + "output": "references/ref_multi_story_building_left.png", + "mask": "references/sam_mask_multi_story_building_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 21.0, + 18.0, + 1013.0, + 988.0 + ], + "mask_score": 2.993486, + "mask_area_ratio": 0.685524, + "elapsed_seconds": 7.4739 + } + }, + { + "name": "street_light_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "source_description": "A tall, curved metal street light pole on the right side of the road, supporting the overhead sign. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane road bordered by trees, with other vehicles and road signs visible.", + "sub_caption": "street light pole: A tall, curved metal street light pole.. Scene role: standing on the right sidewalk, leaning over the roadway", + "measured_bbox": [ + 0.548, + 0.0288, + 0.7884, + 0.7106 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light_pole.png", + "raw_ref_image": "references/raw_ref_street_light_pole_attempt_01.png", + "reference_verify": "references/reference_verify_street_light_pole.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_street_light_pole_attempt_01.png", + "output": "references/ref_street_light_pole.png", + "mask": "references/sam_mask_street_light_pole.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 204.0, + 10.0, + 816.0, + 1018.0 + ], + "mask_score": 3.426132, + "mask_area_ratio": 0.025422, + "elapsed_seconds": 7.2131 + } + }, + { + "name": "white_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "source_description": "A white passenger car partially visible in the right lane next to the gold SUV. Source dataset: BDD100K. Scene context: View from inside a car driving in city traffic on a sunny day with multiple vehicles and urban infrastructure visible.", + "sub_caption": "white sedan: A white passenger car.. Scene role: driving in the forward lane just past the crosswalk", + "measured_bbox": [ + 0.494, + 0.5481, + 0.6384, + 0.6346 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_sedan.png", + "raw_ref_image": "references/raw_ref_white_sedan_attempt_01.png", + "reference_verify": "references/reference_verify_white_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_sedan_attempt_01.png", + "output": "references/ref_white_sedan.png", + "mask": "references/sam_mask_white_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 10.0, + 331.0, + 1014.0, + 694.0 + ], + "mask_score": 2.789065, + "mask_area_ratio": 0.197716, + "elapsed_seconds": 7.2389 + } + }, + { + "name": "dark_car_1", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "source_description": "A dark-colored sedan visible on the left side of the street, partially obscured by rain. Source dataset: BDD100K. Scene context: View from inside a vehicle through a heavily rain-covered windshield, looking at city traffic and buildings.", + "sub_caption": "dark car: A dark-colored sedan.. Scene role: driving in the opposing traffic lane to the left", + "measured_bbox": [ + 0.3126, + 0.5583, + 0.4593, + 0.6372 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_car_1.png", + "raw_ref_image": "references/raw_ref_dark_car_1_attempt_01.png", + "reference_verify": "references/reference_verify_dark_car_1.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_1_attempt_01.png", + "output": "references/ref_dark_car_1.png", + "mask": "references/sam_mask_dark_car_1.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 317.0, + 1007.0, + 664.0 + ], + "mask_score": 3.079951, + "mask_area_ratio": 0.171859, + "elapsed_seconds": 7.1713 + } + }, + { + "name": "dark_car_2", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "source_description": "A dark-colored car parked on the right side of the street, ahead of the other dark car. Source dataset: BDD100K. Scene context: A dashcam view driving down a city street lined with parked cars on both sides and multi-story brick apartment buildings under a partly cloudy sky.", + "sub_caption": "dark car 2: A dark-colored car.. Scene role: parked alongside the curb on the right side of the street", + "measured_bbox": [ + 0.7955, + 0.5535, + 0.9254, + 0.6345 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_car_2.png", + "raw_ref_image": "references/raw_ref_dark_car_2_attempt_01.png", + "reference_verify": "references/reference_verify_dark_car_2.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_2_attempt_01.png", + "output": "references/ref_dark_car_2.png", + "mask": "references/sam_mask_dark_car_2.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 353.0, + 1023.0, + 727.0 + ], + "mask_score": 3.072596, + "mask_area_ratio": 0.191711, + "elapsed_seconds": 7.2503 + } + }, + { + "name": "brick_building_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "source_description": "A tall, multi-story red brick building on the left side of the street, featuring arched windows and a storefront. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a city street lined with parked cars and multi-story brick buildings.", + "sub_caption": "brick building: A tall, multi-story red brick building featuring arched windows and a storefront.. Scene role: lining the street on the right side of the frame", + "measured_bbox": [ + 0.9184, + 0.0073, + 0.9492, + 0.6625 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_brick_building_right.png", + "raw_ref_image": "references/raw_ref_brick_building_right_attempt_01.png", + "reference_verify": "references/reference_verify_brick_building_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_brick_building_right_attempt_01.png", + "output": "references/ref_brick_building_right.png", + "mask": "references/sam_mask_brick_building_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 100.0, + 0.0, + 930.0, + 1023.0 + ], + "mask_score": 2.148493, + "mask_area_ratio": 0.586381, + "elapsed_seconds": 7.383 + } + }, + { + "name": "metal_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "source_description": "A thin, straight metal pole standing upright on the sidewalk near the park area on the left. Source dataset: BDD100K. Scene context: A dashcam view looking down a slightly sloped residential city street with cars parked on both sides, trees bordering a park area to the left, and a tall building to the right, under a cloudy, overcast sky.", + "sub_caption": "pole: A thin, straight metal pole standing upright.. Scene role: standing on the left sidewalk, serving as a mounting point for the balloons", + "measured_bbox": [ + 0.215, + 0.375, + 0.23, + 0.734 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_pole.png", + "raw_ref_image": "references/raw_ref_metal_pole_attempt_01.png", + "reference_verify": "references/reference_verify_metal_pole.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_metal_pole_attempt_01.png", + "output": "references/ref_metal_pole.png", + "mask": "references/sam_mask_metal_pole.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 443.0, + 15.0, + 574.0, + 1015.0 + ], + "mask_score": 3.415272, + "mask_area_ratio": 0.028519, + "elapsed_seconds": 7.2129 + } + }, + { + "name": "crosswalk_markings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "source_description": "White painted lines on the road surface indicating a pedestrian crosswalk. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with traffic lights and a crosswalk.", + "sub_caption": "crosswalk markings: White painted lines on the road surface indicating a pedestrian crosswalk.. Scene role: painted across the road directly ahead of the camera vehicle", + "measured_bbox": [ + 0.3161, + 0.6787, + 0.7102, + 0.7212 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_crosswalk_markings.png", + "raw_ref_image": "references/raw_ref_crosswalk_markings_attempt_01.png", + "reference_verify": "references/reference_verify_crosswalk_markings.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_crosswalk_markings_attempt_01.png", + "output": "references/ref_crosswalk_markings.png", + "mask": "references/sam_mask_crosswalk_markings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 104.0, + 1023.0, + 866.0 + ], + "mask_score": 3.308171, + "mask_area_ratio": 0.469022, + "elapsed_seconds": 7.2159 + } + }, + { + "name": "iron_balcony", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "source_description": "A dark, wrought-iron balcony on a building. Source dataset: CrowdHuman. Scene context: People walk down a narrow, sunlit street lined with tall buildings.", + "sub_caption": "balcony: A dark, wrought-iron balcony.. Scene role: attached to the facade of the multi-story building on the left", + "measured_bbox": [ + 0.0089, + 0.0773, + 0.1849, + 0.3213 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_iron_balcony.png", + "raw_ref_image": "references/raw_ref_iron_balcony_attempt_01.png", + "reference_verify": "references/reference_verify_iron_balcony.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_iron_balcony_attempt_01.png", + "output": "references/ref_iron_balcony.png", + "mask": "references/sam_mask_iron_balcony.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 76.0, + 0.0, + 946.0, + 952.0 + ], + "mask_score": 3.194017, + "mask_area_ratio": 0.432758, + "elapsed_seconds": 7.1961 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000008", + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", + "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath.. Scene role: walking away from the camera on the crosswalk", + "measured_bbox": [ + 0.258, + 0.308, + 0.368, + 0.725 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png" + } + }, + { + "name": "woman_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", + "sub_caption": "shopper: A woman wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: standing on the right curb, looking towards the street traffic", + "measured_bbox": [ + 0.5514, + 0.2961, + 0.618, + 0.6443 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_woman_waiting.png", + "raw_ref_image": "references/raw_ref_woman_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_woman_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_woman_waiting.png", + "mask": "references/sam_mask_woman_waiting.png" + } + }, + { + "name": "pedestrian_standing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", + "sub_caption": "shopper: A person wearing a blue jacket and blue jeans.. Scene role: standing on the sidewalk near the intersection", + "measured_bbox": [ + 0.6289, + 0.2934, + 0.6784, + 0.6567 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_standing.png", + "raw_ref_image": "references/raw_ref_pedestrian_standing_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_standing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_standing.png", + "mask": "references/sam_mask_pedestrian_standing.png" + } + }, + { + "name": "sign_holder", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", + "sub_caption": "protester holding sign in back: A person holding up a large white sign.. Scene role: standing on the sidewalk holding a sign near the crosswalk", + "measured_bbox": [ + 0.4795, + 0.28, + 0.5703, + 0.5944 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_sign_holder.png", + "raw_ref_image": "references/raw_ref_sign_holder_attempt_01.png", + "reference_verify": "references/reference_verify_sign_holder.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_sign_holder.png", + "mask": "references/sam_mask_sign_holder.png" + } + }, + { + "name": "pedestrian_crossing_right", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", + "sub_caption": "pedestrian: A person in a white top and dark pants.. Scene role: walking across the street from left to right in the crosswalk", + "measured_bbox": [ + 0.2006, + 0.3034, + 0.2705, + 0.5763 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_crossing_right.png", + "raw_ref_image": "references/raw_ref_pedestrian_crossing_right_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_crossing_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_crossing_right.png", + "mask": "references/sam_mask_pedestrian_crossing_right.png" + } + }, + { + "name": "pedestrian_walking_away_sidewalk", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", + "sub_caption": "pedestrian: A person wearing a white top and dark pants.. Scene role: walking away from the camera on the distant sidewalk", + "measured_bbox": [ + 0.2006, + 0.3029, + 0.2706, + 0.5728 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away_sidewalk.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_sidewalk_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away_sidewalk.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_walking_away_sidewalk.png", + "mask": "references/sam_mask_pedestrian_walking_away_sidewalk.png" + } + }, + { + "name": "young_man_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", + "sub_caption": "young man: A young man wearing a dark blue hoodie.. Scene role: waiting on the corner for the pedestrian signal", + "measured_bbox": [ + 0.6911, + 0.2718, + 0.7496, + 0.6983 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_man_waiting.png", + "raw_ref_image": "references/raw_ref_young_man_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_young_man_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_young_man_waiting.png", + "mask": "references/sam_mask_young_man_waiting.png" + } + }, + { + "name": "businessman_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", + "sub_caption": "adult in dark suit: An adult wearing a dark suit and tie.. Scene role: standing near the crosswalk amongst the crowd on the curb", + "measured_bbox": [ + 0.7491, + 0.2604, + 0.8005, + 0.6281 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_businessman_waiting.png", + "raw_ref_image": "references/raw_ref_businessman_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_businessman_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_businessman_waiting.png", + "mask": "references/sam_mask_businessman_waiting.png" + } + }, + { + "name": "street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", + "sub_caption": "street lamp: A tall street lamp pole.. Scene role: standing on the right corner of the intersection, near the curb", + "measured_bbox": [ + 0.777, + 0.0, + 0.859, + 0.747 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lamp.png", + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "references/ref_street_lamp.png", + "mask": "references/sam_mask_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 278.0, + 0.0, + 808.0, + 1023.0 + ], + "mask_score": 3.224807, + "mask_area_ratio": 0.054686, + "elapsed_seconds": 7.0333 + } + }, + { + "name": "dashboard_reflection", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", + "sub_caption": "vehicle dashboard reflection: A faint reflection of an interior dashboard and an object on the windshield.. Scene role: overlaid on the bottom portion of the view, establishing the perspective from inside a car", + "measured_bbox": [ + 0.0, + 0.616, + 0.608, + 0.818 + ], + "detection_confidence": 0.88, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dashboard_reflection.png", + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_02.png", + "reference_verify": "references/reference_verify_dashboard_reflection.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_02.png", + "output": "references/ref_dashboard_reflection.png", + "mask": "references/sam_mask_dashboard_reflection.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 239.0, + 997.0, + 784.0 + ], + "mask_score": 3.469732, + "mask_area_ratio": 0.384025, + "elapsed_seconds": 8.6424 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000009", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", + "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking on the sidewalk adjacent to the road, safely behind the metal railing", + "measured_bbox": [ + 0.7699, + 0.4157, + 0.8109, + 0.6322 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_suit.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "references/ref_pedestrian_in_suit.png", + "mask": "references/sam_mask_pedestrian_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 31.0, + 680.0, + 1016.0 + ], + "mask_score": 3.479882, + "mask_area_ratio": 0.150441, + "elapsed_seconds": 7.1831 + } + }, + { + "name": "yellow_lane_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", + "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: running along the asphalt road in the foreground and midground", + "measured_bbox": [ + 0.5116, + 0.511, + 0.7953, + 0.8681 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_lane_line.png", + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_lane_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "references/ref_yellow_lane_line.png", + "mask": "references/sam_mask_yellow_lane_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 163.0, + 1023.0, + 845.0 + ], + "mask_score": 3.12139, + "mask_area_ratio": 0.132687, + "elapsed_seconds": 8.5948 + } + }, + { + "name": "overpass", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", + "sub_caption": "overpass: A concrete bridge structure spanning across the highway ahead, casting a shadow over the road.. Scene role: spanning horizontally across the upper midground of the view", + "measured_bbox": [ + 0.0, + 0.1444, + 1.0, + 0.5433 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overpass.png", + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "reference_verify": "references/reference_verify_overpass.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "references/ref_overpass.png", + "mask": "references/sam_mask_overpass.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 71.0, + 1023.0, + 823.0 + ], + "mask_score": 2.496995, + "mask_area_ratio": 0.373877, + "elapsed_seconds": 7.1818 + } + }, + { + "name": "metal_railing", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", + "source_name": "metal railing", + "source_description": "A metal railing visible on the far right edge of the scene. Source dataset: CrowdHuman. Scene context: A group of people, possibly a tour group, is walking outdoors on a paved area near a stone building, with one person holding a green flag and another speaking into a microphone.", + "sub_caption": "metal railing: A metal railing visible on the far right edge of the scene.. Scene role: acting as a safety barrier between the pedestrian sidewalk and the road on the right side", + "measured_bbox": [ + 0.512, + 0.5, + 0.999, + 0.913 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_railing.png", + "raw_ref_image": "references/raw_ref_metal_railing_attempt_01.png", + "reference_verify": "references/reference_verify_metal_railing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_metal_railing_attempt_01.png", + "output": "references/ref_metal_railing.png", + "mask": "references/sam_mask_metal_railing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 261.0, + 1017.0, + 934.0 + ], + "mask_score": 3.402974, + "mask_area_ratio": 0.273856, + "elapsed_seconds": 7.2968 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", + "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: tall lamp post positioned near the overpass, illuminating the surrounding area", + "measured_bbox": [ + 0.51, + 0.0165, + 0.6141, + 0.5516 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 176.0, + 12.0, + 847.0, + 1014.0 + ], + "mask_score": 3.41464, + "mask_area_ratio": 0.029787, + "elapsed_seconds": 7.0955 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000010", + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 9, + "n_detected": 9, + "n_subjects": 9, + "subjects": [ + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", + "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away down the sidewalk on the right", + "measured_bbox": [ + 0.8776, + 0.2931, + 0.9906, + 0.7623 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 40.0, + 683.0, + 1002.0 + ], + "mask_score": 3.42052, + "mask_area_ratio": 0.145487, + "elapsed_seconds": 7.0967 + } + }, + { + "name": "pedestrian_with_backpack", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", + "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: crossing the street in front of the SUV", + "measured_bbox": [ + 0.2259, + 0.2854, + 0.322, + 0.6604 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_with_backpack.png", + "raw_ref_image": "references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_with_backpack.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "output": "references/ref_pedestrian_with_backpack.png", + "mask": "references/sam_mask_pedestrian_with_backpack.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 36.0, + 682.0, + 1012.0 + ], + "mask_score": 3.441997, + "mask_area_ratio": 0.151945, + "elapsed_seconds": 8.5941 + } + }, + { + "name": "pedestrian_in_red", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", + "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: standing at the street corner waiting to cross", + "measured_bbox": [ + 0.6576, + 0.3338, + 0.6899, + 0.5607 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_red.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_red_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_red.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_red_attempt_01.png", + "output": "references/ref_pedestrian_in_red.png", + "mask": "references/sam_mask_pedestrian_in_red.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 52.0, + 699.0, + 1007.0 + ], + "mask_score": 3.430953, + "mask_area_ratio": 0.159512, + "elapsed_seconds": 7.0834 + } + }, + { + "name": "pedestrian_in_striped_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", + "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: walking briskly across the crosswalk", + "measured_bbox": [ + 0.433, + 0.3315, + 0.5713, + 0.6823 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_striped_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_striped_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "output": "references/ref_pedestrian_in_striped_shirt.png", + "mask": "references/sam_mask_pedestrian_in_striped_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 23.0, + 676.0, + 1011.0 + ], + "mask_score": 3.472095, + "mask_area_ratio": 0.152217, + "elapsed_seconds": 7.27 + } + }, + { + "name": "man_in_pink_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", + "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: standing near the curb waiting for a light", + "measured_bbox": [ + 0.7524, + 0.2743, + 0.8106, + 0.6687 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_in_pink_shirt.png", + "raw_ref_image": "references/raw_ref_man_in_pink_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_man_in_pink_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_pink_shirt_attempt_01.png", + "output": "references/ref_man_in_pink_shirt.png", + "mask": "references/sam_mask_man_in_pink_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 24.0, + 708.0, + 1000.0 + ], + "mask_score": 3.415589, + "mask_area_ratio": 0.161095, + "elapsed_seconds": 7.2651 + } + }, + { + "name": "man_in_grey_sweater", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", + "source_name": "man", + "source_description": "Man wearing a grey sweater. Source dataset: CrowdHuman. Scene context: People are walking through an airport terminal with prominent overhead signage.", + "sub_caption": "man: Man wearing a grey sweater.. Scene role: walking towards the camera on the crosswalk", + "measured_bbox": [ + 0.3541, + 0.2895, + 0.4483, + 0.7382 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_in_grey_sweater.png", + "raw_ref_image": "references/raw_ref_man_in_grey_sweater_attempt_01.png", + "reference_verify": "references/reference_verify_man_in_grey_sweater.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_grey_sweater_attempt_01.png", + "output": "references/ref_man_in_grey_sweater.png", + "mask": "references/sam_mask_man_in_grey_sweater.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 47.0, + 683.0, + 1003.0 + ], + "mask_score": 3.491882, + "mask_area_ratio": 0.143696, + "elapsed_seconds": 7.1775 + } + }, + { + "name": "pedestrian_in_light_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", + "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: walking on the sidewalk in the midground", + "measured_bbox": [ + 0.5797, + 0.3113, + 0.6425, + 0.5493 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_light_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_light_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "output": "references/ref_pedestrian_in_light_jacket.png", + "mask": "references/sam_mask_pedestrian_in_light_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 313.0, + 33.0, + 685.0, + 1017.0 + ], + "mask_score": 3.458198, + "mask_area_ratio": 0.174406, + "elapsed_seconds": 7.237 + } + }, + { + "name": "pedestrian_in_light_blue", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", + "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: crossing the street away from the camera's view", + "measured_bbox": [ + 0.0034, + 0.2952, + 0.1205, + 0.6424 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_light_blue.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_light_blue.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "output": "references/ref_pedestrian_in_light_blue.png", + "mask": "references/sam_mask_pedestrian_in_light_blue.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 47.0, + 672.0, + 989.0 + ], + "mask_score": 3.478225, + "mask_area_ratio": 0.140584, + "elapsed_seconds": 7.065 + } + }, + { + "name": "black_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", + "sub_caption": "black suv: A black SUV.. Scene role: stopped at the crosswalk yielding to pedestrians", + "measured_bbox": [ + 0.0797, + 0.2941, + 0.5997, + 0.5875 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_suv.png", + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "reference_verify": "references/reference_verify_black_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "references/ref_black_suv.png", + "mask": "references/sam_mask_black_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 1.0, + 273.0, + 1023.0, + 701.0 + ], + "mask_score": 3.159418, + "mask_area_ratio": 0.229866, + "elapsed_seconds": 7.2921 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/dataset.jsonl b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/dataset.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b945a39f7cbb3b0d7d47d4e98defeb79927f119d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/dataset.jsonl @@ -0,0 +1,10 @@ +{"sample_id": "sample_000001", "target_total": 3, "target_people": 1, "target_objects": 2, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 3, "n_detected": 3, "n_subjects": 3, "subjects": [{"name": "pedestrian", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", "source_name": "pedestrian", "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", "sub_caption": "pedestrian: A person wearing a dark coat and trousers, walking confidently.. Scene role: Walking along the curbside near the barrier.", "measured_bbox": [0.1528, 0.301, 0.2511, 0.7071], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian.png", "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", "output": "references/ref_pedestrian.png", "mask": "references/sam_mask_pedestrian.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [330.0, 42.0, 698.0, 1007.0], "mask_score": 3.413244, "mask_area_ratio": 0.159381, "elapsed_seconds": 33.2771}}, {"name": "parked_dark_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", "source_name": "parked dark car", "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the street near the curb in the background right.", "measured_bbox": [0.5163, 0.3897, 0.9968, 0.9244], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_parked_dark_car.png", "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_01.png", "reference_verify": "references/reference_verify_parked_dark_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_01.png", "output": "references/ref_parked_dark_car.png", "mask": "references/sam_mask_parked_dark_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [47.0, 315.0, 976.0, 694.0], "mask_score": 3.4345, "mask_area_ratio": 0.180014, "elapsed_seconds": 7.1991}}, {"name": "metal_barrier", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", "source_name": "metal barrier", "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", "sub_caption": "metal barrier: A silver metal barricade placed along the street.. Scene role: Positioned along the curb to section off the pedestrian area from the road.", "measured_bbox": [0.3454, 0.4302, 0.5465, 0.8402], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_metal_barrier.png", "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", "reference_verify": "references/reference_verify_metal_barrier.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", "output": "references/ref_metal_barrier.png", "mask": "references/sam_mask_metal_barrier.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [2.0, 107.0, 1009.0, 986.0], "mask_score": 1.555076, "mask_area_ratio": 0.845579, "elapsed_seconds": 7.2854}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000002", "target_total": 15, "target_people": 2, "target_objects": 13, "canvas_size": [1152, 864], "canvas_aspect_ratio": "4:3", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 15, "n_detected": 15, "n_subjects": 15, "subjects": [{"name": "pedestrian_walking", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", "source_name": "pedestrian", "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: walking along the sidewalk on the right side of the street", "measured_bbox": [0.7497, 0.4757, 0.7954, 0.6192], "detection_confidence": 100, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_walking.png", "raw_ref_image": "references/raw_ref_pedestrian_walking_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_walking.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_walking_attempt_01.png", "output": "references/ref_pedestrian_walking.png", "mask": "references/sam_mask_pedestrian_walking.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [341.0, 63.0, 695.0, 972.0], "mask_score": 3.459152, "mask_area_ratio": 0.145545, "elapsed_seconds": 8.3331}}, {"name": "shop_pedestrian", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "BDD100K:b714a088-861a043b:person:2", "source_name": "pedestrian", "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", "sub_caption": "pedestrian: Person standing near a shop entrance on the right, partially obscured.. Scene role: standing on the sidewalk near the storefronts on the right", "measured_bbox": [0.9337, 0.4752, 0.9695, 0.6107], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_shop_pedestrian.png", "raw_ref_image": "references/raw_ref_shop_pedestrian_attempt_02.png", "reference_verify": "references/reference_verify_shop_pedestrian.json", "reference_verify_passed": true, "reference_attempts": 2, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_shop_pedestrian_attempt_02.png", "output": "references/ref_shop_pedestrian.png", "mask": "references/sam_mask_shop_pedestrian.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [312.0, 43.0, 719.0, 1020.0], "mask_score": 3.162079, "mask_area_ratio": 0.167512, "elapsed_seconds": 7.2283}}, {"name": "city_buildings", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", "source_name": "building", "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", "sub_caption": "building: Various city buildings of different heights forming the skyline and lining the street.. Scene role: framing the street and forming the background skyline", "measured_bbox": [0.0, 0.0, 1.0, 0.6084], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_city_buildings.png", "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", "reference_verify": "references/reference_verify_city_buildings.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", "output": "references/ref_city_buildings.png", "mask": "references/sam_mask_city_buildings.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [14.0, 171.0, 1009.0, 883.0], "mask_score": 3.176814, "mask_area_ratio": 0.327415, "elapsed_seconds": 7.146}}, {"name": "street_signs", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", "source_name": "street signs", "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", "sub_caption": "street signs: Various blank street signs attached to a pole on the right side of the street.. Scene role: mounted on a pole next to the right sidewalk", "measured_bbox": [0.641, 0.165, 0.744, 0.408], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_signs.png", "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", "reference_verify": "references/reference_verify_street_signs.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", "output": "references/ref_street_signs.png", "mask": "references/sam_mask_street_signs.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [224.0, 0.0, 744.0, 1023.0], "mask_score": 3.332549, "mask_area_ratio": 0.190769, "elapsed_seconds": 7.1886}}, {"name": "storefront_sign", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", "source_name": "storefront sign", "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", "sub_caption": "storefront sign: A dark hanging sign framework attached to a building on the right, devoid of readable text.. Scene role: hanging above the shop entrance on the right side of the road", "measured_bbox": [0.7854, 0.1934, 0.9082, 0.2906], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_storefront_sign.png", "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", "reference_verify": "references/reference_verify_storefront_sign.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", "output": "references/ref_storefront_sign.png", "mask": "references/sam_mask_storefront_sign.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [46.0, 0.0, 1023.0, 811.0], "mask_score": 3.296373, "mask_area_ratio": 0.447847, "elapsed_seconds": 7.3102}}, {"name": "parked_suv_right", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c54441e6-400c221e:object:4", "source_name": "parked SUV", "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: parked alongside the right curb", "measured_bbox": [0.5507, 0.4879, 0.6783, 0.6234], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_parked_suv_right.png", "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", "reference_verify": "references/reference_verify_parked_suv_right.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", "output": "references/ref_parked_suv_right.png", "mask": "references/sam_mask_parked_suv_right.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [156.0, 150.0, 868.0, 812.0], "mask_score": 3.463227, "mask_area_ratio": 0.291222, "elapsed_seconds": 7.2583}}, {"name": "parked_car_left", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", "source_name": "car", "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: parked alongside the left curb", "measured_bbox": [0.0, 0.5102, 0.1259, 0.5998], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_parked_car_left.png", "raw_ref_image": "references/raw_ref_parked_car_left_attempt_01.png", "reference_verify": "references/reference_verify_parked_car_left.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_car_left_attempt_01.png", "output": "references/ref_parked_car_left.png", "mask": "references/sam_mask_parked_car_left.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 319.0, 1023.0, 695.0], "mask_score": 3.122119, "mask_area_ratio": 0.19451, "elapsed_seconds": 8.5738}}, {"name": "traveling_dark_suv", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", "source_name": "dark SUV", "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible red taillights reflecting the twilight.. Scene role: driving in the adjacent lane", "measured_bbox": [0.2594, 0.4853, 0.417, 0.6419], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_traveling_dark_suv.png", "raw_ref_image": "references/raw_ref_traveling_dark_suv_attempt_01.png", "reference_verify": "references/reference_verify_traveling_dark_suv.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_traveling_dark_suv_attempt_01.png", "output": "references/ref_traveling_dark_suv.png", "mask": "references/sam_mask_traveling_dark_suv.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [119.0, 198.0, 910.0, 810.0], "mask_score": 3.470329, "mask_area_ratio": 0.300606, "elapsed_seconds": 8.5072}}, {"name": "street_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", "source_name": "street light", "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road from the right side.. Scene role: providing illumination from the right sidewalk", "measured_bbox": [0.5577, 0.0219, 0.6964, 0.588], "detection_confidence": 1.0, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_light.png", "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", "reference_verify": "references/reference_verify_street_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", "output": "references/ref_street_light.png", "mask": "references/sam_mask_street_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [330.0, 17.0, 688.0, 996.0], "mask_score": 3.395182, "mask_area_ratio": 0.033435, "elapsed_seconds": 7.0701}}, {"name": "vehicle_dashboard", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", "source_name": "dashboard", "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground.. Scene role: anchoring the bottom of the frame to establish a driver's perspective", "measured_bbox": [0.0, 0.8881, 1.0, 1.0], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_vehicle_dashboard.png", "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", "reference_verify": "references/reference_verify_vehicle_dashboard.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", "output": "references/ref_vehicle_dashboard.png", "mask": "references/sam_mask_vehicle_dashboard.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 223.0, 1023.0, 700.0], "mask_score": 2.938032, "mask_area_ratio": 0.282133, "elapsed_seconds": 7.1679}}, {"name": "drainage_grate", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b4d0e72d-3b208072:object:16", "source_name": "drainage grate", "source_description": "A metal drainage grate on the edge of the road on the right. Source dataset: BDD100K. Scene context: A driving scene on a multi-lane highway with a dark red minivan in the left lane, under a partly cloudy sky.", "sub_caption": "drainage grate: A metal drainage grate on the edge of the road on the right.. Scene role: embedded in the road surface near the right curb", "measured_bbox": [0.5682, 0.6773, 0.8089, 0.73], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_drainage_grate.png", "raw_ref_image": "references/raw_ref_drainage_grate_attempt_01.png", "reference_verify": "references/reference_verify_drainage_grate.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_drainage_grate_attempt_01.png", "output": "references/ref_drainage_grate.png", "mask": "references/sam_mask_drainage_grate.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 250.0, 1023.0, 773.0], "mask_score": 3.366042, "mask_area_ratio": 0.379179, "elapsed_seconds": 8.3171}}, {"name": "white_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", "source_name": "white car", "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", "sub_caption": "white car: A white car visible further down the road in the right lane.. Scene role: driving ahead in the same lane", "measured_bbox": [0.4356, 0.5036, 0.4784, 0.548], "detection_confidence": "high", "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_white_car.png", "raw_ref_image": "references/raw_ref_white_car_attempt_01.png", "reference_verify": "references/reference_verify_white_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_attempt_01.png", "output": "references/ref_white_car.png", "mask": "references/sam_mask_white_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [137.0, 215.0, 884.0, 819.0], "mask_score": 3.442096, "mask_area_ratio": 0.295652, "elapsed_seconds": 7.1564}}, {"name": "yellow_lines", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c417a291-7802692d:object:8", "source_name": "yellow lines", "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: painted down the center of the road", "measured_bbox": [0.0, 0.622, 0.2642, 0.7692], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_yellow_lines.png", "raw_ref_image": "references/raw_ref_yellow_lines_attempt_01.png", "reference_verify": "references/reference_verify_yellow_lines.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_yellow_lines_attempt_01.png", "output": "references/ref_yellow_lines.png", "mask": "references/sam_mask_yellow_lines.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 0.0, 1023.0, 1023.0], "mask_score": 3.166027, "mask_area_ratio": 0.242679, "elapsed_seconds": 7.0941}}, {"name": "street_trees", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", "source_name": "trees", "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", "sub_caption": "trees: Numerous trees with dense green foliage lining both sides of the road.. Scene role: growing along the sidewalks, adding greenery", "measured_bbox": [0.2664, 0.0, 0.7141, 0.5127], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_trees.png", "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", "reference_verify": "references/reference_verify_street_trees.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", "output": "references/ref_street_trees.png", "mask": "references/sam_mask_street_trees.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [35.0, 55.0, 1002.0, 1000.0], "mask_score": 3.226043, "mask_area_ratio": 0.439437, "elapsed_seconds": 7.0986}}, {"name": "twilight_sky", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", "source_name": "sky", "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: visible above the buildings and trees at the end of the road", "measured_bbox": [0.188, 0.0, 0.862, 0.4846], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_twilight_sky.png", "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", "reference_verify": "references/reference_verify_twilight_sky.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", "output": "references/ref_twilight_sky.png", "mask": "references/sam_mask_twilight_sky.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [72.0, 72.0, 951.0, 951.0], "mask_score": 3.471577, "mask_area_ratio": 0.631801, "elapsed_seconds": 7.5016}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000003", "target_total": 3, "target_people": 1, "target_objects": 2, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 3, "n_detected": 3, "n_subjects": 3, "subjects": [{"name": "waiting_pedestrian", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", "source_name": "shopper", "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", "sub_caption": "shopper: A pedestrian wearing a dark top and dark pants, standing upright with a natural posture.. Scene role: Waiting at the curb near the crosswalk on the left side of the street.", "measured_bbox": [0.0928, 0.1174, 0.205, 0.9401], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_waiting_pedestrian.png", "raw_ref_image": "references/raw_ref_waiting_pedestrian_attempt_01.png", "reference_verify": "references/reference_verify_waiting_pedestrian.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_waiting_pedestrian_attempt_01.png", "output": "references/ref_waiting_pedestrian.png", "mask": "references/sam_mask_waiting_pedestrian.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [348.0, 48.0, 704.0, 1015.0], "mask_score": 3.427649, "mask_area_ratio": 0.155239, "elapsed_seconds": 6.9951}}, {"name": "black_sedan", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", "source_name": "black sedan", "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", "sub_caption": "black sedan: A glossy black sedan with visible headlights and a detailed front grille.. Scene role: Approaching the crosswalk in the center traffic lane.", "measured_bbox": [0.3895, 0.2431, 0.591, 0.5084], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_black_sedan.png", "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", "reference_verify": "references/reference_verify_black_sedan.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", "output": "references/ref_black_sedan.png", "mask": "references/sam_mask_black_sedan.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 221.0, 1023.0, 796.0], "mask_score": 3.446312, "mask_area_ratio": 0.340465, "elapsed_seconds": 7.2258}}, {"name": "silver_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", "source_name": "silver car", "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", "sub_caption": "silver car: A metallic silver car reflecting daylight.. Scene role: Driving in the right lane slightly ahead of the black sedan, approaching the intersection.", "measured_bbox": [0.6628, 0.2419, 0.9089, 0.4999], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_silver_car.png", "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", "reference_verify": "references/reference_verify_silver_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", "output": "references/ref_silver_car.png", "mask": "references/sam_mask_silver_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [13.0, 220.0, 1011.0, 811.0], "mask_score": 3.077144, "mask_area_ratio": 0.338042, "elapsed_seconds": 7.0902}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000004", "target_total": 5, "target_people": 1, "target_objects": 4, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 5, "n_detected": 5, "n_subjects": 5, "subjects": [{"name": "walker", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", "source_name": "walker", "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Standing at the edge of the sidewalk near the crosswalk, waiting to cross the street.", "measured_bbox": [0.7914, 0.2893, 0.834, 0.4815], "detection_confidence": 1.0, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_walker.png", "raw_ref_image": "references/raw_ref_walker_attempt_01.png", "reference_verify": "references/reference_verify_walker.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_walker_attempt_01.png", "output": "references/ref_walker.png", "mask": "references/sam_mask_walker.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [336.0, 51.0, 688.0, 1005.0], "mask_score": 3.433924, "mask_area_ratio": 0.16005, "elapsed_seconds": 7.2846}}, {"name": "traffic_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", "source_name": "traffic light", "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", "sub_caption": "traffic light: A set of traffic lights suspended over the intersection, showing a red light.. Scene role: Hanging high above the intersection in the upper-center of the frame.", "measured_bbox": [0.4425, 0.023, 0.467, 0.1052], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_traffic_light.png", "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", "reference_verify": "references/reference_verify_traffic_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_traffic_light_attempt_01.png", "output": "references/ref_traffic_light.png", "mask": "references/sam_mask_traffic_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [160.0, 93.0, 864.0, 930.0], "mask_score": 3.437579, "mask_area_ratio": 0.253583, "elapsed_seconds": 7.0663}}, {"name": "delivery_truck", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", "source_name": "delivery truck", "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", "sub_caption": "delivery truck: A large, plain white box delivery truck.. Scene role: Parked alongside the right curb in the background, past the intersection.", "measured_bbox": [0.576, 0.1929, 0.7135, 0.4081], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_delivery_truck.png", "raw_ref_image": "references/raw_ref_delivery_truck_attempt_01.png", "reference_verify": "references/reference_verify_delivery_truck.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_delivery_truck_attempt_01.png", "output": "references/ref_delivery_truck.png", "mask": "references/sam_mask_delivery_truck.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [95.0, 100.0, 910.0, 932.0], "mask_score": 3.445823, "mask_area_ratio": 0.476913, "elapsed_seconds": 7.1923}}, {"name": "dark_parked_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", "source_name": "dark parked car", "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", "sub_caption": "dark parked car: A dark-colored sedan.. Scene role: Parked on the right side of the street near the sidewalk in the mid-ground.", "measured_bbox": [0.8414, 0.3717, 0.9967, 0.7454], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dark_parked_car.png", "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", "reference_verify": "references/reference_verify_dark_parked_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", "output": "references/ref_dark_parked_car.png", "mask": "references/sam_mask_dark_parked_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [19.0, 336.0, 1003.0, 700.0], "mask_score": 3.408233, "mask_area_ratio": 0.181406, "elapsed_seconds": 8.4178}}, {"name": "street_lines", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", "source_name": "street lines", "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", "sub_caption": "street lines: Double yellow lines separating traffic directions and crisp white painted lines indicating lanes and a crosswalk.. Scene role: Painted on the asphalt road surface, extending from the foreground toward the intersection.", "measured_bbox": [0.003, 0.3541, 0.915, 0.8612], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_lines.png", "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", "reference_verify": "references/reference_verify_street_lines.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", "output": "references/ref_street_lines.png", "mask": "references/sam_mask_street_lines.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [38.0, 225.0, 985.0, 799.0], "mask_score": 3.287982, "mask_area_ratio": 0.400985, "elapsed_seconds": 7.2613}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000005", "target_total": 6, "target_people": 3, "target_objects": 3, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 6, "n_detected": 6, "n_subjects": 6, "subjects": [{"name": "firefighter", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", "source_name": "firefighter", "source_description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away near the fire truck. Source dataset: CrowdHuman. Scene context: Emergency response personnel, including firefighters and ambulance crew, are gathered outside a large classical building with pillars and banners, accompanied by emergency vehicles.", "sub_caption": "firefighter: Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away.. Scene role: Assisting with incident management, positioned near the stopped car and barrier.", "measured_bbox": [0.2626, 0.3463, 0.3289, 0.6561], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_firefighter.png", "raw_ref_image": "references/raw_ref_firefighter_attempt_01.png", "reference_verify": "references/reference_verify_firefighter.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_firefighter_attempt_01.png", "output": "references/ref_firefighter.png", "mask": "references/sam_mask_firefighter.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [317.0, 34.0, 709.0, 1009.0], "mask_score": 3.445343, "mask_area_ratio": 0.178691, "elapsed_seconds": 7.0362}}, {"name": "uniformed_officer", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", "source_name": "uniformed officer", "source_description": "wearing a khaki uniform and helmet, holding a baton, looking towards the left Source dataset: CrowdHuman. Scene context: A large crowd of people, including some in uniform with batons and helmets, stands in front of a red and yellow building.", "sub_caption": "uniformed officer: Wearing a khaki uniform and helmet, holding a baton, looking towards the left.. Scene role: Directing surrounding traffic away from the stopped vehicle using a baton.", "measured_bbox": [0.0497, 0.3566, 0.1691, 0.6118], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_uniformed_officer.png", "raw_ref_image": "references/raw_ref_uniformed_officer_attempt_01.png", "reference_verify": "references/reference_verify_uniformed_officer.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_uniformed_officer_attempt_01.png", "output": "references/ref_uniformed_officer.png", "mask": "references/sam_mask_uniformed_officer.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [331.0, 24.0, 689.0, 1005.0], "mask_score": 3.475629, "mask_area_ratio": 0.156165, "elapsed_seconds": 7.0984}}, {"name": "bystander_in_suit", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", "source_name": "crowd member", "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", "sub_caption": "crowd member: A person wearing a suit.. Scene role: Standing off to the right side of the road behind the barrier, acting as the driver or an involved pedestrian.", "measured_bbox": [0.7467, 0.3318, 0.8036, 0.5111], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_bystander_in_suit.png", "raw_ref_image": "references/raw_ref_bystander_in_suit_attempt_01.png", "reference_verify": "references/reference_verify_bystander_in_suit.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_bystander_in_suit_attempt_01.png", "output": "references/ref_bystander_in_suit.png", "mask": "references/sam_mask_bystander_in_suit.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [341.0, 59.0, 677.0, 996.0], "mask_score": 3.480669, "mask_area_ratio": 0.144797, "elapsed_seconds": 7.0242}}, {"name": "traffic_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", "source_name": "traffic light", "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Hanging overhead or mounted prominently on a pole at the intersection.", "measured_bbox": [0.5381, 0.0316, 0.5856, 0.2076], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_traffic_light.png", "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", "reference_verify": "references/reference_verify_traffic_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", "output": "references/ref_traffic_light.png", "mask": "references/sam_mask_traffic_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [272.0, 15.0, 750.0, 1006.0], "mask_score": 3.448339, "mask_area_ratio": 0.303974, "elapsed_seconds": 8.3734}}, {"name": "concrete_barrier", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c946c532-07177e0a:object:11", "source_name": "concrete barrier", "source_description": "A continuous low concrete wall acting as a barrier on the right side of the road. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway during the day, with construction or industrial sites visible alongside.", "sub_caption": "concrete barrier: A continuous low concrete wall acting as a barrier on the right side of the road.. Scene role: Lining the right side of the street, separating the pedestrian walkway or construction zone from the active traffic lane.", "measured_bbox": [0.6322, 0.4972, 0.9964, 0.6985], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_concrete_barrier.png", "raw_ref_image": "references/raw_ref_concrete_barrier_attempt_01.png", "reference_verify": "references/reference_verify_concrete_barrier.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_concrete_barrier_attempt_01.png", "output": "references/ref_concrete_barrier.png", "mask": "references/sam_mask_concrete_barrier.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [53.0, 219.0, 970.0, 811.0], "mask_score": 3.469119, "mask_area_ratio": 0.3653, "elapsed_seconds": 7.0274}}, {"name": "silver_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", "source_name": "silver car", "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection with illuminated brake lights.. Scene role: Stopped in the active lane near the barrier, serving as the focal point of the traffic response.", "measured_bbox": [0.3396, 0.3754, 0.6399, 0.6647], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_silver_car.png", "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", "reference_verify": "references/reference_verify_silver_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", "output": "references/ref_silver_car.png", "mask": "references/sam_mask_silver_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [46.0, 215.0, 976.0, 829.0], "mask_score": 3.457698, "mask_area_ratio": 0.330622, "elapsed_seconds": 7.0933}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000006", "target_total": 8, "target_people": 1, "target_objects": 7, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 8, "n_detected": 8, "n_subjects": 8, "subjects": [{"name": "distant_pedestrian", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", "source_name": "pedestrian", "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", "sub_caption": "pedestrian: A person walking across the street in the distant background.. Scene role: Crossing the crosswalk in the distance ahead of the approaching vehicles.", "measured_bbox": [0.3877, 0.478, 0.4204, 0.5881], "detection_confidence": 0.9, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_distant_pedestrian.png", "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", "reference_verify": "references/reference_verify_distant_pedestrian.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_distant_pedestrian_attempt_01.png", "output": "references/ref_distant_pedestrian.png", "mask": "references/sam_mask_distant_pedestrian.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [324.0, 9.0, 705.0, 1015.0], "mask_score": 3.338419, "mask_area_ratio": 0.174056, "elapsed_seconds": 8.694}}, {"name": "vertical_illuminated_sign", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bb1b7e42-9608265e:object:6", "source_name": "street sign", "source_description": "A vertical 'PARK' sign illuminated on the right side of the street, indicating a parking garage. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a city street with tall buildings on both sides, following a yellow taxi, with other cars parked and driving.", "sub_caption": "street sign: A vertical illuminated neon sign with abstract shapes, glowing brightly.. Scene role: Mounted on the building facade on the right side of the street, adding ambient night lighting.", "measured_bbox": [0.7683, 0.0355, 0.8177, 0.2837], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_vertical_illuminated_sign.png", "raw_ref_image": "references/raw_ref_vertical_illuminated_sign_attempt_01.png", "reference_verify": "references/reference_verify_vertical_illuminated_sign.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_vertical_illuminated_sign_attempt_01.png", "output": "references/ref_vertical_illuminated_sign.png", "mask": "references/sam_mask_vertical_illuminated_sign.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [305.0, 20.0, 728.0, 1002.0], "mask_score": 3.37343, "mask_area_ratio": 0.273593, "elapsed_seconds": 7.1332}}, {"name": "emergency_vehicle", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b99f250d-886111c5:object:5", "source_name": "vehicle", "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", "sub_caption": "vehicle: A dark-colored vehicle with bright blue emergency lights flashing.. Scene role: Parked on the left side of the street near the intersection.", "measured_bbox": [0.1031, 0.4564, 0.2827, 0.6497], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_emergency_vehicle.png", "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", "reference_verify": "references/reference_verify_emergency_vehicle.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", "output": "references/ref_emergency_vehicle.png", "mask": "references/sam_mask_emergency_vehicle.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [8.0, 237.0, 1015.0, 828.0], "mask_score": 3.468468, "mask_area_ratio": 0.355034, "elapsed_seconds": 7.0896}}, {"name": "white_panel_van", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", "source_name": "white van", "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", "sub_caption": "white van: A large white panel van with red taillights illuminated.. Scene role: Driving in the lane directly ahead of the camera perspective.", "measured_bbox": [0.4556, 0.3288, 0.5926, 0.6597], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_white_panel_van.png", "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", "reference_verify": "references/reference_verify_white_panel_van.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", "output": "references/ref_white_panel_van.png", "mask": "references/sam_mask_white_panel_van.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [181.0, 63.0, 843.0, 937.0], "mask_score": 2.636854, "mask_area_ratio": 0.376409, "elapsed_seconds": 7.1379}}, {"name": "double_solid_line", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", "source_name": "double solid white line", "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", "sub_caption": "double solid white line: Two continuous white painted lines on the dark asphalt road surface.. Scene role: Separating the traffic lanes on the dark road, leading toward the intersection.", "measured_bbox": [0.1922, 0.6133, 0.4541, 1.0], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_double_solid_line.png", "raw_ref_image": "references/raw_ref_double_solid_line_attempt_01.png", "reference_verify": "references/reference_verify_double_solid_line.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_line_attempt_01.png", "output": "references/ref_double_solid_line.png", "mask": "references/sam_mask_double_solid_line.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [22.0, 186.0, 1001.0, 837.0], "mask_score": 3.460181, "mask_area_ratio": 0.372935, "elapsed_seconds": 8.3174}}, {"name": "dark_building_facade", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c807cb19-7e09cb11:object:8", "source_name": "building facade", "source_description": "Dark outlines of buildings lining the street on both sides, with some lit windows. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane city street with traffic lights and vehicles ahead.", "sub_caption": "building facade: Dark outlines of buildings with scattered, warm-toned lit windows.. Scene role: Forming the urban backdrop along the left side of the street.", "measured_bbox": [0.1397, 0.0, 0.366, 0.5427], "detection_confidence": 0.8, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dark_building_facade.png", "raw_ref_image": "references/raw_ref_dark_building_facade_attempt_01.png", "reference_verify": "references/reference_verify_dark_building_facade.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_dark_building_facade_attempt_01.png", "output": "references/ref_dark_building_facade.png", "mask": "references/sam_mask_dark_building_facade.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [128.0, 0.0, 887.0, 1000.0], "mask_score": 2.829968, "mask_area_ratio": 0.624767, "elapsed_seconds": 7.1675}}, {"name": "awning_building_corner", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c06d23aa-cb9ae751:object:6", "source_name": "building corner", "source_description": "The corner of a building on the right side, with an orange or red awning and some lit signs. Source dataset: BDD100K. Scene context: Nighttime driving scene at an intersection with a stop sign and a large black SUV passing on the right.", "sub_caption": "building corner: The corner of a building featuring an awning and brightly lit abstract signboards.. Scene role: Anchoring the right side of the intersection with a warm architectural glow.", "measured_bbox": [0.6102, 0.3347, 0.7867, 0.5412], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_awning_building_corner.png", "raw_ref_image": "references/raw_ref_awning_building_corner_attempt_01.png", "reference_verify": "references/reference_verify_awning_building_corner.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_awning_building_corner_attempt_01.png", "output": "references/ref_awning_building_corner.png", "mask": "references/sam_mask_awning_building_corner.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [27.0, 27.0, 975.0, 980.0], "mask_score": 3.458235, "mask_area_ratio": 0.594922, "elapsed_seconds": 7.3072}}, {"name": "green_street_sign", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", "source_name": "street sign", "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", "sub_caption": "street sign: A standard green street sign without any readable text.. Scene role: Hanging from a traffic light pole near the intersection.", "measured_bbox": [0.5754, 0.1583, 0.6522, 0.1884], "detection_confidence": 100, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_green_street_sign.png", "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", "reference_verify": "references/reference_verify_green_street_sign.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", "output": "references/ref_green_street_sign.png", "mask": "references/sam_mask_green_street_sign.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [61.0, 378.0, 962.0, 645.0], "mask_score": 3.379525, "mask_area_ratio": 0.536634, "elapsed_seconds": 7.1734}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000007", "target_total": 14, "target_people": 1, "target_objects": 13, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 14, "n_detected": 14, "n_subjects": 14, "subjects": [{"name": "woman_in_dark_dress", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", "source_name": "bridesmaid", "source_description": "A woman with dark hair wearing a dark knee-length dress, walking along the path. Source dataset: CrowdHuman. Scene context: A bride in a white gown and her bridesmaids in dark dresses are walking along a paved path next to a building with stairs, surrounded by trees and a white fence in a sunlit outdoor setting.", "sub_caption": "bridesmaid: A woman with dark hair wearing a dark knee-length dress.. Scene role: walking along the left sidewalk, approaching the crosswalk", "measured_bbox": [0.1595, 0.5229, 0.2058, 0.7308], "detection_confidence": 100, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_woman_in_dark_dress.png", "raw_ref_image": "references/raw_ref_woman_in_dark_dress_attempt_01.png", "reference_verify": "references/reference_verify_woman_in_dark_dress.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_woman_in_dark_dress_attempt_01.png", "output": "references/ref_woman_in_dark_dress.png", "mask": "references/sam_mask_woman_in_dark_dress.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [339.0, 6.0, 680.0, 1019.0], "mask_score": 3.415794, "mask_area_ratio": 0.169896, "elapsed_seconds": 8.5595}}, {"name": "dashboard", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", "source_name": "dashboard", "source_description": "The dashboard of the camera vehicle, visible at the bottom of the frame. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching a tunnel or underpass, with buildings on the left and a retaining wall on the right.", "sub_caption": "dashboard: The dashboard of the camera vehicle, visible at the bottom of the frame.. Scene role: anchoring the bottom foreground of the frame to establish the interior car viewpoint", "measured_bbox": [0.0, 0.7874, 1.0, 1.0], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dashboard.png", "raw_ref_image": "references/raw_ref_dashboard_attempt_02.png", "reference_verify": "references/reference_verify_dashboard.json", "reference_verify_passed": true, "reference_attempts": 2, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dashboard_attempt_02.png", "output": "references/ref_dashboard.png", "mask": "references/sam_mask_dashboard.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 267.0, 1023.0, 746.0], "mask_score": 3.179413, "mask_area_ratio": 0.226381, "elapsed_seconds": 8.7463}}, {"name": "overhead_wires", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", "source_name": "overhead wires", "source_description": "Power and communication lines stretching across the sky above the street. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential street lined with parked cars and houses.", "sub_caption": "overhead wires: Power and communication lines stretching across the sky.. Scene role: strung overhead across the sky, connecting the buildings on either side", "measured_bbox": [0.0656, 0.0, 0.9992, 0.338], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_overhead_wires.png", "raw_ref_image": "references/raw_ref_overhead_wires_attempt_01.png", "reference_verify": "references/reference_verify_overhead_wires.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_overhead_wires_attempt_01.png", "output": "references/ref_overhead_wires.png", "mask": "references/sam_mask_overhead_wires.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 152.0, 1023.0, 791.0], "mask_score": 2.72423, "mask_area_ratio": 0.290783, "elapsed_seconds": 7.2274}}, {"name": "bunch_of_balloons", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", "source_name": "bunch of balloons", "source_description": "A bunch of heart-shaped balloons, some pink and some red, tied to a wooden utility pole. Source dataset: CrowdHuman. Scene context: A group of people standing in a circle, holding hands on a street in a residential area.", "sub_caption": "bunch of balloons: A bunch of heart-shaped balloons, some pink and some red.. Scene role: tied to a metal pole on the left sidewalk", "measured_bbox": [0.2318, 0.3806, 0.2869, 0.4973], "detection_confidence": 100, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_bunch_of_balloons.png", "raw_ref_image": "references/raw_ref_bunch_of_balloons_attempt_01.png", "reference_verify": "references/reference_verify_bunch_of_balloons.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_bunch_of_balloons_attempt_01.png", "output": "references/ref_bunch_of_balloons.png", "mask": "references/sam_mask_bunch_of_balloons.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [213.0, 104.0, 830.0, 1023.0], "mask_score": 3.440433, "mask_area_ratio": 0.246776, "elapsed_seconds": 7.1532}}, {"name": "white_garbage_bag", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", "source_name": "white garbage bag", "source_description": "large white plastic bag on the bottom right corner Source dataset: CrowdHuman. Scene context: A group of fifteen people posing for a photo in front of a colorful graffiti wall.", "sub_caption": "white garbage bag: A large white plastic bag.. Scene role: placed on the curb near the crosswalk on the right side", "measured_bbox": [0.8062, 0.6476, 0.8807, 0.7562], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_white_garbage_bag.png", "raw_ref_image": "references/raw_ref_white_garbage_bag_attempt_01.png", "reference_verify": "references/reference_verify_white_garbage_bag.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_garbage_bag_attempt_01.png", "output": "references/ref_white_garbage_bag.png", "mask": "references/sam_mask_white_garbage_bag.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [27.0, 77.0, 963.0, 989.0], "mask_score": 3.477571, "mask_area_ratio": 0.521497, "elapsed_seconds": 7.4222}}, {"name": "multi_story_building_left", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", "source_name": "building on left", "source_description": "Multi-story brick buildings with numerous windows and fire escapes on the left side. Source dataset: BDD100K. Scene context: A narrow city street lined with parked cars on both sides, with residential and commercial buildings featuring fire escapes and awnings, illuminated by sunlight filtering through mature trees.", "sub_caption": "building on left: Multi-story brick buildings with numerous windows and fire escapes.. Scene role: forming the street facade along the left side of the frame", "measured_bbox": [0.1156, 0.0, 0.416, 0.6004], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_multi_story_building_left.png", "raw_ref_image": "references/raw_ref_multi_story_building_left_attempt_01.png", "reference_verify": "references/reference_verify_multi_story_building_left.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_multi_story_building_left_attempt_01.png", "output": "references/ref_multi_story_building_left.png", "mask": "references/sam_mask_multi_story_building_left.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [21.0, 18.0, 1013.0, 988.0], "mask_score": 2.993486, "mask_area_ratio": 0.685524, "elapsed_seconds": 7.4739}}, {"name": "street_light_pole", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", "source_name": "street light pole", "source_description": "A tall, curved metal street light pole on the right side of the road, supporting the overhead sign. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane road bordered by trees, with other vehicles and road signs visible.", "sub_caption": "street light pole: A tall, curved metal street light pole.. Scene role: standing on the right sidewalk, leaning over the roadway", "measured_bbox": [0.548, 0.0288, 0.7884, 0.7106], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_light_pole.png", "raw_ref_image": "references/raw_ref_street_light_pole_attempt_01.png", "reference_verify": "references/reference_verify_street_light_pole.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_street_light_pole_attempt_01.png", "output": "references/ref_street_light_pole.png", "mask": "references/sam_mask_street_light_pole.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [204.0, 10.0, 816.0, 1018.0], "mask_score": 3.426132, "mask_area_ratio": 0.025422, "elapsed_seconds": 7.2131}}, {"name": "white_sedan", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c754ce77-a105a975:object:3", "source_name": "white sedan", "source_description": "A white passenger car partially visible in the right lane next to the gold SUV. Source dataset: BDD100K. Scene context: View from inside a car driving in city traffic on a sunny day with multiple vehicles and urban infrastructure visible.", "sub_caption": "white sedan: A white passenger car.. Scene role: driving in the forward lane just past the crosswalk", "measured_bbox": [0.494, 0.5481, 0.6384, 0.6346], "detection_confidence": 1.0, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_white_sedan.png", "raw_ref_image": "references/raw_ref_white_sedan_attempt_01.png", "reference_verify": "references/reference_verify_white_sedan.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_sedan_attempt_01.png", "output": "references/ref_white_sedan.png", "mask": "references/sam_mask_white_sedan.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [10.0, 331.0, 1014.0, 694.0], "mask_score": 2.789065, "mask_area_ratio": 0.197716, "elapsed_seconds": 7.2389}}, {"name": "dark_car_1", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", "source_name": "dark car", "source_description": "A dark-colored sedan visible on the left side of the street, partially obscured by rain. Source dataset: BDD100K. Scene context: View from inside a vehicle through a heavily rain-covered windshield, looking at city traffic and buildings.", "sub_caption": "dark car: A dark-colored sedan.. Scene role: driving in the opposing traffic lane to the left", "measured_bbox": [0.3126, 0.5583, 0.4593, 0.6372], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dark_car_1.png", "raw_ref_image": "references/raw_ref_dark_car_1_attempt_01.png", "reference_verify": "references/reference_verify_dark_car_1.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_1_attempt_01.png", "output": "references/ref_dark_car_1.png", "mask": "references/sam_mask_dark_car_1.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [16.0, 317.0, 1007.0, 664.0], "mask_score": 3.079951, "mask_area_ratio": 0.171859, "elapsed_seconds": 7.1713}}, {"name": "dark_car_2", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", "source_name": "dark car 2", "source_description": "A dark-colored car parked on the right side of the street, ahead of the other dark car. Source dataset: BDD100K. Scene context: A dashcam view driving down a city street lined with parked cars on both sides and multi-story brick apartment buildings under a partly cloudy sky.", "sub_caption": "dark car 2: A dark-colored car.. Scene role: parked alongside the curb on the right side of the street", "measured_bbox": [0.7955, 0.5535, 0.9254, 0.6345], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dark_car_2.png", "raw_ref_image": "references/raw_ref_dark_car_2_attempt_01.png", "reference_verify": "references/reference_verify_dark_car_2.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_2_attempt_01.png", "output": "references/ref_dark_car_2.png", "mask": "references/sam_mask_dark_car_2.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 353.0, 1023.0, 727.0], "mask_score": 3.072596, "mask_area_ratio": 0.191711, "elapsed_seconds": 7.2503}}, {"name": "brick_building_right", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c2186a76-5444a563:object:5", "source_name": "brick building", "source_description": "A tall, multi-story red brick building on the left side of the street, featuring arched windows and a storefront. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a city street lined with parked cars and multi-story brick buildings.", "sub_caption": "brick building: A tall, multi-story red brick building featuring arched windows and a storefront.. Scene role: lining the street on the right side of the frame", "measured_bbox": [0.9184, 0.0073, 0.9492, 0.6625], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_brick_building_right.png", "raw_ref_image": "references/raw_ref_brick_building_right_attempt_01.png", "reference_verify": "references/reference_verify_brick_building_right.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_brick_building_right_attempt_01.png", "output": "references/ref_brick_building_right.png", "mask": "references/sam_mask_brick_building_right.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [100.0, 0.0, 930.0, 1023.0], "mask_score": 2.148493, "mask_area_ratio": 0.586381, "elapsed_seconds": 7.383}}, {"name": "metal_pole", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c411687d-73471431:object:14", "source_name": "pole", "source_description": "A thin, straight metal pole standing upright on the sidewalk near the park area on the left. Source dataset: BDD100K. Scene context: A dashcam view looking down a slightly sloped residential city street with cars parked on both sides, trees bordering a park area to the left, and a tall building to the right, under a cloudy, overcast sky.", "sub_caption": "pole: A thin, straight metal pole standing upright.. Scene role: standing on the left sidewalk, serving as a mounting point for the balloons", "measured_bbox": [0.215, 0.375, 0.23, 0.734], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_metal_pole.png", "raw_ref_image": "references/raw_ref_metal_pole_attempt_01.png", "reference_verify": "references/reference_verify_metal_pole.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_metal_pole_attempt_01.png", "output": "references/ref_metal_pole.png", "mask": "references/sam_mask_metal_pole.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [443.0, 15.0, 574.0, 1015.0], "mask_score": 3.415272, "mask_area_ratio": 0.028519, "elapsed_seconds": 7.2129}}, {"name": "crosswalk_markings", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", "source_name": "crosswalk markings", "source_description": "White painted lines on the road surface indicating a pedestrian crosswalk. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with traffic lights and a crosswalk.", "sub_caption": "crosswalk markings: White painted lines on the road surface indicating a pedestrian crosswalk.. Scene role: painted across the road directly ahead of the camera vehicle", "measured_bbox": [0.3161, 0.6787, 0.7102, 0.7212], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_crosswalk_markings.png", "raw_ref_image": "references/raw_ref_crosswalk_markings_attempt_01.png", "reference_verify": "references/reference_verify_crosswalk_markings.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_crosswalk_markings_attempt_01.png", "output": "references/ref_crosswalk_markings.png", "mask": "references/sam_mask_crosswalk_markings.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 104.0, 1023.0, 866.0], "mask_score": 3.308171, "mask_area_ratio": 0.469022, "elapsed_seconds": 7.2159}}, {"name": "iron_balcony", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", "source_name": "balcony", "source_description": "A dark, wrought-iron balcony on a building. Source dataset: CrowdHuman. Scene context: People walk down a narrow, sunlit street lined with tall buildings.", "sub_caption": "balcony: A dark, wrought-iron balcony.. Scene role: attached to the facade of the multi-story building on the left", "measured_bbox": [0.0089, 0.0773, 0.1849, 0.3213], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_iron_balcony.png", "raw_ref_image": "references/raw_ref_iron_balcony_attempt_01.png", "reference_verify": "references/reference_verify_iron_balcony.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_iron_balcony_attempt_01.png", "output": "references/ref_iron_balcony.png", "mask": "references/sam_mask_iron_balcony.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [76.0, 0.0, 946.0, 952.0], "mask_score": 3.194017, "mask_area_ratio": 0.432758, "elapsed_seconds": 7.1961}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000008", "target_total": 10, "target_people": 8, "target_objects": 2, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 10, "n_detected": 10, "n_subjects": 10, "subjects": [{"name": "pedestrian_walking_away", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", "source_name": "passenger", "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath.. Scene role: walking away from the camera on the crosswalk", "measured_bbox": [0.258, 0.308, 0.368, 0.725], "detection_confidence": "high", "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_walking_away.png", "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_walking_away.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_pedestrian_walking_away.png", "mask": "references/sam_mask_pedestrian_walking_away.png"}}, {"name": "woman_waiting", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", "source_name": "shopper", "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", "sub_caption": "shopper: A woman wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: standing on the right curb, looking towards the street traffic", "measured_bbox": [0.5514, 0.2961, 0.618, 0.6443], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_woman_waiting.png", "raw_ref_image": "references/raw_ref_woman_waiting_attempt_01.png", "reference_verify": "references/reference_verify_woman_waiting.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_woman_waiting.png", "mask": "references/sam_mask_woman_waiting.png"}}, {"name": "pedestrian_standing", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", "source_name": "shopper", "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", "sub_caption": "shopper: A person wearing a blue jacket and blue jeans.. Scene role: standing on the sidewalk near the intersection", "measured_bbox": [0.6289, 0.2934, 0.6784, 0.6567], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_standing.png", "raw_ref_image": "references/raw_ref_pedestrian_standing_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_standing.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_pedestrian_standing.png", "mask": "references/sam_mask_pedestrian_standing.png"}}, {"name": "sign_holder", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", "source_name": "protester holding sign in back", "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", "sub_caption": "protester holding sign in back: A person holding up a large white sign.. Scene role: standing on the sidewalk holding a sign near the crosswalk", "measured_bbox": [0.4795, 0.28, 0.5703, 0.5944], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_sign_holder.png", "raw_ref_image": "references/raw_ref_sign_holder_attempt_01.png", "reference_verify": "references/reference_verify_sign_holder.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_sign_holder.png", "mask": "references/sam_mask_sign_holder.png"}}, {"name": "pedestrian_crossing_right", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", "source_name": "pedestrian", "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", "sub_caption": "pedestrian: A person in a white top and dark pants.. Scene role: walking across the street from left to right in the crosswalk", "measured_bbox": [0.2006, 0.3034, 0.2705, 0.5763], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_crossing_right.png", "raw_ref_image": "references/raw_ref_pedestrian_crossing_right_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_crossing_right.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_pedestrian_crossing_right.png", "mask": "references/sam_mask_pedestrian_crossing_right.png"}}, {"name": "pedestrian_walking_away_sidewalk", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", "source_name": "pedestrian", "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", "sub_caption": "pedestrian: A person wearing a white top and dark pants.. Scene role: walking away from the camera on the distant sidewalk", "measured_bbox": [0.2006, 0.3029, 0.2706, 0.5728], "detection_confidence": 0.9, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_walking_away_sidewalk.png", "raw_ref_image": "references/raw_ref_pedestrian_walking_away_sidewalk_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_walking_away_sidewalk.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_pedestrian_walking_away_sidewalk.png", "mask": "references/sam_mask_pedestrian_walking_away_sidewalk.png"}}, {"name": "young_man_waiting", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", "source_name": "young man", "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", "sub_caption": "young man: A young man wearing a dark blue hoodie.. Scene role: waiting on the corner for the pedestrian signal", "measured_bbox": [0.6911, 0.2718, 0.7496, 0.6983], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_young_man_waiting.png", "raw_ref_image": "references/raw_ref_young_man_waiting_attempt_01.png", "reference_verify": "references/reference_verify_young_man_waiting.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_young_man_waiting.png", "mask": "references/sam_mask_young_man_waiting.png"}}, {"name": "businessman_waiting", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", "source_name": "adult in dark suit", "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", "sub_caption": "adult in dark suit: An adult wearing a dark suit and tie.. Scene role: standing near the crosswalk amongst the crowd on the curb", "measured_bbox": [0.7491, 0.2604, 0.8005, 0.6281], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_businessman_waiting.png", "raw_ref_image": "references/raw_ref_businessman_waiting_attempt_01.png", "reference_verify": "references/reference_verify_businessman_waiting.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"cached": true, "output": "references/ref_businessman_waiting.png", "mask": "references/sam_mask_businessman_waiting.png"}}, {"name": "street_lamp", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", "source_name": "street lamp", "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", "sub_caption": "street lamp: A tall street lamp pole.. Scene role: standing on the right corner of the intersection, near the curb", "measured_bbox": [0.777, 0.0, 0.859, 0.747], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_lamp.png", "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", "reference_verify": "references/reference_verify_street_lamp.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", "output": "references/ref_street_lamp.png", "mask": "references/sam_mask_street_lamp.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [278.0, 0.0, 808.0, 1023.0], "mask_score": 3.224807, "mask_area_ratio": 0.054686, "elapsed_seconds": 7.0333}}, {"name": "dashboard_reflection", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", "source_name": "vehicle dashboard reflection", "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", "sub_caption": "vehicle dashboard reflection: A faint reflection of an interior dashboard and an object on the windshield.. Scene role: overlaid on the bottom portion of the view, establishing the perspective from inside a car", "measured_bbox": [0.0, 0.616, 0.608, 0.818], "detection_confidence": 0.88, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dashboard_reflection.png", "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_02.png", "reference_verify": "references/reference_verify_dashboard_reflection.json", "reference_verify_passed": true, "reference_attempts": 2, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_02.png", "output": "references/ref_dashboard_reflection.png", "mask": "references/sam_mask_dashboard_reflection.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [28.0, 239.0, 997.0, 784.0], "mask_score": 3.469732, "mask_area_ratio": 0.384025, "elapsed_seconds": 8.6424}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000009", "target_total": 5, "target_people": 1, "target_objects": 4, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 5, "n_detected": 5, "n_subjects": 5, "subjects": [{"name": "pedestrian_in_suit", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", "source_name": "pedestrian in suit", "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking on the sidewalk adjacent to the road, safely behind the metal railing", "measured_bbox": [0.7699, 0.4157, 0.8109, 0.6322], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_in_suit.png", "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_in_suit.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", "output": "references/ref_pedestrian_in_suit.png", "mask": "references/sam_mask_pedestrian_in_suit.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [348.0, 31.0, 680.0, 1016.0], "mask_score": 3.479882, "mask_area_ratio": 0.150441, "elapsed_seconds": 7.1831}}, {"name": "yellow_lane_line", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", "source_name": "yellow lane line", "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: running along the asphalt road in the foreground and midground", "measured_bbox": [0.5116, 0.511, 0.7953, 0.8681], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_yellow_lane_line.png", "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", "reference_verify": "references/reference_verify_yellow_lane_line.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", "output": "references/ref_yellow_lane_line.png", "mask": "references/sam_mask_yellow_lane_line.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 163.0, 1023.0, 845.0], "mask_score": 3.12139, "mask_area_ratio": 0.132687, "elapsed_seconds": 8.5948}}, {"name": "overpass", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", "source_name": "overpass", "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", "sub_caption": "overpass: A concrete bridge structure spanning across the highway ahead, casting a shadow over the road.. Scene role: spanning horizontally across the upper midground of the view", "measured_bbox": [0.0, 0.1444, 1.0, 0.5433], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_overpass.png", "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", "reference_verify": "references/reference_verify_overpass.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", "output": "references/ref_overpass.png", "mask": "references/sam_mask_overpass.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 71.0, 1023.0, 823.0], "mask_score": 2.496995, "mask_area_ratio": 0.373877, "elapsed_seconds": 7.1818}}, {"name": "metal_railing", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", "source_name": "metal railing", "source_description": "A metal railing visible on the far right edge of the scene. Source dataset: CrowdHuman. Scene context: A group of people, possibly a tour group, is walking outdoors on a paved area near a stone building, with one person holding a green flag and another speaking into a microphone.", "sub_caption": "metal railing: A metal railing visible on the far right edge of the scene.. Scene role: acting as a safety barrier between the pedestrian sidewalk and the road on the right side", "measured_bbox": [0.512, 0.5, 0.999, 0.913], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_metal_railing.png", "raw_ref_image": "references/raw_ref_metal_railing_attempt_01.png", "reference_verify": "references/reference_verify_metal_railing.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_metal_railing_attempt_01.png", "output": "references/ref_metal_railing.png", "mask": "references/sam_mask_metal_railing.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [18.0, 261.0, 1017.0, 934.0], "mask_score": 3.402974, "mask_area_ratio": 0.273856, "elapsed_seconds": 7.2968}}, {"name": "street_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", "source_name": "street light", "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: tall lamp post positioned near the overpass, illuminating the surrounding area", "measured_bbox": [0.51, 0.0165, 0.6141, 0.5516], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_light.png", "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", "reference_verify": "references/reference_verify_street_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", "output": "references/ref_street_light.png", "mask": "references/sam_mask_street_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [176.0, 12.0, 847.0, 1014.0], "mask_score": 3.41464, "mask_area_ratio": 0.029787, "elapsed_seconds": 7.0955}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000010", "target_total": 9, "target_people": 8, "target_objects": 1, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 9, "n_detected": 9, "n_subjects": 9, "subjects": [{"name": "pedestrian_walking_away", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", "source_name": "pedestrian", "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away down the sidewalk on the right", "measured_bbox": [0.8776, 0.2931, 0.9906, 0.7623], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_walking_away.png", "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_walking_away.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_walking_away_attempt_01.png", "output": "references/ref_pedestrian_walking_away.png", "mask": "references/sam_mask_pedestrian_walking_away.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [341.0, 40.0, 683.0, 1002.0], "mask_score": 3.42052, "mask_area_ratio": 0.145487, "elapsed_seconds": 7.0967}}, {"name": "pedestrian_with_backpack", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", "source_name": "pedestrian", "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: crossing the street in front of the SUV", "measured_bbox": [0.2259, 0.2854, 0.322, 0.6604], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_with_backpack.png", "raw_ref_image": "references/raw_ref_pedestrian_with_backpack_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_with_backpack.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_with_backpack_attempt_01.png", "output": "references/ref_pedestrian_with_backpack.png", "mask": "references/sam_mask_pedestrian_with_backpack.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [344.0, 36.0, 682.0, 1012.0], "mask_score": 3.441997, "mask_area_ratio": 0.151945, "elapsed_seconds": 8.5941}}, {"name": "pedestrian_in_red", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", "source_name": "pedestrian standing", "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: standing at the street corner waiting to cross", "measured_bbox": [0.6576, 0.3338, 0.6899, 0.5607], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_in_red.png", "raw_ref_image": "references/raw_ref_pedestrian_in_red_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_in_red.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_red_attempt_01.png", "output": "references/ref_pedestrian_in_red.png", "mask": "references/sam_mask_pedestrian_in_red.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [331.0, 52.0, 699.0, 1007.0], "mask_score": 3.430953, "mask_area_ratio": 0.159512, "elapsed_seconds": 7.0834}}, {"name": "pedestrian_in_striped_shirt", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", "source_name": "pedestrian", "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: walking briskly across the crosswalk", "measured_bbox": [0.433, 0.3315, 0.5713, 0.6823], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_in_striped_shirt.png", "raw_ref_image": "references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_in_striped_shirt.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", "output": "references/ref_pedestrian_in_striped_shirt.png", "mask": "references/sam_mask_pedestrian_in_striped_shirt.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [343.0, 23.0, 676.0, 1011.0], "mask_score": 3.472095, "mask_area_ratio": 0.152217, "elapsed_seconds": 7.27}}, {"name": "man_in_pink_shirt", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", "source_name": "man talking to young man", "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: standing near the curb waiting for a light", "measured_bbox": [0.7524, 0.2743, 0.8106, 0.6687], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_man_in_pink_shirt.png", "raw_ref_image": "references/raw_ref_man_in_pink_shirt_attempt_01.png", "reference_verify": "references/reference_verify_man_in_pink_shirt.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_pink_shirt_attempt_01.png", "output": "references/ref_man_in_pink_shirt.png", "mask": "references/sam_mask_man_in_pink_shirt.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [332.0, 24.0, 708.0, 1000.0], "mask_score": 3.415589, "mask_area_ratio": 0.161095, "elapsed_seconds": 7.2651}}, {"name": "man_in_grey_sweater", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", "source_name": "man", "source_description": "Man wearing a grey sweater. Source dataset: CrowdHuman. Scene context: People are walking through an airport terminal with prominent overhead signage.", "sub_caption": "man: Man wearing a grey sweater.. Scene role: walking towards the camera on the crosswalk", "measured_bbox": [0.3541, 0.2895, 0.4483, 0.7382], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_man_in_grey_sweater.png", "raw_ref_image": "references/raw_ref_man_in_grey_sweater_attempt_01.png", "reference_verify": "references/reference_verify_man_in_grey_sweater.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_grey_sweater_attempt_01.png", "output": "references/ref_man_in_grey_sweater.png", "mask": "references/sam_mask_man_in_grey_sweater.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [344.0, 47.0, 683.0, 1003.0], "mask_score": 3.491882, "mask_area_ratio": 0.143696, "elapsed_seconds": 7.1775}}, {"name": "pedestrian_in_light_jacket", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", "source_name": "pedestrian", "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: walking on the sidewalk in the midground", "measured_bbox": [0.5797, 0.3113, 0.6425, 0.5493], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_in_light_jacket.png", "raw_ref_image": "references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_in_light_jacket.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", "output": "references/ref_pedestrian_in_light_jacket.png", "mask": "references/sam_mask_pedestrian_in_light_jacket.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [313.0, 33.0, 685.0, 1017.0], "mask_score": 3.458198, "mask_area_ratio": 0.174406, "elapsed_seconds": 7.237}}, {"name": "pedestrian_in_light_blue", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", "source_name": "pedestrian", "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: crossing the street away from the camera's view", "measured_bbox": [0.0034, 0.2952, 0.1205, 0.6424], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_in_light_blue.png", "raw_ref_image": "references/raw_ref_pedestrian_in_light_blue_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_in_light_blue.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_blue_attempt_01.png", "output": "references/ref_pedestrian_in_light_blue.png", "mask": "references/sam_mask_pedestrian_in_light_blue.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [341.0, 47.0, 672.0, 989.0], "mask_score": 3.478225, "mask_area_ratio": 0.140584, "elapsed_seconds": 7.065}}, {"name": "black_suv", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", "source_name": "black suv", "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", "sub_caption": "black suv: A black SUV.. Scene role: stopped at the crosswalk yielding to pedestrians", "measured_bbox": [0.0797, 0.2941, 0.5997, 0.5875], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_black_suv.png", "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", "reference_verify": "references/reference_verify_black_suv.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", "output": "references/ref_black_suv.png", "mask": "references/sam_mask_black_suv.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [1.0, 273.0, 1023.0, 701.0], "mask_score": 3.159418, "mask_area_ratio": 0.229866, "elapsed_seconds": 7.2921}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..81475a4c46befd8e86b7c3c010e1f989074cbac2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:097cd068e4799b27434c9bb78ad29d5d4835830eff59f4eb283ecbfa0fb83c9e +size 1849836 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..a46a73777d2f6f680fb0535f0a1f118174ef07fd --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/compose_prompt.txt @@ -0,0 +1,63 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A curbside along a city street on an overcast day", + "activity": "A pedestrian is walking along the edge of the street near a parked dark car and a temporary metal barricade", + "composition": "Medium wide shot, eye-level perspective. The pedestrian is situated in the foreground-left, walking parallel to the curb. A silver metal barrier forms a line separating the walking area from the street, and a dark sedan is parked on the right side of the frame.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "description": "A person wearing a dark coat and trousers, walking confidently.", + "role_in_scene": "Walking along the curbside near the barrier." + } + ], + "objects": [ + { + "name": "parked_dark_car", + "source_index": 1, + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "Parked on the street near the curb in the background right." + }, + { + "name": "metal_barrier", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "description": "A silver metal barricade placed along the street.", + "role_in_scene": "Positioned along the curb to section off the pedestrian area from the road." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_metal_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..257d90bed9be3780b36e4867a18ce67319f6fa1a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_metal_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a72498911b19facb22000398e7f2a701a92b4f9f210076b7aab63e53244b57 +size 270726 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_parked_dark_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..6b10e9a4f226b4b7cbb85a263cfe7d81aa514841 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_parked_dark_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac9006ccc530d840fb80a81033ea294c3b3ec6b1bebc76eb1553ea4d3bea12c +size 643624 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..905b1966db12d40970a239b15e7b556243ab9ef7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/detect_refine_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98c08b2c5deea60144452248b248bd8d4e680458f683e3b4522ca3c0eceb3bf +size 108871 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_metal_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..368bb381617cf4ba7bb4e8163579311f32d7554e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_metal_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95f34ce9a314e71405c10a79993fd360a17cb1514fe0959ca27508bc803731f3 +size 324326 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_parked_dark_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..73319176fadb359a5462c542b6248cfe79826ce8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_parked_dark_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:771731f41dc4d6db87c321477730018716747b629551495827d0572b60d56252 +size 792617 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..ee9939de87f2bcc205dabd1b4bf35876a4d12349 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/crops/diversify_input_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be52573804260ebeb57fc396218903f4a027d4faed9cde2bfa7d3115ae2c074 +size 209129 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..3b19cd466e3d43980a28447e286c6039429c0d4d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/detections.json @@ -0,0 +1,59 @@ +[ + { + "name": "pedestrian", + "present": true, + "bbox": [ + 0.1528, + 0.301, + 0.2511, + 0.7071 + ], + "confidence": 0.98, + "notes": "The closest and most prominent pedestrian matches the description provided perfectly.", + "coarse_bbox": [ + 0.15, + 0.303, + 0.251, + 0.709 + ], + "refine_crop": "crops/detect_refine_pedestrian.png" + }, + { + "name": "parked_dark_car", + "present": true, + "bbox": [ + 0.5163, + 0.3897, + 0.9968, + 0.9244 + ], + "confidence": 0.98, + "notes": "The large dark gray sedan taking up most of the crop matches the description of 'parked dark car'.", + "coarse_bbox": [ + 0.516, + 0.39, + 0.995, + 0.917 + ], + "refine_crop": "crops/detect_refine_parked_dark_car.png" + }, + { + "name": "metal_barrier", + "present": true, + "bbox": [ + 0.3454, + 0.4302, + 0.5465, + 0.8402 + ], + "confidence": 0.98, + "notes": "A prominent silver metal barricade is clearly visible in the foreground, taking up a significant portion of the image. The bounding box tightly encompasses this main barricade.", + "coarse_bbox": [ + 0.339, + 0.432, + 0.556, + 0.839 + ], + "refine_crop": "crops/detect_refine_metal_barrier.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..6276c7d1dbdbed0e0c9e2e0f29ba0f8f675cc649 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1272577e1b327c2ffc706ae0db42696fe1d2b9af2cfd177752d491cd3eec5fb6 +size 1903250 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..fc33e1a6ddfc9e00a94f99e671d851fdea1b7f24 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/plan.json @@ -0,0 +1,107 @@ +{ + "sample_id": "sample_000001", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A curbside along a city street on an overcast day", + "activity": "A pedestrian is walking along the edge of the street near a parked dark car and a temporary metal barricade", + "composition": "Medium wide shot, eye-level perspective. The pedestrian is situated in the foreground-left, walking parallel to the curb. A silver metal barrier forms a line separating the walking area from the street, and a dark sedan is parked on the right side of the frame.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "description": "A person wearing a dark coat and trousers, walking confidently.", + "role_in_scene": "Walking along the curbside near the barrier." + } + ], + "objects": [ + { + "name": "parked_dark_car", + "source_index": 1, + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "Parked on the street near the curb in the background right." + }, + { + "name": "metal_barrier", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "description": "A silver metal barricade placed along the street.", + "role_in_scene": "Positioned along the curb to section off the pedestrian area from the road." + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", + "sub_caption": "pedestrian: A person wearing a dark coat and trousers, walking confidently.. Scene role: Walking along the curbside near the barrier.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "parked_dark_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", + "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the street near the curb in the background right.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "metal_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", + "sub_caption": "metal barrier: A silver metal barricade placed along the street.. Scene role: Positioned along the curb to section off the pedestrian area from the road.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000001/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references.json new file mode 100644 index 0000000000000000000000000000000000000000..5ffc5e33460c6958079d20ddce0bcbf5f12536ad --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references.json @@ -0,0 +1,101 @@ +{ + "references": [ + { + "name": "pedestrian", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 42.0, + 698.0, + 1007.0 + ], + "mask_score": 3.413244, + "mask_area_ratio": 0.159381, + "elapsed_seconds": 33.2771 + }, + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "parked_dark_car", + "ref_image": "references/ref_parked_dark_car.png", + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_01.png", + "diversify_input": "crops/diversify_input_parked_dark_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_01.png", + "output": "references/ref_parked_dark_car.png", + "mask": "references/sam_mask_parked_dark_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 315.0, + 976.0, + 694.0 + ], + "mask_score": 3.4345, + "mask_area_ratio": 0.180014, + "elapsed_seconds": 7.1991 + }, + "reference_verify": "references/reference_verify_parked_dark_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "metal_barrier", + "ref_image": "references/ref_metal_barrier.png", + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "diversify_input": "crops/diversify_input_metal_barrier.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "references/ref_metal_barrier.png", + "mask": "references/sam_mask_metal_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 2.0, + 107.0, + 1009.0, + 986.0 + ], + "mask_score": 1.555076, + "mask_area_ratio": 0.845579, + "elapsed_seconds": 7.2854 + }, + "reference_verify": "references/reference_verify_metal_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_metal_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..e194f6c27915417c2bf1deb357ee4238abc1aaf5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_metal_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc4e592252f9501ef3c05c0b389edb302abe33f6540e65a96048f1763393313c +size 681911 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_parked_dark_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..3b17561e277dda5a4b6f9d5b54a697f9edc204c9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_parked_dark_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47c26b93b8565164aec5ecbdf267c62ec820f68ebc3f5cb3cacea0b811bb173 +size 365625 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..c40b63165d9c950c9e7876099b389bb98293eaf4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/ref_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75b4489c45d746740dad4cb39066eae692f9dd079033217a026fab4e437e57b8 +size 282737 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_metal_barrier.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_metal_barrier.json new file mode 100644 index 0000000000000000000000000000000000000000..714521d8fd1018f84ac197e31148c9215e512ffc --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_metal_barrier.json @@ -0,0 +1,46 @@ +{ + "name": "metal_barrier", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_metal_barrier_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_metal_barrier_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_ref_metal_barrier_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_sam_mask_metal_barrier_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 2.0, + 107.0, + 1009.0, + 986.0 + ], + "mask_score": 1.555076, + "mask_area_ratio": 0.845579, + "elapsed_seconds": 7.2854 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The metal barrier is fully visible and isolated against a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_parked_dark_car.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_parked_dark_car.json new file mode 100644 index 0000000000000000000000000000000000000000..21bc9722db8644073bba814f2e0ba7cef2e07851 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_parked_dark_car.json @@ -0,0 +1,46 @@ +{ + "name": "parked_dark_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_parked_dark_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_parked_dark_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_ref_parked_dark_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_sam_mask_parked_dark_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 315.0, + 976.0, + 694.0 + ], + "mask_score": 3.4345, + "mask_area_ratio": 0.180014, + "elapsed_seconds": 7.1991 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete dark-colored sedan isolated on a white background, which perfectly fits the subject description." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_pedestrian.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..b0b137e5bb9f7292bcc9f41692eb8ab1740ca79a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/reference_verify_pedestrian.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_ref_pedestrian_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_sam_mask_pedestrian_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 42.0, + 698.0, + 1007.0 + ], + "mask_score": 3.413244, + "mask_area_ratio": 0.159381, + "elapsed_seconds": 33.2771 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a person on a white background with no cropping. It is a suitable reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_metal_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..ab8e019a669d6fd8104953cea02a5bbdbf966353 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_metal_barrier.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_parked_dark_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..d95860a94a6ba2db67912da7b621314cd574f310 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_parked_dark_car.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..c1da7cb297d1efb9871011830e8bec0af849bbf9 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/references/sam_mask_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/row.json new file mode 100644 index 0000000000000000000000000000000000000000..2c2e2e43560e8deb051cd990950d1e1a78a551bc --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/row.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000001", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", + "sub_caption": "pedestrian: A person wearing a dark coat and trousers, walking confidently.. Scene role: Walking along the curbside near the barrier.", + "measured_bbox": [ + 0.1528, + 0.301, + 0.2511, + 0.7071 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 42.0, + 698.0, + 1007.0 + ], + "mask_score": 3.413244, + "mask_area_ratio": 0.159381, + "elapsed_seconds": 33.2771 + } + }, + { + "name": "parked_dark_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", + "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the street near the curb in the background right.", + "measured_bbox": [ + 0.5163, + 0.3897, + 0.9968, + 0.9244 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_dark_car.png", + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_01.png", + "reference_verify": "references/reference_verify_parked_dark_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_01.png", + "output": "references/ref_parked_dark_car.png", + "mask": "references/sam_mask_parked_dark_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 315.0, + 976.0, + 694.0 + ], + "mask_score": 3.4345, + "mask_area_ratio": 0.180014, + "elapsed_seconds": 7.1991 + } + }, + { + "name": "metal_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", + "sub_caption": "metal barrier: A silver metal barricade placed along the street.. Scene role: Positioned along the curb to section off the pedestrian area from the road.", + "measured_bbox": [ + 0.3454, + 0.4302, + 0.5465, + 0.8402 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_barrier.png", + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "reference_verify": "references/reference_verify_metal_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "references/ref_metal_barrier.png", + "mask": "references/sam_mask_metal_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 2.0, + 107.0, + 1009.0, + 986.0 + ], + "mask_score": 1.555076, + "mask_area_ratio": 0.845579, + "elapsed_seconds": 7.2854 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..874543dd7b50acc9b90ad0dfcf81578eff46efeb --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000001/vocab_task.json @@ -0,0 +1,56 @@ +{ + "task_id": "sample_000001", + "sample_id": "sample_000001", + "sample_index": 1, + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 13962, + "image_id": "CrowdHuman:data/data_13/282555,65d1d00050480dce.jpg:person:2", + "name": "toddler", + "description": "A young child with short brown hair, wearing a light blue patterned sweater, being carried by the woman in blue. Source dataset: CrowdHuman. Scene context: Two women, one carrying a toddler and the other walking hand-in-hand with a young girl, are crossing a street with parked cars in the background." + }, + { + "candidate_index": 1, + "source_offset": 171812, + "image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "name": "pedestrian", + "description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 129279, + "image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "name": "metal barrier", + "description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows." + }, + { + "candidate_index": 1, + "source_offset": 182609, + "image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "name": "parked dark car", + "description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it." + }, + { + "candidate_index": 2, + "source_offset": 92980, + "image_id": "CrowdHuman:data/data_56/273278,97d7f00040d24761.jpg:object:4", + "name": "paved path", + "description": "A textured paved walkway the group is walking on. Source dataset: CrowdHuman. Scene context: A group of five adults walking together and conversing in a park-like outdoor setting." + }, + { + "candidate_index": 3, + "source_offset": 83796, + "image_id": "CrowdHuman:data/data_51/273278,23a1a000c26da45e.jpg:object:0", + "name": "plastic bag", + "description": "White plastic shopping bag held by the seated woman. Source dataset: CrowdHuman. Scene context: People are standing and sitting inside a brightly lit train or subway car." + } + ], + "rng_seed": 1782032722, + "created_at": 1782292413.1601257 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..82ba97e4b0c28df8ef1f17596b614d91b8fa7e63 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ed83322ed982afd7035973db8d6dd53324aded568663e4804b102e556d84b48 +size 1347836 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb00883e10c32a3aab5d5a0d5b1d818628b411e1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/compose_prompt.txt @@ -0,0 +1,159 @@ +Render the following JSON scene specification as a photorealistic 1152x864 image using a true 4:3 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1152, + 864 + ], + "aspect_ratio": "4:3", + "style": "photorealistic" + }, + "scene": { + "setting": "city street at twilight viewed from inside a moving vehicle", + "activity": "driving down a two-way street with parked and moving cars, while pedestrians walk on the adjacent sidewalk", + "composition": "first-person dashboard perspective, looking down the road, clear depth of field receding to the horizon, road centralized, dashboard anchors the bottom frame", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 4:3 composition", + "final canvas size 1152x864", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walking", + "source_index": 0, + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "description": "Person walking away from the camera on the right sidewalk, wearing dark clothing.", + "role_in_scene": "walking along the sidewalk on the right side of the street" + }, + { + "name": "shop_pedestrian", + "source_index": 2, + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "description": "Person standing near a shop entrance on the right, partially obscured.", + "role_in_scene": "standing on the sidewalk near the storefronts on the right" + } + ], + "objects": [ + { + "name": "city_buildings", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "description": "Various city buildings of different heights forming the skyline and lining the street.", + "role_in_scene": "framing the street and forming the background skyline" + }, + { + "name": "street_signs", + "source_index": 6, + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "description": "Various blank street signs attached to a pole on the right side of the street.", + "role_in_scene": "mounted on a pole next to the right sidewalk" + }, + { + "name": "storefront_sign", + "source_index": 7, + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "description": "A dark hanging sign framework attached to a building on the right, devoid of readable text.", + "role_in_scene": "hanging above the shop entrance on the right side of the road" + }, + { + "name": "parked_suv_right", + "source_index": 11, + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "description": "Dark-colored SUV parked on the right side of the road.", + "role_in_scene": "parked alongside the right curb" + }, + { + "name": "parked_car_left", + "source_index": 12, + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "description": "A dark car parked along the left curb further ahead.", + "role_in_scene": "parked alongside the left curb" + }, + { + "name": "traveling_dark_suv", + "source_index": 14, + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "description": "A dark SUV traveling in the left lane, with visible red taillights reflecting the twilight.", + "role_in_scene": "driving in the adjacent lane" + }, + { + "name": "street_light", + "source_index": 15, + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "description": "Tall pole with a bright light on top, illuminating the road from the right side.", + "role_in_scene": "providing illumination from the right sidewalk" + }, + { + "name": "vehicle_dashboard", + "source_index": 16, + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground.", + "role_in_scene": "anchoring the bottom of the frame to establish a driver's perspective" + }, + { + "name": "drainage_grate", + "source_index": 17, + "source_image_id": "BDD100K:b4d0e72d-3b208072:object:16", + "source_name": "drainage grate", + "description": "A metal drainage grate on the edge of the road on the right.", + "role_in_scene": "embedded in the road surface near the right curb" + }, + { + "name": "white_car", + "source_index": 20, + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "description": "A white car visible further down the road in the right lane.", + "role_in_scene": "driving ahead in the same lane" + }, + { + "name": "yellow_lines", + "source_index": 22, + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "description": "Double yellow painted lines separating opposite directions of traffic.", + "role_in_scene": "painted down the center of the road" + }, + { + "name": "street_trees", + "source_index": 23, + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "description": "Numerous trees with dense green foliage lining both sides of the road.", + "role_in_scene": "growing along the sidewalks, adding greenery" + }, + { + "name": "twilight_sky", + "source_index": 25, + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top.", + "role_in_scene": "visible above the buildings and trees at the end of the road" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_city_buildings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..5f8c969dd7385c8007a7b9eb007459138bdb9dd3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_city_buildings.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb3c7b1a55117744873f03b55a08c2e042d713752e58bd0b2b92d339165ded5d +size 1027643 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_drainage_grate.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_drainage_grate.png new file mode 100644 index 0000000000000000000000000000000000000000..35ba16bc42069e9c794b14da0166a5cc3cd9a1c5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_drainage_grate.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_parked_car_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_parked_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..23f90355821761db488993f8fab8178e084c9096 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_parked_car_left.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_parked_suv_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..6d728771fcf39b8383d5bdd763fff669955699f2 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_parked_suv_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_pedestrian_walking.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_pedestrian_walking.png new file mode 100644 index 0000000000000000000000000000000000000000..a8f3d3ed512006e093e779f6acfbac096d17d0ff Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_pedestrian_walking.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_shop_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_shop_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..884d77af77f07e8664eb24ef36ded4160d0a07cd Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_shop_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_storefront_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..ac7231e37102cad345217e6c1f1c3c4c52cd8791 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_storefront_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..468ee4aff4e5be30e934ba59408e6be2d50def66 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:855e5c5b6bd08b18b80d48ff2ecee8956b95614dcb94b1fa9b941c41aae54a55 +size 186184 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_signs.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..ff693dd527812f3e2f74cb332268317aef886c38 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_signs.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_trees.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..df324eedc3aefff22555508fdcb19db824c02dd9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8805f457ba5a5d70459bc37a6de7178338e06428d2fb3a0e33f8e1d6e9a01ada +size 625075 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_traveling_dark_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_traveling_dark_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..36da44952b2264ad92d8698621f53e30e564b725 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_traveling_dark_suv.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_twilight_sky.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..883c55b3bebea6824f399782770155dc71371452 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_twilight_sky.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ab1ef0c737a5bf5d33b7ef166805c6be5d7c509f72a1f575ea97138dfa71f1c +size 850783 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..e8e2c6d8b3aab5c75f37217b8602f867bd753dce --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_vehicle_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423b49ffe7f74d77c0fea407f57bf84f64854f6e99b686db69985aed18d7784b +size 225829 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_white_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..54b6787fa5f243610ccd9f9d47877ce4b02c970f Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_white_car.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_yellow_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..87dcb71b8c2ab56003b36f400638239bb3fba754 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/detect_refine_yellow_lines.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_city_buildings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..b0bfa65f2b53507900284c553e1c3425a0e1ec9e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_city_buildings.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae3c41e4ffcf122cd7cd72664b4e14091b293829ec23cc5a46357cbda39b9dc1 +size 1109569 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_drainage_grate.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_drainage_grate.png new file mode 100644 index 0000000000000000000000000000000000000000..c764c7f1001ec7e8be4f97d0f994805a1edc8053 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_drainage_grate.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_parked_car_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_parked_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..a588d7f7148c7aa877fe721ed07e9159172e5618 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_parked_car_left.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_parked_suv_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..cbdfa4dfb417db6f64dc94b46a7c8449bbabc6ee Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_parked_suv_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_pedestrian_walking.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_pedestrian_walking.png new file mode 100644 index 0000000000000000000000000000000000000000..eadbd6d2d468440509bd9b5cfc0ac7bcf6696ea2 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_pedestrian_walking.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_shop_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_shop_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..e8be96d4bca5ca53a350e3b47db53112bf1fbf1e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_shop_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_storefront_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..8afc8404dc707ba23179ffe9f8dabbf6125edff8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_storefront_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..d634526a53da7d1ed706cd5e36d1bab627e2c962 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a40c86105e28463792dc6e8558ed4ab78ded3bf12c7960d2908f837544e2808c +size 217492 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_signs.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..f1623064ee45a3723c521f83a7a79b071f4c98c3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_signs.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_trees.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..cf38c0351f1912bcc6a1a6287f4189f8c0af8f63 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08c4e45c3b32e122415ca3766236fec9643102f9545a1705fd0de281078d6d2a +size 562666 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_traveling_dark_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_traveling_dark_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..dca604753d8ccffd483bef8593a4cb6923a55482 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_traveling_dark_suv.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_twilight_sky.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..f5c35db818a50259bf9f56cf08580fbd239c2bf1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_twilight_sky.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41ea7a0b8499dbee198a03fe8161fa45bd80bb45c618da9aa7d19fd4aeafe442 +size 817804 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..be1ff5bbf43f133fdae418f8422668cacd8a7e59 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_vehicle_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee541543c96428eb4f059bd718d39a24798057617497ed9c79aac07b0afd82c1 +size 163642 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_white_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..2fe44873dcfa8e577473da3ef8e56625282a6346 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_white_car.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_yellow_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..2a8d9f41f807a25291c920171cc9a2ae817666fe Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/crops/diversify_input_yellow_lines.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..8b91e38320b49bc3d6d6795baef4fe038c5208c2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/detections.json @@ -0,0 +1,287 @@ +[ + { + "name": "pedestrian_walking", + "present": true, + "bbox": [ + 0.7497, + 0.4757, + 0.7954, + 0.6192 + ], + "confidence": 100, + "notes": "Tight bounding box around the visible pedestrian walking away from the camera, wearing dark clothing.", + "coarse_bbox": [ + 0.747, + 0.476, + 0.794, + 0.616 + ], + "refine_crop": "crops/detect_refine_pedestrian_walking.png" + }, + { + "name": "shop_pedestrian", + "present": true, + "bbox": [ + 0.9337, + 0.4752, + 0.9695, + 0.6107 + ], + "confidence": 0.98, + "notes": "A person standing near a shop entrance.", + "coarse_bbox": [ + 0.934, + 0.474, + 0.968, + 0.608 + ], + "refine_crop": "crops/detect_refine_shop_pedestrian.png" + }, + { + "name": "city_buildings", + "present": true, + "bbox": [ + 0.0, + 0.0, + 1.0, + 0.6084 + ], + "confidence": 0.9, + "notes": "City buildings are visible on both sides of the street and in the distant background, forming a large part of the scene.", + "coarse_bbox": [ + 0.0, + 0.0, + 1.0, + 0.608 + ], + "refine_crop": "crops/detect_refine_city_buildings.png" + }, + { + "name": "street_signs", + "present": true, + "bbox": [ + 0.641, + 0.165, + 0.744, + 0.408 + ], + "confidence": 0.9, + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.641, + 0.165, + 0.744, + 0.408 + ], + "refine_crop": "crops/detect_refine_street_signs.png" + }, + { + "name": "storefront_sign", + "present": true, + "bbox": [ + 0.7854, + 0.1934, + 0.9082, + 0.2906 + ], + "confidence": 0.99, + "notes": "A prominent dark hanging sign without readable text matches the description perfectly.", + "coarse_bbox": [ + 0.785, + 0.188, + 0.906, + 0.291 + ], + "refine_crop": "crops/detect_refine_storefront_sign.png" + }, + { + "name": "parked_suv_right", + "present": true, + "bbox": [ + 0.5507, + 0.4879, + 0.6783, + 0.6234 + ], + "confidence": 0.95, + "notes": "The prominent dark-colored SUV parked on the right side of the road is bounded tightly.", + "coarse_bbox": [ + 0.546, + 0.487, + 0.68, + 0.623 + ], + "refine_crop": "crops/detect_refine_parked_suv_right.png" + }, + { + "name": "parked_car_left", + "present": true, + "bbox": [ + 0.0, + 0.5102, + 0.1259, + 0.5998 + ], + "confidence": 0.9, + "notes": "A dark car parked along the left curb.", + "coarse_bbox": [ + 0.001, + 0.509, + 0.124, + 0.6 + ], + "refine_crop": "crops/detect_refine_parked_car_left.png" + }, + { + "name": "traveling_dark_suv", + "present": true, + "bbox": [ + 0.2594, + 0.4853, + 0.417, + 0.6419 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the dark SUV in the image.", + "coarse_bbox": [ + 0.26, + 0.487, + 0.419, + 0.636 + ], + "refine_crop": "crops/detect_refine_traveling_dark_suv.png" + }, + { + "name": "street_light", + "present": true, + "bbox": [ + 0.5577, + 0.0219, + 0.6964, + 0.588 + ], + "confidence": 1.0, + "notes": "Tall pole with bright light on top.", + "coarse_bbox": [ + 0.553, + 0.024, + 0.696, + 0.588 + ], + "refine_crop": "crops/detect_refine_street_light.png" + }, + { + "name": "vehicle_dashboard", + "present": true, + "bbox": [ + 0.0, + 0.8881, + 1.0, + 1.0 + ], + "confidence": 0.9, + "notes": "Tight bounding box capturing the visible dark dashboard interior along the bottom edge.", + "coarse_bbox": [ + 0.0, + 0.842, + 1.0, + 1.0 + ], + "refine_crop": "crops/detect_refine_vehicle_dashboard.png" + }, + { + "name": "drainage_grate", + "present": true, + "bbox": [ + 0.5682, + 0.6773, + 0.8089, + 0.73 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the visible metal drainage grate, including its immediate frame.", + "coarse_bbox": [ + 0.574, + 0.681, + 0.818, + 0.734 + ], + "refine_crop": "crops/detect_refine_drainage_grate.png" + }, + { + "name": "white_car", + "present": true, + "bbox": [ + 0.4356, + 0.5036, + 0.4784, + 0.548 + ], + "confidence": "high", + "notes": "Tight bounding box around the visible white car in the crop.", + "coarse_bbox": [ + 0.435, + 0.5, + 0.478, + 0.546 + ], + "refine_crop": "crops/detect_refine_white_car.png" + }, + { + "name": "yellow_lines", + "present": true, + "bbox": [ + 0.0, + 0.622, + 0.2642, + 0.7692 + ], + "confidence": 0.99, + "notes": "Double yellow lines stretching diagonally across the image.", + "coarse_bbox": [ + 0.0, + 0.623, + 0.263, + 0.772 + ], + "refine_crop": "crops/detect_refine_yellow_lines.png" + }, + { + "name": "street_trees", + "present": true, + "bbox": [ + 0.2664, + 0.0, + 0.7141, + 0.5127 + ], + "confidence": 0.9, + "notes": "Large prominent trees with dense green foliage on both sides of the street.", + "coarse_bbox": [ + 0.004, + 0.0, + 0.717, + 0.466 + ], + "refine_crop": "crops/detect_refine_street_trees.png" + }, + { + "name": "twilight_sky", + "present": true, + "bbox": [ + 0.188, + 0.0, + 0.862, + 0.4846 + ], + "confidence": 0.95, + "notes": "Sky is visible above the buildings and trees at the end of the road.", + "coarse_bbox": [ + 0.0, + 0.0, + 1.0, + 0.495 + ], + "refine_crop": "crops/detect_refine_twilight_sky.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..beae871e453a2e3245ab84440ac767edbd1ad74f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c44f268998b78003793209bab635b6baff6bab4c22b39be1c63e1f8228d17a3b +size 1490128 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..2f262bc2f17403885f423fe49ac120fc1b504c44 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/plan.json @@ -0,0 +1,335 @@ +{ + "sample_id": "sample_000002", + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1152, + 864 + ], + "aspect_ratio": "4:3", + "style": "photorealistic" + }, + "scene": { + "setting": "city street at twilight viewed from inside a moving vehicle", + "activity": "driving down a two-way street with parked and moving cars, while pedestrians walk on the adjacent sidewalk", + "composition": "first-person dashboard perspective, looking down the road, clear depth of field receding to the horizon, road centralized, dashboard anchors the bottom frame", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 4:3 composition", + "final canvas size 1152x864", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walking", + "source_index": 0, + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "description": "Person walking away from the camera on the right sidewalk, wearing dark clothing.", + "role_in_scene": "walking along the sidewalk on the right side of the street" + }, + { + "name": "shop_pedestrian", + "source_index": 2, + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "description": "Person standing near a shop entrance on the right, partially obscured.", + "role_in_scene": "standing on the sidewalk near the storefronts on the right" + } + ], + "objects": [ + { + "name": "city_buildings", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "description": "Various city buildings of different heights forming the skyline and lining the street.", + "role_in_scene": "framing the street and forming the background skyline" + }, + { + "name": "street_signs", + "source_index": 6, + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "description": "Various blank street signs attached to a pole on the right side of the street.", + "role_in_scene": "mounted on a pole next to the right sidewalk" + }, + { + "name": "storefront_sign", + "source_index": 7, + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "description": "A dark hanging sign framework attached to a building on the right, devoid of readable text.", + "role_in_scene": "hanging above the shop entrance on the right side of the road" + }, + { + "name": "parked_suv_right", + "source_index": 11, + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "description": "Dark-colored SUV parked on the right side of the road.", + "role_in_scene": "parked alongside the right curb" + }, + { + "name": "parked_car_left", + "source_index": 12, + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "description": "A dark car parked along the left curb further ahead.", + "role_in_scene": "parked alongside the left curb" + }, + { + "name": "traveling_dark_suv", + "source_index": 14, + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "description": "A dark SUV traveling in the left lane, with visible red taillights reflecting the twilight.", + "role_in_scene": "driving in the adjacent lane" + }, + { + "name": "street_light", + "source_index": 15, + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "description": "Tall pole with a bright light on top, illuminating the road from the right side.", + "role_in_scene": "providing illumination from the right sidewalk" + }, + { + "name": "vehicle_dashboard", + "source_index": 16, + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground.", + "role_in_scene": "anchoring the bottom of the frame to establish a driver's perspective" + }, + { + "name": "drainage_grate", + "source_index": 17, + "source_image_id": "BDD100K:b4d0e72d-3b208072:object:16", + "source_name": "drainage grate", + "description": "A metal drainage grate on the edge of the road on the right.", + "role_in_scene": "embedded in the road surface near the right curb" + }, + { + "name": "white_car", + "source_index": 20, + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "description": "A white car visible further down the road in the right lane.", + "role_in_scene": "driving ahead in the same lane" + }, + { + "name": "yellow_lines", + "source_index": 22, + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "description": "Double yellow painted lines separating opposite directions of traffic.", + "role_in_scene": "painted down the center of the road" + }, + { + "name": "street_trees", + "source_index": 23, + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "description": "Numerous trees with dense green foliage lining both sides of the road.", + "role_in_scene": "growing along the sidewalks, adding greenery" + }, + { + "name": "twilight_sky", + "source_index": 25, + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top.", + "role_in_scene": "visible above the buildings and trees at the end of the road" + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_walking", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", + "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: walking along the sidewalk on the right side of the street", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "shop_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", + "sub_caption": "pedestrian: Person standing near a shop entrance on the right, partially obscured.. Scene role: standing on the sidewalk near the storefronts on the right", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "city_buildings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", + "sub_caption": "building: Various city buildings of different heights forming the skyline and lining the street.. Scene role: framing the street and forming the background skyline", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_signs", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", + "sub_caption": "street signs: Various blank street signs attached to a pole on the right side of the street.. Scene role: mounted on a pole next to the right sidewalk", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "storefront_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", + "sub_caption": "storefront sign: A dark hanging sign framework attached to a building on the right, devoid of readable text.. Scene role: hanging above the shop entrance on the right side of the road", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "parked_suv_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", + "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: parked alongside the right curb", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "parked_car_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", + "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: parked alongside the left curb", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "traveling_dark_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", + "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible red taillights reflecting the twilight.. Scene role: driving in the adjacent lane", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", + "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road from the right side.. Scene role: providing illumination from the right sidewalk", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "vehicle_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", + "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground.. Scene role: anchoring the bottom of the frame to establish a driver's perspective", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "drainage_grate", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b4d0e72d-3b208072:object:16", + "source_name": "drainage grate", + "source_description": "A metal drainage grate on the edge of the road on the right. Source dataset: BDD100K. Scene context: A driving scene on a multi-lane highway with a dark red minivan in the left lane, under a partly cloudy sky.", + "sub_caption": "drainage grate: A metal drainage grate on the edge of the road on the right.. Scene role: embedded in the road surface near the right curb", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", + "sub_caption": "white car: A white car visible further down the road in the right lane.. Scene role: driving ahead in the same lane", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "yellow_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", + "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: painted down the center of the road", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", + "sub_caption": "trees: Numerous trees with dense green foliage lining both sides of the road.. Scene role: growing along the sidewalks, adding greenery", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "twilight_sky", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", + "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: visible above the buildings and trees at the end of the road", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000002/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references.json new file mode 100644 index 0000000000000000000000000000000000000000..e0a307b1dd5c73491a7f3ca818b985e0a6eb203a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references.json @@ -0,0 +1,485 @@ +{ + "references": [ + { + "name": "pedestrian_walking", + "ref_image": "references/ref_pedestrian_walking.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_walking.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_walking_attempt_01.png", + "output": "references/ref_pedestrian_walking.png", + "mask": "references/sam_mask_pedestrian_walking.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 63.0, + 695.0, + 972.0 + ], + "mask_score": 3.459152, + "mask_area_ratio": 0.145545, + "elapsed_seconds": 8.3331 + }, + "reference_verify": "references/reference_verify_pedestrian_walking.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "shop_pedestrian", + "ref_image": "references/ref_shop_pedestrian.png", + "raw_ref_image": "references/raw_ref_shop_pedestrian_attempt_02.png", + "diversify_input": "crops/diversify_input_shop_pedestrian.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_shop_pedestrian_attempt_02.png", + "output": "references/ref_shop_pedestrian.png", + "mask": "references/sam_mask_shop_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 312.0, + 43.0, + 719.0, + 1020.0 + ], + "mask_score": 3.162079, + "mask_area_ratio": 0.167512, + "elapsed_seconds": 7.2283 + }, + "reference_verify": "references/reference_verify_shop_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 2 + }, + { + "name": "city_buildings", + "ref_image": "references/ref_city_buildings.png", + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "diversify_input": "crops/diversify_input_city_buildings.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "references/ref_city_buildings.png", + "mask": "references/sam_mask_city_buildings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 14.0, + 171.0, + 1009.0, + 883.0 + ], + "mask_score": 3.176814, + "mask_area_ratio": 0.327415, + "elapsed_seconds": 7.146 + }, + "reference_verify": "references/reference_verify_city_buildings.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_signs", + "ref_image": "references/ref_street_signs.png", + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "diversify_input": "crops/diversify_input_street_signs.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "references/ref_street_signs.png", + "mask": "references/sam_mask_street_signs.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 224.0, + 0.0, + 744.0, + 1023.0 + ], + "mask_score": 3.332549, + "mask_area_ratio": 0.190769, + "elapsed_seconds": 7.1886 + }, + "reference_verify": "references/reference_verify_street_signs.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "storefront_sign", + "ref_image": "references/ref_storefront_sign.png", + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "diversify_input": "crops/diversify_input_storefront_sign.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "references/ref_storefront_sign.png", + "mask": "references/sam_mask_storefront_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 0.0, + 1023.0, + 811.0 + ], + "mask_score": 3.296373, + "mask_area_ratio": 0.447847, + "elapsed_seconds": 7.3102 + }, + "reference_verify": "references/reference_verify_storefront_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "parked_suv_right", + "ref_image": "references/ref_parked_suv_right.png", + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "diversify_input": "crops/diversify_input_parked_suv_right.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "references/ref_parked_suv_right.png", + "mask": "references/sam_mask_parked_suv_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 156.0, + 150.0, + 868.0, + 812.0 + ], + "mask_score": 3.463227, + "mask_area_ratio": 0.291222, + "elapsed_seconds": 7.2583 + }, + "reference_verify": "references/reference_verify_parked_suv_right.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "parked_car_left", + "ref_image": "references/ref_parked_car_left.png", + "raw_ref_image": "references/raw_ref_parked_car_left_attempt_01.png", + "diversify_input": "crops/diversify_input_parked_car_left.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_car_left_attempt_01.png", + "output": "references/ref_parked_car_left.png", + "mask": "references/sam_mask_parked_car_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 319.0, + 1023.0, + 695.0 + ], + "mask_score": 3.122119, + "mask_area_ratio": 0.19451, + "elapsed_seconds": 8.5738 + }, + "reference_verify": "references/reference_verify_parked_car_left.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "traveling_dark_suv", + "ref_image": "references/ref_traveling_dark_suv.png", + "raw_ref_image": "references/raw_ref_traveling_dark_suv_attempt_01.png", + "diversify_input": "crops/diversify_input_traveling_dark_suv.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_traveling_dark_suv_attempt_01.png", + "output": "references/ref_traveling_dark_suv.png", + "mask": "references/sam_mask_traveling_dark_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 119.0, + 198.0, + 910.0, + 810.0 + ], + "mask_score": 3.470329, + "mask_area_ratio": 0.300606, + "elapsed_seconds": 8.5072 + }, + "reference_verify": "references/reference_verify_traveling_dark_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_light", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "diversify_input": "crops/diversify_input_street_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 17.0, + 688.0, + 996.0 + ], + "mask_score": 3.395182, + "mask_area_ratio": 0.033435, + "elapsed_seconds": 7.0701 + }, + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "vehicle_dashboard", + "ref_image": "references/ref_vehicle_dashboard.png", + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "diversify_input": "crops/diversify_input_vehicle_dashboard.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "references/ref_vehicle_dashboard.png", + "mask": "references/sam_mask_vehicle_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 223.0, + 1023.0, + 700.0 + ], + "mask_score": 2.938032, + "mask_area_ratio": 0.282133, + "elapsed_seconds": 7.1679 + }, + "reference_verify": "references/reference_verify_vehicle_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "drainage_grate", + "ref_image": "references/ref_drainage_grate.png", + "raw_ref_image": "references/raw_ref_drainage_grate_attempt_01.png", + "diversify_input": "crops/diversify_input_drainage_grate.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_drainage_grate_attempt_01.png", + "output": "references/ref_drainage_grate.png", + "mask": "references/sam_mask_drainage_grate.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 250.0, + 1023.0, + 773.0 + ], + "mask_score": 3.366042, + "mask_area_ratio": 0.379179, + "elapsed_seconds": 8.3171 + }, + "reference_verify": "references/reference_verify_drainage_grate.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "white_car", + "ref_image": "references/ref_white_car.png", + "raw_ref_image": "references/raw_ref_white_car_attempt_01.png", + "diversify_input": "crops/diversify_input_white_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_attempt_01.png", + "output": "references/ref_white_car.png", + "mask": "references/sam_mask_white_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 137.0, + 215.0, + 884.0, + 819.0 + ], + "mask_score": 3.442096, + "mask_area_ratio": 0.295652, + "elapsed_seconds": 7.1564 + }, + "reference_verify": "references/reference_verify_white_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "yellow_lines", + "ref_image": "references/ref_yellow_lines.png", + "raw_ref_image": "references/raw_ref_yellow_lines_attempt_01.png", + "diversify_input": "crops/diversify_input_yellow_lines.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_yellow_lines_attempt_01.png", + "output": "references/ref_yellow_lines.png", + "mask": "references/sam_mask_yellow_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 3.166027, + "mask_area_ratio": 0.242679, + "elapsed_seconds": 7.0941 + }, + "reference_verify": "references/reference_verify_yellow_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_trees", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "diversify_input": "crops/diversify_input_street_trees.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 35.0, + 55.0, + 1002.0, + 1000.0 + ], + "mask_score": 3.226043, + "mask_area_ratio": 0.439437, + "elapsed_seconds": 7.0986 + }, + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "twilight_sky", + "ref_image": "references/ref_twilight_sky.png", + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "diversify_input": "crops/diversify_input_twilight_sky.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "references/ref_twilight_sky.png", + "mask": "references/sam_mask_twilight_sky.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 72.0, + 72.0, + 951.0, + 951.0 + ], + "mask_score": 3.471577, + "mask_area_ratio": 0.631801, + "elapsed_seconds": 7.5016 + }, + "reference_verify": "references/reference_verify_twilight_sky.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_city_buildings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..929a0851bfbe0f21614c3ea3874ef8a0897906d9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_city_buildings.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2a4bf7899a665ca6b5b64962e4597de05ff582010bc8d5a715f7b335ce69a85 +size 748197 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_drainage_grate.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_drainage_grate.png new file mode 100644 index 0000000000000000000000000000000000000000..16c848e349ee4154b44c2b6a337f1dd7f45e7264 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_drainage_grate.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10917bd73b7ae8d538bebe2d54a9b7362029e57908aab07b9941c8d1f66f52b9 +size 699254 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_car_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..4a3c363bcf74447dbb99ee847309e5ffd92acebe --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_car_left.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3e44092d9ac0b70fd8b5965ed77af34d1bce52b9c2341fc23354ec16e5d50b +size 350727 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_suv_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..0b5797665aff391ee310d7a173a49c999fb6891e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_parked_suv_right.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06333f808f6894b9a23e526022f318dbd201b96c93e7150a1a37f9dfdef06d93 +size 488629 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_pedestrian_walking.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_pedestrian_walking.png new file mode 100644 index 0000000000000000000000000000000000000000..e6c757816e863cd3e50387d974673a46899d49c8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_pedestrian_walking.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0efb693d4492ca6a1f1be65435970925bcb2e49aa282ede252acfc6e20c7ece2 +size 257102 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_shop_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_shop_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..bcde27be4ab71bc3777300ca6644b4e7cdf85fc7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_shop_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ccc6d98cc352c7a660bd3af14d656a5dae620e45dec81d1af1f91ac982eb291 +size 333324 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_storefront_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..e4c565b4cad6f1c89f42664655f300077de3f6e7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_storefront_sign.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a16f14e178f8d5fe4b91957c220e7ba73b8c517365444973041035e5897dbb1 +size 822025 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..4a5a7ba017fea379abde5784b879ae7c13c00eca Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_signs.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..138d6ec6530d74522e4444a9d11c5d73986fbe13 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_signs.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30c57263fb88e2c1fb740feef6784469457081dc2c83ff91da3477ab44708253 +size 272138 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_trees.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..f461717049e31ca0b926cf9b4199c2612d565244 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcb3adffb14887cf4d9ff1b38bd1309540271965714d08b0a84a5d91363363f +size 1207325 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_traveling_dark_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_traveling_dark_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..95b0f2da98521546116fcc027ad7e65a964bf320 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_traveling_dark_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519a8e308e9234af7aadc895208980f5d2d7828418045c27bfafc11081e96162 +size 431591 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_twilight_sky.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..47a590acfbf504c704403f6769c473682b32cbbd --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_twilight_sky.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a525ac1c281603cde699aba1a5f36514e099c4e25f9c8dce552599915039c04d +size 779641 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..5ef33434bd6649ab148618381e32a681502b9d7c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_vehicle_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f730f9e8d3f0f40fad2bae2ef652d94c8fbc3eb27eb9245ead4af57babedb88 +size 620729 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_white_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..04f6c1359c5beb2b9c312f9d2b622e7c5fe68266 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_white_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:184f7d263c28de2de4714d66ebc88b8385ae6f89bd55acc4c18c652fb8fa28a6 +size 473248 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_yellow_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..e367e0db3f9e0c6041e7476385a36abaed3b5ae3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/ref_yellow_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac15a61b9c6e8dd4a77f8ba25ad9a87e86445b8defaa79587de86bddab978238 +size 551919 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_city_buildings.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_city_buildings.json new file mode 100644 index 0000000000000000000000000000000000000000..01bf952850ae484b96c294c8420cdfe45891889b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_city_buildings.json @@ -0,0 +1,46 @@ +{ + "name": "city_buildings", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_city_buildings_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_city_buildings_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_city_buildings_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_city_buildings_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 14.0, + 171.0, + 1009.0, + 883.0 + ], + "mask_score": 3.176814, + "mask_area_ratio": 0.327415, + "elapsed_seconds": 7.146 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The buildings are recognizable and useful as an environmental reference, though there are severe masking artifacts resulting in large white holes in the central buildings." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_drainage_grate.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_drainage_grate.json new file mode 100644 index 0000000000000000000000000000000000000000..0bd75b3c5dd946bc3bb2f4975e96bc436657ba54 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_drainage_grate.json @@ -0,0 +1,46 @@ +{ + "name": "drainage_grate", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_drainage_grate_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_drainage_grate_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_drainage_grate_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_drainage_grate_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_drainage_grate_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_drainage_grate_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 250.0, + 1023.0, + 773.0 + ], + "mask_score": 3.366042, + "mask_area_ratio": 0.379179, + "elapsed_seconds": 8.3171 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The drainage grate is fully visible, isolated on a white background, and not cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_parked_car_left.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_parked_car_left.json new file mode 100644 index 0000000000000000000000000000000000000000..effdb4ee549beeb9d29ddeafeb569d87f9a99afa --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_parked_car_left.json @@ -0,0 +1,46 @@ +{ + "name": "parked_car_left", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_parked_car_left_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_parked_car_left_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_parked_car_left_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_car_left_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_parked_car_left_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_parked_car_left_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 319.0, + 1023.0, + 695.0 + ], + "mask_score": 3.122119, + "mask_area_ratio": 0.19451, + "elapsed_seconds": 8.5738 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a dark blue car entirely visible and centered against a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_parked_suv_right.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_parked_suv_right.json new file mode 100644 index 0000000000000000000000000000000000000000..a3cd84eea9e46730a8e599ae83fbfe35ec843db4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_parked_suv_right.json @@ -0,0 +1,46 @@ +{ + "name": "parked_suv_right", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_parked_suv_right_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_parked_suv_right_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_parked_suv_right_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_parked_suv_right_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 156.0, + 150.0, + 868.0, + 812.0 + ], + "mask_score": 3.463227, + "mask_area_ratio": 0.291222, + "elapsed_seconds": 7.2583 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a dark-colored SUV clearly isolated against a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_pedestrian_walking.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_pedestrian_walking.json new file mode 100644 index 0000000000000000000000000000000000000000..9d5d893cf0315aa888f881d293fc6b3d70b07228 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_pedestrian_walking.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_walking", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_walking_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_walking_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_walking_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_walking_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_pedestrian_walking_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_pedestrian_walking_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 63.0, + 695.0, + 972.0 + ], + "mask_score": 3.459152, + "mask_area_ratio": 0.145545, + "elapsed_seconds": 8.3331 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image successfully shows the full body of a person from head to toe, isolated against a white background with adequate margins." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_shop_pedestrian.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_shop_pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..477e5ef380fc7ca4d3be9e66f144da5f536074f5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_shop_pedestrian.json @@ -0,0 +1,87 @@ +{ + "name": "shop_pedestrian", + "passed": true, + "accepted_attempt": 2, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_shop_pedestrian_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_shop_pedestrian_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_shop_pedestrian_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_shop_pedestrian_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_shop_pedestrian_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_shop_pedestrian_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 386.0, + 181.0, + 653.0, + 829.0 + ], + "mask_score": 3.171502, + "mask_area_ratio": 0.058533, + "elapsed_seconds": 7.0432 + }, + "verify": { + "passed": false, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [ + "Face and hands are completely missing or masked out with white pixels" + ], + "notes": "The person is fully within the frame and has a white background, but the face and hands are obscured or entirely deleted by white pixel artifacts/masking, resulting in missing body parts." + } + }, + { + "attempt": 2, + "raw_ref_image": "references/raw_ref_shop_pedestrian_attempt_02.png", + "candidate_ref_image": "references/candidate_ref_shop_pedestrian_attempt_02.png", + "candidate_sam_mask": "references/candidate_sam_mask_shop_pedestrian_attempt_02.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_shop_pedestrian_attempt_02.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_shop_pedestrian_attempt_02.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_shop_pedestrian_attempt_02.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 312.0, + 43.0, + 719.0, + 1020.0 + ], + "mask_score": 3.162079, + "mask_area_ratio": 0.167512, + "elapsed_seconds": 7.2283 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The subject is a person, full body is visible from head to feet, isolated on a white background with sufficient margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_storefront_sign.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_storefront_sign.json new file mode 100644 index 0000000000000000000000000000000000000000..3463aae349c4be8ed4c12b01d58777392aacc24a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_storefront_sign.json @@ -0,0 +1,46 @@ +{ + "name": "storefront_sign", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_storefront_sign_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_storefront_sign_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_storefront_sign_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_storefront_sign_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 0.0, + 1023.0, + 811.0 + ], + "mask_score": 3.296373, + "mask_area_ratio": 0.447847, + "elapsed_seconds": 7.3102 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The storefront sign is fully visible, well-isolated on a white background, and the primary recognizable form is intact. The chain at the top is slightly cropped at the very edge but the sign itself is complete." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_light.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_light.json new file mode 100644 index 0000000000000000000000000000000000000000..f25b9609d45395f87be5f395b8ea5d3b78614ea8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_light.json @@ -0,0 +1,46 @@ +{ + "name": "street_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_street_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_street_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 17.0, + 688.0, + 996.0 + ], + "mask_score": 3.395182, + "mask_area_ratio": 0.033435, + "elapsed_seconds": 7.0701 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The street light is isolated on a white background, visible, complete, and not cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_signs.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_signs.json new file mode 100644 index 0000000000000000000000000000000000000000..31d7f0776089bab38a1b0c5713a0e4987e62dd47 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_signs.json @@ -0,0 +1,46 @@ +{ + "name": "street_signs", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_signs_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_signs_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_street_signs_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_street_signs_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 224.0, + 0.0, + 744.0, + 1023.0 + ], + "mask_score": 3.332549, + "mask_area_ratio": 0.190769, + "elapsed_seconds": 7.1886 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The blank street signs are fully visible and clearly identifiable. The supporting pole is cropped at the top and bottom, which is acceptable for this type of reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_trees.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_trees.json new file mode 100644 index 0000000000000000000000000000000000000000..7d6d230e4e7c333852175643f4d76bd861f11743 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_street_trees.json @@ -0,0 +1,46 @@ +{ + "name": "street_trees", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_trees_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_trees_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_street_trees_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_street_trees_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 35.0, + 55.0, + 1002.0, + 1000.0 + ], + "mask_score": 3.226043, + "mask_area_ratio": 0.439437, + "elapsed_seconds": 7.0986 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The reference image shows a complete tree against a white background, which is an acceptable isolated specimen for 'street_trees' category." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_traveling_dark_suv.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_traveling_dark_suv.json new file mode 100644 index 0000000000000000000000000000000000000000..b572b8422870fd0a66315c2b6462757078b8f9ec --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_traveling_dark_suv.json @@ -0,0 +1,46 @@ +{ + "name": "traveling_dark_suv", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_traveling_dark_suv_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_traveling_dark_suv_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_traveling_dark_suv_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_traveling_dark_suv_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_traveling_dark_suv_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_traveling_dark_suv_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 119.0, + 198.0, + 910.0, + 810.0 + ], + "mask_score": 3.470329, + "mask_area_ratio": 0.300606, + "elapsed_seconds": 8.5072 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete dark SUV from the rear with illuminated taillights on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_twilight_sky.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_twilight_sky.json new file mode 100644 index 0000000000000000000000000000000000000000..54af0db20f81544e60b67368af684aceecc53e0e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_twilight_sky.json @@ -0,0 +1,46 @@ +{ + "name": "twilight_sky", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_twilight_sky_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_twilight_sky_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_twilight_sky_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_twilight_sky_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 72.0, + 72.0, + 951.0, + 951.0 + ], + "mask_score": 3.471577, + "mask_area_ratio": 0.631801, + "elapsed_seconds": 7.5016 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "A square gradient patch representing a twilight sky on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_vehicle_dashboard.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_vehicle_dashboard.json new file mode 100644 index 0000000000000000000000000000000000000000..892dd918f5f3f3d765b9698b5fff67f04b0f145a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_vehicle_dashboard.json @@ -0,0 +1,46 @@ +{ + "name": "vehicle_dashboard", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_vehicle_dashboard_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_vehicle_dashboard_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_vehicle_dashboard_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_vehicle_dashboard_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 223.0, + 1023.0, + 700.0 + ], + "mask_score": 2.938032, + "mask_area_ratio": 0.282133, + "elapsed_seconds": 7.1679 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The dashboard is visible and isolated on a white background. It is cropped on the right edge and has some segmentation artifacts, but remains recognizable and useful as a reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_white_car.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_white_car.json new file mode 100644 index 0000000000000000000000000000000000000000..1ae23d3c1e65e65bfb0dc6feb81e5730cc5247a2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_white_car.json @@ -0,0 +1,46 @@ +{ + "name": "white_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_white_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_white_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_white_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_white_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_white_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 137.0, + 215.0, + 884.0, + 819.0 + ], + "mask_score": 3.442096, + "mask_area_ratio": 0.295652, + "elapsed_seconds": 7.1564 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The white car is clearly visible and isolated on a white background. There is minor cropping at the very bottom of the tires, but the subject remains highly recognizable and useful as a reference, which is acceptable for non-person subjects." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_yellow_lines.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_yellow_lines.json new file mode 100644 index 0000000000000000000000000000000000000000..18f650356ebf47a641e417a020a50fb229404a10 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/reference_verify_yellow_lines.json @@ -0,0 +1,46 @@ +{ + "name": "yellow_lines", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_yellow_lines_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_yellow_lines_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_yellow_lines_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_yellow_lines_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_yellow_lines_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_yellow_lines_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 3.166027, + "mask_area_ratio": 0.242679, + "elapsed_seconds": 7.0941 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Cropped ends are acceptable for a continuous road line." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_city_buildings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..a8704cccc00b1db07323c61a7a1a5ab38d740f22 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_city_buildings.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_drainage_grate.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_drainage_grate.png new file mode 100644 index 0000000000000000000000000000000000000000..03c260e545a3f610eb21197f1033c30f512eba02 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_drainage_grate.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_parked_car_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_parked_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..419fc2833b5161a79a074d70a40934877a4acf5f Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_parked_car_left.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_parked_suv_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..27a013cc5160bd04b1240156a43eb62eef8d8166 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_parked_suv_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_pedestrian_walking.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_pedestrian_walking.png new file mode 100644 index 0000000000000000000000000000000000000000..c6027de26f2fd169d392bf9283d40ddc2e7e8371 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_pedestrian_walking.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_shop_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_shop_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..6ad191a948157b9e7dfa946f9cde43c6c04413e8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_shop_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_storefront_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..8b5389c76202ce1b2ebbd4126e2d2dee3b70d978 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_storefront_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..14f3a5d27f8bcd5de85df490ebc2e295ec3dc081 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_signs.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..eb7e7fd9813bb730ebd78bf2e7ee428d497e34f0 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_signs.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_trees.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..9c979828e8fffc8c73625d8497f6780b7fbb9d41 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_street_trees.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_traveling_dark_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_traveling_dark_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..45db5ca410decbf45aaa3ddfa09bdfb3ec6c37f8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_traveling_dark_suv.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_twilight_sky.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..e2d992cf03f2bdb21d462905a1f340707328d92e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_twilight_sky.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..17bb747b4d76d7c1208a1a7c36fbb4ceaf4e855e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_vehicle_dashboard.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_white_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..1458dfc79c068cc7bf46222bca976cef314a8431 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_white_car.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_yellow_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..0b6bedf2c6b09e9e178afe0a20abe078aa9a4849 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/references/sam_mask_yellow_lines.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/row.json new file mode 100644 index 0000000000000000000000000000000000000000..f4f75f049767fd550772ad5e8d26b8661e9fab1b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/row.json @@ -0,0 +1,716 @@ +{ + "sample_id": "sample_000002", + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "canvas_size": [ + 1152, + 864 + ], + "canvas_aspect_ratio": "4:3", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 15, + "n_detected": 15, + "n_subjects": 15, + "subjects": [ + { + "name": "pedestrian_walking", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", + "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: walking along the sidewalk on the right side of the street", + "measured_bbox": [ + 0.7497, + 0.4757, + 0.7954, + 0.6192 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_walking_attempt_01.png", + "output": "references/ref_pedestrian_walking.png", + "mask": "references/sam_mask_pedestrian_walking.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 63.0, + 695.0, + 972.0 + ], + "mask_score": 3.459152, + "mask_area_ratio": 0.145545, + "elapsed_seconds": 8.3331 + } + }, + { + "name": "shop_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", + "sub_caption": "pedestrian: Person standing near a shop entrance on the right, partially obscured.. Scene role: standing on the sidewalk near the storefronts on the right", + "measured_bbox": [ + 0.9337, + 0.4752, + 0.9695, + 0.6107 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shop_pedestrian.png", + "raw_ref_image": "references/raw_ref_shop_pedestrian_attempt_02.png", + "reference_verify": "references/reference_verify_shop_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_shop_pedestrian_attempt_02.png", + "output": "references/ref_shop_pedestrian.png", + "mask": "references/sam_mask_shop_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 312.0, + 43.0, + 719.0, + 1020.0 + ], + "mask_score": 3.162079, + "mask_area_ratio": 0.167512, + "elapsed_seconds": 7.2283 + } + }, + { + "name": "city_buildings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", + "sub_caption": "building: Various city buildings of different heights forming the skyline and lining the street.. Scene role: framing the street and forming the background skyline", + "measured_bbox": [ + 0.0, + 0.0, + 1.0, + 0.6084 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_city_buildings.png", + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "reference_verify": "references/reference_verify_city_buildings.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "references/ref_city_buildings.png", + "mask": "references/sam_mask_city_buildings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 14.0, + 171.0, + 1009.0, + 883.0 + ], + "mask_score": 3.176814, + "mask_area_ratio": 0.327415, + "elapsed_seconds": 7.146 + } + }, + { + "name": "street_signs", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", + "sub_caption": "street signs: Various blank street signs attached to a pole on the right side of the street.. Scene role: mounted on a pole next to the right sidewalk", + "measured_bbox": [ + 0.641, + 0.165, + 0.744, + 0.408 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_signs.png", + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "reference_verify": "references/reference_verify_street_signs.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "references/ref_street_signs.png", + "mask": "references/sam_mask_street_signs.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 224.0, + 0.0, + 744.0, + 1023.0 + ], + "mask_score": 3.332549, + "mask_area_ratio": 0.190769, + "elapsed_seconds": 7.1886 + } + }, + { + "name": "storefront_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", + "sub_caption": "storefront sign: A dark hanging sign framework attached to a building on the right, devoid of readable text.. Scene role: hanging above the shop entrance on the right side of the road", + "measured_bbox": [ + 0.7854, + 0.1934, + 0.9082, + 0.2906 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_storefront_sign.png", + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "reference_verify": "references/reference_verify_storefront_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "references/ref_storefront_sign.png", + "mask": "references/sam_mask_storefront_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 0.0, + 1023.0, + 811.0 + ], + "mask_score": 3.296373, + "mask_area_ratio": 0.447847, + "elapsed_seconds": 7.3102 + } + }, + { + "name": "parked_suv_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", + "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: parked alongside the right curb", + "measured_bbox": [ + 0.5507, + 0.4879, + 0.6783, + 0.6234 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_suv_right.png", + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "reference_verify": "references/reference_verify_parked_suv_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "references/ref_parked_suv_right.png", + "mask": "references/sam_mask_parked_suv_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 156.0, + 150.0, + 868.0, + 812.0 + ], + "mask_score": 3.463227, + "mask_area_ratio": 0.291222, + "elapsed_seconds": 7.2583 + } + }, + { + "name": "parked_car_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", + "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: parked alongside the left curb", + "measured_bbox": [ + 0.0, + 0.5102, + 0.1259, + 0.5998 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_car_left.png", + "raw_ref_image": "references/raw_ref_parked_car_left_attempt_01.png", + "reference_verify": "references/reference_verify_parked_car_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_car_left_attempt_01.png", + "output": "references/ref_parked_car_left.png", + "mask": "references/sam_mask_parked_car_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 319.0, + 1023.0, + 695.0 + ], + "mask_score": 3.122119, + "mask_area_ratio": 0.19451, + "elapsed_seconds": 8.5738 + } + }, + { + "name": "traveling_dark_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", + "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible red taillights reflecting the twilight.. Scene role: driving in the adjacent lane", + "measured_bbox": [ + 0.2594, + 0.4853, + 0.417, + 0.6419 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traveling_dark_suv.png", + "raw_ref_image": "references/raw_ref_traveling_dark_suv_attempt_01.png", + "reference_verify": "references/reference_verify_traveling_dark_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_traveling_dark_suv_attempt_01.png", + "output": "references/ref_traveling_dark_suv.png", + "mask": "references/sam_mask_traveling_dark_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 119.0, + 198.0, + 910.0, + 810.0 + ], + "mask_score": 3.470329, + "mask_area_ratio": 0.300606, + "elapsed_seconds": 8.5072 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", + "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road from the right side.. Scene role: providing illumination from the right sidewalk", + "measured_bbox": [ + 0.5577, + 0.0219, + 0.6964, + 0.588 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 330.0, + 17.0, + 688.0, + 996.0 + ], + "mask_score": 3.395182, + "mask_area_ratio": 0.033435, + "elapsed_seconds": 7.0701 + } + }, + { + "name": "vehicle_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", + "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground.. Scene role: anchoring the bottom of the frame to establish a driver's perspective", + "measured_bbox": [ + 0.0, + 0.8881, + 1.0, + 1.0 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vehicle_dashboard.png", + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_vehicle_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "references/ref_vehicle_dashboard.png", + "mask": "references/sam_mask_vehicle_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 223.0, + 1023.0, + 700.0 + ], + "mask_score": 2.938032, + "mask_area_ratio": 0.282133, + "elapsed_seconds": 7.1679 + } + }, + { + "name": "drainage_grate", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b4d0e72d-3b208072:object:16", + "source_name": "drainage grate", + "source_description": "A metal drainage grate on the edge of the road on the right. Source dataset: BDD100K. Scene context: A driving scene on a multi-lane highway with a dark red minivan in the left lane, under a partly cloudy sky.", + "sub_caption": "drainage grate: A metal drainage grate on the edge of the road on the right.. Scene role: embedded in the road surface near the right curb", + "measured_bbox": [ + 0.5682, + 0.6773, + 0.8089, + 0.73 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_drainage_grate.png", + "raw_ref_image": "references/raw_ref_drainage_grate_attempt_01.png", + "reference_verify": "references/reference_verify_drainage_grate.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_drainage_grate_attempt_01.png", + "output": "references/ref_drainage_grate.png", + "mask": "references/sam_mask_drainage_grate.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 250.0, + 1023.0, + 773.0 + ], + "mask_score": 3.366042, + "mask_area_ratio": 0.379179, + "elapsed_seconds": 8.3171 + } + }, + { + "name": "white_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", + "sub_caption": "white car: A white car visible further down the road in the right lane.. Scene role: driving ahead in the same lane", + "measured_bbox": [ + 0.4356, + 0.5036, + 0.4784, + 0.548 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_car.png", + "raw_ref_image": "references/raw_ref_white_car_attempt_01.png", + "reference_verify": "references/reference_verify_white_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_attempt_01.png", + "output": "references/ref_white_car.png", + "mask": "references/sam_mask_white_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 137.0, + 215.0, + 884.0, + 819.0 + ], + "mask_score": 3.442096, + "mask_area_ratio": 0.295652, + "elapsed_seconds": 7.1564 + } + }, + { + "name": "yellow_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", + "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: painted down the center of the road", + "measured_bbox": [ + 0.0, + 0.622, + 0.2642, + 0.7692 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_lines.png", + "raw_ref_image": "references/raw_ref_yellow_lines_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_yellow_lines_attempt_01.png", + "output": "references/ref_yellow_lines.png", + "mask": "references/sam_mask_yellow_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 3.166027, + "mask_area_ratio": 0.242679, + "elapsed_seconds": 7.0941 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", + "sub_caption": "trees: Numerous trees with dense green foliage lining both sides of the road.. Scene role: growing along the sidewalks, adding greenery", + "measured_bbox": [ + 0.2664, + 0.0, + 0.7141, + 0.5127 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 35.0, + 55.0, + 1002.0, + 1000.0 + ], + "mask_score": 3.226043, + "mask_area_ratio": 0.439437, + "elapsed_seconds": 7.0986 + } + }, + { + "name": "twilight_sky", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", + "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: visible above the buildings and trees at the end of the road", + "measured_bbox": [ + 0.188, + 0.0, + 0.862, + 0.4846 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_twilight_sky.png", + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "reference_verify": "references/reference_verify_twilight_sky.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "references/ref_twilight_sky.png", + "mask": "references/sam_mask_twilight_sky.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 72.0, + 72.0, + 951.0, + 951.0 + ], + "mask_score": 3.471577, + "mask_area_ratio": 0.631801, + "elapsed_seconds": 7.5016 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..4c7ae4dbd464db26c25ccc9c52d94434eb2ffdfc --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000002/vocab_task.json @@ -0,0 +1,224 @@ +{ + "task_id": "sample_000002", + "sample_id": "sample_000002", + "sample_index": 2, + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 200197, + "image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "name": "pedestrian", + "description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain." + }, + { + "candidate_index": 1, + "source_offset": 21685, + "image_id": "CrowdHuman:data/data_16/273278,13c1f3000bd2d0bbe.jpg:person:1", + "name": "shopper", + "description": "Woman holding a bright yellow shopping bag, wearing a white top and grey pants, walking alongside a companion. Source dataset: CrowdHuman. Scene context: A bustling city street at night, lined with brightly lit storefronts and large illuminated signs, filled with pedestrians walking and shopping at street vendor stalls." + }, + { + "candidate_index": 2, + "source_offset": 196602, + "image_id": "BDD100K:b714a088-861a043b:person:2", + "name": "pedestrian", + "description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening" + }, + { + "candidate_index": 3, + "source_offset": 38044, + "image_id": "CrowdHuman:data/data_21/283554,1b06b00011302479.jpg:person:26", + "name": "person standing", + "description": "Person standing behind the right bench, wearing light clothing. Source dataset: CrowdHuman. Scene context: A group of people standing and sitting on benches near a city street, with trees and large buildings in the background." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 97568, + "image_id": "CrowdHuman:data/data_59/273271,1f2b0000eb60e942.jpg:object:0", + "name": "champagne glass", + "description": "A tall, slender glass holding a light-colored bubbly liquid, held by the woman in the black dress. Source dataset: CrowdHuman. Scene context: A group of five adults at a party, three sitting on a couch and two on a confetti-covered floor, holding up champagne glasses in celebration." + }, + { + "candidate_index": 1, + "source_offset": 132505, + "image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "name": "building", + "description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background." + }, + { + "candidate_index": 2, + "source_offset": 75108, + "image_id": "CrowdHuman:data/data_46/283991,1979e0006e725ed3.jpg:object:11", + "name": "bicycle wheel", + "description": "the front wheel of a bicycle visible amidst the crowd Source dataset: CrowdHuman. Scene context: A crowd of people gathers in a city square for a protest or demonstration, with many holding flags and signs." + }, + { + "candidate_index": 3, + "source_offset": 107106, + "image_id": "CrowdHuman:data/data_63/273278,6f9fc00027f3f324.jpg:object:7", + "name": "bag", + "description": "A crumpled white plastic bag lying on the sidewalk near the edge of the street. Source dataset: CrowdHuman. Scene context: People wait at a covered bus stop beside a street with a bus parked behind them, while a large advertisement sign stands prominently in the foreground." + }, + { + "candidate_index": 4, + "source_offset": 89231, + "image_id": "CrowdHuman:data/data_54/273278,126226000f9ec04e2.jpg:object:3", + "name": "backpack", + "description": "A gray backpack with darker straps and details, worn by the pedestrian in the red jacket. Source dataset: CrowdHuman. Scene context: A bustling tree-lined pedestrian street with people walking and browsing souvenir stalls." + }, + { + "candidate_index": 5, + "source_offset": 113102, + "image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "name": "pink scooter", + "description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background." + }, + { + "candidate_index": 6, + "source_offset": 151562, + "image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "name": "street signs", + "description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers." + }, + { + "candidate_index": 7, + "source_offset": 60366, + "image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "name": "storefront sign", + "description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic." + }, + { + "candidate_index": 8, + "source_offset": 84198, + "image_id": "CrowdHuman:data/data_51/273278,131e65000ff71f35b.jpg:object:5", + "name": "mannequin", + "description": "Another mannequin torso displaying white lingerie. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly lit with numerous neon signs and storefronts, crowded with pedestrians walking in both directions." + }, + { + "candidate_index": 9, + "source_offset": 41160, + "image_id": "CrowdHuman:data/data_29/283647,1742900042076f47.jpg:object:8", + "name": "car", + "description": "The front portion of a dark-colored car visible in the lower right corner, appearing blurred. Source dataset: CrowdHuman. Scene context: A grand, multi-towered stone building, possibly a cathedral or important civic structure, overlooks a public plaza or square where several pedestrians are walking or congregating around a dark fountain." + }, + { + "candidate_index": 10, + "source_offset": 57282, + "image_id": "CrowdHuman:data/data_38/273278,d172d0001ac1c7d0.jpg:object:7", + "name": "stairs", + "description": "Concrete stairs the students are walking up. Source dataset: CrowdHuman. Scene context: A group of young female students walking up some stairs, some carrying backpacks, folders, or papers, with a textured brown wall in the background." + }, + { + "candidate_index": 11, + "source_offset": 222565, + "image_id": "BDD100K:c54441e6-400c221e:object:4", + "name": "parked SUV", + "description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right." + }, + { + "candidate_index": 12, + "source_offset": 175127, + "image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "name": "car", + "description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk." + }, + { + "candidate_index": 13, + "source_offset": 150969, + "image_id": "BDD100K:b58436bb-5790dfd3:object:5", + "name": "overhead sign 2", + "description": "A second rectangular green highway sign with white text and arrows, mounted next to the first sign. Source dataset: BDD100K. Scene context: View from a moving vehicle driving on a bridge at dusk, with other cars and a yellow taxi ahead, and highway signs overhead." + }, + { + "candidate_index": 14, + "source_offset": 238259, + "image_id": "BDD100K:c889c950-865ca5b6:object:0", + "name": "dark SUV", + "description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights." + }, + { + "candidate_index": 15, + "source_offset": 237241, + "image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "name": "street light", + "description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead." + }, + { + "candidate_index": 16, + "source_offset": 148362, + "image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "name": "dashboard", + "description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings." + }, + { + "candidate_index": 17, + "source_offset": 147478, + "image_id": "BDD100K:b4d0e72d-3b208072:object:16", + "name": "drainage grate", + "description": "A metal drainage grate on the edge of the road on the right. Source dataset: BDD100K. Scene context: A driving scene on a multi-lane highway with a dark red minivan in the left lane, under a partly cloudy sky." + }, + { + "candidate_index": 18, + "source_offset": 47380, + "image_id": "CrowdHuman:data/data_32/283992,1008000116a704e.jpg:object:0", + "name": "police car", + "description": "The front portion of a white police vehicle on the left, with bright blue flashing lights on its roof. Source dataset: CrowdHuman. Scene context: A tense street scene with a police officer and plainclothes armed men moving urgently through a cordoned-off area, with an ambulance in the background." + }, + { + "candidate_index": 19, + "source_offset": 46976, + "image_id": "CrowdHuman:data/data_32/282555,34e4900001d063c5.jpg:object:1", + "name": "curved canopy structure", + "description": "An arched structural framework with beams crossing diagonally, visible above the fence. Source dataset: CrowdHuman. Scene context: A group of eight people pose for a photo at night on a walkway enclosed by wire mesh fencing under a curved structural canopy." + }, + { + "candidate_index": 20, + "source_offset": 142061, + "image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "name": "white car", + "description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right." + }, + { + "candidate_index": 21, + "source_offset": 33363, + "image_id": "CrowdHuman:data/data_26/273271,1ee5700005ba3c28.jpg:object:6", + "name": "fan", + "description": "A black oscillating fan standing in the background. Source dataset: CrowdHuman. Scene context: A group of young adults are gathered in a dimly lit room, many of them sitting at computers and appearing to be engaged in a LAN party or gaming event." + }, + { + "candidate_index": 22, + "source_offset": 218458, + "image_id": "BDD100K:c417a291-7802692d:object:8", + "name": "yellow lines", + "description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background." + }, + { + "candidate_index": 23, + "source_offset": 218847, + "image_id": "BDD100K:c4891df0-24371ae1:object:3", + "name": "trees", + "description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky." + }, + { + "candidate_index": 24, + "source_offset": 126497, + "image_id": "CrowdHuman:data/data_72/283647,12bd000f875dc20.jpg:object:0", + "name": "billboard", + "description": "a large, bright sign on a building on the left side of the street Source dataset: CrowdHuman. Scene context: A wide city street is filled with numerous pedestrians walking, with tall buildings adorned with vibrant billboards lining the sides and a prominent green overpass structure in the distance." + }, + { + "candidate_index": 25, + "source_offset": 203204, + "image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "name": "sky", + "description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky." + } + ], + "rng_seed": 1782137451, + "created_at": 1782292413.308104 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..fe78fd932e2332275caa0200f33bd9184d364584 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46fc2ad9be7b98cba75fcd507c322311c70ec1f06ff5f0d5bbe9873cb4ef14c5 +size 1614013 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..b68ccf86b0a5d9de57f2c25282824cc49ee7ba60 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/compose_prompt.txt @@ -0,0 +1,63 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban intersection with a marked crosswalk on a multi-lane city street during daytime.", + "activity": "A pedestrian stands at the edge of the crosswalk waiting for traffic to clear, while a black sedan and a silver car are driving side-by-side toward the intersection.", + "composition": "Wide-angle street-level view looking across the crosswalk; the pedestrian is positioned on the left side of the frame in the foreground, while the black sedan and silver car approach from the midground on the right, providing clear depth and spatial relationships.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "waiting_pedestrian", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "description": "A pedestrian wearing a dark top and dark pants, standing upright with a natural posture.", + "role_in_scene": "Waiting at the curb near the crosswalk on the left side of the street." + } + ], + "objects": [ + { + "name": "black_sedan", + "source_index": 1, + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "description": "A glossy black sedan with visible headlights and a detailed front grille.", + "role_in_scene": "Approaching the crosswalk in the center traffic lane." + }, + { + "name": "silver_car", + "source_index": 3, + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "description": "A metallic silver car reflecting daylight.", + "role_in_scene": "Driving in the right lane slightly ahead of the black sedan, approaching the intersection." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_black_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..997a96efd5239cdc9a6b0add0ba34f7db32823ad --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_black_sedan.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a49c71021b7572595ec6c552ba06032011d1da5121df1b9ec1381d6d757742 +size 128307 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..8d8f32aa24a09922df0d95a5d8d997d07cb11c25 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1764c2731f6b82788c7f3ca280351b11ea9cd52ecdb8ddd1a5e0ce63bd44b724 +size 148938 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_waiting_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_waiting_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..8250f6884850ceb8d98630c92c970d7452c03f04 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/detect_refine_waiting_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98ea56d9a74ac857c3da85c9c645fff97208902cba02c0455f4df0443bf8a9a6 +size 198924 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_black_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..c9e6dcc186340d1c99f7a5da97e54f9ae36b78bf --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_black_sedan.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c49f6f0817a8008ab97a834204df97ac29d3573e37f08cc3fe24b770af8eb03 +size 163768 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..477f2c8d38576e07e2d2af36abfdc5b18ce40b30 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62cc4404e461334f992f3831db8efb49375bf6c357403fa49aabde45def0dd2f +size 189887 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_waiting_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_waiting_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..d0a71131118d01562cae1ff18d223e3dcb36da4b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/crops/diversify_input_waiting_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c71340e5c50ec716989b3107fc0adc4ba5f5793ff60c7ecce5e2533f31100c5e +size 292958 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..5ffbddc4b5ae96e6cc71182e903e25b9b57e44bd --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/detections.json @@ -0,0 +1,59 @@ +[ + { + "name": "waiting_pedestrian", + "present": true, + "bbox": [ + 0.0928, + 0.1174, + 0.205, + 0.9401 + ], + "confidence": 0.95, + "notes": "A tight box around the pedestrian in a dark coat and dark pants standing at the curb.", + "coarse_bbox": [ + 0.091, + 0.115, + 0.205, + 0.938 + ], + "refine_crop": "crops/detect_refine_waiting_pedestrian.png" + }, + { + "name": "black_sedan", + "present": true, + "bbox": [ + 0.3895, + 0.2431, + 0.591, + 0.5084 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the main black sedan in the foreground.", + "coarse_bbox": [ + 0.388, + 0.245, + 0.592, + 0.508 + ], + "refine_crop": "crops/detect_refine_black_sedan.png" + }, + { + "name": "silver_car", + "present": true, + "bbox": [ + 0.6628, + 0.2419, + 0.9089, + 0.4999 + ], + "confidence": 0.99, + "notes": "The large silver car in the foreground is clearly visible and occupies most of the crop.", + "coarse_bbox": [ + 0.661, + 0.24, + 0.91, + 0.493 + ], + "refine_crop": "crops/detect_refine_silver_car.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..f8d63cf0cb8c8a1a76e141ac8e9bec295a02258b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0222e03abfb2a3cd72e4cfb4acb4240582db7f86e5505c27a22a3712d179ea4c +size 1651186 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..5a6e723c4cb58384ad29af1a82b52978d53f3e1c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/plan.json @@ -0,0 +1,107 @@ +{ + "sample_id": "sample_000003", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban intersection with a marked crosswalk on a multi-lane city street during daytime.", + "activity": "A pedestrian stands at the edge of the crosswalk waiting for traffic to clear, while a black sedan and a silver car are driving side-by-side toward the intersection.", + "composition": "Wide-angle street-level view looking across the crosswalk; the pedestrian is positioned on the left side of the frame in the foreground, while the black sedan and silver car approach from the midground on the right, providing clear depth and spatial relationships.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "waiting_pedestrian", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "description": "A pedestrian wearing a dark top and dark pants, standing upright with a natural posture.", + "role_in_scene": "Waiting at the curb near the crosswalk on the left side of the street." + } + ], + "objects": [ + { + "name": "black_sedan", + "source_index": 1, + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "description": "A glossy black sedan with visible headlights and a detailed front grille.", + "role_in_scene": "Approaching the crosswalk in the center traffic lane." + }, + { + "name": "silver_car", + "source_index": 3, + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "description": "A metallic silver car reflecting daylight.", + "role_in_scene": "Driving in the right lane slightly ahead of the black sedan, approaching the intersection." + } + ] + }, + "expected_subjects": [ + { + "name": "waiting_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", + "sub_caption": "shopper: A pedestrian wearing a dark top and dark pants, standing upright with a natural posture.. Scene role: Waiting at the curb near the crosswalk on the left side of the street.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "black_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", + "sub_caption": "black sedan: A glossy black sedan with visible headlights and a detailed front grille.. Scene role: Approaching the crosswalk in the center traffic lane.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", + "sub_caption": "silver car: A metallic silver car reflecting daylight.. Scene role: Driving in the right lane slightly ahead of the black sedan, approaching the intersection.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000003/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references.json new file mode 100644 index 0000000000000000000000000000000000000000..320357adf012d612f009ee78cdbd2cae18c004db --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references.json @@ -0,0 +1,101 @@ +{ + "references": [ + { + "name": "waiting_pedestrian", + "ref_image": "references/ref_waiting_pedestrian.png", + "raw_ref_image": "references/raw_ref_waiting_pedestrian_attempt_01.png", + "diversify_input": "crops/diversify_input_waiting_pedestrian.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_waiting_pedestrian_attempt_01.png", + "output": "references/ref_waiting_pedestrian.png", + "mask": "references/sam_mask_waiting_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 48.0, + 704.0, + 1015.0 + ], + "mask_score": 3.427649, + "mask_area_ratio": 0.155239, + "elapsed_seconds": 6.9951 + }, + "reference_verify": "references/reference_verify_waiting_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "black_sedan", + "ref_image": "references/ref_black_sedan.png", + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "diversify_input": "crops/diversify_input_black_sedan.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "references/ref_black_sedan.png", + "mask": "references/sam_mask_black_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 221.0, + 1023.0, + 796.0 + ], + "mask_score": 3.446312, + "mask_area_ratio": 0.340465, + "elapsed_seconds": 7.2258 + }, + "reference_verify": "references/reference_verify_black_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "silver_car", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "diversify_input": "crops/diversify_input_silver_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 220.0, + 1011.0, + 811.0 + ], + "mask_score": 3.077144, + "mask_area_ratio": 0.338042, + "elapsed_seconds": 7.0902 + }, + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_black_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..139f819af97d6888b9ae0989aee42cbad9b1e45f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_black_sedan.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0582b0aca901b6524d05d1139c390e9f2510d52cd9580c96094254684a7c2ce8 +size 598130 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..69a58e82ebe997ff6f5990601cfbd4ef615b3819 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beaf5e9dabfb88abb6569a9eccbde1e819ececa41da72972a3c2fef7245b6bfb +size 590437 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_waiting_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_waiting_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..4ba11caaa52598a5bbf4468d493a270e8f323c0e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/ref_waiting_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:952bf402e8cadd502bc1a34152fd535d4e439e7b09d8ab75a4760c37916c6b9c +size 285547 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_black_sedan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_black_sedan.json new file mode 100644 index 0000000000000000000000000000000000000000..b95a9123a43572a8b42d71c80f497c0ac158b2e9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_black_sedan.json @@ -0,0 +1,46 @@ +{ + "name": "black_sedan", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_black_sedan_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_black_sedan_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_ref_black_sedan_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_sam_mask_black_sedan_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 221.0, + 1023.0, + 796.0 + ], + "mask_score": 3.446312, + "mask_area_ratio": 0.340465, + "elapsed_seconds": 7.2258 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image is a well-isolated reference of a black sedan." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_silver_car.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_silver_car.json new file mode 100644 index 0000000000000000000000000000000000000000..868eba2525c19ba5b3487c44fd977d563fd767a4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_silver_car.json @@ -0,0 +1,46 @@ +{ + "name": "silver_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_silver_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_ref_silver_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 220.0, + 1011.0, + 811.0 + ], + "mask_score": 3.077144, + "mask_area_ratio": 0.338042, + "elapsed_seconds": 7.0902 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "A complete silver car is isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_waiting_pedestrian.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_waiting_pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..bf3ec0f021a8b052b631c077c51dd64bdd80c661 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/reference_verify_waiting_pedestrian.json @@ -0,0 +1,46 @@ +{ + "name": "waiting_pedestrian", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_waiting_pedestrian_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_waiting_pedestrian_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_waiting_pedestrian_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_waiting_pedestrian_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_ref_waiting_pedestrian_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_sam_mask_waiting_pedestrian_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 48.0, + 704.0, + 1015.0 + ], + "mask_score": 3.427649, + "mask_area_ratio": 0.155239, + "elapsed_seconds": 6.9951 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image meets all requirements for a person reference. The subject is fully visible from head to toe, isolated on a white background, and centered with sufficient margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_black_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..081a86d8e70b153f06eba0ed3eeea3c53b0032ba Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_black_sedan.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..c7ee251d9eada30d28484b85ed9d6570a398f24d Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_silver_car.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_waiting_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_waiting_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..6dbaa8b3b609bb9fc5265cac30f87aa08b043857 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/references/sam_mask_waiting_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/row.json new file mode 100644 index 0000000000000000000000000000000000000000..fe38e599efba54ddfc66405680cf3248e5dcb0fb --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/row.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000003", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "waiting_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", + "sub_caption": "shopper: A pedestrian wearing a dark top and dark pants, standing upright with a natural posture.. Scene role: Waiting at the curb near the crosswalk on the left side of the street.", + "measured_bbox": [ + 0.0928, + 0.1174, + 0.205, + 0.9401 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_waiting_pedestrian.png", + "raw_ref_image": "references/raw_ref_waiting_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_waiting_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_waiting_pedestrian_attempt_01.png", + "output": "references/ref_waiting_pedestrian.png", + "mask": "references/sam_mask_waiting_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 48.0, + 704.0, + 1015.0 + ], + "mask_score": 3.427649, + "mask_area_ratio": 0.155239, + "elapsed_seconds": 6.9951 + } + }, + { + "name": "black_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", + "sub_caption": "black sedan: A glossy black sedan with visible headlights and a detailed front grille.. Scene role: Approaching the crosswalk in the center traffic lane.", + "measured_bbox": [ + 0.3895, + 0.2431, + 0.591, + 0.5084 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_sedan.png", + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "reference_verify": "references/reference_verify_black_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "references/ref_black_sedan.png", + "mask": "references/sam_mask_black_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 221.0, + 1023.0, + 796.0 + ], + "mask_score": 3.446312, + "mask_area_ratio": 0.340465, + "elapsed_seconds": 7.2258 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", + "sub_caption": "silver car: A metallic silver car reflecting daylight.. Scene role: Driving in the right lane slightly ahead of the black sedan, approaching the intersection.", + "measured_bbox": [ + 0.6628, + 0.2419, + 0.9089, + 0.4999 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 220.0, + 1011.0, + 811.0 + ], + "mask_score": 3.077144, + "mask_area_ratio": 0.338042, + "elapsed_seconds": 7.0902 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..b25407d18ebbbbfa824efbbba38c7e31afba7e79 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000003/vocab_task.json @@ -0,0 +1,56 @@ +{ + "task_id": "sample_000003", + "sample_id": "sample_000003", + "sample_index": 3, + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 11490, + "image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "name": "shopper", + "description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping." + }, + { + "candidate_index": 1, + "source_offset": 10082, + "image_id": "CrowdHuman:data/data_12/273275,a967f0002c2986ab.jpg:person:10", + "name": "uniformed attendee", + "description": "Person in a dark military-style uniform with a beret, standing near the seated man. Source dataset: CrowdHuman. Scene context: A group of people, including flag bearers in uniform, are gathered outdoors on a dirt path next to a stone building and trees for a ceremony." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 69825, + "image_id": "CrowdHuman:data/data_43/273278,b8637000bf6cd3e6.jpg:object:3", + "name": "sunglasses", + "description": "Dark-lensed sunglasses worn by a fan in the front right. Source dataset: CrowdHuman. Scene context: A large crowd of enthusiastic fans in a stadium bleacher section, many wearing matching blue apparel and holding signs, cheering animatedly." + }, + { + "candidate_index": 1, + "source_offset": 202163, + "image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "name": "black sedan", + "description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side." + }, + { + "candidate_index": 2, + "source_offset": 190614, + "image_id": "BDD100K:be6b4502-e0c95034:object:4", + "name": "distant vehicles", + "description": "Several indistinct vehicles with headlights and taillights visible further down the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle approaching an intersection with a crosswalk and green traffic lights, with several cars ahead." + }, + { + "candidate_index": 3, + "source_offset": 190043, + "image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "name": "silver car", + "description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky." + } + ], + "rng_seed": 1782242180, + "created_at": 1782292413.3249526 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..f845665050012352d09fdb1a8d5f55bbcfc29364 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d9c2c68b64618ef69de84abe84cefad790182c5bbc04a1eee26a1e56965c0a +size 1650549 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..a69acf58c7f77c7cd91dacb7c52b4fc7ec81f645 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/compose_prompt.txt @@ -0,0 +1,79 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A daytime city street intersection viewed from the perspective of an approaching vehicle.", + "activity": "A man is waiting at the crosswalk as the traffic light turns red, while a delivery truck and a dark car are parked along the curbside.", + "composition": "First-person driving view from the center lane. The painted street lines lead the eye toward the intersection. The red traffic light is suspended prominently near the top center. The dark car is parked on the right curb in the mid-ground, with the delivery truck parked further down. The pedestrian stands at the right-side crosswalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "walker", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "description": "A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.", + "role_in_scene": "Standing at the edge of the sidewalk near the crosswalk, waiting to cross the street." + } + ], + "objects": [ + { + "name": "traffic_light", + "source_index": 0, + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "description": "A set of traffic lights suspended over the intersection, showing a red light.", + "role_in_scene": "Hanging high above the intersection in the upper-center of the frame." + }, + { + "name": "delivery_truck", + "source_index": 2, + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "description": "A large, plain white box delivery truck.", + "role_in_scene": "Parked alongside the right curb in the background, past the intersection." + }, + { + "name": "dark_parked_car", + "source_index": 3, + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "description": "A dark-colored sedan.", + "role_in_scene": "Parked on the right side of the street near the sidewalk in the mid-ground." + }, + { + "name": "street_lines", + "source_index": 4, + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "description": "Double yellow lines separating traffic directions and crisp white painted lines indicating lanes and a crosswalk.", + "role_in_scene": "Painted on the asphalt road surface, extending from the foreground toward the intersection." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_dark_parked_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..a78397fb94403104d10b0ad9d8334bd4a079b805 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_dark_parked_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52be9139ca889d45ad431b8dc3e54fd230918354f303e9658dd8bf344bdf411c +size 133125 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_delivery_truck.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..e86c568ca48f831deb1f0f131c9f1c8f0127f9a1 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_delivery_truck.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_street_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..aae305cc1d068e7ac31463a62e2b37752849a297 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_street_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce28fa202c9211a81693b1200a11136418b6b3b4f98b23366fe03e2feaf565fb +size 1036842 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..42ca59b46e5a0c79eef6c440e82d841b96f75f32 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_walker.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..7e0547fee5bcaa633471bdb9db2cd69633597919 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/detect_refine_walker.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_dark_parked_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..a365c8bafecd9278106b84f8a13dda9d6dbfe769 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_dark_parked_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:980c370a3e3a87ae9ae2d716771914ea3500634bef1d38d696a4426cd0a222ee +size 163957 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_delivery_truck.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..cfd6c78148931c05c21cffbdc3194742724d754d Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_delivery_truck.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_street_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..8a2cbf08621f3df9a57935a9e50fb6af3606415c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_street_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ab537facecf3692386cb752d7e204efc246080e49e53a2c9b4f83c5e9e3ba3 +size 1139256 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..96a5f0dd502e271ec4c070d90ba6431093dfa45b Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_walker.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..7cb692c328e566c65a2eff84d4882a1fdef35c44 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/crops/diversify_input_walker.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..2324609e93406ab7ddcd206a4455eea83c63f617 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/detections.json @@ -0,0 +1,97 @@ +[ + { + "name": "walker", + "present": true, + "bbox": [ + 0.7914, + 0.2893, + 0.834, + 0.4815 + ], + "confidence": 1.0, + "notes": "The man wearing a blue hoodie and grey pants is clearly visible.", + "coarse_bbox": [ + 0.791, + 0.291, + 0.833, + 0.485 + ], + "refine_crop": "crops/detect_refine_walker.png" + }, + { + "name": "traffic_light", + "present": true, + "bbox": [ + 0.4425, + 0.023, + 0.467, + 0.1052 + ], + "confidence": 0.99, + "notes": "A traffic light showing a red light.", + "coarse_bbox": [ + 0.442, + 0.024, + 0.467, + 0.105 + ], + "refine_crop": "crops/detect_refine_traffic_light.png" + }, + { + "name": "delivery_truck", + "present": true, + "bbox": [ + 0.576, + 0.1929, + 0.7135, + 0.4081 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the white box delivery truck.", + "coarse_bbox": [ + 0.574, + 0.191, + 0.714, + 0.407 + ], + "refine_crop": "crops/detect_refine_delivery_truck.png" + }, + { + "name": "dark_parked_car", + "present": true, + "bbox": [ + 0.8414, + 0.3717, + 0.9967, + 0.7454 + ], + "confidence": 0.98, + "notes": "A dark-colored sedan parked on the street.", + "coarse_bbox": [ + 0.841, + 0.375, + 0.995, + 0.734 + ], + "refine_crop": "crops/detect_refine_dark_parked_car.png" + }, + { + "name": "street_lines", + "present": true, + "bbox": [ + 0.003, + 0.3541, + 0.915, + 0.8612 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the painted street lines visible on the road surface, including double yellow lines, lane markings, and crosswalk lines.", + "coarse_bbox": [ + 0.0, + 0.347, + 0.917, + 0.865 + ], + "refine_crop": "crops/detect_refine_street_lines.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..8630b48f5833dbe2e9a1755e10f9cb3fa07c412f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9622102c5ee0bb900fd330bfc7785f21a7c344c877ce8996532d858e02d32216 +size 1704806 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..0ba743934439da19eae659f3ae1b789eca463c69 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/plan.json @@ -0,0 +1,145 @@ +{ + "sample_id": "sample_000004", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A daytime city street intersection viewed from the perspective of an approaching vehicle.", + "activity": "A man is waiting at the crosswalk as the traffic light turns red, while a delivery truck and a dark car are parked along the curbside.", + "composition": "First-person driving view from the center lane. The painted street lines lead the eye toward the intersection. The red traffic light is suspended prominently near the top center. The dark car is parked on the right curb in the mid-ground, with the delivery truck parked further down. The pedestrian stands at the right-side crosswalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "walker", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "description": "A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.", + "role_in_scene": "Standing at the edge of the sidewalk near the crosswalk, waiting to cross the street." + } + ], + "objects": [ + { + "name": "traffic_light", + "source_index": 0, + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "description": "A set of traffic lights suspended over the intersection, showing a red light.", + "role_in_scene": "Hanging high above the intersection in the upper-center of the frame." + }, + { + "name": "delivery_truck", + "source_index": 2, + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "description": "A large, plain white box delivery truck.", + "role_in_scene": "Parked alongside the right curb in the background, past the intersection." + }, + { + "name": "dark_parked_car", + "source_index": 3, + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "description": "A dark-colored sedan.", + "role_in_scene": "Parked on the right side of the street near the sidewalk in the mid-ground." + }, + { + "name": "street_lines", + "source_index": 4, + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "description": "Double yellow lines separating traffic directions and crisp white painted lines indicating lanes and a crosswalk.", + "role_in_scene": "Painted on the asphalt road surface, extending from the foreground toward the intersection." + } + ] + }, + "expected_subjects": [ + { + "name": "walker", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", + "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Standing at the edge of the sidewalk near the crosswalk, waiting to cross the street.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", + "sub_caption": "traffic light: A set of traffic lights suspended over the intersection, showing a red light.. Scene role: Hanging high above the intersection in the upper-center of the frame.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "delivery_truck", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", + "sub_caption": "delivery truck: A large, plain white box delivery truck.. Scene role: Parked alongside the right curb in the background, past the intersection.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", + "sub_caption": "dark parked car: A dark-colored sedan.. Scene role: Parked on the right side of the street near the sidewalk in the mid-ground.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", + "sub_caption": "street lines: Double yellow lines separating traffic directions and crisp white painted lines indicating lanes and a crosswalk.. Scene role: Painted on the asphalt road surface, extending from the foreground toward the intersection.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000004/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references.json new file mode 100644 index 0000000000000000000000000000000000000000..e11dd6da93f9325c4e931c0309c61cd2acd4f5c2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references.json @@ -0,0 +1,165 @@ +{ + "references": [ + { + "name": "walker", + "ref_image": "references/ref_walker.png", + "raw_ref_image": "references/raw_ref_walker_attempt_01.png", + "diversify_input": "crops/diversify_input_walker.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_walker_attempt_01.png", + "output": "references/ref_walker.png", + "mask": "references/sam_mask_walker.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 51.0, + 688.0, + 1005.0 + ], + "mask_score": 3.433924, + "mask_area_ratio": 0.16005, + "elapsed_seconds": 7.2846 + }, + "reference_verify": "references/reference_verify_walker.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "traffic_light", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "diversify_input": "crops/diversify_input_traffic_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 160.0, + 93.0, + 864.0, + 930.0 + ], + "mask_score": 3.437579, + "mask_area_ratio": 0.253583, + "elapsed_seconds": 7.0663 + }, + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "delivery_truck", + "ref_image": "references/ref_delivery_truck.png", + "raw_ref_image": "references/raw_ref_delivery_truck_attempt_01.png", + "diversify_input": "crops/diversify_input_delivery_truck.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_delivery_truck_attempt_01.png", + "output": "references/ref_delivery_truck.png", + "mask": "references/sam_mask_delivery_truck.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 95.0, + 100.0, + 910.0, + 932.0 + ], + "mask_score": 3.445823, + "mask_area_ratio": 0.476913, + "elapsed_seconds": 7.1923 + }, + "reference_verify": "references/reference_verify_delivery_truck.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dark_parked_car", + "ref_image": "references/ref_dark_parked_car.png", + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "diversify_input": "crops/diversify_input_dark_parked_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "references/ref_dark_parked_car.png", + "mask": "references/sam_mask_dark_parked_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 19.0, + 336.0, + 1003.0, + 700.0 + ], + "mask_score": 3.408233, + "mask_area_ratio": 0.181406, + "elapsed_seconds": 8.4178 + }, + "reference_verify": "references/reference_verify_dark_parked_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_lines", + "ref_image": "references/ref_street_lines.png", + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "diversify_input": "crops/diversify_input_street_lines.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "references/ref_street_lines.png", + "mask": "references/sam_mask_street_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 38.0, + 225.0, + 985.0, + 799.0 + ], + "mask_score": 3.287982, + "mask_area_ratio": 0.400985, + "elapsed_seconds": 7.2613 + }, + "reference_verify": "references/reference_verify_street_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_dark_parked_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..3a8c2d6be0e0eda8b3fa7fc736ff8c32a0197f56 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_dark_parked_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc767419f8313b4c5b6df5bdffcca2bd9a95a12d0d32d263438bf43f98c28ab +size 330462 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_delivery_truck.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..ce5c15a91a30ba9c952ea941efca10f1fa29adc1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_delivery_truck.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:429b26b6757a8ed4e5bacfc7046e7e15524fc4a61a74e1c7ce6270af9103a76f +size 738310 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_street_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..1a95531b6054c36cf31dc367a622510d499362fd --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_street_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b816a3d471ae8e5a14fdc83d11e4435df93f43517185d362409a4dc5e37084 +size 1039172 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..b2515d597e2999c9b0fbbcc4da37d928aaabe366 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_traffic_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713269190075dd90fdabc697a06c67c43a94ac210e40458e64752aec976c1209 +size 550048 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_walker.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..4ec995422688f63ae3e214016dfbc8ccb6a4695b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/ref_walker.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ed9c97b556fad32685fadbeb1ae0f6b25b440854e60988e195079a81410d7eb +size 292965 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_dark_parked_car.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_dark_parked_car.json new file mode 100644 index 0000000000000000000000000000000000000000..d8e4280c73188a75f67ce16c9743cc71ec45f467 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_dark_parked_car.json @@ -0,0 +1,46 @@ +{ + "name": "dark_parked_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dark_parked_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dark_parked_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_dark_parked_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_dark_parked_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 19.0, + 336.0, + 1003.0, + 700.0 + ], + "mask_score": 3.408233, + "mask_area_ratio": 0.181406, + "elapsed_seconds": 8.4178 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a dark sedan entirely visible, not cropped, and placed cleanly on a white background. It satisfies all hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_delivery_truck.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_delivery_truck.json new file mode 100644 index 0000000000000000000000000000000000000000..7f8b10b1d8957e6559a38afdbe77234c01437b21 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_delivery_truck.json @@ -0,0 +1,46 @@ +{ + "name": "delivery_truck", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_delivery_truck_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_delivery_truck_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_delivery_truck_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_delivery_truck_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_delivery_truck_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_delivery_truck_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 95.0, + 100.0, + 910.0, + 932.0 + ], + "mask_score": 3.445823, + "mask_area_ratio": 0.476913, + "elapsed_seconds": 7.1923 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated delivery truck against a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_street_lines.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_street_lines.json new file mode 100644 index 0000000000000000000000000000000000000000..8757976316b6751a4c40f21a5e41e9394e533d48 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_street_lines.json @@ -0,0 +1,46 @@ +{ + "name": "street_lines", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_lines_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_lines_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_street_lines_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_street_lines_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 38.0, + 225.0, + 985.0, + 799.0 + ], + "mask_score": 3.287982, + "mask_area_ratio": 0.400985, + "elapsed_seconds": 7.2613 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Image shows a representative crop of street lines on asphalt against a white background. This is acceptable for a surface pattern or environmental feature subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_traffic_light.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_traffic_light.json new file mode 100644 index 0000000000000000000000000000000000000000..73758da3270714266a795e181e62968016bda586 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_traffic_light.json @@ -0,0 +1,46 @@ +{ + "name": "traffic_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_traffic_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_traffic_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_traffic_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_traffic_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_traffic_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 160.0, + 93.0, + 864.0, + 930.0 + ], + "mask_score": 3.437579, + "mask_area_ratio": 0.253583, + "elapsed_seconds": 7.0663 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The traffic light is clearly visible, complete, and isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_walker.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_walker.json new file mode 100644 index 0000000000000000000000000000000000000000..1297173d82e5ae456f682483c9f9cdde87bbc2b9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/reference_verify_walker.json @@ -0,0 +1,46 @@ +{ + "name": "walker", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_walker_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_walker_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_walker_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_walker_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_walker_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_walker_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 51.0, + 688.0, + 1005.0 + ], + "mask_score": 3.433924, + "mask_area_ratio": 0.16005, + "elapsed_seconds": 7.2846 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete full-body view of the person on a white background with ample margins, satisfying all hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_dark_parked_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..d08f1bbe8a28fce955a1fb0af73b7c8c24d854a5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_dark_parked_car.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_delivery_truck.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..7f299ebad546f45560fda26ddc93362514f79f09 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_delivery_truck.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_street_lines.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..2d72ab15d8d504378260e986fb44281789cb503e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_street_lines.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..14dc06d2f505639dc6e6c5d1eee29dc43c63b898 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_walker.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..95b94e553c0fdca5df9ebd9fd8b3ce491afb2c14 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/references/sam_mask_walker.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/row.json new file mode 100644 index 0000000000000000000000000000000000000000..29b239b2a485e526de0882f089edd5c5bb5becdc --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/row.json @@ -0,0 +1,256 @@ +{ + "sample_id": "sample_000004", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "walker", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", + "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Standing at the edge of the sidewalk near the crosswalk, waiting to cross the street.", + "measured_bbox": [ + 0.7914, + 0.2893, + 0.834, + 0.4815 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_walker.png", + "raw_ref_image": "references/raw_ref_walker_attempt_01.png", + "reference_verify": "references/reference_verify_walker.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_walker_attempt_01.png", + "output": "references/ref_walker.png", + "mask": "references/sam_mask_walker.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 51.0, + 688.0, + 1005.0 + ], + "mask_score": 3.433924, + "mask_area_ratio": 0.16005, + "elapsed_seconds": 7.2846 + } + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", + "sub_caption": "traffic light: A set of traffic lights suspended over the intersection, showing a red light.. Scene role: Hanging high above the intersection in the upper-center of the frame.", + "measured_bbox": [ + 0.4425, + 0.023, + 0.467, + 0.1052 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 160.0, + 93.0, + 864.0, + 930.0 + ], + "mask_score": 3.437579, + "mask_area_ratio": 0.253583, + "elapsed_seconds": 7.0663 + } + }, + { + "name": "delivery_truck", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", + "sub_caption": "delivery truck: A large, plain white box delivery truck.. Scene role: Parked alongside the right curb in the background, past the intersection.", + "measured_bbox": [ + 0.576, + 0.1929, + 0.7135, + 0.4081 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_delivery_truck.png", + "raw_ref_image": "references/raw_ref_delivery_truck_attempt_01.png", + "reference_verify": "references/reference_verify_delivery_truck.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_delivery_truck_attempt_01.png", + "output": "references/ref_delivery_truck.png", + "mask": "references/sam_mask_delivery_truck.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 95.0, + 100.0, + 910.0, + 932.0 + ], + "mask_score": 3.445823, + "mask_area_ratio": 0.476913, + "elapsed_seconds": 7.1923 + } + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", + "sub_caption": "dark parked car: A dark-colored sedan.. Scene role: Parked on the right side of the street near the sidewalk in the mid-ground.", + "measured_bbox": [ + 0.8414, + 0.3717, + 0.9967, + 0.7454 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_parked_car.png", + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "reference_verify": "references/reference_verify_dark_parked_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "references/ref_dark_parked_car.png", + "mask": "references/sam_mask_dark_parked_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 19.0, + 336.0, + 1003.0, + 700.0 + ], + "mask_score": 3.408233, + "mask_area_ratio": 0.181406, + "elapsed_seconds": 8.4178 + } + }, + { + "name": "street_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", + "sub_caption": "street lines: Double yellow lines separating traffic directions and crisp white painted lines indicating lanes and a crosswalk.. Scene role: Painted on the asphalt road surface, extending from the foreground toward the intersection.", + "measured_bbox": [ + 0.003, + 0.3541, + 0.915, + 0.8612 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lines.png", + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "reference_verify": "references/reference_verify_street_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "references/ref_street_lines.png", + "mask": "references/sam_mask_street_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 38.0, + 225.0, + 985.0, + 799.0 + ], + "mask_score": 3.287982, + "mask_area_ratio": 0.400985, + "elapsed_seconds": 7.2613 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..348d157f28acd91ee26a791b1f5889a07ab4feca --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000004/vocab_task.json @@ -0,0 +1,84 @@ +{ + "task_id": "sample_000004", + "sample_id": "sample_000004", + "sample_index": 4, + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 122464, + "image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "name": "walker", + "description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background." + }, + { + "candidate_index": 1, + "source_offset": 38483, + "image_id": "CrowdHuman:data/data_21/283554,2385f000b2018513.jpg:person:44", + "name": "pedestrian", + "description": "Walking in the background, mostly obscured. Source dataset: CrowdHuman. Scene context: A crowded pedestrian area with various shops and a large KFC billboard." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 156913, + "image_id": "BDD100K:b6df605f-51c158b8:object:6", + "name": "traffic light", + "description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk." + }, + { + "candidate_index": 1, + "source_offset": 181614, + "image_id": "BDD100K:bc692855-7c087cf6:object:5", + "name": "turn only sign", + "description": "A white rectangular sign with a black curved arrow indicating 'ONLY' for the lane. Source dataset: BDD100K. Scene context: Nighttime driving scene on a multi-lane city street with several cars and illuminated buildings." + }, + { + "candidate_index": 2, + "source_offset": 141997, + "image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "name": "delivery truck", + "description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky." + }, + { + "candidate_index": 3, + "source_offset": 186080, + "image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "name": "dark parked car", + "description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left." + }, + { + "candidate_index": 4, + "source_offset": 200940, + "image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "name": "street lines", + "description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure." + }, + { + "candidate_index": 5, + "source_offset": 225017, + "image_id": "BDD100K:c5b2506d-6e15c4c3:object:8", + "name": "traffic light", + "description": "A traffic signal visible far ahead at an intersection. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street flanked by tall buildings, trees, and other vehicles." + }, + { + "candidate_index": 6, + "source_offset": 148655, + "image_id": "BDD100K:b5172858-da5e71cc:object:5", + "name": "red advertisement signs", + "description": "Two rectangular red signs with white text, positioned on the sidewalk near the storefront entrance. Source dataset: BDD100K. Scene context: A nighttime city street scene with cars parked and driving, and a lit storefront on the right where people are standing." + }, + { + "candidate_index": 7, + "source_offset": 238418, + "image_id": "BDD100K:c80cf60a-8bb33a63:object:10", + "name": "street light pole", + "description": "tall metal pole for street lighting along the highway Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway, with a flatbed tow truck carrying a street sweeper prominently visible on the right." + } + ], + "rng_seed": 1782346909, + "created_at": 1782292413.3386934 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..ec67ebf71ea4801664bff2ac95cd8bc8e20339a3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f566b9fe6ab2483e5406e62a240024ad0fcd3fe9a6557339f01ae2d31c03633 +size 1447337 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69ec08de9e536a452c5e004006b93fd14452191 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/compose_prompt.txt @@ -0,0 +1,87 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban intersection with active traffic management and a temporary lane closure due to an incident.", + "activity": "A silver car is stopped at the intersection near a concrete roadside barrier while a firefighter and a uniformed officer assess the situation and direct traffic. A person in a suit stands nearby, observing the scene.", + "composition": "Eye-level street perspective with a moderate depth of field. The silver car is centered in the midground with brake lights on. The continuous concrete barrier runs along the right side, guiding the eye into the scene. The traffic light hangs clearly in the upper frame. The firefighter and uniformed officer stand in the left-to-center foreground, while the person in the suit stands safely behind the barrier on the right.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "firefighter", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", + "source_name": "firefighter", + "description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away.", + "role_in_scene": "Assisting with incident management, positioned near the stopped car and barrier." + }, + { + "name": "uniformed_officer", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", + "source_name": "uniformed officer", + "description": "Wearing a khaki uniform and helmet, holding a baton, looking towards the left.", + "role_in_scene": "Directing surrounding traffic away from the stopped vehicle using a baton." + }, + { + "name": "bystander_in_suit", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "description": "A person wearing a suit.", + "role_in_scene": "Standing off to the right side of the road behind the barrier, acting as the driver or an involved pedestrian." + } + ], + "objects": [ + { + "name": "traffic_light", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "description": "A black multi-lens traffic light fixture mounted on a pole above the street.", + "role_in_scene": "Hanging overhead or mounted prominently on a pole at the intersection." + }, + { + "name": "concrete_barrier", + "source_index": 3, + "source_image_id": "BDD100K:c946c532-07177e0a:object:11", + "source_name": "concrete barrier", + "description": "A continuous low concrete wall acting as a barrier on the right side of the road.", + "role_in_scene": "Lining the right side of the street, separating the pedestrian walkway or construction zone from the active traffic lane." + }, + { + "name": "silver_car", + "source_index": 5, + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "description": "A silver compact hatchback car facing forward, waiting at an intersection with illuminated brake lights.", + "role_in_scene": "Stopped in the active lane near the barrier, serving as the focal point of the traffic response." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_bystander_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_bystander_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..effdadf89ae5acb98bc5ca533c84f6b944c05d03 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_bystander_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_concrete_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_concrete_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..31a40074e7087292ca9f6a07d24ce0c47690da73 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_concrete_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4fe7bda496410a49ed1442f5079e9098c7cbc74790484569fb4eccfa3958d01 +size 176123 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_firefighter.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_firefighter.png new file mode 100644 index 0000000000000000000000000000000000000000..8b9a713aaa2f2331a7a8653fd0c53ebc22dc3685 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_firefighter.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..8093f5dc816ed814c09b5dbfb6ce3da7aad098d2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a518640aae2d85021a9459c68ef70caccff9a8856e1f65e3091e0ec263a0325 +size 191042 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..4d8e09402f4b33c2054a01a5e43da45c92d0187f Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_uniformed_officer.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_uniformed_officer.png new file mode 100644 index 0000000000000000000000000000000000000000..c1bf76e7cf0d7a10189a70ca0888f9774b620d63 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/detect_refine_uniformed_officer.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_bystander_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_bystander_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..fce33ebc9c0c921ea334815527e09b0ccf15fd0e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_bystander_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_concrete_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_concrete_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..bc6c5d0c0ce265c41bc66f5ecec04f3fc0d62dc0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_concrete_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a45dcd1a93ac7688240f9d1c918093f982afaac3634c96fe1f3bb2fe786e67 +size 200686 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_firefighter.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_firefighter.png new file mode 100644 index 0000000000000000000000000000000000000000..0d04ed6da8c1f352e2b012900f227629cf76167c Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_firefighter.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..0c4c45555818f41b85a4adb74051d5416cdffdda --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add2b1404bcef604e5fb386843383d6c28fe9341516b1bd5a1f7bff7b5f3349e +size 262764 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..e0d9008ef258299e3552474f40d01bf6e0fb67a3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_uniformed_officer.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_uniformed_officer.png new file mode 100644 index 0000000000000000000000000000000000000000..468a9a8def04170141d656fc710ef713c99bf1e3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/crops/diversify_input_uniformed_officer.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20851fbafcd6794317c6d8c5239e65d65c07684238c061353c8c974c73f9043 +size 132608 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..1eaf9cb477a48b5182c9711caa31349bbead4c31 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/detections.json @@ -0,0 +1,116 @@ +[ + { + "name": "firefighter", + "present": true, + "bbox": [ + 0.2626, + 0.3463, + 0.3289, + 0.6561 + ], + "confidence": 0.99, + "notes": "The firefighter is clearly visible in the center of the crop, matching the provided description.", + "coarse_bbox": [ + 0.262, + 0.348, + 0.33, + 0.654 + ], + "refine_crop": "crops/detect_refine_firefighter.png" + }, + { + "name": "uniformed_officer", + "present": true, + "bbox": [ + 0.0497, + 0.3566, + 0.1691, + 0.6118 + ], + "confidence": 0.99, + "notes": "The uniformed officer is clearly visible directing traffic.", + "coarse_bbox": [ + 0.049, + 0.354, + 0.17, + 0.605 + ], + "refine_crop": "crops/detect_refine_uniformed_officer.png" + }, + { + "name": "bystander_in_suit", + "present": true, + "bbox": [ + 0.7467, + 0.3318, + 0.8036, + 0.5111 + ], + "confidence": 0.98, + "notes": "A person in a suit is visible in the crop.", + "coarse_bbox": [ + 0.746, + 0.326, + 0.804, + 0.509 + ], + "refine_crop": "crops/detect_refine_bystander_in_suit.png" + }, + { + "name": "traffic_light", + "present": true, + "bbox": [ + 0.5381, + 0.0316, + 0.5856, + 0.2076 + ], + "confidence": 0.99, + "notes": "A prominent black traffic light with a red light illuminated, occupying most of the frame.", + "coarse_bbox": [ + 0.538, + 0.03, + 0.589, + 0.211 + ], + "refine_crop": "crops/detect_refine_traffic_light.png" + }, + { + "name": "concrete_barrier", + "present": true, + "bbox": [ + 0.6322, + 0.4972, + 0.9964, + 0.6985 + ], + "confidence": 0.99, + "notes": "A continuous low concrete wall acting as a barrier on the right side of the road.", + "coarse_bbox": [ + 0.632, + 0.493, + 0.999, + 0.704 + ], + "refine_crop": "crops/detect_refine_concrete_barrier.png" + }, + { + "name": "silver_car", + "present": true, + "bbox": [ + 0.3396, + 0.3754, + 0.6399, + 0.6647 + ], + "confidence": 0.99, + "notes": "The silver hatchback car is prominently visible occupying the majority of the crop.", + "coarse_bbox": [ + 0.339, + 0.391, + 0.64, + 0.657 + ], + "refine_crop": "crops/detect_refine_silver_car.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..a2c5ca7014c39a802862267a6d2b32a19f815b3a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5babf44e6b10ef2a7f83975729523f948f5fdee9979f7cb112b350eff3984444 +size 1489800 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..2435af8a184b3819b5ab2825f9dc4ca39791e02c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/plan.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000005", + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban intersection with active traffic management and a temporary lane closure due to an incident.", + "activity": "A silver car is stopped at the intersection near a concrete roadside barrier while a firefighter and a uniformed officer assess the situation and direct traffic. A person in a suit stands nearby, observing the scene.", + "composition": "Eye-level street perspective with a moderate depth of field. The silver car is centered in the midground with brake lights on. The continuous concrete barrier runs along the right side, guiding the eye into the scene. The traffic light hangs clearly in the upper frame. The firefighter and uniformed officer stand in the left-to-center foreground, while the person in the suit stands safely behind the barrier on the right.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "firefighter", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", + "source_name": "firefighter", + "description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away.", + "role_in_scene": "Assisting with incident management, positioned near the stopped car and barrier." + }, + { + "name": "uniformed_officer", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", + "source_name": "uniformed officer", + "description": "Wearing a khaki uniform and helmet, holding a baton, looking towards the left.", + "role_in_scene": "Directing surrounding traffic away from the stopped vehicle using a baton." + }, + { + "name": "bystander_in_suit", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "description": "A person wearing a suit.", + "role_in_scene": "Standing off to the right side of the road behind the barrier, acting as the driver or an involved pedestrian." + } + ], + "objects": [ + { + "name": "traffic_light", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "description": "A black multi-lens traffic light fixture mounted on a pole above the street.", + "role_in_scene": "Hanging overhead or mounted prominently on a pole at the intersection." + }, + { + "name": "concrete_barrier", + "source_index": 3, + "source_image_id": "BDD100K:c946c532-07177e0a:object:11", + "source_name": "concrete barrier", + "description": "A continuous low concrete wall acting as a barrier on the right side of the road.", + "role_in_scene": "Lining the right side of the street, separating the pedestrian walkway or construction zone from the active traffic lane." + }, + { + "name": "silver_car", + "source_index": 5, + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "description": "A silver compact hatchback car facing forward, waiting at an intersection with illuminated brake lights.", + "role_in_scene": "Stopped in the active lane near the barrier, serving as the focal point of the traffic response." + } + ] + }, + "expected_subjects": [ + { + "name": "firefighter", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", + "source_name": "firefighter", + "source_description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away near the fire truck. Source dataset: CrowdHuman. Scene context: Emergency response personnel, including firefighters and ambulance crew, are gathered outside a large classical building with pillars and banners, accompanied by emergency vehicles.", + "sub_caption": "firefighter: Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away.. Scene role: Assisting with incident management, positioned near the stopped car and barrier.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "uniformed_officer", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", + "source_name": "uniformed officer", + "source_description": "wearing a khaki uniform and helmet, holding a baton, looking towards the left Source dataset: CrowdHuman. Scene context: A large crowd of people, including some in uniform with batons and helmets, stands in front of a red and yellow building.", + "sub_caption": "uniformed officer: Wearing a khaki uniform and helmet, holding a baton, looking towards the left.. Scene role: Directing surrounding traffic away from the stopped vehicle using a baton.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "bystander_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", + "sub_caption": "crowd member: A person wearing a suit.. Scene role: Standing off to the right side of the road behind the barrier, acting as the driver or an involved pedestrian.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", + "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Hanging overhead or mounted prominently on a pole at the intersection.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "concrete_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c946c532-07177e0a:object:11", + "source_name": "concrete barrier", + "source_description": "A continuous low concrete wall acting as a barrier on the right side of the road. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway during the day, with construction or industrial sites visible alongside.", + "sub_caption": "concrete barrier: A continuous low concrete wall acting as a barrier on the right side of the road.. Scene role: Lining the right side of the street, separating the pedestrian walkway or construction zone from the active traffic lane.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", + "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection with illuminated brake lights.. Scene role: Stopped in the active lane near the barrier, serving as the focal point of the traffic response.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000005/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references.json new file mode 100644 index 0000000000000000000000000000000000000000..7e90fb2f2d08004c1bd14e897a932d4b78d0d123 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references.json @@ -0,0 +1,197 @@ +{ + "references": [ + { + "name": "firefighter", + "ref_image": "references/ref_firefighter.png", + "raw_ref_image": "references/raw_ref_firefighter_attempt_01.png", + "diversify_input": "crops/diversify_input_firefighter.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_firefighter_attempt_01.png", + "output": "references/ref_firefighter.png", + "mask": "references/sam_mask_firefighter.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 34.0, + 709.0, + 1009.0 + ], + "mask_score": 3.445343, + "mask_area_ratio": 0.178691, + "elapsed_seconds": 7.0362 + }, + "reference_verify": "references/reference_verify_firefighter.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "uniformed_officer", + "ref_image": "references/ref_uniformed_officer.png", + "raw_ref_image": "references/raw_ref_uniformed_officer_attempt_01.png", + "diversify_input": "crops/diversify_input_uniformed_officer.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_uniformed_officer_attempt_01.png", + "output": "references/ref_uniformed_officer.png", + "mask": "references/sam_mask_uniformed_officer.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 24.0, + 689.0, + 1005.0 + ], + "mask_score": 3.475629, + "mask_area_ratio": 0.156165, + "elapsed_seconds": 7.0984 + }, + "reference_verify": "references/reference_verify_uniformed_officer.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "bystander_in_suit", + "ref_image": "references/ref_bystander_in_suit.png", + "raw_ref_image": "references/raw_ref_bystander_in_suit_attempt_01.png", + "diversify_input": "crops/diversify_input_bystander_in_suit.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_bystander_in_suit_attempt_01.png", + "output": "references/ref_bystander_in_suit.png", + "mask": "references/sam_mask_bystander_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 59.0, + 677.0, + 996.0 + ], + "mask_score": 3.480669, + "mask_area_ratio": 0.144797, + "elapsed_seconds": 7.0242 + }, + "reference_verify": "references/reference_verify_bystander_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "traffic_light", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "diversify_input": "crops/diversify_input_traffic_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 272.0, + 15.0, + 750.0, + 1006.0 + ], + "mask_score": 3.448339, + "mask_area_ratio": 0.303974, + "elapsed_seconds": 8.3734 + }, + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "concrete_barrier", + "ref_image": "references/ref_concrete_barrier.png", + "raw_ref_image": "references/raw_ref_concrete_barrier_attempt_01.png", + "diversify_input": "crops/diversify_input_concrete_barrier.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_concrete_barrier_attempt_01.png", + "output": "references/ref_concrete_barrier.png", + "mask": "references/sam_mask_concrete_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 53.0, + 219.0, + 970.0, + 811.0 + ], + "mask_score": 3.469119, + "mask_area_ratio": 0.3653, + "elapsed_seconds": 7.0274 + }, + "reference_verify": "references/reference_verify_concrete_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "silver_car", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "diversify_input": "crops/diversify_input_silver_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 215.0, + 976.0, + 829.0 + ], + "mask_score": 3.457698, + "mask_area_ratio": 0.330622, + "elapsed_seconds": 7.0933 + }, + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_bystander_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_bystander_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..edf575a98d9c708a54f5c1179bd1c661a3a60139 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_bystander_in_suit.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b57c842d8b5704f44d45464210ee8eebfcb8d946e099e9999905a0b16794d0e +size 255501 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_concrete_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_concrete_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..f13b0228b12e17f0ffc9e5d29949f58f7a30d09d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_concrete_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf59bdf7e4382321a4c8d66a611f2fe77d740195b3acd73c265cdbe285cc172a +size 778492 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_firefighter.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_firefighter.png new file mode 100644 index 0000000000000000000000000000000000000000..f1ae55dd443265b3829925185a7a8c285c62afb7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_firefighter.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f51017bd6a478ac6f458875025ab4c3860537c4ee385b86c92e70b44af970f69 +size 328203 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..55ab2b23fa93be88b36911a0f288fc418884b724 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:509df17c9648f1931e6b3fdabde72faf188c369195efb174d8358c047516bdd0 +size 557650 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..93bb4e06203c77793e74cc8c2c9cc9a454240835 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_traffic_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ded9d9ee1e1d2dec1155fe9253b5f139e9a42e78d7472923c8c2be12e7e8541f +size 514829 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_uniformed_officer.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_uniformed_officer.png new file mode 100644 index 0000000000000000000000000000000000000000..9e9d612063e6fbd3d769e207a316d43ee8219c7a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/ref_uniformed_officer.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dfff8137a81c45f225d04bc85a3df825e8ba9e84f31da13e58de3c846de626d +size 291180 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_bystander_in_suit.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_bystander_in_suit.json new file mode 100644 index 0000000000000000000000000000000000000000..7b0be809f77dc63da16d3e3cff738b276196bb21 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_bystander_in_suit.json @@ -0,0 +1,46 @@ +{ + "name": "bystander_in_suit", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_bystander_in_suit_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_bystander_in_suit_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_bystander_in_suit_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_bystander_in_suit_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_bystander_in_suit_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_bystander_in_suit_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 59.0, + 677.0, + 996.0 + ], + "mask_score": 3.480669, + "mask_area_ratio": 0.144797, + "elapsed_seconds": 7.0242 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a person wearing a suit on a white background with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_concrete_barrier.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_concrete_barrier.json new file mode 100644 index 0000000000000000000000000000000000000000..80a779b7b64d541ca93f725d0e28f24c94f2b14a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_concrete_barrier.json @@ -0,0 +1,46 @@ +{ + "name": "concrete_barrier", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_concrete_barrier_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_concrete_barrier_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_concrete_barrier_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_concrete_barrier_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_concrete_barrier_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_concrete_barrier_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 53.0, + 219.0, + 970.0, + 811.0 + ], + "mask_score": 3.469119, + "mask_area_ratio": 0.3653, + "elapsed_seconds": 7.0274 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a single, complete concrete barrier isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_firefighter.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_firefighter.json new file mode 100644 index 0000000000000000000000000000000000000000..22755a5217e67dac8e1004bb2f1f45f0ddc247a2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_firefighter.json @@ -0,0 +1,46 @@ +{ + "name": "firefighter", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_firefighter_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_firefighter_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_firefighter_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_firefighter_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_firefighter_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_firefighter_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 34.0, + 709.0, + 1009.0 + ], + "mask_score": 3.445343, + "mask_area_ratio": 0.178691, + "elapsed_seconds": 7.0362 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body firefighter, well-isolated on a white background, all requirements met." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_silver_car.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_silver_car.json new file mode 100644 index 0000000000000000000000000000000000000000..fc0775c437caf09704001def167f71241efbc392 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_silver_car.json @@ -0,0 +1,46 @@ +{ + "name": "silver_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_silver_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_silver_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 215.0, + 976.0, + 829.0 + ], + "mask_score": 3.457698, + "mask_area_ratio": 0.330622, + "elapsed_seconds": 7.0933 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "A complete silver hatchback car is clearly visible on a white background with illuminated brake lights." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_traffic_light.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_traffic_light.json new file mode 100644 index 0000000000000000000000000000000000000000..febe4e47b23bf62ea057d49d61575c29d85420a4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_traffic_light.json @@ -0,0 +1,46 @@ +{ + "name": "traffic_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_traffic_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_traffic_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_traffic_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_traffic_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 272.0, + 15.0, + 750.0, + 1006.0 + ], + "mask_score": 3.448339, + "mask_area_ratio": 0.303974, + "elapsed_seconds": 8.3734 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a single traffic light completely visible, uncropped, and isolated against a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_uniformed_officer.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_uniformed_officer.json new file mode 100644 index 0000000000000000000000000000000000000000..2c3f9f2fea7ea9df71dd4f66addb4d4707404a05 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/reference_verify_uniformed_officer.json @@ -0,0 +1,46 @@ +{ + "name": "uniformed_officer", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_uniformed_officer_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_uniformed_officer_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_uniformed_officer_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_uniformed_officer_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_uniformed_officer_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_uniformed_officer_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 24.0, + 689.0, + 1005.0 + ], + "mask_score": 3.475629, + "mask_area_ratio": 0.156165, + "elapsed_seconds": 7.0984 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body visible, no cropping, single person on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_bystander_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_bystander_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..bc9c882fe678ef1080cb45cfa7d9ea448d5ff8f5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_bystander_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_concrete_barrier.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_concrete_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..382ef3f2d4708950f40a51760018f3fbfa37a73a Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_concrete_barrier.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_firefighter.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_firefighter.png new file mode 100644 index 0000000000000000000000000000000000000000..2acc7c50407edd36c313fe3b474453aa2444ab4a Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_firefighter.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_silver_car.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..62e9ff17e0acb185def4444f38c95c0b8d08b278 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_silver_car.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_traffic_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..3d6bc9a7f58d77d1e8d9ea26fb219215a4bd162f Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_uniformed_officer.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_uniformed_officer.png new file mode 100644 index 0000000000000000000000000000000000000000..aea29ea557096d2a2d2faeea3631cda705c04e4e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/references/sam_mask_uniformed_officer.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/row.json new file mode 100644 index 0000000000000000000000000000000000000000..917a828ab477bae6cc36f019447ca4ab7faed9cd --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/row.json @@ -0,0 +1,302 @@ +{ + "sample_id": "sample_000005", + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 6, + "n_detected": 6, + "n_subjects": 6, + "subjects": [ + { + "name": "firefighter", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", + "source_name": "firefighter", + "source_description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away near the fire truck. Source dataset: CrowdHuman. Scene context: Emergency response personnel, including firefighters and ambulance crew, are gathered outside a large classical building with pillars and banners, accompanied by emergency vehicles.", + "sub_caption": "firefighter: Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away.. Scene role: Assisting with incident management, positioned near the stopped car and barrier.", + "measured_bbox": [ + 0.2626, + 0.3463, + 0.3289, + 0.6561 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_firefighter.png", + "raw_ref_image": "references/raw_ref_firefighter_attempt_01.png", + "reference_verify": "references/reference_verify_firefighter.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_firefighter_attempt_01.png", + "output": "references/ref_firefighter.png", + "mask": "references/sam_mask_firefighter.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 34.0, + 709.0, + 1009.0 + ], + "mask_score": 3.445343, + "mask_area_ratio": 0.178691, + "elapsed_seconds": 7.0362 + } + }, + { + "name": "uniformed_officer", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", + "source_name": "uniformed officer", + "source_description": "wearing a khaki uniform and helmet, holding a baton, looking towards the left Source dataset: CrowdHuman. Scene context: A large crowd of people, including some in uniform with batons and helmets, stands in front of a red and yellow building.", + "sub_caption": "uniformed officer: Wearing a khaki uniform and helmet, holding a baton, looking towards the left.. Scene role: Directing surrounding traffic away from the stopped vehicle using a baton.", + "measured_bbox": [ + 0.0497, + 0.3566, + 0.1691, + 0.6118 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_uniformed_officer.png", + "raw_ref_image": "references/raw_ref_uniformed_officer_attempt_01.png", + "reference_verify": "references/reference_verify_uniformed_officer.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_uniformed_officer_attempt_01.png", + "output": "references/ref_uniformed_officer.png", + "mask": "references/sam_mask_uniformed_officer.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 24.0, + 689.0, + 1005.0 + ], + "mask_score": 3.475629, + "mask_area_ratio": 0.156165, + "elapsed_seconds": 7.0984 + } + }, + { + "name": "bystander_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", + "sub_caption": "crowd member: A person wearing a suit.. Scene role: Standing off to the right side of the road behind the barrier, acting as the driver or an involved pedestrian.", + "measured_bbox": [ + 0.7467, + 0.3318, + 0.8036, + 0.5111 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_bystander_in_suit.png", + "raw_ref_image": "references/raw_ref_bystander_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_bystander_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_bystander_in_suit_attempt_01.png", + "output": "references/ref_bystander_in_suit.png", + "mask": "references/sam_mask_bystander_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 59.0, + 677.0, + 996.0 + ], + "mask_score": 3.480669, + "mask_area_ratio": 0.144797, + "elapsed_seconds": 7.0242 + } + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", + "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Hanging overhead or mounted prominently on a pole at the intersection.", + "measured_bbox": [ + 0.5381, + 0.0316, + 0.5856, + 0.2076 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 272.0, + 15.0, + 750.0, + 1006.0 + ], + "mask_score": 3.448339, + "mask_area_ratio": 0.303974, + "elapsed_seconds": 8.3734 + } + }, + { + "name": "concrete_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c946c532-07177e0a:object:11", + "source_name": "concrete barrier", + "source_description": "A continuous low concrete wall acting as a barrier on the right side of the road. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway during the day, with construction or industrial sites visible alongside.", + "sub_caption": "concrete barrier: A continuous low concrete wall acting as a barrier on the right side of the road.. Scene role: Lining the right side of the street, separating the pedestrian walkway or construction zone from the active traffic lane.", + "measured_bbox": [ + 0.6322, + 0.4972, + 0.9964, + 0.6985 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_concrete_barrier.png", + "raw_ref_image": "references/raw_ref_concrete_barrier_attempt_01.png", + "reference_verify": "references/reference_verify_concrete_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_concrete_barrier_attempt_01.png", + "output": "references/ref_concrete_barrier.png", + "mask": "references/sam_mask_concrete_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 53.0, + 219.0, + 970.0, + 811.0 + ], + "mask_score": 3.469119, + "mask_area_ratio": 0.3653, + "elapsed_seconds": 7.0274 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", + "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection with illuminated brake lights.. Scene role: Stopped in the active lane near the barrier, serving as the focal point of the traffic response.", + "measured_bbox": [ + 0.3396, + 0.3754, + 0.6399, + 0.6647 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 46.0, + 215.0, + 976.0, + 829.0 + ], + "mask_score": 3.457698, + "mask_area_ratio": 0.330622, + "elapsed_seconds": 7.0933 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..3dd929f515be1de413aa0685bb0ed10f545c1674 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000005/vocab_task.json @@ -0,0 +1,98 @@ +{ + "task_id": "sample_000005", + "sample_id": "sample_000005", + "sample_index": 5, + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 46975, + "image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "name": "person", + "description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day." + }, + { + "candidate_index": 1, + "source_offset": 55111, + "image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "name": "crowd member", + "description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues." + }, + { + "candidate_index": 2, + "source_offset": 123738, + "image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "name": "pedestrian", + "description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade." + }, + { + "candidate_index": 3, + "source_offset": 32630, + "image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", + "name": "firefighter", + "description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away near the fire truck. Source dataset: CrowdHuman. Scene context: Emergency response personnel, including firefighters and ambulance crew, are gathered outside a large classical building with pillars and banners, accompanied by emergency vehicles." + }, + { + "candidate_index": 4, + "source_offset": 2956, + "image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", + "name": "uniformed officer", + "description": "wearing a khaki uniform and helmet, holding a baton, looking towards the left Source dataset: CrowdHuman. Scene context: A large crowd of people, including some in uniform with batons and helmets, stands in front of a red and yellow building." + }, + { + "candidate_index": 5, + "source_offset": 49906, + "image_id": "CrowdHuman:data/data_26/273278,110f89000f6dc4f9f.jpg:person:23", + "name": "pedestrian far left background", + "description": "A person walking in the background on the left. Source dataset: CrowdHuman. Scene context: A slightly elevated view of a city street and sidewalk, showing a subway entrance, pedestrians, runners, cars, and trees." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 6027, + "image_id": "CrowdHuman:data/data_10/273275,36cc4000c1fb4fda.jpg:object:3", + "name": "C'BON Cosmetics sign", + "description": "A large green 'C'BON COSMETICS' sign across the middle of the cylindrical glass building. Source dataset: CrowdHuman. Scene context: A bustling city street corner featuring several tall commercial buildings covered with prominent advertisements and brand logos." + }, + { + "candidate_index": 1, + "source_offset": 233879, + "image_id": "BDD100K:c789ecd3-819d4445:object:9", + "name": "sign", + "description": "A small diamond-shaped yellow sign mounted along the barrier. Source dataset: BDD100K. Scene context: Night driving on a highway with traffic ahead, streetlights, and a barrier wall." + }, + { + "candidate_index": 2, + "source_offset": 53140, + "image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "name": "traffic light", + "description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing." + }, + { + "candidate_index": 3, + "source_offset": 242371, + "image_id": "BDD100K:c946c532-07177e0a:object:11", + "name": "concrete barrier", + "description": "A continuous low concrete wall acting as a barrier on the right side of the road. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway during the day, with construction or industrial sites visible alongside." + }, + { + "candidate_index": 4, + "source_offset": 57443, + "image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "name": "trees", + "description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day." + }, + { + "candidate_index": 5, + "source_offset": 189425, + "image_id": "BDD100K:be3d3a81-326a032d:object:0", + "name": "silver car", + "description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure." + } + ], + "rng_seed": 1782451638, + "created_at": 1782292413.3527775 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..e2641c3126712fe3a0f4276768093777684c20be --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebd8b4fceebbea82c31b38f8282c71db0c527c1205086ba7f60e40ec4e327d04 +size 1057735 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d0a888b95cd4c5b83afd8263dac4794795dc43c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/compose_prompt.txt @@ -0,0 +1,103 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A nighttime city street intersection surrounded by illuminated building facades and glowing signboards.", + "activity": "A dashcam view driving behind a white van approaching an intersection, while an emergency vehicle with flashing blue lights sits parked on the left, and a pedestrian crosses the road in the distance.", + "composition": "Wide 16:9 dashcam perspective, emphasizing depth with leading lines from the double solid white line guiding the viewer's eye toward the vehicles and the crossing pedestrian.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "distant_pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "description": "A person walking across the street in the distant background.", + "role_in_scene": "Crossing the crosswalk in the distance ahead of the approaching vehicles." + } + ], + "objects": [ + { + "name": "vertical_illuminated_sign", + "source_index": 0, + "source_image_id": "BDD100K:bb1b7e42-9608265e:object:6", + "source_name": "street sign", + "description": "A vertical illuminated neon sign with abstract shapes, glowing brightly.", + "role_in_scene": "Mounted on the building facade on the right side of the street, adding ambient night lighting." + }, + { + "name": "emergency_vehicle", + "source_index": 1, + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "description": "A dark-colored vehicle with bright blue emergency lights flashing.", + "role_in_scene": "Parked on the left side of the street near the intersection." + }, + { + "name": "white_panel_van", + "source_index": 5, + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "description": "A large white panel van with red taillights illuminated.", + "role_in_scene": "Driving in the lane directly ahead of the camera perspective." + }, + { + "name": "double_solid_line", + "source_index": 6, + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "description": "Two continuous white painted lines on the dark asphalt road surface.", + "role_in_scene": "Separating the traffic lanes on the dark road, leading toward the intersection." + }, + { + "name": "dark_building_facade", + "source_index": 7, + "source_image_id": "BDD100K:c807cb19-7e09cb11:object:8", + "source_name": "building facade", + "description": "Dark outlines of buildings with scattered, warm-toned lit windows.", + "role_in_scene": "Forming the urban backdrop along the left side of the street." + }, + { + "name": "awning_building_corner", + "source_index": 10, + "source_image_id": "BDD100K:c06d23aa-cb9ae751:object:6", + "source_name": "building corner", + "description": "The corner of a building featuring an awning and brightly lit abstract signboards.", + "role_in_scene": "Anchoring the right side of the intersection with a warm architectural glow." + }, + { + "name": "green_street_sign", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "description": "A standard green street sign without any readable text.", + "role_in_scene": "Hanging from a traffic light pole near the intersection." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_awning_building_corner.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_awning_building_corner.png new file mode 100644 index 0000000000000000000000000000000000000000..3e98bc37fa347b72fd991c1af130553f0d553f91 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_awning_building_corner.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_dark_building_facade.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_dark_building_facade.png new file mode 100644 index 0000000000000000000000000000000000000000..1e3e9755658afc29827ab6eb4e73be84da18c790 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_dark_building_facade.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f88e15fc6ee223f11baac3ce67e248a9749102c24420b0bb8fd73c970354b8d7 +size 309726 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_distant_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..ab289b8ba9faa39fa4dada1a6c9550b00aeb64be Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_distant_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_double_solid_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_double_solid_line.png new file mode 100644 index 0000000000000000000000000000000000000000..75095f67422289a337b5245e6dbda385e94d4023 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_double_solid_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d795cce14b11771e3ba75b97c70b09d523c88a04cd6d11be0912b65151493328 +size 142916 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_emergency_vehicle.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..11a8f7ab8bc52500ad161635869f236cc6602dbe Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_emergency_vehicle.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_green_street_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..2f9979e8c31b80796193d3273b8be350973714af Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_green_street_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_vertical_illuminated_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_vertical_illuminated_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..3467ba8b13dee8af92139668dfb076e39e2facfc Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_vertical_illuminated_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_white_panel_van.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..c22bddf5db24665e3eedc23c9b38aec73b70caff Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/detect_refine_white_panel_van.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_awning_building_corner.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_awning_building_corner.png new file mode 100644 index 0000000000000000000000000000000000000000..030b10605314b949c60c37bc89dcdc9019654ab7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_awning_building_corner.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_dark_building_facade.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_dark_building_facade.png new file mode 100644 index 0000000000000000000000000000000000000000..50b816cc181ce712e20e25bffc73dceb38404653 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_dark_building_facade.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8d86dd6f0288b8649abc055e4c06c48010aa85087ae14deb19d085d6eb6af22 +size 216972 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_distant_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..081bb125cd8e0abe60831193a4531a75e8dedeba Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_distant_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_double_solid_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_double_solid_line.png new file mode 100644 index 0000000000000000000000000000000000000000..0e5121491731f6210c59fd6e3156cd2df2e128bc --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_double_solid_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11307d18e277edc5e94d6c2ad5541759e53d6e28034195f703ce46ebac9af555 +size 194471 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_emergency_vehicle.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..388ea43d309ac727df29205da08ca0a30c6862df Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_emergency_vehicle.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_green_street_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..ba6bc229573b319791e2a98f819bca3fd8a10196 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_green_street_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_vertical_illuminated_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_vertical_illuminated_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..98a680de03a819605984ae660b81c3e3c7c8a4e5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_vertical_illuminated_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_white_panel_van.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..f67a0f04f3b2d803590be4de9ec1ff005af889e0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/crops/diversify_input_white_panel_van.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50f008afcb03527bb00b6bd2e177f4ab61558741829fae6be4121d3e75373d0e +size 109799 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..f33aa2ec3f3e6d8e53773006d2accc73d1b7057c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/detections.json @@ -0,0 +1,154 @@ +[ + { + "name": "distant_pedestrian", + "present": true, + "bbox": [ + 0.3877, + 0.478, + 0.4204, + 0.5881 + ], + "confidence": 0.9, + "notes": "A person walking across the street.", + "coarse_bbox": [ + 0.387, + 0.476, + 0.419, + 0.589 + ], + "refine_crop": "crops/detect_refine_distant_pedestrian.png" + }, + { + "name": "vertical_illuminated_sign", + "present": true, + "bbox": [ + 0.7683, + 0.0355, + 0.8177, + 0.2837 + ], + "confidence": 0.99, + "notes": "A prominent vertical illuminated sign is centered in the crop.", + "coarse_bbox": [ + 0.765, + 0.032, + 0.818, + 0.286 + ], + "refine_crop": "crops/detect_refine_vertical_illuminated_sign.png" + }, + { + "name": "emergency_vehicle", + "present": true, + "bbox": [ + 0.1031, + 0.4564, + 0.2827, + 0.6497 + ], + "confidence": 0.95, + "notes": "Tight box around the dark-colored vehicle with bright blue emergency lights.", + "coarse_bbox": [ + 0.102, + 0.432, + 0.278, + 0.654 + ], + "refine_crop": "crops/detect_refine_emergency_vehicle.png" + }, + { + "name": "white_panel_van", + "present": true, + "bbox": [ + 0.4556, + 0.3288, + 0.5926, + 0.6597 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the visible portion of the white panel van, from the roof down to the tires.", + "coarse_bbox": [ + 0.456, + 0.326, + 0.594, + 0.648 + ], + "refine_crop": "crops/detect_refine_white_panel_van.png" + }, + { + "name": "double_solid_line", + "present": true, + "bbox": [ + 0.1922, + 0.6133, + 0.4541, + 1.0 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the double solid white line.", + "coarse_bbox": [ + 0.215, + 0.611, + 0.444, + 0.993 + ], + "refine_crop": "crops/detect_refine_double_solid_line.png" + }, + { + "name": "dark_building_facade", + "present": true, + "bbox": [ + 0.1397, + 0.0, + 0.366, + 0.5427 + ], + "confidence": 0.8, + "notes": "Tight bounding box around the dark building facade with scattered, warm-toned lit windows in the background.", + "coarse_bbox": [ + 0.0, + 0.003, + 0.419, + 0.556 + ], + "refine_crop": "crops/detect_refine_dark_building_facade.png" + }, + { + "name": "awning_building_corner", + "present": true, + "bbox": [ + 0.6102, + 0.3347, + 0.7867, + 0.5412 + ], + "confidence": 0.9, + "notes": "The crop shows the building corner described, with a brightly lit awning and abstract signboards.", + "coarse_bbox": [ + 0.625, + 0.354, + 0.772, + 0.542 + ], + "refine_crop": "crops/detect_refine_awning_building_corner.png" + }, + { + "name": "green_street_sign", + "present": true, + "bbox": [ + 0.5754, + 0.1583, + 0.6522, + 0.1884 + ], + "confidence": 100, + "notes": "Green street sign without readable text.", + "coarse_bbox": [ + 0.575, + 0.155, + 0.65, + 0.187 + ], + "refine_crop": "crops/detect_refine_green_street_sign.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..a6170d3cae712bf9ae98f8a44d8e8b882d259fd9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53b691b80ab6c250183a16c464a76c365e2e0a0255773029d0261fce2ba98630 +size 1128710 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..64faa20c4d7ba125dfb407c70fdf9cd554d979dd --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/plan.json @@ -0,0 +1,202 @@ +{ + "sample_id": "sample_000006", + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A nighttime city street intersection surrounded by illuminated building facades and glowing signboards.", + "activity": "A dashcam view driving behind a white van approaching an intersection, while an emergency vehicle with flashing blue lights sits parked on the left, and a pedestrian crosses the road in the distance.", + "composition": "Wide 16:9 dashcam perspective, emphasizing depth with leading lines from the double solid white line guiding the viewer's eye toward the vehicles and the crossing pedestrian.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "distant_pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "description": "A person walking across the street in the distant background.", + "role_in_scene": "Crossing the crosswalk in the distance ahead of the approaching vehicles." + } + ], + "objects": [ + { + "name": "vertical_illuminated_sign", + "source_index": 0, + "source_image_id": "BDD100K:bb1b7e42-9608265e:object:6", + "source_name": "street sign", + "description": "A vertical illuminated neon sign with abstract shapes, glowing brightly.", + "role_in_scene": "Mounted on the building facade on the right side of the street, adding ambient night lighting." + }, + { + "name": "emergency_vehicle", + "source_index": 1, + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "description": "A dark-colored vehicle with bright blue emergency lights flashing.", + "role_in_scene": "Parked on the left side of the street near the intersection." + }, + { + "name": "white_panel_van", + "source_index": 5, + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "description": "A large white panel van with red taillights illuminated.", + "role_in_scene": "Driving in the lane directly ahead of the camera perspective." + }, + { + "name": "double_solid_line", + "source_index": 6, + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "description": "Two continuous white painted lines on the dark asphalt road surface.", + "role_in_scene": "Separating the traffic lanes on the dark road, leading toward the intersection." + }, + { + "name": "dark_building_facade", + "source_index": 7, + "source_image_id": "BDD100K:c807cb19-7e09cb11:object:8", + "source_name": "building facade", + "description": "Dark outlines of buildings with scattered, warm-toned lit windows.", + "role_in_scene": "Forming the urban backdrop along the left side of the street." + }, + { + "name": "awning_building_corner", + "source_index": 10, + "source_image_id": "BDD100K:c06d23aa-cb9ae751:object:6", + "source_name": "building corner", + "description": "The corner of a building featuring an awning and brightly lit abstract signboards.", + "role_in_scene": "Anchoring the right side of the intersection with a warm architectural glow." + }, + { + "name": "green_street_sign", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "description": "A standard green street sign without any readable text.", + "role_in_scene": "Hanging from a traffic light pole near the intersection." + } + ] + }, + "expected_subjects": [ + { + "name": "distant_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", + "sub_caption": "pedestrian: A person walking across the street in the distant background.. Scene role: Crossing the crosswalk in the distance ahead of the approaching vehicles.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "vertical_illuminated_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb1b7e42-9608265e:object:6", + "source_name": "street sign", + "source_description": "A vertical 'PARK' sign illuminated on the right side of the street, indicating a parking garage. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a city street with tall buildings on both sides, following a yellow taxi, with other cars parked and driving.", + "sub_caption": "street sign: A vertical illuminated neon sign with abstract shapes, glowing brightly.. Scene role: Mounted on the building facade on the right side of the street, adding ambient night lighting.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "emergency_vehicle", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", + "sub_caption": "vehicle: A dark-colored vehicle with bright blue emergency lights flashing.. Scene role: Parked on the left side of the street near the intersection.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_panel_van", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", + "sub_caption": "white van: A large white panel van with red taillights illuminated.. Scene role: Driving in the lane directly ahead of the camera perspective.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "double_solid_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", + "sub_caption": "double solid white line: Two continuous white painted lines on the dark asphalt road surface.. Scene role: Separating the traffic lanes on the dark road, leading toward the intersection.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_building_facade", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c807cb19-7e09cb11:object:8", + "source_name": "building facade", + "source_description": "Dark outlines of buildings lining the street on both sides, with some lit windows. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane city street with traffic lights and vehicles ahead.", + "sub_caption": "building facade: Dark outlines of buildings with scattered, warm-toned lit windows.. Scene role: Forming the urban backdrop along the left side of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "awning_building_corner", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c06d23aa-cb9ae751:object:6", + "source_name": "building corner", + "source_description": "The corner of a building on the right side, with an orange or red awning and some lit signs. Source dataset: BDD100K. Scene context: Nighttime driving scene at an intersection with a stop sign and a large black SUV passing on the right.", + "sub_caption": "building corner: The corner of a building featuring an awning and brightly lit abstract signboards.. Scene role: Anchoring the right side of the intersection with a warm architectural glow.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "green_street_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", + "sub_caption": "street sign: A standard green street sign without any readable text.. Scene role: Hanging from a traffic light pole near the intersection.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000006/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references.json new file mode 100644 index 0000000000000000000000000000000000000000..78a099a1d875da98a7c58018f077aacc2cbf3dbd --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references.json @@ -0,0 +1,261 @@ +{ + "references": [ + { + "name": "distant_pedestrian", + "ref_image": "references/ref_distant_pedestrian.png", + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "diversify_input": "crops/diversify_input_distant_pedestrian.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "references/ref_distant_pedestrian.png", + "mask": "references/sam_mask_distant_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 324.0, + 9.0, + 705.0, + 1015.0 + ], + "mask_score": 3.338419, + "mask_area_ratio": 0.174056, + "elapsed_seconds": 8.694 + }, + "reference_verify": "references/reference_verify_distant_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "vertical_illuminated_sign", + "ref_image": "references/ref_vertical_illuminated_sign.png", + "raw_ref_image": "references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "diversify_input": "crops/diversify_input_vertical_illuminated_sign.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "output": "references/ref_vertical_illuminated_sign.png", + "mask": "references/sam_mask_vertical_illuminated_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 305.0, + 20.0, + 728.0, + 1002.0 + ], + "mask_score": 3.37343, + "mask_area_ratio": 0.273593, + "elapsed_seconds": 7.1332 + }, + "reference_verify": "references/reference_verify_vertical_illuminated_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "emergency_vehicle", + "ref_image": "references/ref_emergency_vehicle.png", + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "diversify_input": "crops/diversify_input_emergency_vehicle.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "references/ref_emergency_vehicle.png", + "mask": "references/sam_mask_emergency_vehicle.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 8.0, + 237.0, + 1015.0, + 828.0 + ], + "mask_score": 3.468468, + "mask_area_ratio": 0.355034, + "elapsed_seconds": 7.0896 + }, + "reference_verify": "references/reference_verify_emergency_vehicle.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "white_panel_van", + "ref_image": "references/ref_white_panel_van.png", + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "diversify_input": "crops/diversify_input_white_panel_van.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "references/ref_white_panel_van.png", + "mask": "references/sam_mask_white_panel_van.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 181.0, + 63.0, + 843.0, + 937.0 + ], + "mask_score": 2.636854, + "mask_area_ratio": 0.376409, + "elapsed_seconds": 7.1379 + }, + "reference_verify": "references/reference_verify_white_panel_van.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "double_solid_line", + "ref_image": "references/ref_double_solid_line.png", + "raw_ref_image": "references/raw_ref_double_solid_line_attempt_01.png", + "diversify_input": "crops/diversify_input_double_solid_line.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_line_attempt_01.png", + "output": "references/ref_double_solid_line.png", + "mask": "references/sam_mask_double_solid_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 186.0, + 1001.0, + 837.0 + ], + "mask_score": 3.460181, + "mask_area_ratio": 0.372935, + "elapsed_seconds": 8.3174 + }, + "reference_verify": "references/reference_verify_double_solid_line.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dark_building_facade", + "ref_image": "references/ref_dark_building_facade.png", + "raw_ref_image": "references/raw_ref_dark_building_facade_attempt_01.png", + "diversify_input": "crops/diversify_input_dark_building_facade.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_dark_building_facade_attempt_01.png", + "output": "references/ref_dark_building_facade.png", + "mask": "references/sam_mask_dark_building_facade.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 128.0, + 0.0, + 887.0, + 1000.0 + ], + "mask_score": 2.829968, + "mask_area_ratio": 0.624767, + "elapsed_seconds": 7.1675 + }, + "reference_verify": "references/reference_verify_dark_building_facade.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "awning_building_corner", + "ref_image": "references/ref_awning_building_corner.png", + "raw_ref_image": "references/raw_ref_awning_building_corner_attempt_01.png", + "diversify_input": "crops/diversify_input_awning_building_corner.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_awning_building_corner_attempt_01.png", + "output": "references/ref_awning_building_corner.png", + "mask": "references/sam_mask_awning_building_corner.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 27.0, + 975.0, + 980.0 + ], + "mask_score": 3.458235, + "mask_area_ratio": 0.594922, + "elapsed_seconds": 7.3072 + }, + "reference_verify": "references/reference_verify_awning_building_corner.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "green_street_sign", + "ref_image": "references/ref_green_street_sign.png", + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "diversify_input": "crops/diversify_input_green_street_sign.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "references/ref_green_street_sign.png", + "mask": "references/sam_mask_green_street_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 61.0, + 378.0, + 962.0, + 645.0 + ], + "mask_score": 3.379525, + "mask_area_ratio": 0.536634, + "elapsed_seconds": 7.1734 + }, + "reference_verify": "references/reference_verify_green_street_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_awning_building_corner.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_awning_building_corner.png new file mode 100644 index 0000000000000000000000000000000000000000..74b59ea14cc00ca66524aa64ffcac293de9c9747 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_awning_building_corner.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f10dd6c9281835aa367443c3996c39877a46225ef30acb140cfc0d4d9d95ea0f +size 956304 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_dark_building_facade.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_dark_building_facade.png new file mode 100644 index 0000000000000000000000000000000000000000..a524080bea12e5da67348d3f7999c4f94b4d94ba --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_dark_building_facade.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03665ac988f6468a5743944df912c5dc13ccb0d906c12b1a00f271ccebd3d1c9 +size 1092515 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_distant_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..07dc32ffe381f0d307e5970c7be2e55102bdb0ee --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_distant_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf9c0e5faccd877f4546e8166c8a3977e71578710d74f6dee8a53a8c64d5417 +size 330096 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_double_solid_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_double_solid_line.png new file mode 100644 index 0000000000000000000000000000000000000000..e2590330009927505753f6dd73c36070a13ef96c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_double_solid_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68ac6299fd72c1143b3b7cf49c876c1fa9010bf8bb8e6d065c513118f5169e43 +size 958041 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_emergency_vehicle.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..e10df2cf414adcb0d2dd8d6ac8f28802d833cf8c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_emergency_vehicle.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f5cfdebbfaf2f3813d5df58673478241773482cbed14bca8405c26c6dc0052 +size 646271 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_green_street_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..b3ec7f779019887193a403ffda350c2ec4945320 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_green_street_sign.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88c3aeb400e3928fcd32c1d8ab08c8999b3a0a1ef275b3e697810a4a113f92fa +size 317178 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_vertical_illuminated_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_vertical_illuminated_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..3eb5294124457b9c055600faa07379fe5e21d9df --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_vertical_illuminated_sign.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6aa988775d6c089c060baf17847722857c10d5545004161f1415a53169be505 +size 477159 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_white_panel_van.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..d84924a8d5cda2db4dd36c1d1257982b8f294455 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/ref_white_panel_van.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74306c02edc3110f5c700edf5248577787be434f06ad6fb61b5060b828a8a7f0 +size 580183 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_awning_building_corner.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_awning_building_corner.json new file mode 100644 index 0000000000000000000000000000000000000000..dc45135c3095354b463be5aa6441bf1cb3eec17d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_awning_building_corner.json @@ -0,0 +1,46 @@ +{ + "name": "awning_building_corner", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_awning_building_corner_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_awning_building_corner_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_awning_building_corner_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_awning_building_corner_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_awning_building_corner_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_awning_building_corner_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 27.0, + 975.0, + 980.0 + ], + "mask_score": 3.458235, + "mask_area_ratio": 0.594922, + "elapsed_seconds": 7.3072 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image successfully displays the described building corner with awning and illuminated signboards isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_dark_building_facade.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_dark_building_facade.json new file mode 100644 index 0000000000000000000000000000000000000000..5ce183b9ec1b9532c1cb6de119968c6ed0632d3f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_dark_building_facade.json @@ -0,0 +1,46 @@ +{ + "name": "dark_building_facade", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dark_building_facade_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dark_building_facade_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dark_building_facade_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_dark_building_facade_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_dark_building_facade_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_dark_building_facade_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 128.0, + 0.0, + 887.0, + 1000.0 + ], + "mask_score": 2.829968, + "mask_area_ratio": 0.624767, + "elapsed_seconds": 7.1675 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a representative crop of a dark building facade with lit windows, isolated on a white background. Minor cropping at the top is acceptable for this type of architectural subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_distant_pedestrian.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_distant_pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..8c07652cce0295fb1d32d16c201f55067ab6115e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_distant_pedestrian.json @@ -0,0 +1,46 @@ +{ + "name": "distant_pedestrian", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_distant_pedestrian_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_distant_pedestrian_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_distant_pedestrian_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_distant_pedestrian_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 324.0, + 9.0, + 705.0, + 1015.0 + ], + "mask_score": 3.338419, + "mask_area_ratio": 0.174056, + "elapsed_seconds": 8.694 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body of a single person visible with adequate white margin. All requirements met." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_double_solid_line.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_double_solid_line.json new file mode 100644 index 0000000000000000000000000000000000000000..d37120ecc6e380f1bf0adac6f13c3b403663ce96 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_double_solid_line.json @@ -0,0 +1,46 @@ +{ + "name": "double_solid_line", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_double_solid_line_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_double_solid_line_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_double_solid_line_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_line_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_double_solid_line_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_double_solid_line_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 186.0, + 1001.0, + 837.0 + ], + "mask_score": 3.460181, + "mask_area_ratio": 0.372935, + "elapsed_seconds": 8.3174 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Representative specimen of a double solid white line painted on dark asphalt, perfectly isolated on a white background. As a continuous object, the specimen crop is fully acceptable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_emergency_vehicle.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_emergency_vehicle.json new file mode 100644 index 0000000000000000000000000000000000000000..d9e2e35de40f8a6ccc42e786cf8110348c72fcf5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_emergency_vehicle.json @@ -0,0 +1,46 @@ +{ + "name": "emergency_vehicle", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_emergency_vehicle_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_emergency_vehicle_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_emergency_vehicle_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_emergency_vehicle_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 8.0, + 237.0, + 1015.0, + 828.0 + ], + "mask_score": 3.468468, + "mask_area_ratio": 0.355034, + "elapsed_seconds": 7.0896 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated emergency vehicle on a white background. All requirements are met." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_green_street_sign.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_green_street_sign.json new file mode 100644 index 0000000000000000000000000000000000000000..bb593400a20afce7fa07aa0ff2c11ee100e96703 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_green_street_sign.json @@ -0,0 +1,46 @@ +{ + "name": "green_street_sign", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_green_street_sign_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_green_street_sign_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_green_street_sign_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_green_street_sign_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 61.0, + 378.0, + 962.0, + 645.0 + ], + "mask_score": 3.379525, + "mask_area_ratio": 0.536634, + "elapsed_seconds": 7.1734 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Image clearly shows a complete blank green street sign isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_vertical_illuminated_sign.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_vertical_illuminated_sign.json new file mode 100644 index 0000000000000000000000000000000000000000..40957ef029f74c4288acc9181ff9df0919598c55 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_vertical_illuminated_sign.json @@ -0,0 +1,46 @@ +{ + "name": "vertical_illuminated_sign", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_vertical_illuminated_sign_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_vertical_illuminated_sign_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_vertical_illuminated_sign_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_vertical_illuminated_sign_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 305.0, + 20.0, + 728.0, + 1002.0 + ], + "mask_score": 3.37343, + "mask_area_ratio": 0.273593, + "elapsed_seconds": 7.1332 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image clearly shows the entire vertical illuminated sign isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_white_panel_van.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_white_panel_van.json new file mode 100644 index 0000000000000000000000000000000000000000..d971697b5caceea63811343ff3db53875f223fa2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/reference_verify_white_panel_van.json @@ -0,0 +1,46 @@ +{ + "name": "white_panel_van", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_white_panel_van_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_white_panel_van_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_white_panel_van_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_white_panel_van_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 181.0, + 63.0, + 843.0, + 937.0 + ], + "mask_score": 2.636854, + "mask_area_ratio": 0.376409, + "elapsed_seconds": 7.1379 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image clearly shows the rear view of a white panel van isolated on a white background. There are some minor masking artifacts along the bottom bumper and where the wheels would be, but the primary subject remains highly recognizable and useful as a reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_awning_building_corner.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_awning_building_corner.png new file mode 100644 index 0000000000000000000000000000000000000000..9871519c100c94e01f3e7896566581e1b6bfd9db Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_awning_building_corner.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_dark_building_facade.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_dark_building_facade.png new file mode 100644 index 0000000000000000000000000000000000000000..a4197372521d1f35ee3395c6745c170c2424b977 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_dark_building_facade.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_distant_pedestrian.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..a87fc0319674648502584a1bf32de30809187d16 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_distant_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_double_solid_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_double_solid_line.png new file mode 100644 index 0000000000000000000000000000000000000000..e3b3bb99de1bdae6e39a42402e63790e89a6cc91 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_double_solid_line.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_emergency_vehicle.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..5a95687bd76a0e32770beb022f904011fbc83c23 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_emergency_vehicle.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_green_street_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..8f021d51fd70865c652e40d336d9faf97a126fe4 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_green_street_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_vertical_illuminated_sign.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_vertical_illuminated_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..8d1205c7f64430cdd5ac934c88b74dcf6b9e9a00 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_vertical_illuminated_sign.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_white_panel_van.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..18f20941ccd0d94eb3ca719bea68a9b509bc6919 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/references/sam_mask_white_panel_van.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/row.json new file mode 100644 index 0000000000000000000000000000000000000000..5b45ee8632419cf76cc791f78d8304d337b61ab8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/row.json @@ -0,0 +1,394 @@ +{ + "sample_id": "sample_000006", + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 8, + "n_detected": 8, + "n_subjects": 8, + "subjects": [ + { + "name": "distant_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", + "sub_caption": "pedestrian: A person walking across the street in the distant background.. Scene role: Crossing the crosswalk in the distance ahead of the approaching vehicles.", + "measured_bbox": [ + 0.3877, + 0.478, + 0.4204, + 0.5881 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_distant_pedestrian.png", + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_distant_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "references/ref_distant_pedestrian.png", + "mask": "references/sam_mask_distant_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 324.0, + 9.0, + 705.0, + 1015.0 + ], + "mask_score": 3.338419, + "mask_area_ratio": 0.174056, + "elapsed_seconds": 8.694 + } + }, + { + "name": "vertical_illuminated_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb1b7e42-9608265e:object:6", + "source_name": "street sign", + "source_description": "A vertical 'PARK' sign illuminated on the right side of the street, indicating a parking garage. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a city street with tall buildings on both sides, following a yellow taxi, with other cars parked and driving.", + "sub_caption": "street sign: A vertical illuminated neon sign with abstract shapes, glowing brightly.. Scene role: Mounted on the building facade on the right side of the street, adding ambient night lighting.", + "measured_bbox": [ + 0.7683, + 0.0355, + 0.8177, + 0.2837 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vertical_illuminated_sign.png", + "raw_ref_image": "references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "reference_verify": "references/reference_verify_vertical_illuminated_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_vertical_illuminated_sign_attempt_01.png", + "output": "references/ref_vertical_illuminated_sign.png", + "mask": "references/sam_mask_vertical_illuminated_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 305.0, + 20.0, + 728.0, + 1002.0 + ], + "mask_score": 3.37343, + "mask_area_ratio": 0.273593, + "elapsed_seconds": 7.1332 + } + }, + { + "name": "emergency_vehicle", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", + "sub_caption": "vehicle: A dark-colored vehicle with bright blue emergency lights flashing.. Scene role: Parked on the left side of the street near the intersection.", + "measured_bbox": [ + 0.1031, + 0.4564, + 0.2827, + 0.6497 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_emergency_vehicle.png", + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "reference_verify": "references/reference_verify_emergency_vehicle.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "references/ref_emergency_vehicle.png", + "mask": "references/sam_mask_emergency_vehicle.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 8.0, + 237.0, + 1015.0, + 828.0 + ], + "mask_score": 3.468468, + "mask_area_ratio": 0.355034, + "elapsed_seconds": 7.0896 + } + }, + { + "name": "white_panel_van", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", + "sub_caption": "white van: A large white panel van with red taillights illuminated.. Scene role: Driving in the lane directly ahead of the camera perspective.", + "measured_bbox": [ + 0.4556, + 0.3288, + 0.5926, + 0.6597 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_panel_van.png", + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "reference_verify": "references/reference_verify_white_panel_van.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "references/ref_white_panel_van.png", + "mask": "references/sam_mask_white_panel_van.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 181.0, + 63.0, + 843.0, + 937.0 + ], + "mask_score": 2.636854, + "mask_area_ratio": 0.376409, + "elapsed_seconds": 7.1379 + } + }, + { + "name": "double_solid_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", + "sub_caption": "double solid white line: Two continuous white painted lines on the dark asphalt road surface.. Scene role: Separating the traffic lanes on the dark road, leading toward the intersection.", + "measured_bbox": [ + 0.1922, + 0.6133, + 0.4541, + 1.0 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_solid_line.png", + "raw_ref_image": "references/raw_ref_double_solid_line_attempt_01.png", + "reference_verify": "references/reference_verify_double_solid_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_line_attempt_01.png", + "output": "references/ref_double_solid_line.png", + "mask": "references/sam_mask_double_solid_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 186.0, + 1001.0, + 837.0 + ], + "mask_score": 3.460181, + "mask_area_ratio": 0.372935, + "elapsed_seconds": 8.3174 + } + }, + { + "name": "dark_building_facade", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c807cb19-7e09cb11:object:8", + "source_name": "building facade", + "source_description": "Dark outlines of buildings lining the street on both sides, with some lit windows. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane city street with traffic lights and vehicles ahead.", + "sub_caption": "building facade: Dark outlines of buildings with scattered, warm-toned lit windows.. Scene role: Forming the urban backdrop along the left side of the street.", + "measured_bbox": [ + 0.1397, + 0.0, + 0.366, + 0.5427 + ], + "detection_confidence": 0.8, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_building_facade.png", + "raw_ref_image": "references/raw_ref_dark_building_facade_attempt_01.png", + "reference_verify": "references/reference_verify_dark_building_facade.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_dark_building_facade_attempt_01.png", + "output": "references/ref_dark_building_facade.png", + "mask": "references/sam_mask_dark_building_facade.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 128.0, + 0.0, + 887.0, + 1000.0 + ], + "mask_score": 2.829968, + "mask_area_ratio": 0.624767, + "elapsed_seconds": 7.1675 + } + }, + { + "name": "awning_building_corner", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c06d23aa-cb9ae751:object:6", + "source_name": "building corner", + "source_description": "The corner of a building on the right side, with an orange or red awning and some lit signs. Source dataset: BDD100K. Scene context: Nighttime driving scene at an intersection with a stop sign and a large black SUV passing on the right.", + "sub_caption": "building corner: The corner of a building featuring an awning and brightly lit abstract signboards.. Scene role: Anchoring the right side of the intersection with a warm architectural glow.", + "measured_bbox": [ + 0.6102, + 0.3347, + 0.7867, + 0.5412 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_awning_building_corner.png", + "raw_ref_image": "references/raw_ref_awning_building_corner_attempt_01.png", + "reference_verify": "references/reference_verify_awning_building_corner.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_awning_building_corner_attempt_01.png", + "output": "references/ref_awning_building_corner.png", + "mask": "references/sam_mask_awning_building_corner.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 27.0, + 975.0, + 980.0 + ], + "mask_score": 3.458235, + "mask_area_ratio": 0.594922, + "elapsed_seconds": 7.3072 + } + }, + { + "name": "green_street_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", + "sub_caption": "street sign: A standard green street sign without any readable text.. Scene role: Hanging from a traffic light pole near the intersection.", + "measured_bbox": [ + 0.5754, + 0.1583, + 0.6522, + 0.1884 + ], + "detection_confidence": 100, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_green_street_sign.png", + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "reference_verify": "references/reference_verify_green_street_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "references/ref_green_street_sign.png", + "mask": "references/sam_mask_green_street_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 61.0, + 378.0, + 962.0, + 645.0 + ], + "mask_score": 3.379525, + "mask_area_ratio": 0.536634, + "elapsed_seconds": 7.1734 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..16f95e843fd579324adfbbc29cc6d843ea57d0d2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000006/vocab_task.json @@ -0,0 +1,126 @@ +{ + "task_id": "sample_000006", + "sample_id": "sample_000006", + "sample_index": 6, + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 110679, + "image_id": "CrowdHuman:data/data_47/273278,7956e0000b6bb646.jpg:person:10", + "name": "female athlete", + "description": "Sitting in the front middle, wearing a light blue t-shirt and shorts, with a soccer ball at her feet. Source dataset: CrowdHuman. Scene context: A group portrait of college athletes in uniform, posed against a backdrop of a city skyline at dusk." + }, + { + "candidate_index": 1, + "source_offset": 186464, + "image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "name": "pedestrian", + "description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 174964, + "image_id": "BDD100K:bb1b7e42-9608265e:object:6", + "name": "street sign", + "description": "A vertical 'PARK' sign illuminated on the right side of the street, indicating a parking garage. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a city street with tall buildings on both sides, following a yellow taxi, with other cars parked and driving." + }, + { + "candidate_index": 1, + "source_offset": 168910, + "image_id": "BDD100K:b99f250d-886111c5:object:5", + "name": "vehicle", + "description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals." + }, + { + "candidate_index": 2, + "source_offset": 3756, + "image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "name": "trash can", + "description": "dark blue cylindrical bin partially visible in the foreground Source dataset: CrowdHuman. Scene context: A group of children and an adult pose for a photo in front of a roller coaster at an amusement park." + }, + { + "candidate_index": 3, + "source_offset": 31444, + "image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "name": "exhibition banner", + "description": "A large, rectangular banner hanging on the building's facade, featuring a maroon background and text. Source dataset: CrowdHuman. Scene context: A large, classical building, likely a museum, with many people sitting and standing on its wide front steps, while a yellow taxi speeds past in the foreground." + }, + { + "candidate_index": 4, + "source_offset": 61592, + "image_id": "CrowdHuman:data/data_4/283991,1ec000a212ec26.jpg:object:6", + "name": "stall sign", + "description": "A bright yellow sign with red text and a drawing of a face. Source dataset: CrowdHuman. Scene context: A bustling night market scene with people walking and looking at stalls, with parked motor scooters in the foreground." + }, + { + "candidate_index": 5, + "source_offset": 148584, + "image_id": "BDD100K:b5047c50-e1facff6:object:2", + "name": "white van", + "description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic." + }, + { + "candidate_index": 6, + "source_offset": 181315, + "image_id": "BDD100K:bc886d37-5b22c313:object:7", + "name": "double solid white line", + "description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car." + }, + { + "candidate_index": 7, + "source_offset": 236118, + "image_id": "BDD100K:c807cb19-7e09cb11:object:8", + "name": "building facade", + "description": "Dark outlines of buildings lining the street on both sides, with some lit windows. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane city street with traffic lights and vehicles ahead." + }, + { + "candidate_index": 8, + "source_offset": 34903, + "image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "name": "blooming trees", + "description": "Trees with pink blossoms in the background park area. Source dataset: CrowdHuman. Scene context: People are walking along a sidewalk next to a street lined with trees, some in bloom, with a police officer standing near a parked car." + }, + { + "candidate_index": 9, + "source_offset": 69573, + "image_id": "CrowdHuman:data/data_44/273278,231ab000d9efcb71.jpg:object:1", + "name": "bench", + "description": "A low, dark rectangular seating structure on the floor. Source dataset: CrowdHuman. Scene context: A grand, high-ceilinged indoor train station concourse with a polished tile floor reflecting overhead lights, lined with various small shops, kiosks, and passing pedestrians." + }, + { + "candidate_index": 10, + "source_offset": 198849, + "image_id": "BDD100K:c06d23aa-cb9ae751:object:6", + "name": "building corner", + "description": "The corner of a building on the right side, with an orange or red awning and some lit signs. Source dataset: BDD100K. Scene context: Nighttime driving scene at an intersection with a stop sign and a large black SUV passing on the right." + }, + { + "candidate_index": 11, + "source_offset": 25812, + "image_id": "CrowdHuman:data/data_21/282555,93df2000dd2b5468.jpg:object:5", + "name": "gymnastics floor", + "description": "A light beige, flat, smooth mat covering the competition area. Source dataset: CrowdHuman. Scene context: Five rhythmic gymnasts in matching maroon and pink leotards pose on the floor, holding ribbons and balls, with spectators and a colorful background in the distance." + }, + { + "candidate_index": 12, + "source_offset": 82290, + "image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "name": "street sign", + "description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day." + }, + { + "candidate_index": 13, + "source_offset": 125175, + "image_id": "CrowdHuman:data/data_72/282555,6c6850003beacb74.jpg:object:1", + "name": "paved ground", + "description": "The ground surface made of light-colored, irregularly shaped flat stones or concrete pieces. Source dataset: CrowdHuman. Scene context: A large group of young adults is posing for a group photo in an outdoor paved area, with a massive, intricate steel stadium structure in the background." + } + ], + "rng_seed": 1782556367, + "created_at": 1782292413.3750482 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..860b2872443e7894cb10138ed40ca4bcccfdf340 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bc8b326a1a68cf8c49bd08b6bc9954b3aeed467e2343bbbf3c37399b8f30b7d +size 1384128 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..efa0a61b2887cb1ab073f7cc70f8cb9d743c485a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/compose_prompt.txt @@ -0,0 +1,151 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A typical urban residential and commercial city street during the daytime.", + "activity": "The camera vehicle drives down the street approaching a crosswalk, navigating past parked and moving cars, while a pedestrian walks on the sidewalk next to a pole tied with balloons.", + "composition": "First-person dashcam perspective with the vehicle's dashboard framing the bottom edge. The wide 16:9 aspect ratio captures the full breadth of the street, with multi-story buildings framing the left and right edges, leading to a central vanishing point. Depth is established by foreground curbside objects, midground moving traffic and a crosswalk, and background architecture under a sky crisscrossed with wires.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "woman_in_dark_dress", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "description": "A woman with dark hair wearing a dark knee-length dress.", + "role_in_scene": "walking along the left sidewalk, approaching the crosswalk" + } + ], + "objects": [ + { + "name": "dashboard", + "source_index": 0, + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "description": "The dashboard of the camera vehicle, visible at the bottom of the frame.", + "role_in_scene": "anchoring the bottom foreground of the frame to establish the interior car viewpoint" + }, + { + "name": "overhead_wires", + "source_index": 2, + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "description": "Power and communication lines stretching across the sky.", + "role_in_scene": "strung overhead across the sky, connecting the buildings on either side" + }, + { + "name": "bunch_of_balloons", + "source_index": 7, + "source_image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", + "source_name": "bunch of balloons", + "description": "A bunch of heart-shaped balloons, some pink and some red.", + "role_in_scene": "tied to a metal pole on the left sidewalk" + }, + { + "name": "white_garbage_bag", + "source_index": 11, + "source_image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", + "source_name": "white garbage bag", + "description": "A large white plastic bag.", + "role_in_scene": "placed on the curb near the crosswalk on the right side" + }, + { + "name": "multi_story_building_left", + "source_index": 12, + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "description": "Multi-story brick buildings with numerous windows and fire escapes.", + "role_in_scene": "forming the street facade along the left side of the frame" + }, + { + "name": "street_light_pole", + "source_index": 13, + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "description": "A tall, curved metal street light pole.", + "role_in_scene": "standing on the right sidewalk, leaning over the roadway" + }, + { + "name": "white_sedan", + "source_index": 16, + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "description": "A white passenger car.", + "role_in_scene": "driving in the forward lane just past the crosswalk" + }, + { + "name": "dark_car_1", + "source_index": 18, + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "driving in the opposing traffic lane to the left" + }, + { + "name": "dark_car_2", + "source_index": 20, + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "description": "A dark-colored car.", + "role_in_scene": "parked alongside the curb on the right side of the street" + }, + { + "name": "brick_building_right", + "source_index": 22, + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "description": "A tall, multi-story red brick building featuring arched windows and a storefront.", + "role_in_scene": "lining the street on the right side of the frame" + }, + { + "name": "metal_pole", + "source_index": 23, + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "description": "A thin, straight metal pole standing upright.", + "role_in_scene": "standing on the left sidewalk, serving as a mounting point for the balloons" + }, + { + "name": "crosswalk_markings", + "source_index": 24, + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "description": "White painted lines on the road surface indicating a pedestrian crosswalk.", + "role_in_scene": "painted across the road directly ahead of the camera vehicle" + }, + { + "name": "iron_balcony", + "source_index": 25, + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "description": "A dark, wrought-iron balcony.", + "role_in_scene": "attached to the facade of the multi-story building on the left" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_brick_building_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_brick_building_right.png new file mode 100644 index 0000000000000000000000000000000000000000..ed96cdecdc8a5a173fe851b281d69d4aaa569f4c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_brick_building_right.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b552c44619729088083b3d98162a6802c4054f33064ba36c4e23f4277498e0d +size 393220 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_bunch_of_balloons.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_bunch_of_balloons.png new file mode 100644 index 0000000000000000000000000000000000000000..0661b6a7794f13febd993daea054fdd3878536c2 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_bunch_of_balloons.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_crosswalk_markings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_crosswalk_markings.png new file mode 100644 index 0000000000000000000000000000000000000000..4e5fd2778c0141f616d9a2b06018027603304706 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_crosswalk_markings.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cf6594ef6f476a965ddd3894bf11f1a4b57c2c509ecb1dd4ae04ed81589188f +size 186401 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dark_car_1.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dark_car_1.png new file mode 100644 index 0000000000000000000000000000000000000000..4bd58e939fb525f14583acad7f151ccd591de998 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dark_car_1.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dark_car_2.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dark_car_2.png new file mode 100644 index 0000000000000000000000000000000000000000..9764907f88c2c9a9d053a638926c2cdcf0c8f577 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dark_car_2.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..c88a22b6742ece58a3ebec4e517ca0cb8691d390 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1abf7a1a29ec938120ae9070bb4c0945ccc64a7c98b82e7b44f6f014535090c +size 264116 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_iron_balcony.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_iron_balcony.png new file mode 100644 index 0000000000000000000000000000000000000000..c6c5c06adda5b649e170deb4d0877465a523d2c5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_iron_balcony.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b552b358bc87743b0d35f10f6158a706b4e65fb77bb98f2675175c1c4d1a894e +size 126847 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_metal_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_metal_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..865cb49a7227b5816340a3c6a6412e310ed52395 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_metal_pole.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_multi_story_building_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_multi_story_building_left.png new file mode 100644 index 0000000000000000000000000000000000000000..8e2daeef65d6bf1200d86fc580faf8302f87d357 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_multi_story_building_left.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6193b349fb15bbe59df3825132073256a7bac27b40dc0de4eb7c57f56d636dc +size 413885 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_overhead_wires.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_overhead_wires.png new file mode 100644 index 0000000000000000000000000000000000000000..8652bea16cf2c3690db27954de1acf0cd87a169e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_overhead_wires.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562d9b2b3b1bd2cd3760afa7e56ace6b40af5f5478bd79d3adc5601650e2bd63 +size 597595 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_street_light_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_street_light_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..1d308be1ed117be06b26033e0c43a401cb349ba9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_street_light_pole.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5f3f3685b432517add00d2dae827fb8e5e20626a2edf6b2e34c213f57dc19f6 +size 373676 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_white_garbage_bag.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_white_garbage_bag.png new file mode 100644 index 0000000000000000000000000000000000000000..44ae131629d982d904f559374bd6dd37744c4041 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_white_garbage_bag.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_white_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_white_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..2df1ee4ec59656045fc136d1df2f39a8802d4c56 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_white_sedan.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_woman_in_dark_dress.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_woman_in_dark_dress.png new file mode 100644 index 0000000000000000000000000000000000000000..99545b9a2e7a6af87bb8fcc43dfa944a1cc130ca Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/detect_refine_woman_in_dark_dress.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_brick_building_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_brick_building_right.png new file mode 100644 index 0000000000000000000000000000000000000000..91e190cd275681f1e74ad310a98e957177597c08 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_brick_building_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_bunch_of_balloons.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_bunch_of_balloons.png new file mode 100644 index 0000000000000000000000000000000000000000..a86a41a1079185455f1330c712714b2374ea7c2c Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_bunch_of_balloons.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_crosswalk_markings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_crosswalk_markings.png new file mode 100644 index 0000000000000000000000000000000000000000..2bfde0bfd2122cdf7b1d6f71e4bd4d06a4ac7c03 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_crosswalk_markings.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dark_car_1.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dark_car_1.png new file mode 100644 index 0000000000000000000000000000000000000000..25c8c24e6d1b537dd9f9fcdf206dda9260a94d0c Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dark_car_1.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dark_car_2.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dark_car_2.png new file mode 100644 index 0000000000000000000000000000000000000000..f83ac3f50abc5822e4cc3468ebc6175a162cd329 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dark_car_2.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..ecc2c0346333ee3e9441ded9baeb9f1cc1dafb81 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4416c92f83fbf66d3c3deb93db15da3d2c575db8cb608f64551bac98958fc171 +size 288317 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_iron_balcony.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_iron_balcony.png new file mode 100644 index 0000000000000000000000000000000000000000..ba3e8fc629bb5b966ea51472ea2d83658c5b9ab3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_iron_balcony.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb50a3873802d306a84430bc31d0bf840b0e4e4569656fe6cfa6d70dcf6f5f58 +size 132524 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_metal_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_metal_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..5cdff22bdef2f7d60709c97e10395bfc8454156b Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_metal_pole.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_multi_story_building_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_multi_story_building_left.png new file mode 100644 index 0000000000000000000000000000000000000000..1b4b306f750ddc6c40b5ebba04257f658c79512d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_multi_story_building_left.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:898cddda391f52cbb54026396fef3d11cf99a4c6879efb442d979e7e5181a0b4 +size 522223 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_overhead_wires.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_overhead_wires.png new file mode 100644 index 0000000000000000000000000000000000000000..dfd6016ab4fbb7c495fc3c5deb2b691c578d3e4c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_overhead_wires.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5c2026595798e659c6a86a69fb2e2dd131fbc4f27fd0cb1b22451797937408 +size 689268 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_street_light_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_street_light_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..06caee52a95de1d57a85d645b8b4576e617b2a41 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_street_light_pole.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f681ddf7138b4d42fa047498abcf6b57c24ba9ed022b0d63cd5cb08ef5138c44 +size 440899 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_white_garbage_bag.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_white_garbage_bag.png new file mode 100644 index 0000000000000000000000000000000000000000..36c426b7b5cd8f455d67d591324adcba18792084 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_white_garbage_bag.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_white_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_white_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..8c176c7f5358559379c46a85ed64dac597832092 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_white_sedan.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_woman_in_dark_dress.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_woman_in_dark_dress.png new file mode 100644 index 0000000000000000000000000000000000000000..b1573f0ad62dba2d82af1984f7bc3b53aa570e0d Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/crops/diversify_input_woman_in_dark_dress.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..16900cd45e95f5b9656470aac101efa2843a5c2d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/detections.json @@ -0,0 +1,268 @@ +[ + { + "name": "woman_in_dark_dress", + "present": true, + "bbox": [ + 0.1595, + 0.5229, + 0.2058, + 0.7308 + ], + "confidence": 100, + "notes": "Woman in dark dress identified successfully.", + "coarse_bbox": [ + 0.158, + 0.524, + 0.206, + 0.731 + ], + "refine_crop": "crops/detect_refine_woman_in_dark_dress.png" + }, + { + "name": "dashboard", + "present": true, + "bbox": [ + 0.0, + 0.7874, + 1.0, + 1.0 + ], + "confidence": 0.99, + "notes": "The large dark gray dashboard structure occupying the bottom portion of the image.", + "coarse_bbox": [ + 0.0, + 0.786, + 0.999, + 0.994 + ], + "refine_crop": "crops/detect_refine_dashboard.png" + }, + { + "name": "overhead_wires", + "present": true, + "bbox": [ + 0.0656, + 0.0, + 0.9992, + 0.338 + ], + "confidence": 0.95, + "notes": "The bounding box covers the extensive network of overhead wires stretching across the scene, including those descending diagonally from the top left and horizontal lines spanning the width.", + "coarse_bbox": [ + 0.144, + 0.03, + 0.921, + 0.339 + ], + "refine_crop": "crops/detect_refine_overhead_wires.png" + }, + { + "name": "bunch_of_balloons", + "present": true, + "bbox": [ + 0.2318, + 0.3806, + 0.2869, + 0.4973 + ], + "confidence": 100, + "notes": "A bunch of heart-shaped balloons, some pink and some red.", + "coarse_bbox": [ + 0.231, + 0.384, + 0.286, + 0.494 + ], + "refine_crop": "crops/detect_refine_bunch_of_balloons.png" + }, + { + "name": "white_garbage_bag", + "present": true, + "bbox": [ + 0.8062, + 0.6476, + 0.8807, + 0.7562 + ], + "confidence": 0.98, + "notes": "Large white garbage bag on the curb.", + "coarse_bbox": [ + 0.804, + 0.65, + 0.881, + 0.754 + ], + "refine_crop": "crops/detect_refine_white_garbage_bag.png" + }, + { + "name": "multi_story_building_left", + "present": true, + "bbox": [ + 0.1156, + 0.0, + 0.416, + 0.6004 + ], + "confidence": 0.95, + "notes": "The large multi-story brick building taking up the majority of the background and the left side of the frame.", + "coarse_bbox": [ + 0.144, + 0.003, + 0.426, + 0.599 + ], + "refine_crop": "crops/detect_refine_multi_story_building_left.png" + }, + { + "name": "street_light_pole", + "present": true, + "bbox": [ + 0.548, + 0.0288, + 0.7884, + 0.7106 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the tall, curved metal street light pole.", + "coarse_bbox": [ + 0.547, + 0.034, + 0.786, + 0.711 + ], + "refine_crop": "crops/detect_refine_street_light_pole.png" + }, + { + "name": "white_sedan", + "present": true, + "bbox": [ + 0.494, + 0.5481, + 0.6384, + 0.6346 + ], + "confidence": 1.0, + "notes": "Tight bounding box around the white passenger car in the foreground.", + "coarse_bbox": [ + 0.493, + 0.548, + 0.638, + 0.636 + ], + "refine_crop": "crops/detect_refine_white_sedan.png" + }, + { + "name": "dark_car_1", + "present": true, + "bbox": [ + 0.3126, + 0.5583, + 0.4593, + 0.6372 + ], + "confidence": 0.98, + "notes": "The dark-colored sedan matches the visual description.", + "coarse_bbox": [ + 0.312, + 0.548, + 0.46, + 0.639 + ], + "refine_crop": "crops/detect_refine_dark_car_1.png" + }, + { + "name": "dark_car_2", + "present": true, + "bbox": [ + 0.7955, + 0.5535, + 0.9254, + 0.6345 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the dark-colored car visible in the image.", + "coarse_bbox": [ + 0.794, + 0.552, + 0.927, + 0.635 + ], + "refine_crop": "crops/detect_refine_dark_car_2.png" + }, + { + "name": "brick_building_right", + "present": true, + "bbox": [ + 0.9184, + 0.0073, + 0.9492, + 0.6625 + ], + "confidence": 0.9, + "notes": "Tight bounding box around the tall, multi-story red brick building lining the right side of the frame.", + "coarse_bbox": [ + 0.628, + 0.0, + 0.92, + 0.603 + ], + "refine_crop": "crops/detect_refine_brick_building_right.png" + }, + { + "name": "metal_pole", + "present": true, + "bbox": [ + 0.215, + 0.375, + 0.23, + 0.734 + ], + "confidence": 0.9, + "notes": "A thin, straight metal pole standing upright.", + "coarse_bbox": [ + 0.215, + 0.375, + 0.23, + 0.734 + ], + "refine_crop": "crops/detect_refine_metal_pole.png" + }, + { + "name": "crosswalk_markings", + "present": true, + "bbox": [ + 0.3161, + 0.6787, + 0.7102, + 0.7212 + ], + "confidence": 0.95, + "notes": "Bounding the main zebra stripes of the crosswalk painted across the road directly ahead.", + "coarse_bbox": [ + 0.188, + 0.63, + 0.77, + 0.772 + ], + "refine_crop": "crops/detect_refine_crosswalk_markings.png" + }, + { + "name": "iron_balcony", + "present": true, + "bbox": [ + 0.0089, + 0.0773, + 0.1849, + 0.3213 + ], + "confidence": 0.95, + "notes": "The dark, wrought-iron balcony (fire escape platform) attached to the facade on the left is clearly visible.", + "coarse_bbox": [ + 0.008, + 0.002, + 0.184, + 0.309 + ], + "refine_crop": "crops/detect_refine_iron_balcony.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..a767dab8ee72fdb69d256791bc46cfccb41a7f45 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0f6ae5837dbed2b4605df8460875d090815b174c620c93da73064a2604262ee +size 1563600 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..c49b3772373436695141af4743e9ec10f3820a37 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/plan.json @@ -0,0 +1,316 @@ +{ + "sample_id": "sample_000007", + "target_total": 14, + "target_people": 1, + "target_objects": 13, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A typical urban residential and commercial city street during the daytime.", + "activity": "The camera vehicle drives down the street approaching a crosswalk, navigating past parked and moving cars, while a pedestrian walks on the sidewalk next to a pole tied with balloons.", + "composition": "First-person dashcam perspective with the vehicle's dashboard framing the bottom edge. The wide 16:9 aspect ratio captures the full breadth of the street, with multi-story buildings framing the left and right edges, leading to a central vanishing point. Depth is established by foreground curbside objects, midground moving traffic and a crosswalk, and background architecture under a sky crisscrossed with wires.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "woman_in_dark_dress", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "description": "A woman with dark hair wearing a dark knee-length dress.", + "role_in_scene": "walking along the left sidewalk, approaching the crosswalk" + } + ], + "objects": [ + { + "name": "dashboard", + "source_index": 0, + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "description": "The dashboard of the camera vehicle, visible at the bottom of the frame.", + "role_in_scene": "anchoring the bottom foreground of the frame to establish the interior car viewpoint" + }, + { + "name": "overhead_wires", + "source_index": 2, + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "description": "Power and communication lines stretching across the sky.", + "role_in_scene": "strung overhead across the sky, connecting the buildings on either side" + }, + { + "name": "bunch_of_balloons", + "source_index": 7, + "source_image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", + "source_name": "bunch of balloons", + "description": "A bunch of heart-shaped balloons, some pink and some red.", + "role_in_scene": "tied to a metal pole on the left sidewalk" + }, + { + "name": "white_garbage_bag", + "source_index": 11, + "source_image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", + "source_name": "white garbage bag", + "description": "A large white plastic bag.", + "role_in_scene": "placed on the curb near the crosswalk on the right side" + }, + { + "name": "multi_story_building_left", + "source_index": 12, + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "description": "Multi-story brick buildings with numerous windows and fire escapes.", + "role_in_scene": "forming the street facade along the left side of the frame" + }, + { + "name": "street_light_pole", + "source_index": 13, + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "description": "A tall, curved metal street light pole.", + "role_in_scene": "standing on the right sidewalk, leaning over the roadway" + }, + { + "name": "white_sedan", + "source_index": 16, + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "description": "A white passenger car.", + "role_in_scene": "driving in the forward lane just past the crosswalk" + }, + { + "name": "dark_car_1", + "source_index": 18, + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "driving in the opposing traffic lane to the left" + }, + { + "name": "dark_car_2", + "source_index": 20, + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "description": "A dark-colored car.", + "role_in_scene": "parked alongside the curb on the right side of the street" + }, + { + "name": "brick_building_right", + "source_index": 22, + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "description": "A tall, multi-story red brick building featuring arched windows and a storefront.", + "role_in_scene": "lining the street on the right side of the frame" + }, + { + "name": "metal_pole", + "source_index": 23, + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "description": "A thin, straight metal pole standing upright.", + "role_in_scene": "standing on the left sidewalk, serving as a mounting point for the balloons" + }, + { + "name": "crosswalk_markings", + "source_index": 24, + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "description": "White painted lines on the road surface indicating a pedestrian crosswalk.", + "role_in_scene": "painted across the road directly ahead of the camera vehicle" + }, + { + "name": "iron_balcony", + "source_index": 25, + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "description": "A dark, wrought-iron balcony.", + "role_in_scene": "attached to the facade of the multi-story building on the left" + } + ] + }, + "expected_subjects": [ + { + "name": "woman_in_dark_dress", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "source_description": "A woman with dark hair wearing a dark knee-length dress, walking along the path. Source dataset: CrowdHuman. Scene context: A bride in a white gown and her bridesmaids in dark dresses are walking along a paved path next to a building with stairs, surrounded by trees and a white fence in a sunlit outdoor setting.", + "sub_caption": "bridesmaid: A woman with dark hair wearing a dark knee-length dress.. Scene role: walking along the left sidewalk, approaching the crosswalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "source_description": "The dashboard of the camera vehicle, visible at the bottom of the frame. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching a tunnel or underpass, with buildings on the left and a retaining wall on the right.", + "sub_caption": "dashboard: The dashboard of the camera vehicle, visible at the bottom of the frame.. Scene role: anchoring the bottom foreground of the frame to establish the interior car viewpoint", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "overhead_wires", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "source_description": "Power and communication lines stretching across the sky above the street. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential street lined with parked cars and houses.", + "sub_caption": "overhead wires: Power and communication lines stretching across the sky.. Scene role: strung overhead across the sky, connecting the buildings on either side", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "bunch_of_balloons", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", + "source_name": "bunch of balloons", + "source_description": "A bunch of heart-shaped balloons, some pink and some red, tied to a wooden utility pole. Source dataset: CrowdHuman. Scene context: A group of people standing in a circle, holding hands on a street in a residential area.", + "sub_caption": "bunch of balloons: A bunch of heart-shaped balloons, some pink and some red.. Scene role: tied to a metal pole on the left sidewalk", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_garbage_bag", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", + "source_name": "white garbage bag", + "source_description": "large white plastic bag on the bottom right corner Source dataset: CrowdHuman. Scene context: A group of fifteen people posing for a photo in front of a colorful graffiti wall.", + "sub_caption": "white garbage bag: A large white plastic bag.. Scene role: placed on the curb near the crosswalk on the right side", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "multi_story_building_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "source_description": "Multi-story brick buildings with numerous windows and fire escapes on the left side. Source dataset: BDD100K. Scene context: A narrow city street lined with parked cars on both sides, with residential and commercial buildings featuring fire escapes and awnings, illuminated by sunlight filtering through mature trees.", + "sub_caption": "building on left: Multi-story brick buildings with numerous windows and fire escapes.. Scene role: forming the street facade along the left side of the frame", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_light_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "source_description": "A tall, curved metal street light pole on the right side of the road, supporting the overhead sign. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane road bordered by trees, with other vehicles and road signs visible.", + "sub_caption": "street light pole: A tall, curved metal street light pole.. Scene role: standing on the right sidewalk, leaning over the roadway", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "source_description": "A white passenger car partially visible in the right lane next to the gold SUV. Source dataset: BDD100K. Scene context: View from inside a car driving in city traffic on a sunny day with multiple vehicles and urban infrastructure visible.", + "sub_caption": "white sedan: A white passenger car.. Scene role: driving in the forward lane just past the crosswalk", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_car_1", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "source_description": "A dark-colored sedan visible on the left side of the street, partially obscured by rain. Source dataset: BDD100K. Scene context: View from inside a vehicle through a heavily rain-covered windshield, looking at city traffic and buildings.", + "sub_caption": "dark car: A dark-colored sedan.. Scene role: driving in the opposing traffic lane to the left", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_car_2", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "source_description": "A dark-colored car parked on the right side of the street, ahead of the other dark car. Source dataset: BDD100K. Scene context: A dashcam view driving down a city street lined with parked cars on both sides and multi-story brick apartment buildings under a partly cloudy sky.", + "sub_caption": "dark car 2: A dark-colored car.. Scene role: parked alongside the curb on the right side of the street", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "brick_building_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "source_description": "A tall, multi-story red brick building on the left side of the street, featuring arched windows and a storefront. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a city street lined with parked cars and multi-story brick buildings.", + "sub_caption": "brick building: A tall, multi-story red brick building featuring arched windows and a storefront.. Scene role: lining the street on the right side of the frame", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "metal_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "source_description": "A thin, straight metal pole standing upright on the sidewalk near the park area on the left. Source dataset: BDD100K. Scene context: A dashcam view looking down a slightly sloped residential city street with cars parked on both sides, trees bordering a park area to the left, and a tall building to the right, under a cloudy, overcast sky.", + "sub_caption": "pole: A thin, straight metal pole standing upright.. Scene role: standing on the left sidewalk, serving as a mounting point for the balloons", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "crosswalk_markings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "source_description": "White painted lines on the road surface indicating a pedestrian crosswalk. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with traffic lights and a crosswalk.", + "sub_caption": "crosswalk markings: White painted lines on the road surface indicating a pedestrian crosswalk.. Scene role: painted across the road directly ahead of the camera vehicle", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "iron_balcony", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "source_description": "A dark, wrought-iron balcony on a building. Source dataset: CrowdHuman. Scene context: People walk down a narrow, sunlit street lined with tall buildings.", + "sub_caption": "balcony: A dark, wrought-iron balcony.. Scene role: attached to the facade of the multi-story building on the left", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000007/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references.json new file mode 100644 index 0000000000000000000000000000000000000000..efc1eb9d12248165d5d13a1dccf023250d719207 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references.json @@ -0,0 +1,453 @@ +{ + "references": [ + { + "name": "woman_in_dark_dress", + "ref_image": "references/ref_woman_in_dark_dress.png", + "raw_ref_image": "references/raw_ref_woman_in_dark_dress_attempt_01.png", + "diversify_input": "crops/diversify_input_woman_in_dark_dress.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_woman_in_dark_dress_attempt_01.png", + "output": "references/ref_woman_in_dark_dress.png", + "mask": "references/sam_mask_woman_in_dark_dress.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 339.0, + 6.0, + 680.0, + 1019.0 + ], + "mask_score": 3.415794, + "mask_area_ratio": 0.169896, + "elapsed_seconds": 8.5595 + }, + "reference_verify": "references/reference_verify_woman_in_dark_dress.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dashboard", + "ref_image": "references/ref_dashboard.png", + "raw_ref_image": "references/raw_ref_dashboard_attempt_02.png", + "diversify_input": "crops/diversify_input_dashboard.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dashboard_attempt_02.png", + "output": "references/ref_dashboard.png", + "mask": "references/sam_mask_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 267.0, + 1023.0, + 746.0 + ], + "mask_score": 3.179413, + "mask_area_ratio": 0.226381, + "elapsed_seconds": 8.7463 + }, + "reference_verify": "references/reference_verify_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 2 + }, + { + "name": "overhead_wires", + "ref_image": "references/ref_overhead_wires.png", + "raw_ref_image": "references/raw_ref_overhead_wires_attempt_01.png", + "diversify_input": "crops/diversify_input_overhead_wires.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_overhead_wires_attempt_01.png", + "output": "references/ref_overhead_wires.png", + "mask": "references/sam_mask_overhead_wires.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 152.0, + 1023.0, + 791.0 + ], + "mask_score": 2.72423, + "mask_area_ratio": 0.290783, + "elapsed_seconds": 7.2274 + }, + "reference_verify": "references/reference_verify_overhead_wires.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "bunch_of_balloons", + "ref_image": "references/ref_bunch_of_balloons.png", + "raw_ref_image": "references/raw_ref_bunch_of_balloons_attempt_01.png", + "diversify_input": "crops/diversify_input_bunch_of_balloons.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_bunch_of_balloons_attempt_01.png", + "output": "references/ref_bunch_of_balloons.png", + "mask": "references/sam_mask_bunch_of_balloons.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 213.0, + 104.0, + 830.0, + 1023.0 + ], + "mask_score": 3.440433, + "mask_area_ratio": 0.246776, + "elapsed_seconds": 7.1532 + }, + "reference_verify": "references/reference_verify_bunch_of_balloons.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "white_garbage_bag", + "ref_image": "references/ref_white_garbage_bag.png", + "raw_ref_image": "references/raw_ref_white_garbage_bag_attempt_01.png", + "diversify_input": "crops/diversify_input_white_garbage_bag.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_garbage_bag_attempt_01.png", + "output": "references/ref_white_garbage_bag.png", + "mask": "references/sam_mask_white_garbage_bag.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 77.0, + 963.0, + 989.0 + ], + "mask_score": 3.477571, + "mask_area_ratio": 0.521497, + "elapsed_seconds": 7.4222 + }, + "reference_verify": "references/reference_verify_white_garbage_bag.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "multi_story_building_left", + "ref_image": "references/ref_multi_story_building_left.png", + "raw_ref_image": "references/raw_ref_multi_story_building_left_attempt_01.png", + "diversify_input": "crops/diversify_input_multi_story_building_left.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_multi_story_building_left_attempt_01.png", + "output": "references/ref_multi_story_building_left.png", + "mask": "references/sam_mask_multi_story_building_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 21.0, + 18.0, + 1013.0, + 988.0 + ], + "mask_score": 2.993486, + "mask_area_ratio": 0.685524, + "elapsed_seconds": 7.4739 + }, + "reference_verify": "references/reference_verify_multi_story_building_left.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_light_pole", + "ref_image": "references/ref_street_light_pole.png", + "raw_ref_image": "references/raw_ref_street_light_pole_attempt_01.png", + "diversify_input": "crops/diversify_input_street_light_pole.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_street_light_pole_attempt_01.png", + "output": "references/ref_street_light_pole.png", + "mask": "references/sam_mask_street_light_pole.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 204.0, + 10.0, + 816.0, + 1018.0 + ], + "mask_score": 3.426132, + "mask_area_ratio": 0.025422, + "elapsed_seconds": 7.2131 + }, + "reference_verify": "references/reference_verify_street_light_pole.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "white_sedan", + "ref_image": "references/ref_white_sedan.png", + "raw_ref_image": "references/raw_ref_white_sedan_attempt_01.png", + "diversify_input": "crops/diversify_input_white_sedan.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_sedan_attempt_01.png", + "output": "references/ref_white_sedan.png", + "mask": "references/sam_mask_white_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 10.0, + 331.0, + 1014.0, + 694.0 + ], + "mask_score": 2.789065, + "mask_area_ratio": 0.197716, + "elapsed_seconds": 7.2389 + }, + "reference_verify": "references/reference_verify_white_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dark_car_1", + "ref_image": "references/ref_dark_car_1.png", + "raw_ref_image": "references/raw_ref_dark_car_1_attempt_01.png", + "diversify_input": "crops/diversify_input_dark_car_1.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_1_attempt_01.png", + "output": "references/ref_dark_car_1.png", + "mask": "references/sam_mask_dark_car_1.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 317.0, + 1007.0, + 664.0 + ], + "mask_score": 3.079951, + "mask_area_ratio": 0.171859, + "elapsed_seconds": 7.1713 + }, + "reference_verify": "references/reference_verify_dark_car_1.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dark_car_2", + "ref_image": "references/ref_dark_car_2.png", + "raw_ref_image": "references/raw_ref_dark_car_2_attempt_01.png", + "diversify_input": "crops/diversify_input_dark_car_2.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_2_attempt_01.png", + "output": "references/ref_dark_car_2.png", + "mask": "references/sam_mask_dark_car_2.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 353.0, + 1023.0, + 727.0 + ], + "mask_score": 3.072596, + "mask_area_ratio": 0.191711, + "elapsed_seconds": 7.2503 + }, + "reference_verify": "references/reference_verify_dark_car_2.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "brick_building_right", + "ref_image": "references/ref_brick_building_right.png", + "raw_ref_image": "references/raw_ref_brick_building_right_attempt_01.png", + "diversify_input": "crops/diversify_input_brick_building_right.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_brick_building_right_attempt_01.png", + "output": "references/ref_brick_building_right.png", + "mask": "references/sam_mask_brick_building_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 100.0, + 0.0, + 930.0, + 1023.0 + ], + "mask_score": 2.148493, + "mask_area_ratio": 0.586381, + "elapsed_seconds": 7.383 + }, + "reference_verify": "references/reference_verify_brick_building_right.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "metal_pole", + "ref_image": "references/ref_metal_pole.png", + "raw_ref_image": "references/raw_ref_metal_pole_attempt_01.png", + "diversify_input": "crops/diversify_input_metal_pole.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_metal_pole_attempt_01.png", + "output": "references/ref_metal_pole.png", + "mask": "references/sam_mask_metal_pole.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 443.0, + 15.0, + 574.0, + 1015.0 + ], + "mask_score": 3.415272, + "mask_area_ratio": 0.028519, + "elapsed_seconds": 7.2129 + }, + "reference_verify": "references/reference_verify_metal_pole.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "crosswalk_markings", + "ref_image": "references/ref_crosswalk_markings.png", + "raw_ref_image": "references/raw_ref_crosswalk_markings_attempt_01.png", + "diversify_input": "crops/diversify_input_crosswalk_markings.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_crosswalk_markings_attempt_01.png", + "output": "references/ref_crosswalk_markings.png", + "mask": "references/sam_mask_crosswalk_markings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 104.0, + 1023.0, + 866.0 + ], + "mask_score": 3.308171, + "mask_area_ratio": 0.469022, + "elapsed_seconds": 7.2159 + }, + "reference_verify": "references/reference_verify_crosswalk_markings.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "iron_balcony", + "ref_image": "references/ref_iron_balcony.png", + "raw_ref_image": "references/raw_ref_iron_balcony_attempt_01.png", + "diversify_input": "crops/diversify_input_iron_balcony.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_iron_balcony_attempt_01.png", + "output": "references/ref_iron_balcony.png", + "mask": "references/sam_mask_iron_balcony.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 76.0, + 0.0, + 946.0, + 952.0 + ], + "mask_score": 3.194017, + "mask_area_ratio": 0.432758, + "elapsed_seconds": 7.1961 + }, + "reference_verify": "references/reference_verify_iron_balcony.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_brick_building_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_brick_building_right.png new file mode 100644 index 0000000000000000000000000000000000000000..4eba1d28890bb77ac8390c517dac6dacd2dd0f50 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_brick_building_right.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21dda6f497fa7ca8e73726a79802d1a6297f16c70db1c92594a7e57f92809623 +size 1149514 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_bunch_of_balloons.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_bunch_of_balloons.png new file mode 100644 index 0000000000000000000000000000000000000000..339a56646f3ca1868f46c7aea2196faeaca4b8a6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_bunch_of_balloons.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c630463b069930f50253bd61127b32714be1c772b2f1ba3bdb2bf0213d55a773 +size 514354 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_crosswalk_markings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_crosswalk_markings.png new file mode 100644 index 0000000000000000000000000000000000000000..31386ef105225bfa1d77e32768e736764f7b6e9b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_crosswalk_markings.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50859f4bc1167737ed4a0ad49726f3e2bd1f9f456d8b7fde2c05f088b9bbad1b +size 215105 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_1.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_1.png new file mode 100644 index 0000000000000000000000000000000000000000..8659dc5d032ef7ba87fb4eefeefe3f69cd2b5cb2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379a34592d97c9acd64d920787cd976214878c809a5f8c632267931194bb2546 +size 334251 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_2.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_2.png new file mode 100644 index 0000000000000000000000000000000000000000..93507eb55915c2f160073837fe596006fa2772a2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dark_car_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d58e6bf956427d0b371d1cfb9288ba6e204818dc3770b8f9447496b5ea7fa22 +size 364470 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..145788ff1da75f507af20953bf6e39711ddd9c88 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71c1b2ff25fdfae00427ce2330b53ea055e8eed38099fdfc867eb3fd6b268915 +size 426713 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_iron_balcony.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_iron_balcony.png new file mode 100644 index 0000000000000000000000000000000000000000..81439f4da109784642f4aaadbd87d5d9b3525dc2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_iron_balcony.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46252ea7bbbe56fe928e8d257d7d3960574d5601b2de284d6fcf4847bd0a6720 +size 671603 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_metal_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_metal_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..5ef9e6f4c2015ad521fe9113810cf09e468dd0a4 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_metal_pole.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_multi_story_building_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_multi_story_building_left.png new file mode 100644 index 0000000000000000000000000000000000000000..f1776a363843707a2d42e195c79795b84b6a6298 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_multi_story_building_left.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db636f3d3528ff1aab6abcdadaea2d03700fd20233dab2f565907a62d93be6a +size 1439588 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_overhead_wires.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_overhead_wires.png new file mode 100644 index 0000000000000000000000000000000000000000..7d1b532650ed9b343ea768816ab4482ce9733713 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_overhead_wires.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8baacedfc79f635c0671d46f7c956f10476aff221e71605b00c78f3508048f5a +size 496692 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_street_light_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_street_light_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..2c9eddfe82cf8ff2df81fa7955c2426d8e5ce766 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_street_light_pole.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_garbage_bag.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_garbage_bag.png new file mode 100644 index 0000000000000000000000000000000000000000..1ad1ac79f85b1139f519b5cd1c7a9213d37115f1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_garbage_bag.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8aa0841da79ca8ed68e4a346090e3a4c4ce7b8deb66af0fd40287f4f84ce485 +size 760533 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..5f49c7e20aca3b8f72d0d154d5dfa2884a4411d7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_white_sedan.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4b2ecb7eb3c2b2e68971f5d7bb9b0ee7a2bfb7fc370d228dcca75ee66c8b3e +size 347216 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_woman_in_dark_dress.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_woman_in_dark_dress.png new file mode 100644 index 0000000000000000000000000000000000000000..e123a677512a15b31e5d49ac2bb7a204931b71a2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/ref_woman_in_dark_dress.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce901032479ea3e28e3574f66a0c59f791306eebd53ba5e63d333a43e64e3692 +size 305992 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_brick_building_right.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_brick_building_right.json new file mode 100644 index 0000000000000000000000000000000000000000..5e54ae2e9c37c04874b11cba94a7f6ed97845d17 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_brick_building_right.json @@ -0,0 +1,46 @@ +{ + "name": "brick_building_right", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_brick_building_right_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_brick_building_right_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_brick_building_right_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_brick_building_right_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_brick_building_right_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_brick_building_right_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 100.0, + 0.0, + 930.0, + 1023.0 + ], + "mask_score": 2.148493, + "mask_area_ratio": 0.586381, + "elapsed_seconds": 7.383 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a cropped section of a brick building with windows and a storefront awning. As a large environmental feature, a representative crop is acceptable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_bunch_of_balloons.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_bunch_of_balloons.json new file mode 100644 index 0000000000000000000000000000000000000000..4e2fae67ba3ff6d5aeab4b863f261006fb65e72e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_bunch_of_balloons.json @@ -0,0 +1,46 @@ +{ + "name": "bunch_of_balloons", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_bunch_of_balloons_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_bunch_of_balloons_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_bunch_of_balloons_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_bunch_of_balloons_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_bunch_of_balloons_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_bunch_of_balloons_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 213.0, + 104.0, + 830.0, + 1023.0 + ], + "mask_score": 3.440433, + "mask_area_ratio": 0.246776, + "elapsed_seconds": 7.1532 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The bunch of balloons is clearly visible and isolated on a white background. The strings are slightly cropped at the bottom edge, but this does not affect the usability of the reference image." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_crosswalk_markings.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_crosswalk_markings.json new file mode 100644 index 0000000000000000000000000000000000000000..1e3403cdc423b899577b1687ea50391f63cbd138 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_crosswalk_markings.json @@ -0,0 +1,46 @@ +{ + "name": "crosswalk_markings", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_crosswalk_markings_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_crosswalk_markings_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_crosswalk_markings_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_crosswalk_markings_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_crosswalk_markings_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_crosswalk_markings_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 104.0, + 1023.0, + 866.0 + ], + "mask_score": 3.308171, + "mask_area_ratio": 0.469022, + "elapsed_seconds": 7.2159 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a set of crosswalk markings isolated on a white background. It serves as an acceptable dataset reference for the subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dark_car_1.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dark_car_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f81e5936909d5e6f3aa7791c19266fce8da87ae3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dark_car_1.json @@ -0,0 +1,46 @@ +{ + "name": "dark_car_1", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dark_car_1_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dark_car_1_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dark_car_1_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_1_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_dark_car_1_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_dark_car_1_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 317.0, + 1007.0, + 664.0 + ], + "mask_score": 3.079951, + "mask_area_ratio": 0.171859, + "elapsed_seconds": 7.1713 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated dark-colored sedan on a white background, perfectly suitable as a subject reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dark_car_2.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dark_car_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bf233a9b48bde21e31d17ee3003ee010541334aa --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dark_car_2.json @@ -0,0 +1,46 @@ +{ + "name": "dark_car_2", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dark_car_2_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dark_car_2_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dark_car_2_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_2_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_dark_car_2_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_dark_car_2_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 353.0, + 1023.0, + 727.0 + ], + "mask_score": 3.072596, + "mask_area_ratio": 0.191711, + "elapsed_seconds": 7.2503 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "A complete, side-profile view of a dark car isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dashboard.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dashboard.json new file mode 100644 index 0000000000000000000000000000000000000000..10bb8d807ba18a814707d3bb57ac961c5ada307b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_dashboard.json @@ -0,0 +1,87 @@ +{ + "name": "dashboard", + "passed": true, + "accepted_attempt": 2, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dashboard_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dashboard_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dashboard_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dashboard_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_dashboard_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_dashboard_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 327.0, + 1023.0, + 740.0 + ], + "mask_score": 2.797969, + "mask_area_ratio": 0.161998, + "elapsed_seconds": 7.154 + }, + "verify": { + "passed": false, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [ + "subject is highly fragmented with a large chunk missing where the steering wheel would be" + ], + "notes": "The dashboard has a massive hole cut out of it where the steering wheel was removed during isolation. This leaves the object too fragmented and incomplete to serve as a high quality reference." + } + }, + { + "attempt": 2, + "raw_ref_image": "references/raw_ref_dashboard_attempt_02.png", + "candidate_ref_image": "references/candidate_ref_dashboard_attempt_02.png", + "candidate_sam_mask": "references/candidate_sam_mask_dashboard_attempt_02.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dashboard_attempt_02.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_dashboard_attempt_02.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_dashboard_attempt_02.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 267.0, + 1023.0, + 746.0 + ], + "mask_score": 3.179413, + "mask_area_ratio": 0.226381, + "elapsed_seconds": 8.7463 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Car dashboard assembly successfully isolated on a white background. Minor cropping on the A-pillars is acceptable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_iron_balcony.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_iron_balcony.json new file mode 100644 index 0000000000000000000000000000000000000000..0b9832c99f1faa1e6632fdd2ca402f956674984e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_iron_balcony.json @@ -0,0 +1,46 @@ +{ + "name": "iron_balcony", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_iron_balcony_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_iron_balcony_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_iron_balcony_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_iron_balcony_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_iron_balcony_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_iron_balcony_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 76.0, + 0.0, + 946.0, + 952.0 + ], + "mask_score": 3.194017, + "mask_area_ratio": 0.432758, + "elapsed_seconds": 7.1961 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a dark wrought-iron balcony or fire escape landing on a white background. The attached stairs and ladder are cropped at the image boundaries, but the main balcony structure is highly recognizable and serves as a good reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_metal_pole.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_metal_pole.json new file mode 100644 index 0000000000000000000000000000000000000000..a39f68161b9a7283bedc495d5978f6a86254b0cc --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_metal_pole.json @@ -0,0 +1,46 @@ +{ + "name": "metal_pole", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_metal_pole_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_metal_pole_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_metal_pole_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_metal_pole_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_metal_pole_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_metal_pole_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 443.0, + 15.0, + 574.0, + 1015.0 + ], + "mask_score": 3.415272, + "mask_area_ratio": 0.028519, + "elapsed_seconds": 7.2129 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The metal pole is clearly visible, complete, isolated on a white background, and satisfies all requirements for a non-person subject reference image." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_multi_story_building_left.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_multi_story_building_left.json new file mode 100644 index 0000000000000000000000000000000000000000..52abb63e50fa87583aed7e415cbaaeb89ff91b20 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_multi_story_building_left.json @@ -0,0 +1,46 @@ +{ + "name": "multi_story_building_left", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_multi_story_building_left_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_multi_story_building_left_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_multi_story_building_left_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_multi_story_building_left_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_multi_story_building_left_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_multi_story_building_left_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 21.0, + 18.0, + 1013.0, + 988.0 + ], + "mask_score": 2.993486, + "mask_area_ratio": 0.685524, + "elapsed_seconds": 7.4739 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The multi-story brick building is perfectly isolated, fully visible, and not cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_overhead_wires.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_overhead_wires.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d76cd09347ed3e16e6327c2e8985a49a841285 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_overhead_wires.json @@ -0,0 +1,46 @@ +{ + "name": "overhead_wires", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_overhead_wires_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_overhead_wires_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_overhead_wires_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_overhead_wires_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_overhead_wires_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_overhead_wires_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 152.0, + 1023.0, + 791.0 + ], + "mask_score": 2.72423, + "mask_area_ratio": 0.290783, + "elapsed_seconds": 7.2274 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows overhead wires on a white background. Since it's an extensive environmental feature, a representative crop like this is fully acceptable and recognizable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_street_light_pole.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_street_light_pole.json new file mode 100644 index 0000000000000000000000000000000000000000..2b903e228cadbe65a7163625b310d80a42200574 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_street_light_pole.json @@ -0,0 +1,46 @@ +{ + "name": "street_light_pole", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_light_pole_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_light_pole_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_light_pole_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_street_light_pole_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_street_light_pole_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_street_light_pole_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 204.0, + 10.0, + 816.0, + 1018.0 + ], + "mask_score": 3.426132, + "mask_area_ratio": 0.025422, + "elapsed_seconds": 7.2131 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated street light pole on a white background, serving as a good reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_white_garbage_bag.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_white_garbage_bag.json new file mode 100644 index 0000000000000000000000000000000000000000..39f959821b4e43ff8d605f26736f547c1b7ca321 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_white_garbage_bag.json @@ -0,0 +1,46 @@ +{ + "name": "white_garbage_bag", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_white_garbage_bag_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_white_garbage_bag_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_white_garbage_bag_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_garbage_bag_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_white_garbage_bag_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_white_garbage_bag_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 77.0, + 963.0, + 989.0 + ], + "mask_score": 3.477571, + "mask_area_ratio": 0.521497, + "elapsed_seconds": 7.4222 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The white garbage bag is fully visible, isolated, complete, and on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_white_sedan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_white_sedan.json new file mode 100644 index 0000000000000000000000000000000000000000..00f1844079fb91bba07099e0304547a065fbb0d5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_white_sedan.json @@ -0,0 +1,46 @@ +{ + "name": "white_sedan", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_white_sedan_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_white_sedan_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_white_sedan_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_sedan_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_white_sedan_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_white_sedan_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 10.0, + 331.0, + 1014.0, + 694.0 + ], + "mask_score": 2.789065, + "mask_area_ratio": 0.197716, + "elapsed_seconds": 7.2389 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete white sedan isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_woman_in_dark_dress.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_woman_in_dark_dress.json new file mode 100644 index 0000000000000000000000000000000000000000..dc72567069bffda897c90a8a824efd73d73ba8b3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/reference_verify_woman_in_dark_dress.json @@ -0,0 +1,46 @@ +{ + "name": "woman_in_dark_dress", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_woman_in_dark_dress_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_woman_in_dark_dress_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_woman_in_dark_dress_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_woman_in_dark_dress_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_ref_woman_in_dark_dress_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/candidate_sam_mask_woman_in_dark_dress_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 339.0, + 6.0, + 680.0, + 1019.0 + ], + "mask_score": 3.415794, + "mask_area_ratio": 0.169896, + "elapsed_seconds": 8.5595 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a single person with a complete white background. There are no truncated body parts, satisfying all the given hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_brick_building_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_brick_building_right.png new file mode 100644 index 0000000000000000000000000000000000000000..c102242acff3622eb4b7710535c08ce936cf1e51 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_brick_building_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_bunch_of_balloons.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_bunch_of_balloons.png new file mode 100644 index 0000000000000000000000000000000000000000..215ebc7c6ac84c8d97d4729972be4dd8ca6aecd0 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_bunch_of_balloons.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_crosswalk_markings.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_crosswalk_markings.png new file mode 100644 index 0000000000000000000000000000000000000000..060b6495976d52189375fe330cf6a297f1978f36 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_crosswalk_markings.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dark_car_1.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dark_car_1.png new file mode 100644 index 0000000000000000000000000000000000000000..d0510918d16576e7513320e94c8bd3382758bf9f Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dark_car_1.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dark_car_2.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dark_car_2.png new file mode 100644 index 0000000000000000000000000000000000000000..28020557fc7d0f73bd4e63ff80e207f84a87061e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dark_car_2.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dashboard.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..bca09692f789230881d63832c73d22e83f26a039 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_dashboard.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_iron_balcony.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_iron_balcony.png new file mode 100644 index 0000000000000000000000000000000000000000..6f2af43d0bfc1c35955e7a59f17fcdeebd55f8a6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_iron_balcony.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_metal_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_metal_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..bd717ad33c85083c12e52f4e7fbb8b0b8b9618dd Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_metal_pole.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_multi_story_building_left.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_multi_story_building_left.png new file mode 100644 index 0000000000000000000000000000000000000000..49414abd362e0ae3b2cf0f3c4a1f04a64143125a Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_multi_story_building_left.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_overhead_wires.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_overhead_wires.png new file mode 100644 index 0000000000000000000000000000000000000000..1dad6285d517aabd410123920d84996206d70963 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_overhead_wires.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_street_light_pole.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_street_light_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..66b3d25aad1d7ea53a8e12577ee208f1afe0b201 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_street_light_pole.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_white_garbage_bag.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_white_garbage_bag.png new file mode 100644 index 0000000000000000000000000000000000000000..6ae92626c5c713e4b4cab6afeb22fbef02d2518f Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_white_garbage_bag.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_white_sedan.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_white_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..30825e0ffb2d79b24bcccaa0bc85393da5be7968 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_white_sedan.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_woman_in_dark_dress.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_woman_in_dark_dress.png new file mode 100644 index 0000000000000000000000000000000000000000..8d925e84dc23cdca324d833825c20732fefc1afa Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/references/sam_mask_woman_in_dark_dress.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/row.json new file mode 100644 index 0000000000000000000000000000000000000000..89c862dafae1332c87e6e6ea153e68dd4f6e418e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/row.json @@ -0,0 +1,670 @@ +{ + "sample_id": "sample_000007", + "target_total": 14, + "target_people": 1, + "target_objects": 13, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 14, + "n_detected": 14, + "n_subjects": 14, + "subjects": [ + { + "name": "woman_in_dark_dress", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "source_description": "A woman with dark hair wearing a dark knee-length dress, walking along the path. Source dataset: CrowdHuman. Scene context: A bride in a white gown and her bridesmaids in dark dresses are walking along a paved path next to a building with stairs, surrounded by trees and a white fence in a sunlit outdoor setting.", + "sub_caption": "bridesmaid: A woman with dark hair wearing a dark knee-length dress.. Scene role: walking along the left sidewalk, approaching the crosswalk", + "measured_bbox": [ + 0.1595, + 0.5229, + 0.2058, + 0.7308 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_woman_in_dark_dress.png", + "raw_ref_image": "references/raw_ref_woman_in_dark_dress_attempt_01.png", + "reference_verify": "references/reference_verify_woman_in_dark_dress.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_woman_in_dark_dress_attempt_01.png", + "output": "references/ref_woman_in_dark_dress.png", + "mask": "references/sam_mask_woman_in_dark_dress.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 339.0, + 6.0, + 680.0, + 1019.0 + ], + "mask_score": 3.415794, + "mask_area_ratio": 0.169896, + "elapsed_seconds": 8.5595 + } + }, + { + "name": "dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "source_description": "The dashboard of the camera vehicle, visible at the bottom of the frame. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching a tunnel or underpass, with buildings on the left and a retaining wall on the right.", + "sub_caption": "dashboard: The dashboard of the camera vehicle, visible at the bottom of the frame.. Scene role: anchoring the bottom foreground of the frame to establish the interior car viewpoint", + "measured_bbox": [ + 0.0, + 0.7874, + 1.0, + 1.0 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dashboard.png", + "raw_ref_image": "references/raw_ref_dashboard_attempt_02.png", + "reference_verify": "references/reference_verify_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dashboard_attempt_02.png", + "output": "references/ref_dashboard.png", + "mask": "references/sam_mask_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 267.0, + 1023.0, + 746.0 + ], + "mask_score": 3.179413, + "mask_area_ratio": 0.226381, + "elapsed_seconds": 8.7463 + } + }, + { + "name": "overhead_wires", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "source_description": "Power and communication lines stretching across the sky above the street. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential street lined with parked cars and houses.", + "sub_caption": "overhead wires: Power and communication lines stretching across the sky.. Scene role: strung overhead across the sky, connecting the buildings on either side", + "measured_bbox": [ + 0.0656, + 0.0, + 0.9992, + 0.338 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overhead_wires.png", + "raw_ref_image": "references/raw_ref_overhead_wires_attempt_01.png", + "reference_verify": "references/reference_verify_overhead_wires.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_overhead_wires_attempt_01.png", + "output": "references/ref_overhead_wires.png", + "mask": "references/sam_mask_overhead_wires.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 152.0, + 1023.0, + 791.0 + ], + "mask_score": 2.72423, + "mask_area_ratio": 0.290783, + "elapsed_seconds": 7.2274 + } + }, + { + "name": "bunch_of_balloons", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", + "source_name": "bunch of balloons", + "source_description": "A bunch of heart-shaped balloons, some pink and some red, tied to a wooden utility pole. Source dataset: CrowdHuman. Scene context: A group of people standing in a circle, holding hands on a street in a residential area.", + "sub_caption": "bunch of balloons: A bunch of heart-shaped balloons, some pink and some red.. Scene role: tied to a metal pole on the left sidewalk", + "measured_bbox": [ + 0.2318, + 0.3806, + 0.2869, + 0.4973 + ], + "detection_confidence": 100, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_bunch_of_balloons.png", + "raw_ref_image": "references/raw_ref_bunch_of_balloons_attempt_01.png", + "reference_verify": "references/reference_verify_bunch_of_balloons.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_bunch_of_balloons_attempt_01.png", + "output": "references/ref_bunch_of_balloons.png", + "mask": "references/sam_mask_bunch_of_balloons.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 213.0, + 104.0, + 830.0, + 1023.0 + ], + "mask_score": 3.440433, + "mask_area_ratio": 0.246776, + "elapsed_seconds": 7.1532 + } + }, + { + "name": "white_garbage_bag", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", + "source_name": "white garbage bag", + "source_description": "large white plastic bag on the bottom right corner Source dataset: CrowdHuman. Scene context: A group of fifteen people posing for a photo in front of a colorful graffiti wall.", + "sub_caption": "white garbage bag: A large white plastic bag.. Scene role: placed on the curb near the crosswalk on the right side", + "measured_bbox": [ + 0.8062, + 0.6476, + 0.8807, + 0.7562 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_garbage_bag.png", + "raw_ref_image": "references/raw_ref_white_garbage_bag_attempt_01.png", + "reference_verify": "references/reference_verify_white_garbage_bag.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_garbage_bag_attempt_01.png", + "output": "references/ref_white_garbage_bag.png", + "mask": "references/sam_mask_white_garbage_bag.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 77.0, + 963.0, + 989.0 + ], + "mask_score": 3.477571, + "mask_area_ratio": 0.521497, + "elapsed_seconds": 7.4222 + } + }, + { + "name": "multi_story_building_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "source_description": "Multi-story brick buildings with numerous windows and fire escapes on the left side. Source dataset: BDD100K. Scene context: A narrow city street lined with parked cars on both sides, with residential and commercial buildings featuring fire escapes and awnings, illuminated by sunlight filtering through mature trees.", + "sub_caption": "building on left: Multi-story brick buildings with numerous windows and fire escapes.. Scene role: forming the street facade along the left side of the frame", + "measured_bbox": [ + 0.1156, + 0.0, + 0.416, + 0.6004 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_multi_story_building_left.png", + "raw_ref_image": "references/raw_ref_multi_story_building_left_attempt_01.png", + "reference_verify": "references/reference_verify_multi_story_building_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_multi_story_building_left_attempt_01.png", + "output": "references/ref_multi_story_building_left.png", + "mask": "references/sam_mask_multi_story_building_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 21.0, + 18.0, + 1013.0, + 988.0 + ], + "mask_score": 2.993486, + "mask_area_ratio": 0.685524, + "elapsed_seconds": 7.4739 + } + }, + { + "name": "street_light_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "source_description": "A tall, curved metal street light pole on the right side of the road, supporting the overhead sign. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane road bordered by trees, with other vehicles and road signs visible.", + "sub_caption": "street light pole: A tall, curved metal street light pole.. Scene role: standing on the right sidewalk, leaning over the roadway", + "measured_bbox": [ + 0.548, + 0.0288, + 0.7884, + 0.7106 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light_pole.png", + "raw_ref_image": "references/raw_ref_street_light_pole_attempt_01.png", + "reference_verify": "references/reference_verify_street_light_pole.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_street_light_pole_attempt_01.png", + "output": "references/ref_street_light_pole.png", + "mask": "references/sam_mask_street_light_pole.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 204.0, + 10.0, + 816.0, + 1018.0 + ], + "mask_score": 3.426132, + "mask_area_ratio": 0.025422, + "elapsed_seconds": 7.2131 + } + }, + { + "name": "white_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "source_description": "A white passenger car partially visible in the right lane next to the gold SUV. Source dataset: BDD100K. Scene context: View from inside a car driving in city traffic on a sunny day with multiple vehicles and urban infrastructure visible.", + "sub_caption": "white sedan: A white passenger car.. Scene role: driving in the forward lane just past the crosswalk", + "measured_bbox": [ + 0.494, + 0.5481, + 0.6384, + 0.6346 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_sedan.png", + "raw_ref_image": "references/raw_ref_white_sedan_attempt_01.png", + "reference_verify": "references/reference_verify_white_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_white_sedan_attempt_01.png", + "output": "references/ref_white_sedan.png", + "mask": "references/sam_mask_white_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 10.0, + 331.0, + 1014.0, + 694.0 + ], + "mask_score": 2.789065, + "mask_area_ratio": 0.197716, + "elapsed_seconds": 7.2389 + } + }, + { + "name": "dark_car_1", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "source_description": "A dark-colored sedan visible on the left side of the street, partially obscured by rain. Source dataset: BDD100K. Scene context: View from inside a vehicle through a heavily rain-covered windshield, looking at city traffic and buildings.", + "sub_caption": "dark car: A dark-colored sedan.. Scene role: driving in the opposing traffic lane to the left", + "measured_bbox": [ + 0.3126, + 0.5583, + 0.4593, + 0.6372 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_car_1.png", + "raw_ref_image": "references/raw_ref_dark_car_1_attempt_01.png", + "reference_verify": "references/reference_verify_dark_car_1.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_1_attempt_01.png", + "output": "references/ref_dark_car_1.png", + "mask": "references/sam_mask_dark_car_1.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 317.0, + 1007.0, + 664.0 + ], + "mask_score": 3.079951, + "mask_area_ratio": 0.171859, + "elapsed_seconds": 7.1713 + } + }, + { + "name": "dark_car_2", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "source_description": "A dark-colored car parked on the right side of the street, ahead of the other dark car. Source dataset: BDD100K. Scene context: A dashcam view driving down a city street lined with parked cars on both sides and multi-story brick apartment buildings under a partly cloudy sky.", + "sub_caption": "dark car 2: A dark-colored car.. Scene role: parked alongside the curb on the right side of the street", + "measured_bbox": [ + 0.7955, + 0.5535, + 0.9254, + 0.6345 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_car_2.png", + "raw_ref_image": "references/raw_ref_dark_car_2_attempt_01.png", + "reference_verify": "references/reference_verify_dark_car_2.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_dark_car_2_attempt_01.png", + "output": "references/ref_dark_car_2.png", + "mask": "references/sam_mask_dark_car_2.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 353.0, + 1023.0, + 727.0 + ], + "mask_score": 3.072596, + "mask_area_ratio": 0.191711, + "elapsed_seconds": 7.2503 + } + }, + { + "name": "brick_building_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "source_description": "A tall, multi-story red brick building on the left side of the street, featuring arched windows and a storefront. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a city street lined with parked cars and multi-story brick buildings.", + "sub_caption": "brick building: A tall, multi-story red brick building featuring arched windows and a storefront.. Scene role: lining the street on the right side of the frame", + "measured_bbox": [ + 0.9184, + 0.0073, + 0.9492, + 0.6625 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_brick_building_right.png", + "raw_ref_image": "references/raw_ref_brick_building_right_attempt_01.png", + "reference_verify": "references/reference_verify_brick_building_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_brick_building_right_attempt_01.png", + "output": "references/ref_brick_building_right.png", + "mask": "references/sam_mask_brick_building_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 100.0, + 0.0, + 930.0, + 1023.0 + ], + "mask_score": 2.148493, + "mask_area_ratio": 0.586381, + "elapsed_seconds": 7.383 + } + }, + { + "name": "metal_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "source_description": "A thin, straight metal pole standing upright on the sidewalk near the park area on the left. Source dataset: BDD100K. Scene context: A dashcam view looking down a slightly sloped residential city street with cars parked on both sides, trees bordering a park area to the left, and a tall building to the right, under a cloudy, overcast sky.", + "sub_caption": "pole: A thin, straight metal pole standing upright.. Scene role: standing on the left sidewalk, serving as a mounting point for the balloons", + "measured_bbox": [ + 0.215, + 0.375, + 0.23, + 0.734 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_pole.png", + "raw_ref_image": "references/raw_ref_metal_pole_attempt_01.png", + "reference_verify": "references/reference_verify_metal_pole.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_metal_pole_attempt_01.png", + "output": "references/ref_metal_pole.png", + "mask": "references/sam_mask_metal_pole.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 443.0, + 15.0, + 574.0, + 1015.0 + ], + "mask_score": 3.415272, + "mask_area_ratio": 0.028519, + "elapsed_seconds": 7.2129 + } + }, + { + "name": "crosswalk_markings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "source_description": "White painted lines on the road surface indicating a pedestrian crosswalk. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with traffic lights and a crosswalk.", + "sub_caption": "crosswalk markings: White painted lines on the road surface indicating a pedestrian crosswalk.. Scene role: painted across the road directly ahead of the camera vehicle", + "measured_bbox": [ + 0.3161, + 0.6787, + 0.7102, + 0.7212 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_crosswalk_markings.png", + "raw_ref_image": "references/raw_ref_crosswalk_markings_attempt_01.png", + "reference_verify": "references/reference_verify_crosswalk_markings.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_crosswalk_markings_attempt_01.png", + "output": "references/ref_crosswalk_markings.png", + "mask": "references/sam_mask_crosswalk_markings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 104.0, + 1023.0, + 866.0 + ], + "mask_score": 3.308171, + "mask_area_ratio": 0.469022, + "elapsed_seconds": 7.2159 + } + }, + { + "name": "iron_balcony", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "source_description": "A dark, wrought-iron balcony on a building. Source dataset: CrowdHuman. Scene context: People walk down a narrow, sunlit street lined with tall buildings.", + "sub_caption": "balcony: A dark, wrought-iron balcony.. Scene role: attached to the facade of the multi-story building on the left", + "measured_bbox": [ + 0.0089, + 0.0773, + 0.1849, + 0.3213 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_iron_balcony.png", + "raw_ref_image": "references/raw_ref_iron_balcony_attempt_01.png", + "reference_verify": "references/reference_verify_iron_balcony.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000007/references/raw_ref_iron_balcony_attempt_01.png", + "output": "references/ref_iron_balcony.png", + "mask": "references/sam_mask_iron_balcony.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 76.0, + 0.0, + 946.0, + 952.0 + ], + "mask_score": 3.194017, + "mask_area_ratio": 0.432758, + "elapsed_seconds": 7.1961 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..5e61c5142c437b7c791e01ef1e4513a45aae2a18 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000007/vocab_task.json @@ -0,0 +1,210 @@ +{ + "task_id": "sample_000007", + "sample_id": "sample_000007", + "sample_index": 7, + "target_total": 14, + "target_people": 1, + "target_objects": 13, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 43568, + "image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "name": "bridesmaid", + "description": "A woman with dark hair wearing a dark knee-length dress, walking along the path. Source dataset: CrowdHuman. Scene context: A bride in a white gown and her bridesmaids in dark dresses are walking along a paved path next to a building with stairs, surrounded by trees and a white fence in a sunlit outdoor setting." + }, + { + "candidate_index": 1, + "source_offset": 109525, + "image_id": "CrowdHuman:data/data_47/273275,11e9620009f0fe3f7.jpg:person:5", + "name": "hiker in light shirt", + "description": "A hiker partially visible wearing a light-colored long-sleeved shirt. Source dataset: CrowdHuman. Scene context: A group of hikers navigates a rocky, forested area with yellow wildflowers." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 226933, + "image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "name": "dashboard", + "description": "The dashboard of the camera vehicle, visible at the bottom of the frame. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching a tunnel or underpass, with buildings on the left and a retaining wall on the right." + }, + { + "candidate_index": 1, + "source_offset": 236099, + "image_id": "BDD100K:c807d32d-e5383e74:object:6", + "name": "airport terminal building", + "description": "Large modern building on the right with a curved, overhanging roof and tall dark glass windows. Source dataset: BDD100K. Scene context: View from a vehicle driving slowly past an airport terminal with a line of metal bollards separating the roadway from the sidewalk." + }, + { + "candidate_index": 2, + "source_offset": 181248, + "image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "name": "overhead wires", + "description": "Power and communication lines stretching across the sky above the street. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential street lined with parked cars and houses." + }, + { + "candidate_index": 3, + "source_offset": 211638, + "image_id": "BDD100K:c2f92f94-43481d10:object:5", + "name": "building lights", + "description": "Illuminated signs and windows from buildings lining the right side of the street. Source dataset: BDD100K. Scene context: A dark, empty street at night viewed from inside a car, illuminated sparsely by streetlights and vehicle headlights." + }, + { + "candidate_index": 4, + "source_offset": 121991, + "image_id": "CrowdHuman:data/data_7/282555,1205160005cee3251.jpg:object:1", + "name": "stone gallery", + "description": "A long stone building extending along the left side, characterized by a dark, intricately carved exterior and a continuous wall with multiple large windows containing densely packed, turned stone pillars. Source dataset: CrowdHuman. Scene context: Tourists explore a large ancient stone temple complex featuring a prominent tiered tower and long columned galleries under a partly cloudy sky." + }, + { + "candidate_index": 5, + "source_offset": 131395, + "image_id": "CrowdHuman:data/data_9/273275,7ed7b0000bbb63c7.jpg:object:2", + "name": "sunglasses", + "description": "Dark sunglasses worn by the woman in the gray shirt. Source dataset: CrowdHuman. Scene context: A group of nine people, including adults and children, pose for a photo outdoors in front of a large, old tree." + }, + { + "candidate_index": 6, + "source_offset": 19389, + "image_id": "CrowdHuman:data/data_19/273271,2d803000c90cec0a.jpg:object:3", + "name": "motorized wheelchair", + "description": "green motorized wheelchair with a black metal cage-like guard attached to the front. Source dataset: CrowdHuman. Scene context: Several individuals in specialized motorized wheelchairs are playing power soccer on an outdoor court." + }, + { + "candidate_index": 7, + "source_offset": 116248, + "image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", + "name": "bunch of balloons", + "description": "A bunch of heart-shaped balloons, some pink and some red, tied to a wooden utility pole. Source dataset: CrowdHuman. Scene context: A group of people standing in a circle, holding hands on a street in a residential area." + }, + { + "candidate_index": 8, + "source_offset": 51323, + "image_id": "CrowdHuman:data/data_33/273275,a22dc000cd21038b.jpg:object:29", + "name": "wooden post", + "description": "A wooden post supporting the string lights or part of a fence. Source dataset: CrowdHuman. Scene context: A large group of people gathered outdoors on a paved area, waving and posing for a picture, with a small pond, bridge, and picnic tables in the background." + }, + { + "candidate_index": 9, + "source_offset": 90451, + "image_id": "CrowdHuman:data/data_55/273275,1dfaa0001f3d5a76.jpg:object:11", + "name": "flag", + "description": "the French national flag flying atop a building Source dataset: CrowdHuman. Scene context: A view of Paris with an equestrian statue in the foreground and the Eiffel Tower in the distance under a cloudy sky with sun rays." + }, + { + "candidate_index": 10, + "source_offset": 42073, + "image_id": "CrowdHuman:data/data_3/282555,5e3b40007052b80f.jpg:object:33", + "name": "black garment", + "description": "A black garment hanging on a rack. Source dataset: CrowdHuman. Scene context: A group of people standing in a line inside a brightly lit retail clothing store, with a mother carrying a baby in the foreground." + }, + { + "candidate_index": 11, + "source_offset": 86280, + "image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", + "name": "white garbage bag", + "description": "large white plastic bag on the bottom right corner Source dataset: CrowdHuman. Scene context: A group of fifteen people posing for a photo in front of a colorful graffiti wall." + }, + { + "candidate_index": 12, + "source_offset": 169799, + "image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "name": "building on left", + "description": "Multi-story brick buildings with numerous windows and fire escapes on the left side. Source dataset: BDD100K. Scene context: A narrow city street lined with parked cars on both sides, with residential and commercial buildings featuring fire escapes and awnings, illuminated by sunlight filtering through mature trees." + }, + { + "candidate_index": 13, + "source_offset": 217845, + "image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "name": "street light pole", + "description": "A tall, curved metal street light pole on the right side of the road, supporting the overhead sign. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane road bordered by trees, with other vehicles and road signs visible." + }, + { + "candidate_index": 14, + "source_offset": 74994, + "image_id": "CrowdHuman:data/data_47/273278,13690500030bdbb93.jpg:object:1", + "name": "illuminated store sign", + "description": "Vertical, brightly lit signs in various colors like red, blue, purple, and white, with Korean and English text attached to building facades. Source dataset: CrowdHuman. Scene context: A crowded, brightly lit shopping street at night filled with pedestrians and lined with numerous colorful illuminated store signs and street stalls." + }, + { + "candidate_index": 15, + "source_offset": 7349, + "image_id": "CrowdHuman:data/data_12/273275,76ecb000e38a985e.jpg:object:6", + "name": "painting 4", + "description": "A rectangular framed painting near the doorway on the red wall. Source dataset: CrowdHuman. Scene context: A large group of tourists visits an ornate museum gallery filled with classic paintings and large chandeliers." + }, + { + "candidate_index": 16, + "source_offset": 233025, + "image_id": "BDD100K:c754ce77-a105a975:object:3", + "name": "white sedan", + "description": "A white passenger car partially visible in the right lane next to the gold SUV. Source dataset: BDD100K. Scene context: View from inside a car driving in city traffic on a sunny day with multiple vehicles and urban infrastructure visible." + }, + { + "candidate_index": 17, + "source_offset": 23374, + "image_id": "CrowdHuman:data/data_20/273275,8c907000fceb0e02.jpg:object:0", + "name": "stone steps", + "description": "wide, light-colored stone steps where the group is seated Source dataset: CrowdHuman. Scene context: A group of people sitting on outdoor steps listening to a man with a long white beard dressed in a dark robe." + }, + { + "candidate_index": 18, + "source_offset": 178939, + "image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "name": "dark car", + "description": "A dark-colored sedan visible on the left side of the street, partially obscured by rain. Source dataset: BDD100K. Scene context: View from inside a vehicle through a heavily rain-covered windshield, looking at city traffic and buildings." + }, + { + "candidate_index": 19, + "source_offset": 153727, + "image_id": "BDD100K:b61f19ba-2f34ba9f:object:17", + "name": "skyline", + "description": "Silhouettes of buildings visible in the distance against the twilight sky. Source dataset: BDD100K. Scene context: A street scene at dusk with cars stopped at a red traffic light, snow on the ground, and various commercial buildings alongside the road." + }, + { + "candidate_index": 20, + "source_offset": 174624, + "image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "name": "dark car 2", + "description": "A dark-colored car parked on the right side of the street, ahead of the other dark car. Source dataset: BDD100K. Scene context: A dashcam view driving down a city street lined with parked cars on both sides and multi-story brick apartment buildings under a partly cloudy sky." + }, + { + "candidate_index": 21, + "source_offset": 112956, + "image_id": "CrowdHuman:data/data_66/283647,8133000946f9f6f.jpg:object:5", + "name": "camera", + "description": "A camera held by the photographer in the dark coat. Source dataset: CrowdHuman. Scene context: A group of men, including military personnel and civilians in long coats, walking across an airfield with airplanes in the background." + }, + { + "candidate_index": 22, + "source_offset": 208304, + "image_id": "BDD100K:c2186a76-5444a563:object:5", + "name": "brick building", + "description": "A tall, multi-story red brick building on the left side of the street, featuring arched windows and a storefront. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a city street lined with parked cars and multi-story brick buildings." + }, + { + "candidate_index": 23, + "source_offset": 217110, + "image_id": "BDD100K:c411687d-73471431:object:14", + "name": "pole", + "description": "A thin, straight metal pole standing upright on the sidewalk near the park area on the left. Source dataset: BDD100K. Scene context: A dashcam view looking down a slightly sloped residential city street with cars parked on both sides, trees bordering a park area to the left, and a tall building to the right, under a cloudy, overcast sky." + }, + { + "candidate_index": 24, + "source_offset": 172239, + "image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "name": "crosswalk markings", + "description": "White painted lines on the road surface indicating a pedestrian crosswalk. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with traffic lights and a crosswalk." + }, + { + "candidate_index": 25, + "source_offset": 91774, + "image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "name": "balcony", + "description": "A dark, wrought-iron balcony on a building. Source dataset: CrowdHuman. Scene context: People walk down a narrow, sunlit street lined with tall buildings." + } + ], + "rng_seed": 1782661096, + "created_at": 1782292413.3904371 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..ae197e6e6328dc660a9d853eb898399e12fe0746 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee9eeefde41ba518613e143f384568033446941a657ced9721051401891f7907 +size 1440646 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc5a8b22e75ad3b6e2f9cd568d70991f9f9997ca --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/compose_prompt.txt @@ -0,0 +1,119 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A busy city street intersection viewed from the driver's perspective inside a stopped car.", + "activity": "Pedestrians are crossing the street and waiting at the curbside crosswalk while the car waits at a red light.", + "composition": "The camera frames the scene through a car windshield, with a faint dashboard reflection visible at the bottom edge. In the midground, a few pedestrians are actively crossing the street in front of the car, while others are clustered on the right-side curb waiting to cross, anchored by a tall street lamp. The depth extends down the street where people are walking away.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walking_away", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath.", + "role_in_scene": "walking away from the camera on the crosswalk" + }, + { + "name": "woman_waiting", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "description": "A woman wearing a black jacket, dark trousers, and carrying a brown handbag.", + "role_in_scene": "standing on the right curb, looking towards the street traffic" + }, + { + "name": "pedestrian_standing", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "description": "A person wearing a blue jacket and blue jeans.", + "role_in_scene": "standing on the sidewalk near the intersection" + }, + { + "name": "sign_holder", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "description": "A person holding up a large white sign.", + "role_in_scene": "standing on the sidewalk holding a sign near the crosswalk" + }, + { + "name": "pedestrian_crossing_right", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "description": "A person in a white top and dark pants.", + "role_in_scene": "walking across the street from left to right in the crosswalk" + }, + { + "name": "pedestrian_walking_away_sidewalk", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "description": "A person wearing a white top and dark pants.", + "role_in_scene": "walking away from the camera on the distant sidewalk" + }, + { + "name": "young_man_waiting", + "source_index": 13, + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "description": "A young man wearing a dark blue hoodie.", + "role_in_scene": "waiting on the corner for the pedestrian signal" + }, + { + "name": "businessman_waiting", + "source_index": 15, + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "description": "An adult wearing a dark suit and tie.", + "role_in_scene": "standing near the crosswalk amongst the crowd on the curb" + } + ], + "objects": [ + { + "name": "street_lamp", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "description": "A tall street lamp pole.", + "role_in_scene": "standing on the right corner of the intersection, near the curb" + }, + { + "name": "dashboard_reflection", + "source_index": 2, + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "description": "A faint reflection of an interior dashboard and an object on the windshield.", + "role_in_scene": "overlaid on the bottom portion of the view, establishing the perspective from inside a car" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_businessman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_businessman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..5489a96719c02430446e5d82b44b8d47cd899c4b Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_businessman_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_dashboard_reflection.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..9b5da107bf44feadcb181b9396bf5ad511a47fb5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_dashboard_reflection.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b1acb011974c2bc9ab5d4bdec782e916c413dea89f28b4539cdc52641e7637 +size 261583 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_crossing_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_crossing_right.png new file mode 100644 index 0000000000000000000000000000000000000000..0a75275128f1ad08375348066dcd8ce417ca45a6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_crossing_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_standing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..c2c1781429b26dd329976e2ffbbe05d50c0bc71c Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_standing.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..dd5c6a57f017b90688bc205ae567027a191a593a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_walking_away.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc59a2782492b17af4b8a7779b92cb13071defbda321c1ec0f861aa904d600e1 +size 117536 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_walking_away_sidewalk.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_walking_away_sidewalk.png new file mode 100644 index 0000000000000000000000000000000000000000..c3eeb404aee82c81387ed927a20fd73d240313c3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_pedestrian_walking_away_sidewalk.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_sign_holder.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_sign_holder.png new file mode 100644 index 0000000000000000000000000000000000000000..fe58e51dabfe3e8a975f70472175267f9809a173 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_sign_holder.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_street_lamp.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..acceed826bfa3933ad36863608fbd68f01fffa88 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_street_lamp.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eba5bb32070e6142bac3c5dae23cef90a164ac2de5b7c61e9e0ae961d6cb667b +size 127363 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_woman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_woman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..7a57bf7db389ed6ff706f3b31a0fd7642aea39bf Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_woman_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_young_man_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_young_man_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..d0a360d735dd086ac446bec7a63c6852caef6bd3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/detect_refine_young_man_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_businessman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_businessman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..ec2d938f14bd9fd4c758a8f9a8def6e74baab872 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_businessman_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_dashboard_reflection.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..6536ef2b991d4fe831b52dd67cd68a60aa504cf6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_dashboard_reflection.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06c4df7a184b09f7caa15d6c694bba9ceff307c680c845a2ba693982eae0a55a +size 312995 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_crossing_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_crossing_right.png new file mode 100644 index 0000000000000000000000000000000000000000..c1efbc54c8d8f2c062f1913be9d062f24a51fab6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_crossing_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_standing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..000037e3f024fa00b0060e70d0b87f5d3791d882 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_standing.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..21140e9496cf9703d7ae801c0fd59dbc9e6affd7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_walking_away.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d68dd9cd9c271889a27a1e204218b2c5184d81fbeb1d12875fd1aec486095f43 +size 217585 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_walking_away_sidewalk.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_walking_away_sidewalk.png new file mode 100644 index 0000000000000000000000000000000000000000..436a35669abbed5cf1ff7242add991ec1d5d6a60 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_pedestrian_walking_away_sidewalk.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_sign_holder.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_sign_holder.png new file mode 100644 index 0000000000000000000000000000000000000000..4693604a8da377d3ce9c6d685185efb04aa3ef5a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_sign_holder.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:005e533fc9c8eaaceeb176d6fc492e735fc166291026186bbfa232cf8efddbd9 +size 143447 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_street_lamp.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..648366b03f2ad50ee02345205591177e0b0da1b9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_street_lamp.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:315b7063315f0af7818d80ec336e2e8f52e5d9fc309ffa5a0dea1f813cc8474f +size 151984 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_woman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_woman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..891272641d3f1ffd6d0559ffcc606dad521c41be --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_woman_waiting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5c1af63f0bc87502e4697ec39f1fdf4b4ada95a6982b6493a5ef8f97a693f49 +size 120276 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_young_man_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_young_man_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..0f64eb27be5292127543e23489caef8a06661d01 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/crops/diversify_input_young_man_waiting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81fca937ccb1d558825dec5bd4a6ff4b561f709fa4fe6cb356115437c044708c +size 119447 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..33fc569d1074a75d3d17d8622267970ce026112f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/detections.json @@ -0,0 +1,192 @@ +[ + { + "name": "pedestrian_walking_away", + "present": true, + "bbox": [ + 0.258, + 0.308, + 0.368, + 0.725 + ], + "confidence": "high", + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.258, + 0.308, + 0.368, + 0.725 + ], + "refine_crop": "crops/detect_refine_pedestrian_walking_away.png" + }, + { + "name": "woman_waiting", + "present": true, + "bbox": [ + 0.5514, + 0.2961, + 0.618, + 0.6443 + ], + "confidence": 0.98, + "notes": "Woman in a black jacket, dark trousers, and carrying a brown handbag standing on the curb as described.", + "coarse_bbox": [ + 0.55, + 0.295, + 0.63, + 0.644 + ], + "refine_crop": "crops/detect_refine_woman_waiting.png" + }, + { + "name": "pedestrian_standing", + "present": true, + "bbox": [ + 0.6289, + 0.2934, + 0.6784, + 0.6567 + ], + "confidence": 0.99, + "notes": "A person standing wearing a blue jacket and blue jeans fits the sub_caption.", + "coarse_bbox": [ + 0.626, + 0.296, + 0.682, + 0.651 + ], + "refine_crop": "crops/detect_refine_pedestrian_standing.png" + }, + { + "name": "sign_holder", + "present": true, + "bbox": [ + 0.4795, + 0.28, + 0.5703, + 0.5944 + ], + "confidence": 0.95, + "notes": "The person holding the large white sign is clearly visible, facing away from the camera and wearing a grey hoodie and blue jeans.", + "coarse_bbox": [ + 0.481, + 0.291, + 0.574, + 0.594 + ], + "refine_crop": "crops/detect_refine_sign_holder.png" + }, + { + "name": "pedestrian_crossing_right", + "present": true, + "bbox": [ + 0.2006, + 0.3034, + 0.2705, + 0.5763 + ], + "confidence": 0.95, + "notes": "The primary pedestrian in a white top and dark pants is clearly visible, walking from left to right.", + "coarse_bbox": [ + 0.198, + 0.306, + 0.266, + 0.574 + ], + "refine_crop": "crops/detect_refine_pedestrian_crossing_right.png" + }, + { + "name": "pedestrian_walking_away_sidewalk", + "present": true, + "bbox": [ + 0.2006, + 0.3029, + 0.2706, + 0.5728 + ], + "confidence": 0.9, + "notes": "A person wearing a white top and dark pants is visible.", + "coarse_bbox": [ + 0.201, + 0.325, + 0.27, + 0.574 + ], + "refine_crop": "crops/detect_refine_pedestrian_walking_away_sidewalk.png" + }, + { + "name": "young_man_waiting", + "present": true, + "bbox": [ + 0.6911, + 0.2718, + 0.7496, + 0.6983 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the young man wearing the dark blue hoodie, covering him from the top of his head to the bottom of his shoes.", + "coarse_bbox": [ + 0.691, + 0.275, + 0.754, + 0.692 + ], + "refine_crop": "crops/detect_refine_young_man_waiting.png" + }, + { + "name": "businessman_waiting", + "present": true, + "bbox": [ + 0.7491, + 0.2604, + 0.8005, + 0.6281 + ], + "confidence": 0.99, + "notes": "An adult wearing a dark suit and tie, standing near the crosswalk.", + "coarse_bbox": [ + 0.747, + 0.261, + 0.804, + 0.618 + ], + "refine_crop": "crops/detect_refine_businessman_waiting.png" + }, + { + "name": "street_lamp", + "present": true, + "bbox": [ + 0.777, + 0.0, + 0.859, + 0.747 + ], + "confidence": 0.95, + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.777, + 0.0, + 0.859, + 0.747 + ], + "refine_crop": "crops/detect_refine_street_lamp.png" + }, + { + "name": "dashboard_reflection", + "present": true, + "bbox": [ + 0.0, + 0.616, + 0.608, + 0.818 + ], + "confidence": 0.88, + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.0, + 0.616, + 0.608, + 0.818 + ], + "refine_crop": "crops/detect_refine_dashboard_reflection.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..d3edb17b1dd79b38d85d787cb908064713f7b821 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d90fefd30f43ac53fe062f2efb6fe5a6bebdc51fd8fa7fee8f9fd271001c44 +size 1536579 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..3d91d28c9f9eebef8ca1398b587a927a7f408e3b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/plan.json @@ -0,0 +1,240 @@ +{ + "sample_id": "sample_000008", + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A busy city street intersection viewed from the driver's perspective inside a stopped car.", + "activity": "Pedestrians are crossing the street and waiting at the curbside crosswalk while the car waits at a red light.", + "composition": "The camera frames the scene through a car windshield, with a faint dashboard reflection visible at the bottom edge. In the midground, a few pedestrians are actively crossing the street in front of the car, while others are clustered on the right-side curb waiting to cross, anchored by a tall street lamp. The depth extends down the street where people are walking away.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walking_away", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath.", + "role_in_scene": "walking away from the camera on the crosswalk" + }, + { + "name": "woman_waiting", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "description": "A woman wearing a black jacket, dark trousers, and carrying a brown handbag.", + "role_in_scene": "standing on the right curb, looking towards the street traffic" + }, + { + "name": "pedestrian_standing", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "description": "A person wearing a blue jacket and blue jeans.", + "role_in_scene": "standing on the sidewalk near the intersection" + }, + { + "name": "sign_holder", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "description": "A person holding up a large white sign.", + "role_in_scene": "standing on the sidewalk holding a sign near the crosswalk" + }, + { + "name": "pedestrian_crossing_right", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "description": "A person in a white top and dark pants.", + "role_in_scene": "walking across the street from left to right in the crosswalk" + }, + { + "name": "pedestrian_walking_away_sidewalk", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "description": "A person wearing a white top and dark pants.", + "role_in_scene": "walking away from the camera on the distant sidewalk" + }, + { + "name": "young_man_waiting", + "source_index": 13, + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "description": "A young man wearing a dark blue hoodie.", + "role_in_scene": "waiting on the corner for the pedestrian signal" + }, + { + "name": "businessman_waiting", + "source_index": 15, + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "description": "An adult wearing a dark suit and tie.", + "role_in_scene": "standing near the crosswalk amongst the crowd on the curb" + } + ], + "objects": [ + { + "name": "street_lamp", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "description": "A tall street lamp pole.", + "role_in_scene": "standing on the right corner of the intersection, near the curb" + }, + { + "name": "dashboard_reflection", + "source_index": 2, + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "description": "A faint reflection of an interior dashboard and an object on the windshield.", + "role_in_scene": "overlaid on the bottom portion of the view, establishing the perspective from inside a car" + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", + "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath.. Scene role: walking away from the camera on the crosswalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "woman_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", + "sub_caption": "shopper: A woman wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: standing on the right curb, looking towards the street traffic", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_standing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", + "sub_caption": "shopper: A person wearing a blue jacket and blue jeans.. Scene role: standing on the sidewalk near the intersection", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "sign_holder", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", + "sub_caption": "protester holding sign in back: A person holding up a large white sign.. Scene role: standing on the sidewalk holding a sign near the crosswalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_crossing_right", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", + "sub_caption": "pedestrian: A person in a white top and dark pants.. Scene role: walking across the street from left to right in the crosswalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_walking_away_sidewalk", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", + "sub_caption": "pedestrian: A person wearing a white top and dark pants.. Scene role: walking away from the camera on the distant sidewalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "young_man_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", + "sub_caption": "young man: A young man wearing a dark blue hoodie.. Scene role: waiting on the corner for the pedestrian signal", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "businessman_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", + "sub_caption": "adult in dark suit: An adult wearing a dark suit and tie.. Scene role: standing near the crosswalk amongst the crowd on the curb", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", + "sub_caption": "street lamp: A tall street lamp pole.. Scene role: standing on the right corner of the intersection, near the curb", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dashboard_reflection", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", + "sub_caption": "vehicle dashboard reflection: A faint reflection of an interior dashboard and an object on the windshield.. Scene role: overlaid on the bottom portion of the view, establishing the perspective from inside a car", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000008/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references.json new file mode 100644 index 0000000000000000000000000000000000000000..7897ee820f79e5f8844e4609ef3bd394109feb18 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references.json @@ -0,0 +1,189 @@ +{ + "references": [ + { + "name": "pedestrian_walking_away", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_walking_away.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png" + }, + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "woman_waiting", + "ref_image": "references/ref_woman_waiting.png", + "raw_ref_image": "references/raw_ref_woman_waiting_attempt_01.png", + "diversify_input": "crops/diversify_input_woman_waiting.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_woman_waiting.png", + "mask": "references/sam_mask_woman_waiting.png" + }, + "reference_verify": "references/reference_verify_woman_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "pedestrian_standing", + "ref_image": "references/ref_pedestrian_standing.png", + "raw_ref_image": "references/raw_ref_pedestrian_standing_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_standing.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_standing.png", + "mask": "references/sam_mask_pedestrian_standing.png" + }, + "reference_verify": "references/reference_verify_pedestrian_standing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "sign_holder", + "ref_image": "references/ref_sign_holder.png", + "raw_ref_image": "references/raw_ref_sign_holder_attempt_01.png", + "diversify_input": "crops/diversify_input_sign_holder.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_sign_holder.png", + "mask": "references/sam_mask_sign_holder.png" + }, + "reference_verify": "references/reference_verify_sign_holder.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "pedestrian_crossing_right", + "ref_image": "references/ref_pedestrian_crossing_right.png", + "raw_ref_image": "references/raw_ref_pedestrian_crossing_right_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_crossing_right.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_crossing_right.png", + "mask": "references/sam_mask_pedestrian_crossing_right.png" + }, + "reference_verify": "references/reference_verify_pedestrian_crossing_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "pedestrian_walking_away_sidewalk", + "ref_image": "references/ref_pedestrian_walking_away_sidewalk.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_sidewalk_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_walking_away_sidewalk.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_walking_away_sidewalk.png", + "mask": "references/sam_mask_pedestrian_walking_away_sidewalk.png" + }, + "reference_verify": "references/reference_verify_pedestrian_walking_away_sidewalk.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "young_man_waiting", + "ref_image": "references/ref_young_man_waiting.png", + "raw_ref_image": "references/raw_ref_young_man_waiting_attempt_01.png", + "diversify_input": "crops/diversify_input_young_man_waiting.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_young_man_waiting.png", + "mask": "references/sam_mask_young_man_waiting.png" + }, + "reference_verify": "references/reference_verify_young_man_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "businessman_waiting", + "ref_image": "references/ref_businessman_waiting.png", + "raw_ref_image": "references/raw_ref_businessman_waiting_attempt_01.png", + "diversify_input": "crops/diversify_input_businessman_waiting.png", + "sam_white_bg": { + "cached": true, + "output": "references/ref_businessman_waiting.png", + "mask": "references/sam_mask_businessman_waiting.png" + }, + "reference_verify": "references/reference_verify_businessman_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "cached_reference": true + }, + { + "name": "street_lamp", + "ref_image": "references/ref_street_lamp.png", + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "diversify_input": "crops/diversify_input_street_lamp.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "references/ref_street_lamp.png", + "mask": "references/sam_mask_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 278.0, + 0.0, + 808.0, + 1023.0 + ], + "mask_score": 3.224807, + "mask_area_ratio": 0.054686, + "elapsed_seconds": 7.0333 + }, + "reference_verify": "references/reference_verify_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dashboard_reflection", + "ref_image": "references/ref_dashboard_reflection.png", + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_02.png", + "diversify_input": "crops/diversify_input_dashboard_reflection.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_02.png", + "output": "references/ref_dashboard_reflection.png", + "mask": "references/sam_mask_dashboard_reflection.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 239.0, + 997.0, + 784.0 + ], + "mask_score": 3.469732, + "mask_area_ratio": 0.384025, + "elapsed_seconds": 8.6424 + }, + "reference_verify": "references/reference_verify_dashboard_reflection.json", + "reference_verify_passed": true, + "reference_attempts": 2 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_businessman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_businessman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..38360e21f8b3061da82ab904176944314e70043e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_businessman_waiting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a8ef4ede1ec170b7432b3b6a15629b285553368caed7cc3ffb4900f3a671ee +size 305485 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_dashboard_reflection.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..06af565baa27eecd135546144bb21e2b8f5b5cba --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_dashboard_reflection.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c08b6c3570e093a7ef3be4d89d6bcd772c06f4f64a18f2a17236c358151face +size 494707 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_crossing_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_crossing_right.png new file mode 100644 index 0000000000000000000000000000000000000000..8bef49bfd1c372cffacc7086226d8297865e2635 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_crossing_right.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43af028c266d46141c01eb14cf8a0b05f24cd55c43c6c90da5008efa54ffe04 +size 250843 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_standing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..76cb7d62ece12fea39c7c60d99179ecb90677ed2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_standing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6e68d83c38cb9b5cb173e6a4c620620749a22c8e594f1cf72ff8922b4fe1abf +size 306740 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..275a18dcdeb601a36517091773ee9dbb1a4a531a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aa6f680e7af1c91a9690a9a5a454b851327a447efe78725f5769c6ab7909490 +size 226721 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away_sidewalk.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away_sidewalk.png new file mode 100644 index 0000000000000000000000000000000000000000..03943b5853a618f2d86393ad7876f956b2c72c81 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_pedestrian_walking_away_sidewalk.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbef4a1c801ceb4b283f6559930d5b2577ab53ef9186b5b2eb6dabdf2871b517 +size 260664 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_sign_holder.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_sign_holder.png new file mode 100644 index 0000000000000000000000000000000000000000..f598301d059187b294a37e81bfc836275fb10802 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_sign_holder.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d27ce95fc92b9527b225912ab886769f3b82f719db4e3a0d8a282dd99982d5f1 +size 287147 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_street_lamp.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..a808f14ecc46cfcc2772e92c8e07efec48d64d31 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_street_lamp.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1d2e8a6a5a88cbc250a3e37f4bfb65d705fbb7717a217881f72688838841d0 +size 114158 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_woman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_woman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..0074bb01faaa81dd580bc1a926a2056de4b0cf82 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_woman_waiting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e947f7db1ac4077122ff6ee41524b2bc48166b61bdc90634909b24195e94c552 +size 272232 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_young_man_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_young_man_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..fe389bd8bef66ecfc3e85b0d82f9279dbf076f65 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/ref_young_man_waiting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288c0ba0805e6296d67f424a3155b7614918b9f614d842d6cc9df6bae97ea524 +size 262042 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_businessman_waiting.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_businessman_waiting.json new file mode 100644 index 0000000000000000000000000000000000000000..45e568cb20b790f7371cfb9fca5501f431b96c45 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_businessman_waiting.json @@ -0,0 +1,46 @@ +{ + "name": "businessman_waiting", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_businessman_waiting_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_businessman_waiting_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_businessman_waiting_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_businessman_waiting_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_businessman_waiting_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_businessman_waiting_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 322.0, + 37.0, + 700.0, + 1013.0 + ], + "mask_score": 3.396758, + "mask_area_ratio": 0.169758, + "elapsed_seconds": 8.3694 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image meets all requirements: it shows a full-body view of a single person in a dark suit with a tie, isolated on a white background with ample margins, and no parts are cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_dashboard_reflection.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_dashboard_reflection.json new file mode 100644 index 0000000000000000000000000000000000000000..440cf6c2afca011662f21833b3da7d6ac1b15b9b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_dashboard_reflection.json @@ -0,0 +1,88 @@ +{ + "name": "dashboard_reflection", + "passed": true, + "accepted_attempt": 2, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dashboard_reflection_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dashboard_reflection_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_dashboard_reflection_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_dashboard_reflection_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 195.0, + 1023.0, + 746.0 + ], + "mask_score": 2.508488, + "mask_area_ratio": 0.474533, + "elapsed_seconds": 7.2414 + }, + "verify": { + "passed": false, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": false, + "white_background": true, + "failure_reasons": [ + "The subject 'dashboard_reflection' is not clearly isolated as a single main subject.", + "The image includes a person walking on a crosswalk in the background, which is a major distracting element and violates the single main subject requirement." + ], + "notes": "The image shows a view from inside a car looking out at a crosswalk with a person walking. While there is a dashboard at the bottom and some reflection visible, the scene is not an isolated reference of a dashboard reflection. The person and the street scene are prominent." + } + }, + { + "attempt": 2, + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_02.png", + "candidate_ref_image": "references/candidate_ref_dashboard_reflection_attempt_02.png", + "candidate_sam_mask": "references/candidate_sam_mask_dashboard_reflection_attempt_02.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_02.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_dashboard_reflection_attempt_02.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_dashboard_reflection_attempt_02.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 239.0, + 997.0, + 784.0 + ], + "mask_score": 3.469732, + "mask_area_ratio": 0.384025, + "elapsed_seconds": 8.6424 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a faint, semi-transparent car interior representing a dashboard reflection on a white background, matching the subject requirements perfectly." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_crossing_right.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_crossing_right.json new file mode 100644 index 0000000000000000000000000000000000000000..9a941c4dd6dd9a7692f4d2baf144360df2da8788 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_crossing_right.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_crossing_right", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_crossing_right_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_crossing_right_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_crossing_right_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_crossing_right_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_pedestrian_crossing_right_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_pedestrian_crossing_right_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 333.0, + 81.0, + 678.0, + 995.0 + ], + "mask_score": 3.485898, + "mask_area_ratio": 0.144193, + "elapsed_seconds": 7.2503 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a person isolated on a white background with no cropping. It is suitable for a dataset subject reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_standing.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_standing.json new file mode 100644 index 0000000000000000000000000000000000000000..cff7f40088a0f6aaba431242ab108686d0e5916f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_standing.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_standing", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_standing_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_standing_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_standing_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_standing_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_pedestrian_standing_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_pedestrian_standing_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 338.0, + 48.0, + 692.0, + 995.0 + ], + "mask_score": 3.477333, + "mask_area_ratio": 0.156459, + "elapsed_seconds": 8.3132 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete full-body view of a person with no cropping and sufficient white margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_walking_away.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_walking_away.json new file mode 100644 index 0000000000000000000000000000000000000000..7b01a83c278ef8878bd44e1e3ce5ba7de73bfe75 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_walking_away.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_walking_away", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_walking_away_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_walking_away_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_pedestrian_walking_away_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_pedestrian_walking_away_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 315.0, + 32.0, + 717.0, + 1003.0 + ], + "mask_score": 2.280319, + "mask_area_ratio": 0.10415, + "elapsed_seconds": 8.3894 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The man is fully visible from head to toe with a white background and sufficient margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_walking_away_sidewalk.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_walking_away_sidewalk.json new file mode 100644 index 0000000000000000000000000000000000000000..50920991d32bcfe42c09aad5c8f5d69211ee2250 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_pedestrian_walking_away_sidewalk.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_walking_away_sidewalk", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_sidewalk_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_walking_away_sidewalk_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_walking_away_sidewalk_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_sidewalk_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_pedestrian_walking_away_sidewalk_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_pedestrian_walking_away_sidewalk_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 53.0, + 676.0, + 993.0 + ], + "mask_score": 3.401822, + "mask_area_ratio": 0.141634, + "elapsed_seconds": 7.1823 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image satisfies all requirements. It is a full-body shot with enough white margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_sign_holder.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_sign_holder.json new file mode 100644 index 0000000000000000000000000000000000000000..9d5f853e9d21349f8b977200e00814db3164b16a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_sign_holder.json @@ -0,0 +1,46 @@ +{ + "name": "sign_holder", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_sign_holder_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_sign_holder_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_sign_holder_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_sign_holder_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_sign_holder_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_sign_holder_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 357.0, + 44.0, + 709.0, + 1004.0 + ], + "mask_score": 3.368555, + "mask_area_ratio": 0.146316, + "elapsed_seconds": 8.6809 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body visible, well isolated with white background. The subject is missing the sign mentioned in the caption, but meets all structural hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_street_lamp.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_street_lamp.json new file mode 100644 index 0000000000000000000000000000000000000000..875c59aa3c212efb456afe0c16c3c84ff252d9bf --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_street_lamp.json @@ -0,0 +1,46 @@ +{ + "name": "street_lamp", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_lamp_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_lamp_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_street_lamp_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_street_lamp_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 278.0, + 0.0, + 808.0, + 1023.0 + ], + "mask_score": 3.224807, + "mask_area_ratio": 0.054686, + "elapsed_seconds": 7.0333 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete street lamp on a white background, perfectly suitable for a reference image." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_woman_waiting.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_woman_waiting.json new file mode 100644 index 0000000000000000000000000000000000000000..bc0614ae0e4b8c2a023d7941976143397bd0fac6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_woman_waiting.json @@ -0,0 +1,46 @@ +{ + "name": "woman_waiting", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_woman_waiting_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_woman_waiting_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_woman_waiting_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_woman_waiting_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_woman_waiting_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_woman_waiting_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 353.0, + 45.0, + 731.0, + 1002.0 + ], + "mask_score": 3.200432, + "mask_area_ratio": 0.136518, + "elapsed_seconds": 6.9669 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The person is fully visible with no cropping and adequate white margin. The handbag has some background removal artifacts, but the person's body completeness is fully satisfied." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_young_man_waiting.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_young_man_waiting.json new file mode 100644 index 0000000000000000000000000000000000000000..7f45f115a7c486571772ca1303f1f2db445d4150 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/reference_verify_young_man_waiting.json @@ -0,0 +1,46 @@ +{ + "name": "young_man_waiting", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_young_man_waiting_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_young_man_waiting_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_young_man_waiting_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_young_man_waiting_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_young_man_waiting_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_young_man_waiting_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 335.0, + 37.0, + 681.0, + 1013.0 + ], + "mask_score": 3.465394, + "mask_area_ratio": 0.145208, + "elapsed_seconds": 7.15 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body visible, clean white background, sufficient margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_businessman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_businessman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..8a7b3df9a5346515f75ae7e612ebc2dd1dfe0847 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_businessman_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_dashboard_reflection.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..fab81a9e8b89d494fbb84269f6c4cb5bdd6ab451 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_dashboard_reflection.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_crossing_right.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_crossing_right.png new file mode 100644 index 0000000000000000000000000000000000000000..45003716035d2cc69cb4eb1c9917ed8db462a4cd Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_crossing_right.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_standing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..0629e396c2ad118e70e28093738c899963e78484 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_standing.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..4c6010e6f5b56299409d9f60165140530e906bf5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_walking_away.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_walking_away_sidewalk.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_walking_away_sidewalk.png new file mode 100644 index 0000000000000000000000000000000000000000..486de0e38ea397d33f84631e609e5fe1b1ded4e1 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_pedestrian_walking_away_sidewalk.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_sign_holder.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_sign_holder.png new file mode 100644 index 0000000000000000000000000000000000000000..8151d0e9d7e224adf58dfbf630d150ae778ff088 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_sign_holder.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_street_lamp.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..2455939bc836f65e8bb5f4f5aca24089660fa2aa Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_street_lamp.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_woman_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_woman_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..7696fd66074c1a7ddb1916bdd0651e230552f461 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_woman_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_young_man_waiting.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_young_man_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..6f88e1be0f66999e70694fe90a4b17583acdbe13 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/references/sam_mask_young_man_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/row.json new file mode 100644 index 0000000000000000000000000000000000000000..04f01a831aad8ca6c40a9ad53d93a495acbdbafa --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/row.json @@ -0,0 +1,342 @@ +{ + "sample_id": "sample_000008", + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", + "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath.. Scene role: walking away from the camera on the crosswalk", + "measured_bbox": [ + 0.258, + 0.308, + 0.368, + 0.725 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png" + } + }, + { + "name": "woman_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", + "sub_caption": "shopper: A woman wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: standing on the right curb, looking towards the street traffic", + "measured_bbox": [ + 0.5514, + 0.2961, + 0.618, + 0.6443 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_woman_waiting.png", + "raw_ref_image": "references/raw_ref_woman_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_woman_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_woman_waiting.png", + "mask": "references/sam_mask_woman_waiting.png" + } + }, + { + "name": "pedestrian_standing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", + "sub_caption": "shopper: A person wearing a blue jacket and blue jeans.. Scene role: standing on the sidewalk near the intersection", + "measured_bbox": [ + 0.6289, + 0.2934, + 0.6784, + 0.6567 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_standing.png", + "raw_ref_image": "references/raw_ref_pedestrian_standing_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_standing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_standing.png", + "mask": "references/sam_mask_pedestrian_standing.png" + } + }, + { + "name": "sign_holder", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", + "sub_caption": "protester holding sign in back: A person holding up a large white sign.. Scene role: standing on the sidewalk holding a sign near the crosswalk", + "measured_bbox": [ + 0.4795, + 0.28, + 0.5703, + 0.5944 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_sign_holder.png", + "raw_ref_image": "references/raw_ref_sign_holder_attempt_01.png", + "reference_verify": "references/reference_verify_sign_holder.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_sign_holder.png", + "mask": "references/sam_mask_sign_holder.png" + } + }, + { + "name": "pedestrian_crossing_right", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", + "sub_caption": "pedestrian: A person in a white top and dark pants.. Scene role: walking across the street from left to right in the crosswalk", + "measured_bbox": [ + 0.2006, + 0.3034, + 0.2705, + 0.5763 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_crossing_right.png", + "raw_ref_image": "references/raw_ref_pedestrian_crossing_right_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_crossing_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_crossing_right.png", + "mask": "references/sam_mask_pedestrian_crossing_right.png" + } + }, + { + "name": "pedestrian_walking_away_sidewalk", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", + "sub_caption": "pedestrian: A person wearing a white top and dark pants.. Scene role: walking away from the camera on the distant sidewalk", + "measured_bbox": [ + 0.2006, + 0.3029, + 0.2706, + 0.5728 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away_sidewalk.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_sidewalk_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away_sidewalk.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_pedestrian_walking_away_sidewalk.png", + "mask": "references/sam_mask_pedestrian_walking_away_sidewalk.png" + } + }, + { + "name": "young_man_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", + "sub_caption": "young man: A young man wearing a dark blue hoodie.. Scene role: waiting on the corner for the pedestrian signal", + "measured_bbox": [ + 0.6911, + 0.2718, + 0.7496, + 0.6983 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_man_waiting.png", + "raw_ref_image": "references/raw_ref_young_man_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_young_man_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_young_man_waiting.png", + "mask": "references/sam_mask_young_man_waiting.png" + } + }, + { + "name": "businessman_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", + "sub_caption": "adult in dark suit: An adult wearing a dark suit and tie.. Scene role: standing near the crosswalk amongst the crowd on the curb", + "measured_bbox": [ + 0.7491, + 0.2604, + 0.8005, + 0.6281 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_businessman_waiting.png", + "raw_ref_image": "references/raw_ref_businessman_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_businessman_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "cached": true, + "output": "references/ref_businessman_waiting.png", + "mask": "references/sam_mask_businessman_waiting.png" + } + }, + { + "name": "street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", + "sub_caption": "street lamp: A tall street lamp pole.. Scene role: standing on the right corner of the intersection, near the curb", + "measured_bbox": [ + 0.777, + 0.0, + 0.859, + 0.747 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lamp.png", + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "references/ref_street_lamp.png", + "mask": "references/sam_mask_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 278.0, + 0.0, + 808.0, + 1023.0 + ], + "mask_score": 3.224807, + "mask_area_ratio": 0.054686, + "elapsed_seconds": 7.0333 + } + }, + { + "name": "dashboard_reflection", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", + "sub_caption": "vehicle dashboard reflection: A faint reflection of an interior dashboard and an object on the windshield.. Scene role: overlaid on the bottom portion of the view, establishing the perspective from inside a car", + "measured_bbox": [ + 0.0, + 0.616, + 0.608, + 0.818 + ], + "detection_confidence": 0.88, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dashboard_reflection.png", + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_02.png", + "reference_verify": "references/reference_verify_dashboard_reflection.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_02.png", + "output": "references/ref_dashboard_reflection.png", + "mask": "references/sam_mask_dashboard_reflection.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 239.0, + 997.0, + 784.0 + ], + "mask_score": 3.469732, + "mask_area_ratio": 0.384025, + "elapsed_seconds": 8.6424 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..60b01173e8d3b3220048a3d4330d814d73961711 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000008/vocab_task.json @@ -0,0 +1,154 @@ +{ + "task_id": "sample_000008", + "sample_id": "sample_000008", + "sample_index": 8, + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 78660, + "image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "name": "passenger", + "description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train." + }, + { + "candidate_index": 1, + "source_offset": 20565, + "image_id": "CrowdHuman:data/data_16/273275,5f6b50004676029f.jpg:person:6", + "name": "woman looking up", + "description": "A woman sitting on the grass, wearing a light beige jacket or shirt, looking up and smiling towards the right. Source dataset: CrowdHuman. Scene context: A group of young adults is relaxing and interacting on a grassy slope next to a modern building, some using laptops and others playing with a football." + }, + { + "candidate_index": 2, + "source_offset": 90506, + "image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "name": "shopper", + "description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors." + }, + { + "candidate_index": 3, + "source_offset": 144829, + "image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "name": "shopper", + "description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground." + }, + { + "candidate_index": 4, + "source_offset": 143218, + "image_id": "CrowdHuman:data/data_59/273271,2ced70001bb62011.jpg:person:28", + "name": "child in white cardigan", + "description": "Kneeling, wearing a white cardigan over a red top. Source dataset: CrowdHuman. Scene context: A large group of children and a few adults are posed for a group photo in what appears to be a school gym or hall, some holding props or instruments, in front of a decorated banner." + }, + { + "candidate_index": 5, + "source_offset": 39864, + "image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "name": "protester holding sign in back", + "description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march." + }, + { + "candidate_index": 6, + "source_offset": 97495, + "image_id": "CrowdHuman:data/data_40/282555,2c9d000ea25c638.jpg:person:3", + "name": "person sitting", + "description": "person sitting on the stairs, wearing a dark top and light pants Source dataset: CrowdHuman. Scene context: A large crowd of people sits and stands on the wide stone steps in front of a grand, classical building, viewed through the arching jets of a fountain in the foreground." + }, + { + "candidate_index": 7, + "source_offset": 82251, + "image_id": "CrowdHuman:data/data_37/283991,1970600072fc59c6.jpg:person:4", + "name": "cafe patron", + "description": "Seated at a table in the foreground, facing right, wearing a light long-sleeved shirt and appearing to interact with someone or something off-camera. Source dataset: CrowdHuman. Scene context: A black and white view of a lively outdoor pedestrian area lined with trees, outdoor cafes with large umbrellas, and numerous people walking and sitting." + }, + { + "candidate_index": 8, + "source_offset": 88856, + "image_id": "CrowdHuman:data/data_4/273278,1f14b00090e94205.jpg:person:1", + "name": "person on screen", + "description": "Partially visible on the right digital display, a person with long brown hair, seen from behind. Source dataset: CrowdHuman. Scene context: A man stands in front of a digital display showing a service change announcement for the Long Island Rail Road." + }, + { + "candidate_index": 9, + "source_offset": 2586, + "image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "name": "pedestrian", + "description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk." + }, + { + "candidate_index": 10, + "source_offset": 159889, + "image_id": "CrowdHuman:data/data_64/283081,dccc00022c623e8.jpg:person:5", + "name": "man in gray jacket", + "description": "A man seated in the lower right foreground with an afro, wearing a gray velvety jacket over a dark shirt and blue jeans. Source dataset: CrowdHuman. Scene context: A group portrait of six men posing together against a plain white background, dressed in varied retro-style clothing including hats, suits, and jackets." + }, + { + "candidate_index": 11, + "source_offset": 9906, + "image_id": "CrowdHuman:data/data_12/273275,106895000eb7b7132.jpg:person:5", + "name": "person sitting on right", + "description": "Sitting on the right side of the table, wearing a panda hoodie with hood up, using a laptop. Source dataset: CrowdHuman. Scene context: A group of people wearing panda-themed hoodies are sitting around a circular table in an office setting, working on laptops." + }, + { + "candidate_index": 12, + "source_offset": 165119, + "image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "name": "pedestrian", + "description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix." + }, + { + "candidate_index": 13, + "source_offset": 41919, + "image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "name": "young man", + "description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building." + }, + { + "candidate_index": 14, + "source_offset": 8187, + "image_id": "CrowdHuman:data/data_11/273275,f16410005b938064.jpg:person:27", + "name": "child in dark jacket", + "description": "A young child crouching in the front, wearing a dark jacket. Source dataset: CrowdHuman. Scene context: A large group of people posing for a group photo outdoors with ancient ruins and a large mountain in the background." + }, + { + "candidate_index": 15, + "source_offset": 138906, + "image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "name": "adult in dark suit", + "description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 4011, + "image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "name": "street lamp", + "description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays." + }, + { + "candidate_index": 1, + "source_offset": 3235, + "image_id": "CrowdHuman:data/data_1/283992,13ecf00046b443c7.jpg:object:3", + "name": "camera", + "description": "black digital camera held by the woman in the center Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers around and interacts with a bronze statue of a mermaid sitting on a large rock." + }, + { + "candidate_index": 2, + "source_offset": 209406, + "image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "name": "vehicle dashboard reflection", + "description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right." + }, + { + "candidate_index": 3, + "source_offset": 109024, + "image_id": "CrowdHuman:data/data_64/273278,edaf2000081eef33.jpg:object:9", + "name": "shoulder bag", + "description": "A light-colored shoulder bag carried by the woman in the teal jacket. Source dataset: CrowdHuman. Scene context: A group of people walks on a grassy area in front of a large, two-story house with a porch." + } + ], + "rng_seed": 1782765825, + "created_at": 1782292413.40541 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..54f794ec76990050c32cd8c1a3b31b014d0dc98a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ec14d83dbcda95035a8fc33026f3ad10b8767146d13d5c8412548bf3d882d44 +size 1205626 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fde938bf28ce9cda34a3530d3f79f29a89a0ee0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/compose_prompt.txt @@ -0,0 +1,79 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban city road approaching a concrete overpass during late afternoon", + "activity": "A dashcam-style view of driving on a multi-lane city road, with a pedestrian walking on the elevated sidewalk alongside the overpass", + "composition": "Wide landscape framing simulating a driver's perspective. The yellow lane line leads the eye forward towards the center-framed overpass. The metal railing and pedestrian are positioned on the right side of the frame in the midground, with a street light towering above.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_in_suit", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "description": "A person walking, wearing a dark suit.", + "role_in_scene": "walking on the sidewalk adjacent to the road, safely behind the metal railing" + } + ], + "objects": [ + { + "name": "yellow_lane_line", + "source_index": 1, + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "description": "A solid yellow line painted on the road surface indicating the edge of the lane.", + "role_in_scene": "running along the asphalt road in the foreground and midground" + }, + { + "name": "overpass", + "source_index": 3, + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road.", + "role_in_scene": "spanning horizontally across the upper midground of the view" + }, + { + "name": "metal_railing", + "source_index": 6, + "source_image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", + "source_name": "metal railing", + "description": "A metal railing visible on the far right edge of the scene.", + "role_in_scene": "acting as a safety barrier between the pedestrian sidewalk and the road on the right side" + }, + { + "name": "street_light", + "source_index": 7, + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "description": "Bright, glowing street lights illuminating the road.", + "role_in_scene": "tall lamp post positioned near the overpass, illuminating the surrounding area" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_metal_railing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_metal_railing.png new file mode 100644 index 0000000000000000000000000000000000000000..e2f9160ec3000e20d0bf8b52a79abcd23d126c29 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_metal_railing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5453521662230842f59a53ff8343300c41abb49a8dd9f8c8d8c2f8d5d850beea +size 361357 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_overpass.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..e20fb408ab22ed6cf9a7d4fc1eeaabb277100654 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_overpass.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0827c75feec9334f63e92f4958ee61c4a5dd7d804b466a1bf426c9e861b9bbb6 +size 639643 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..95ae2f02950d7f955633c2951704817496b1c3fe Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_pedestrian_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..c25563eb52814b84da94705d2c24b68d7211d5dd Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_yellow_lane_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..879271ac91dc9ad77f068586ee8320184e597d27 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/detect_refine_yellow_lane_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28eace342b08aaa20aa2d1d504eac0543d717a5e85b119da4d9f548987eb9dd6 +size 423290 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_metal_railing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_metal_railing.png new file mode 100644 index 0000000000000000000000000000000000000000..cb1a03e3ba7de557fd9e481f7a1f5c3987fe08d1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_metal_railing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e37eaa3efceb6ac277debc5164bb8649cca530bf5b536180eef8ea1205c68163 +size 433607 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_overpass.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..8d14677bc5f4313b730d1277a14020000bbc0320 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_overpass.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8424615008ef5e1d3dbf18c5a38db992b35510ef05393e1243487413119f522 +size 735092 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..da2e4d8dc2f7e008909c716a000adfe26e74abff Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_pedestrian_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..a98d547ecf86ad91d8db5f13797248572e12383b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_street_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426c9f4402c41b069a6802eaf7c32ebcc78fbc09b777669aaafb8b5a181e9fe5 +size 125820 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_yellow_lane_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..c5d19a2f0933165ffc73f68cb31a3c39b87bdcb1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/crops/diversify_input_yellow_lane_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a20ff00f5051894dddfccb3e78b619433e4ebc47ee7e05b757c3ea2a612f4a0 +size 261995 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..e36ef372b72c49a98aed373f4799924d2d565aef --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/detections.json @@ -0,0 +1,97 @@ +[ + { + "name": "pedestrian_in_suit", + "present": true, + "bbox": [ + 0.7699, + 0.4157, + 0.8109, + 0.6322 + ], + "confidence": 0.98, + "notes": "Tight bounding box around the visible portion of the pedestrian in a dark suit walking on the sidewalk.", + "coarse_bbox": [ + 0.771, + 0.416, + 0.814, + 0.623 + ], + "refine_crop": "crops/detect_refine_pedestrian_in_suit.png" + }, + { + "name": "yellow_lane_line", + "present": true, + "bbox": [ + 0.5116, + 0.511, + 0.7953, + 0.8681 + ], + "confidence": 0.95, + "notes": "Solid yellow lane line running along the right side of the road.", + "coarse_bbox": [ + 0.511, + 0.25, + 0.877, + 0.813 + ], + "refine_crop": "crops/detect_refine_yellow_lane_line.png" + }, + { + "name": "overpass", + "present": true, + "bbox": [ + 0.0, + 0.1444, + 1.0, + 0.5433 + ], + "confidence": 0.95, + "notes": "The overpass structure clearly stretches across the top half of the entire image and its supports extend to the bottom edge. Tight box includes the bridge decking and piers.", + "coarse_bbox": [ + 0.0, + 0.182, + 1.0, + 0.556 + ], + "refine_crop": "crops/detect_refine_overpass.png" + }, + { + "name": "metal_railing", + "present": true, + "bbox": [ + 0.512, + 0.5, + 0.999, + 0.913 + ], + "confidence": 0.95, + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.512, + 0.5, + 0.999, + 0.913 + ], + "refine_crop": "crops/detect_refine_metal_railing.png" + }, + { + "name": "street_light", + "present": true, + "bbox": [ + 0.51, + 0.0165, + 0.6141, + 0.5516 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the tall lamp post and its glowing light fixtures.", + "coarse_bbox": [ + 0.512, + 0.03, + 0.605, + 0.519 + ], + "refine_crop": "crops/detect_refine_street_light.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..e1e7cc25e7b94bdda891875b9c0ec65934617513 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f110182d231a3f30a89f7822d7d22ffc3d629f10a4cb4e49abf95b78368779cf +size 1259730 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..a14b592302a55fd4856e9f64e1848ddaeb78f3d4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/plan.json @@ -0,0 +1,145 @@ +{ + "sample_id": "sample_000009", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban city road approaching a concrete overpass during late afternoon", + "activity": "A dashcam-style view of driving on a multi-lane city road, with a pedestrian walking on the elevated sidewalk alongside the overpass", + "composition": "Wide landscape framing simulating a driver's perspective. The yellow lane line leads the eye forward towards the center-framed overpass. The metal railing and pedestrian are positioned on the right side of the frame in the midground, with a street light towering above.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_in_suit", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "description": "A person walking, wearing a dark suit.", + "role_in_scene": "walking on the sidewalk adjacent to the road, safely behind the metal railing" + } + ], + "objects": [ + { + "name": "yellow_lane_line", + "source_index": 1, + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "description": "A solid yellow line painted on the road surface indicating the edge of the lane.", + "role_in_scene": "running along the asphalt road in the foreground and midground" + }, + { + "name": "overpass", + "source_index": 3, + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road.", + "role_in_scene": "spanning horizontally across the upper midground of the view" + }, + { + "name": "metal_railing", + "source_index": 6, + "source_image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", + "source_name": "metal railing", + "description": "A metal railing visible on the far right edge of the scene.", + "role_in_scene": "acting as a safety barrier between the pedestrian sidewalk and the road on the right side" + }, + { + "name": "street_light", + "source_index": 7, + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "description": "Bright, glowing street lights illuminating the road.", + "role_in_scene": "tall lamp post positioned near the overpass, illuminating the surrounding area" + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", + "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking on the sidewalk adjacent to the road, safely behind the metal railing", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "yellow_lane_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", + "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: running along the asphalt road in the foreground and midground", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "overpass", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", + "sub_caption": "overpass: A concrete bridge structure spanning across the highway ahead, casting a shadow over the road.. Scene role: spanning horizontally across the upper midground of the view", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "metal_railing", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", + "source_name": "metal railing", + "source_description": "A metal railing visible on the far right edge of the scene. Source dataset: CrowdHuman. Scene context: A group of people, possibly a tour group, is walking outdoors on a paved area near a stone building, with one person holding a green flag and another speaking into a microphone.", + "sub_caption": "metal railing: A metal railing visible on the far right edge of the scene.. Scene role: acting as a safety barrier between the pedestrian sidewalk and the road on the right side", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", + "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: tall lamp post positioned near the overpass, illuminating the surrounding area", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000009/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references.json new file mode 100644 index 0000000000000000000000000000000000000000..988e0328217cff5f9961bb3c781288b2a6eaaeed --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references.json @@ -0,0 +1,165 @@ +{ + "references": [ + { + "name": "pedestrian_in_suit", + "ref_image": "references/ref_pedestrian_in_suit.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_in_suit.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "references/ref_pedestrian_in_suit.png", + "mask": "references/sam_mask_pedestrian_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 31.0, + 680.0, + 1016.0 + ], + "mask_score": 3.479882, + "mask_area_ratio": 0.150441, + "elapsed_seconds": 7.1831 + }, + "reference_verify": "references/reference_verify_pedestrian_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "yellow_lane_line", + "ref_image": "references/ref_yellow_lane_line.png", + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "diversify_input": "crops/diversify_input_yellow_lane_line.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "references/ref_yellow_lane_line.png", + "mask": "references/sam_mask_yellow_lane_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 163.0, + 1023.0, + 845.0 + ], + "mask_score": 3.12139, + "mask_area_ratio": 0.132687, + "elapsed_seconds": 8.5948 + }, + "reference_verify": "references/reference_verify_yellow_lane_line.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "overpass", + "ref_image": "references/ref_overpass.png", + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "diversify_input": "crops/diversify_input_overpass.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "references/ref_overpass.png", + "mask": "references/sam_mask_overpass.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 71.0, + 1023.0, + 823.0 + ], + "mask_score": 2.496995, + "mask_area_ratio": 0.373877, + "elapsed_seconds": 7.1818 + }, + "reference_verify": "references/reference_verify_overpass.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "metal_railing", + "ref_image": "references/ref_metal_railing.png", + "raw_ref_image": "references/raw_ref_metal_railing_attempt_01.png", + "diversify_input": "crops/diversify_input_metal_railing.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_metal_railing_attempt_01.png", + "output": "references/ref_metal_railing.png", + "mask": "references/sam_mask_metal_railing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 261.0, + 1017.0, + 934.0 + ], + "mask_score": 3.402974, + "mask_area_ratio": 0.273856, + "elapsed_seconds": 7.2968 + }, + "reference_verify": "references/reference_verify_metal_railing.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_light", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "diversify_input": "crops/diversify_input_street_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 176.0, + 12.0, + 847.0, + 1014.0 + ], + "mask_score": 3.41464, + "mask_area_ratio": 0.029787, + "elapsed_seconds": 7.0955 + }, + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_metal_railing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_metal_railing.png new file mode 100644 index 0000000000000000000000000000000000000000..8d444cd19203e4516f1fbca39cd4bae00cb8d033 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_metal_railing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82bff3f8aa14efd3c6e120efec0c4f2f7cb24f0068347af7ee885d44c663ed45 +size 537349 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_overpass.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..00c7d64a9e8b753decb518cbf032f8526dbd98f5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_overpass.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09fd4ee00ab8e3b241c211e8ecec63fd90a8e9fd9e67b8c3fc43fd987d9038ad +size 662761 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..b82bbf1134495937e0f06c4696b3b31860b871e8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_pedestrian_in_suit.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52f1ea96e7c8cdafe38bf4165ff7c907faf129f40d5b335e52caf12f7666f107 +size 262558 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..817d32231a5955730df9763e3713aec1177b5832 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_yellow_lane_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..67fec66f2e6022313d1d2f54d4f0fc80a093eaf4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/ref_yellow_lane_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00d431b654c3491c84c8f2d1f3e9d0417dc11bb379156b179ae353c322360921 +size 306537 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_metal_railing.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_metal_railing.json new file mode 100644 index 0000000000000000000000000000000000000000..259acca0bc6cf7e4baea5ee9a60cc4ae9ab94729 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_metal_railing.json @@ -0,0 +1,46 @@ +{ + "name": "metal_railing", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_metal_railing_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_metal_railing_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_metal_railing_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_metal_railing_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_metal_railing_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_metal_railing_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 261.0, + 1017.0, + 934.0 + ], + "mask_score": 3.402974, + "mask_area_ratio": 0.273856, + "elapsed_seconds": 7.2968 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The metal railing is clearly visible, complete within the typical framing of such an object, isolated on a white background, and acts as a good reference image." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_overpass.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_overpass.json new file mode 100644 index 0000000000000000000000000000000000000000..91c53ac038790581bcb1aac42d10f6b2731a2964 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_overpass.json @@ -0,0 +1,46 @@ +{ + "name": "overpass", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_overpass_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_overpass_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_overpass_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_overpass_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 71.0, + 1023.0, + 823.0 + ], + "mask_score": 2.496995, + "mask_area_ratio": 0.373877, + "elapsed_seconds": 7.1818 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The overpass is visible and recognizable. Even though it is cropped at the left and right edges (typical for a continuous structure), a representative and identifiable portion is shown against a white background, which makes it acceptable according to the policy for large environmental features." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_pedestrian_in_suit.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_pedestrian_in_suit.json new file mode 100644 index 0000000000000000000000000000000000000000..72e7701cc5fa4acdad3d67747a59d65b5067732d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_pedestrian_in_suit.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_in_suit", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_in_suit_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_in_suit_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_pedestrian_in_suit_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_pedestrian_in_suit_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 31.0, + 680.0, + 1016.0 + ], + "mask_score": 3.479882, + "mask_area_ratio": 0.150441, + "elapsed_seconds": 7.1831 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The reference image contains a single person wearing a dark suit with a full body view, clear margins, and a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_street_light.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_street_light.json new file mode 100644 index 0000000000000000000000000000000000000000..99d95a6c4a6d44c12648c66756b62977a797ed91 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_street_light.json @@ -0,0 +1,46 @@ +{ + "name": "street_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_street_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_street_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 176.0, + 12.0, + 847.0, + 1014.0 + ], + "mask_score": 3.41464, + "mask_area_ratio": 0.029787, + "elapsed_seconds": 7.0955 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The street light is fully visible, isolated on a white background, and not cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_yellow_lane_line.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_yellow_lane_line.json new file mode 100644 index 0000000000000000000000000000000000000000..4f84ce4b91173c92097b938542924217e47feb81 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/reference_verify_yellow_lane_line.json @@ -0,0 +1,46 @@ +{ + "name": "yellow_lane_line", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_yellow_lane_line_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_yellow_lane_line_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_yellow_lane_line_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_yellow_lane_line_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 163.0, + 1023.0, + 845.0 + ], + "mask_score": 3.12139, + "mask_area_ratio": 0.132687, + "elapsed_seconds": 8.5948 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a representative crop of a yellow lane line. Since it is a continuous feature, cropping at the ends is acceptable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_metal_railing.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_metal_railing.png new file mode 100644 index 0000000000000000000000000000000000000000..b4fd108089dfd64fe053538c40510e9809250b28 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_metal_railing.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_overpass.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..366a075818b7f67fa5099f944749debaad7e04a2 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_overpass.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..ef3cd8ea1ac27f9748c3fca62b6bd189c9e57ed5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_pedestrian_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_street_light.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..3e44dd835fb08dc89d76cd0f7be675456db6a1cd Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_yellow_lane_line.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..c7360ad3b5e7594c90fb0d61dbf80c7fe2e34b54 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/references/sam_mask_yellow_lane_line.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/row.json new file mode 100644 index 0000000000000000000000000000000000000000..941a44a7e98fcd4063edf569189b2d18b0656814 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/row.json @@ -0,0 +1,256 @@ +{ + "sample_id": "sample_000009", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", + "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking on the sidewalk adjacent to the road, safely behind the metal railing", + "measured_bbox": [ + 0.7699, + 0.4157, + 0.8109, + 0.6322 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_suit.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "references/ref_pedestrian_in_suit.png", + "mask": "references/sam_mask_pedestrian_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 31.0, + 680.0, + 1016.0 + ], + "mask_score": 3.479882, + "mask_area_ratio": 0.150441, + "elapsed_seconds": 7.1831 + } + }, + { + "name": "yellow_lane_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", + "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: running along the asphalt road in the foreground and midground", + "measured_bbox": [ + 0.5116, + 0.511, + 0.7953, + 0.8681 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_lane_line.png", + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_lane_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "references/ref_yellow_lane_line.png", + "mask": "references/sam_mask_yellow_lane_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 163.0, + 1023.0, + 845.0 + ], + "mask_score": 3.12139, + "mask_area_ratio": 0.132687, + "elapsed_seconds": 8.5948 + } + }, + { + "name": "overpass", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", + "sub_caption": "overpass: A concrete bridge structure spanning across the highway ahead, casting a shadow over the road.. Scene role: spanning horizontally across the upper midground of the view", + "measured_bbox": [ + 0.0, + 0.1444, + 1.0, + 0.5433 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overpass.png", + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "reference_verify": "references/reference_verify_overpass.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "references/ref_overpass.png", + "mask": "references/sam_mask_overpass.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 71.0, + 1023.0, + 823.0 + ], + "mask_score": 2.496995, + "mask_area_ratio": 0.373877, + "elapsed_seconds": 7.1818 + } + }, + { + "name": "metal_railing", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", + "source_name": "metal railing", + "source_description": "A metal railing visible on the far right edge of the scene. Source dataset: CrowdHuman. Scene context: A group of people, possibly a tour group, is walking outdoors on a paved area near a stone building, with one person holding a green flag and another speaking into a microphone.", + "sub_caption": "metal railing: A metal railing visible on the far right edge of the scene.. Scene role: acting as a safety barrier between the pedestrian sidewalk and the road on the right side", + "measured_bbox": [ + 0.512, + 0.5, + 0.999, + 0.913 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_railing.png", + "raw_ref_image": "references/raw_ref_metal_railing_attempt_01.png", + "reference_verify": "references/reference_verify_metal_railing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_metal_railing_attempt_01.png", + "output": "references/ref_metal_railing.png", + "mask": "references/sam_mask_metal_railing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 261.0, + 1017.0, + 934.0 + ], + "mask_score": 3.402974, + "mask_area_ratio": 0.273856, + "elapsed_seconds": 7.2968 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", + "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: tall lamp post positioned near the overpass, illuminating the surrounding area", + "measured_bbox": [ + 0.51, + 0.0165, + 0.6141, + 0.5516 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 176.0, + 12.0, + 847.0, + 1014.0 + ], + "mask_score": 3.41464, + "mask_area_ratio": 0.029787, + "elapsed_seconds": 7.0955 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..184031b5e6a78a0b4692080362efeaa9a83d3528 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000009/vocab_task.json @@ -0,0 +1,84 @@ +{ + "task_id": "sample_000009", + "sample_id": "sample_000009", + "sample_index": 9, + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 8921, + "image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "name": "pedestrian in suit", + "description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path." + }, + { + "candidate_index": 1, + "source_offset": 24385, + "image_id": "CrowdHuman:data/data_17/282555,b049a000a718515c.jpg:person:9", + "name": "spectator", + "description": "person in the background crowd on the upper level or further back, wearing a blue jacket. Source dataset: CrowdHuman. Scene context: A large crowd gathered in an indoor mall watching a staged hockey-like game played on the floor." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 49557, + "image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "name": "yellow building", + "description": "A multi-story building with a yellow ochre facade and numerous shuttered windows visible in the background on the far left. Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered around a monumental, ornate stone fountain with large statues and cascading water, situated in a city square surrounded by buildings." + }, + { + "candidate_index": 1, + "source_offset": 240928, + "image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "name": "yellow lane line", + "description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier." + }, + { + "candidate_index": 2, + "source_offset": 198760, + "image_id": "BDD100K:bfbe2ad2-ec5dea9d:object:9", + "name": "windshield reflection", + "description": "A blurry, bright green rectangular reflection appearing on the vehicle's windshield on the left side. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving on a wet city street behind a dark SUV, with a construction zone marked by barricades on the right side." + }, + { + "candidate_index": 3, + "source_offset": 183256, + "image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "name": "overpass", + "description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead." + }, + { + "candidate_index": 4, + "source_offset": 44513, + "image_id": "CrowdHuman:data/data_31/273275,83e380001691cba1.jpg:object:2", + "name": "wooden cabinet", + "description": "A tall, dark wood cabinet or hutch located behind the people on the left side. Source dataset: CrowdHuman. Scene context: A group of people, including priests and nuns, poses for a photograph behind a long dining table set for a meal in a wood-paneled room." + }, + { + "candidate_index": 5, + "source_offset": 61366, + "image_id": "CrowdHuman:data/data_4/283554,13b0000ade9cfd6.jpg:object:1", + "name": "sunglasses", + "description": "Dark sunglasses worn by the woman in the grey sweater. Source dataset: CrowdHuman. Scene context: Three women walk across a paved urban street, with one carrying a motorcycle helmet." + }, + { + "candidate_index": 6, + "source_offset": 18898, + "image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", + "name": "metal railing", + "description": "A metal railing visible on the far right edge of the scene. Source dataset: CrowdHuman. Scene context: A group of people, possibly a tour group, is walking outdoors on a paved area near a stone building, with one person holding a green flag and another speaking into a microphone." + }, + { + "candidate_index": 7, + "source_offset": 185323, + "image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "name": "street light", + "description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals." + } + ], + "rng_seed": 1782870554, + "created_at": 1782292413.4124064 +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/bbox_overlay.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..a00a8cf748309553bd97f42ec2ff728b9ab0a391 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8d2a057286b878932521a1fba30aa7565d4a48bf5efa5f7ec98ee49fbce03 +size 1526435 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/compose_prompt.txt b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..98c7f33ed440a167ae1d6bd2f1865404f2ab367b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/compose_prompt.txt @@ -0,0 +1,111 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A bustling city intersection with crosswalks and wide sidewalks", + "activity": "Several pedestrians are crossing the street and walking along the curbside while a black SUV waits at the crosswalk", + "composition": "Eye-level camera view from the street level, framing the black SUV in the midground and pedestrians spread across the foreground and midground crossing paths", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walking_away", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "description": "Person walking away, wearing a black jacket and dark pants.", + "role_in_scene": "walking away down the sidewalk on the right" + }, + { + "name": "pedestrian_with_backpack", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "description": "A man wearing a blue t-shirt and a backpack.", + "role_in_scene": "crossing the street in front of the SUV" + }, + { + "name": "pedestrian_in_red", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "description": "A person standing, wearing a bright red jacket.", + "role_in_scene": "standing at the street corner waiting to cross" + }, + { + "name": "pedestrian_in_striped_shirt", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "description": "Person wearing a striped shirt and dark pants.", + "role_in_scene": "walking briskly across the crosswalk" + }, + { + "name": "man_in_pink_shirt", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "description": "Man wearing a pink shirt and dark shorts.", + "role_in_scene": "standing near the curb waiting for a light" + }, + { + "name": "man_in_grey_sweater", + "source_index": 10, + "source_image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", + "source_name": "man", + "description": "Man wearing a grey sweater.", + "role_in_scene": "walking towards the camera on the crosswalk" + }, + { + "name": "pedestrian_in_light_jacket", + "source_index": 11, + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "description": "Individual wearing a light-colored jacket.", + "role_in_scene": "walking on the sidewalk in the midground" + }, + { + "name": "pedestrian_in_light_blue", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "description": "A person in a light blue shirt walking away from the camera.", + "role_in_scene": "crossing the street away from the camera's view" + } + ], + "objects": [ + { + "name": "black_suv", + "source_index": 1, + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "description": "A black SUV.", + "role_in_scene": "stopped at the crosswalk yielding to pedestrians" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_black_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..218e3c7e93af303f06678ee7ed210aabf13c397c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_black_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d09349757ebd512a2a834ca9b7b4310b23725edff6c469a1a5b99bf1d22135e5 +size 414955 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_man_in_grey_sweater.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_man_in_grey_sweater.png new file mode 100644 index 0000000000000000000000000000000000000000..fbdf8c48b95f6c6acd4e8b41df64077617adbe68 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_man_in_grey_sweater.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f03d2170ee7220d473eb9eef95235c599fa06cee5ba64b76aafea576b57c4790 +size 112379 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_man_in_pink_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_man_in_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..015b42b385b38875fdc3db2a6662f8acb30d7808 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_man_in_pink_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_light_blue.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_light_blue.png new file mode 100644 index 0000000000000000000000000000000000000000..09658791cc3486815dcee77ca582e2c55fe80a4f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_light_blue.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc10648518fa16bb1e6dee3963de50296e217ea2278834ac4511906cc1d2ff8f +size 101403 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_light_jacket.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..dc3ab5a885813a876020942360f7574c61d9667b Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_light_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_red.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_red.png new file mode 100644 index 0000000000000000000000000000000000000000..1649ffdf6220ea546c2c06be832573bcdd00aede Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_red.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_striped_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..1e919090b0664d6d3f5619714ac3de91e0c3de9d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_in_striped_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18ce3963798cea89fca97d77f24d82992b2fe4d3108272a7be581263d9703bac +size 136549 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..34396e269b49fa568fd18f814a4175b4a5696c08 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_walking_away.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72af29068c5267297e8c62eac41628e0015b84261563aa3e2d9a918f49aa72fd +size 111807 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_with_backpack.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_with_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..50d839f0cf741a3bfa2b87a37ee61991db8881df Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/detect_refine_pedestrian_with_backpack.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_black_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..cd4e9ab2c11f89e2736b1f2c4a1c0588929f0365 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_black_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98460aa2da3b4efd8edf5533e0a8a12327e0aab550353fed3a7a545dd6326886 +size 516193 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_grey_sweater.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_grey_sweater.png new file mode 100644 index 0000000000000000000000000000000000000000..76ed3d35f30b765614c21b9cdca7a7a58aacd582 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_grey_sweater.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9a5126c95fa1219e54be4af6bf62c10d3938df7206bba23420f2f4b6566348f +size 207008 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_pink_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..cac97d32f3f00d1d7fb2413130b4eb1032a5988e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_man_in_pink_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0d0dbd431cb11d65c418a20cdac279be0d58619a1c75fd6db99492b71c51b7 +size 112679 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_light_blue.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_light_blue.png new file mode 100644 index 0000000000000000000000000000000000000000..5bd2ea665b1ebfd08c8ebcb4541a79ca66091b0a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_light_blue.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b1abd5727db4008d0b18a7edff730837e90e6988d5fc7ca37a9dc277ff841fb +size 171066 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_light_jacket.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..68230fea8885f9d5103d980eab2f4bafb4789ce6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_light_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_red.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_red.png new file mode 100644 index 0000000000000000000000000000000000000000..3966d8218ced29ea52d817fe77ce9b4a9c4af63a Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_red.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_striped_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..30ef28bc06838579c4d8cf34a72eadd6a9e1df1c --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_in_striped_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f23a80aa4b19f4f720821869eda96ca797fe27792560bffdd81ffb84a1868e0 +size 263408 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..685e8dddccbcc3af1234d7c6e6e177773605315a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_walking_away.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:398d57262a202de3dcfb2b5ce91e1d85c70cd1221157cb5f7fe9241f91fc9a52 +size 189349 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_with_backpack.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_with_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..981dc45a56b46a100ae9385cb3de3f4366c074f9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/crops/diversify_input_pedestrian_with_backpack.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a501e3b2cccca32bcdfcd827869eb3a0cec342ec25f6efb7367874ec19f7b92d +size 187167 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/detections.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..eaa29d8edacd39bbb0c216fef7fb2824b107acc8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/detections.json @@ -0,0 +1,173 @@ +[ + { + "name": "pedestrian_walking_away", + "present": true, + "bbox": [ + 0.8776, + 0.2931, + 0.9906, + 0.7623 + ], + "confidence": 0.98, + "notes": "A person walking away wearing a black jacket and dark pants.", + "coarse_bbox": [ + 0.877, + 0.294, + 0.991, + 0.762 + ], + "refine_crop": "crops/detect_refine_pedestrian_walking_away.png" + }, + { + "name": "pedestrian_with_backpack", + "present": true, + "bbox": [ + 0.2259, + 0.2854, + 0.322, + 0.6604 + ], + "confidence": 0.98, + "notes": "The closest same-category instance of 'pedestrian_with_backpack' is clearly visible in the crop, and a tight bounding box has been drawn around him.", + "coarse_bbox": [ + 0.227, + 0.286, + 0.318, + 0.657 + ], + "refine_crop": "crops/detect_refine_pedestrian_with_backpack.png" + }, + { + "name": "pedestrian_in_red", + "present": true, + "bbox": [ + 0.6576, + 0.3338, + 0.6899, + 0.5607 + ], + "confidence": 0.99, + "notes": "Pedestrian in the red jacket standing in the foreground.", + "coarse_bbox": [ + 0.657, + 0.334, + 0.687, + 0.551 + ], + "refine_crop": "crops/detect_refine_pedestrian_in_red.png" + }, + { + "name": "pedestrian_in_striped_shirt", + "present": true, + "bbox": [ + 0.433, + 0.3315, + 0.5713, + 0.6823 + ], + "confidence": 0.98, + "notes": "Tight crop around the person wearing a striped shirt and dark pants walking briskly across the crosswalk.", + "coarse_bbox": [ + 0.438, + 0.332, + 0.573, + 0.686 + ], + "refine_crop": "crops/detect_refine_pedestrian_in_striped_shirt.png" + }, + { + "name": "man_in_pink_shirt", + "present": true, + "bbox": [ + 0.7524, + 0.2743, + 0.8106, + 0.6687 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the man wearing a pink shirt and dark shorts.", + "coarse_bbox": [ + 0.747, + 0.275, + 0.808, + 0.668 + ], + "refine_crop": "crops/detect_refine_man_in_pink_shirt.png" + }, + { + "name": "man_in_grey_sweater", + "present": true, + "bbox": [ + 0.3541, + 0.2895, + 0.4483, + 0.7382 + ], + "confidence": 0.99, + "notes": "tight box containing the full visible body of the man.", + "coarse_bbox": [ + 0.353, + 0.289, + 0.447, + 0.734 + ], + "refine_crop": "crops/detect_refine_man_in_grey_sweater.png" + }, + { + "name": "pedestrian_in_light_jacket", + "present": true, + "bbox": [ + 0.5797, + 0.3113, + 0.6425, + 0.5493 + ], + "confidence": 0.99, + "notes": "A prominent individual walking, wearing a light-colored jacket.", + "coarse_bbox": [ + 0.582, + 0.309, + 0.644, + 0.55 + ], + "refine_crop": "crops/detect_refine_pedestrian_in_light_jacket.png" + }, + { + "name": "pedestrian_in_light_blue", + "present": true, + "bbox": [ + 0.0034, + 0.2952, + 0.1205, + 0.6424 + ], + "confidence": 0.95, + "notes": "Tight bounding box drawn around the entire visible pedestrian in the light blue shirt walking.", + "coarse_bbox": [ + 0.002, + 0.296, + 0.119, + 0.638 + ], + "refine_crop": "crops/detect_refine_pedestrian_in_light_blue.png" + }, + { + "name": "black_suv", + "present": true, + "bbox": [ + 0.0797, + 0.2941, + 0.5997, + 0.5875 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the prominent black SUV in the scene, which matches the object description.", + "coarse_bbox": [ + 0.077, + 0.294, + 0.597, + 0.589 + ], + "refine_crop": "crops/detect_refine_black_suv.png" + } +] diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/main_image.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..3bc0c0431a565535055ff7d431576f01b348227d --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8b6074e290062cefd97091b7fe0b7b99b998f4be1b8edfd09edb036d33c558a +size 1629947 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/plan.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..588ded9ea4472c09838fef1fef50e1876493b331 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/plan.json @@ -0,0 +1,221 @@ +{ + "sample_id": "sample_000010", + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A bustling city intersection with crosswalks and wide sidewalks", + "activity": "Several pedestrians are crossing the street and walking along the curbside while a black SUV waits at the crosswalk", + "composition": "Eye-level camera view from the street level, framing the black SUV in the midground and pedestrians spread across the foreground and midground crossing paths", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walking_away", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "description": "Person walking away, wearing a black jacket and dark pants.", + "role_in_scene": "walking away down the sidewalk on the right" + }, + { + "name": "pedestrian_with_backpack", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "description": "A man wearing a blue t-shirt and a backpack.", + "role_in_scene": "crossing the street in front of the SUV" + }, + { + "name": "pedestrian_in_red", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "description": "A person standing, wearing a bright red jacket.", + "role_in_scene": "standing at the street corner waiting to cross" + }, + { + "name": "pedestrian_in_striped_shirt", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "description": "Person wearing a striped shirt and dark pants.", + "role_in_scene": "walking briskly across the crosswalk" + }, + { + "name": "man_in_pink_shirt", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "description": "Man wearing a pink shirt and dark shorts.", + "role_in_scene": "standing near the curb waiting for a light" + }, + { + "name": "man_in_grey_sweater", + "source_index": 10, + "source_image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", + "source_name": "man", + "description": "Man wearing a grey sweater.", + "role_in_scene": "walking towards the camera on the crosswalk" + }, + { + "name": "pedestrian_in_light_jacket", + "source_index": 11, + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "description": "Individual wearing a light-colored jacket.", + "role_in_scene": "walking on the sidewalk in the midground" + }, + { + "name": "pedestrian_in_light_blue", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "description": "A person in a light blue shirt walking away from the camera.", + "role_in_scene": "crossing the street away from the camera's view" + } + ], + "objects": [ + { + "name": "black_suv", + "source_index": 1, + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "description": "A black SUV.", + "role_in_scene": "stopped at the crosswalk yielding to pedestrians" + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", + "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away down the sidewalk on the right", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_with_backpack", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", + "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: crossing the street in front of the SUV", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_in_red", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", + "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: standing at the street corner waiting to cross", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_in_striped_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", + "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: walking briskly across the crosswalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "man_in_pink_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", + "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: standing near the curb waiting for a light", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "man_in_grey_sweater", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", + "source_name": "man", + "source_description": "Man wearing a grey sweater. Source dataset: CrowdHuman. Scene context: People are walking through an airport terminal with prominent overhead signage.", + "sub_caption": "man: Man wearing a grey sweater.. Scene role: walking towards the camera on the crosswalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_in_light_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", + "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: walking on the sidewalk in the midground", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_in_light_blue", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", + "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: crossing the street away from the camera's view", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "black_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", + "sub_caption": "black suv: A black SUV.. Scene role: stopped at the crosswalk yielding to pedestrians", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000010/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references.json new file mode 100644 index 0000000000000000000000000000000000000000..079521d795c510ace5330bb03e6fbca250b4b133 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references.json @@ -0,0 +1,293 @@ +{ + "references": [ + { + "name": "pedestrian_walking_away", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_walking_away.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 40.0, + 683.0, + 1002.0 + ], + "mask_score": 3.42052, + "mask_area_ratio": 0.145487, + "elapsed_seconds": 7.0967 + }, + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_with_backpack", + "ref_image": "references/ref_pedestrian_with_backpack.png", + "raw_ref_image": "references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_with_backpack.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "output": "references/ref_pedestrian_with_backpack.png", + "mask": "references/sam_mask_pedestrian_with_backpack.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 36.0, + 682.0, + 1012.0 + ], + "mask_score": 3.441997, + "mask_area_ratio": 0.151945, + "elapsed_seconds": 8.5941 + }, + "reference_verify": "references/reference_verify_pedestrian_with_backpack.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_in_red", + "ref_image": "references/ref_pedestrian_in_red.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_red_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_in_red.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_red_attempt_01.png", + "output": "references/ref_pedestrian_in_red.png", + "mask": "references/sam_mask_pedestrian_in_red.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 52.0, + 699.0, + 1007.0 + ], + "mask_score": 3.430953, + "mask_area_ratio": 0.159512, + "elapsed_seconds": 7.0834 + }, + "reference_verify": "references/reference_verify_pedestrian_in_red.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_in_striped_shirt", + "ref_image": "references/ref_pedestrian_in_striped_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_in_striped_shirt.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "output": "references/ref_pedestrian_in_striped_shirt.png", + "mask": "references/sam_mask_pedestrian_in_striped_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 23.0, + 676.0, + 1011.0 + ], + "mask_score": 3.472095, + "mask_area_ratio": 0.152217, + "elapsed_seconds": 7.27 + }, + "reference_verify": "references/reference_verify_pedestrian_in_striped_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "man_in_pink_shirt", + "ref_image": "references/ref_man_in_pink_shirt.png", + "raw_ref_image": "references/raw_ref_man_in_pink_shirt_attempt_01.png", + "diversify_input": "crops/diversify_input_man_in_pink_shirt.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_pink_shirt_attempt_01.png", + "output": "references/ref_man_in_pink_shirt.png", + "mask": "references/sam_mask_man_in_pink_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 24.0, + 708.0, + 1000.0 + ], + "mask_score": 3.415589, + "mask_area_ratio": 0.161095, + "elapsed_seconds": 7.2651 + }, + "reference_verify": "references/reference_verify_man_in_pink_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "man_in_grey_sweater", + "ref_image": "references/ref_man_in_grey_sweater.png", + "raw_ref_image": "references/raw_ref_man_in_grey_sweater_attempt_01.png", + "diversify_input": "crops/diversify_input_man_in_grey_sweater.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_grey_sweater_attempt_01.png", + "output": "references/ref_man_in_grey_sweater.png", + "mask": "references/sam_mask_man_in_grey_sweater.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 47.0, + 683.0, + 1003.0 + ], + "mask_score": 3.491882, + "mask_area_ratio": 0.143696, + "elapsed_seconds": 7.1775 + }, + "reference_verify": "references/reference_verify_man_in_grey_sweater.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_in_light_jacket", + "ref_image": "references/ref_pedestrian_in_light_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_in_light_jacket.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "output": "references/ref_pedestrian_in_light_jacket.png", + "mask": "references/sam_mask_pedestrian_in_light_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 313.0, + 33.0, + 685.0, + 1017.0 + ], + "mask_score": 3.458198, + "mask_area_ratio": 0.174406, + "elapsed_seconds": 7.237 + }, + "reference_verify": "references/reference_verify_pedestrian_in_light_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_in_light_blue", + "ref_image": "references/ref_pedestrian_in_light_blue.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_in_light_blue.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "output": "references/ref_pedestrian_in_light_blue.png", + "mask": "references/sam_mask_pedestrian_in_light_blue.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 47.0, + 672.0, + 989.0 + ], + "mask_score": 3.478225, + "mask_area_ratio": 0.140584, + "elapsed_seconds": 7.065 + }, + "reference_verify": "references/reference_verify_pedestrian_in_light_blue.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "black_suv", + "ref_image": "references/ref_black_suv.png", + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "diversify_input": "crops/diversify_input_black_suv.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "references/ref_black_suv.png", + "mask": "references/sam_mask_black_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 1.0, + 273.0, + 1023.0, + 701.0 + ], + "mask_score": 3.159418, + "mask_area_ratio": 0.229866, + "elapsed_seconds": 7.2921 + }, + "reference_verify": "references/reference_verify_black_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_black_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..ca0d06b866cf2801638421d4c5a841e828ca477f --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_black_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d9c64ac06aea0a84a2c2efbb00d9bc0eaccf80c9850a4729219c14b45f6ab3 +size 404005 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_grey_sweater.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_grey_sweater.png new file mode 100644 index 0000000000000000000000000000000000000000..6892b09362ebdf0dc604c17495541380b4da2f9a --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_grey_sweater.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c9715920e040601247b219c452bf4a76d25b9ac9987172604f8bf9f80243064 +size 315173 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_pink_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..fc9a3a922533c46aca115ce112599449a621d498 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_man_in_pink_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8411b2733d0fb85bd960b7c02defb7b39fac407a8d6503e5d7261185d01bb659 +size 325333 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_blue.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_blue.png new file mode 100644 index 0000000000000000000000000000000000000000..7e73e7a9310c9ed19a2801b46162eda8975d8541 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_blue.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e522acfefedf127d98f2dc154a8be6c29578ebe05593a452ba9a7281edf886e +size 265484 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_jacket.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..e00a208c6c15038fbfdab717b827d2de9c6a6198 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_light_jacket.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72925592a51e30f70c5ce4681582105619421cdfda2057b3047cf03da9efebc4 +size 315367 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_red.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_red.png new file mode 100644 index 0000000000000000000000000000000000000000..9bb5ef6e6b84712b759092b718749d7c6d948f8b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_red.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:164a8ab843bb134d418b0d6a41f386cc0aad4d4751f8918ad40ff27dc231bc5c +size 320864 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_striped_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..0dd23e947c45fb5afde5594fc64e332288645ef9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_in_striped_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edde6e28ddef78d4dafea3445bec0b732c2b848c71a1da890f50710ab4d9094b +size 293782 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..15000818ccc74e2f559754c9ea605d63017ea167 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_walking_away.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd6cf42ca545d7ffe31f72083bc78aade34b9fbf1100bd768b4776bbe73611d +size 263215 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_with_backpack.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_with_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..4a216f909d89785cbec6ab9cf9a1bc646efeba98 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/ref_pedestrian_with_backpack.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:723f1213da21840f9f212e9eff50139eb6c12b2b20dd7622fe9f947c00c70e87 +size 322451 diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_black_suv.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_black_suv.json new file mode 100644 index 0000000000000000000000000000000000000000..a520baa43e7aaae11846cbcc334cd2e0d536530b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_black_suv.json @@ -0,0 +1,46 @@ +{ + "name": "black_suv", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_black_suv_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_black_suv_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_black_suv_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_black_suv_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 1.0, + 273.0, + 1023.0, + 701.0 + ], + "mask_score": 3.159418, + "mask_area_ratio": 0.229866, + "elapsed_seconds": 7.2921 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated black SUV on a white background, which fully satisfies the hard requirements for a non-person subject reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_man_in_grey_sweater.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_man_in_grey_sweater.json new file mode 100644 index 0000000000000000000000000000000000000000..c954cf808ffb90f7b67aed9102be1333e7e4f636 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_man_in_grey_sweater.json @@ -0,0 +1,46 @@ +{ + "name": "man_in_grey_sweater", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_man_in_grey_sweater_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_man_in_grey_sweater_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_man_in_grey_sweater_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_grey_sweater_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_man_in_grey_sweater_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_man_in_grey_sweater_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 47.0, + 683.0, + 1003.0 + ], + "mask_score": 3.491882, + "mask_area_ratio": 0.143696, + "elapsed_seconds": 7.1775 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full-body view of a man wearing a grey sweater on a white background with sufficient margins." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_man_in_pink_shirt.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_man_in_pink_shirt.json new file mode 100644 index 0000000000000000000000000000000000000000..6e30c63fc8c1c8f033cbc3817d97bc03b2f3824b --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_man_in_pink_shirt.json @@ -0,0 +1,46 @@ +{ + "name": "man_in_pink_shirt", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_man_in_pink_shirt_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_man_in_pink_shirt_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_man_in_pink_shirt_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_pink_shirt_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_man_in_pink_shirt_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_man_in_pink_shirt_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 24.0, + 708.0, + 1000.0 + ], + "mask_score": 3.415589, + "mask_area_ratio": 0.161095, + "elapsed_seconds": 7.2651 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image meets all requirements for a full-body person reference on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_light_blue.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_light_blue.json new file mode 100644 index 0000000000000000000000000000000000000000..edc958cd82d0eb239c06c2d4d90a61ce3da2eb99 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_light_blue.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_in_light_blue", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_in_light_blue_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_in_light_blue_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_in_light_blue_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_in_light_blue_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 47.0, + 672.0, + 989.0 + ], + "mask_score": 3.478225, + "mask_area_ratio": 0.140584, + "elapsed_seconds": 7.065 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image meets all hard requirements for a person reference. The full body is visible with no cropping and sufficient white margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_light_jacket.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_light_jacket.json new file mode 100644 index 0000000000000000000000000000000000000000..970fe98af8a5b5314fb50ae4d60ad026336c7046 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_light_jacket.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_in_light_jacket", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_in_light_jacket_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_in_light_jacket_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_in_light_jacket_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_in_light_jacket_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 313.0, + 33.0, + 685.0, + 1017.0 + ], + "mask_score": 3.458198, + "mask_area_ratio": 0.174406, + "elapsed_seconds": 7.237 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a single person wearing a light-colored jacket, with no cropping and adequate white margin on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_red.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_red.json new file mode 100644 index 0000000000000000000000000000000000000000..020839a7f1e3afce6cc48860975bf4b60a3d83a5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_red.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_in_red", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_in_red_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_in_red_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_in_red_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_red_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_in_red_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_in_red_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 52.0, + 699.0, + 1007.0 + ], + "mask_score": 3.430953, + "mask_area_ratio": 0.159512, + "elapsed_seconds": 7.0834 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a single person wearing a red jacket, isolated on a white background, with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_striped_shirt.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_striped_shirt.json new file mode 100644 index 0000000000000000000000000000000000000000..50879c7e384ce75b9963544058df600f79213667 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_in_striped_shirt.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_in_striped_shirt", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_in_striped_shirt_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_in_striped_shirt_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_in_striped_shirt_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_in_striped_shirt_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 23.0, + 676.0, + 1011.0 + ], + "mask_score": 3.472095, + "mask_area_ratio": 0.152217, + "elapsed_seconds": 7.27 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image perfectly shows the full body of the subject with a white background and sufficient margins." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_walking_away.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_walking_away.json new file mode 100644 index 0000000000000000000000000000000000000000..398fff8c9408381d0f5bbf36ba63ee81a9c7d661 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_walking_away.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_walking_away", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_walking_away_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_walking_away_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_walking_away_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_walking_away_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 40.0, + 683.0, + 1002.0 + ], + "mask_score": 3.42052, + "mask_area_ratio": 0.145487, + "elapsed_seconds": 7.0967 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The person is facing forward rather than walking away, but the image fully satisfies all hard requirements for completeness, isolation, margin, and lack of cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_with_backpack.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_with_backpack.json new file mode 100644 index 0000000000000000000000000000000000000000..d124faae2a46e725488c6df5fd5754de765f2415 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/reference_verify_pedestrian_with_backpack.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_with_backpack", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_with_backpack_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_with_backpack_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_with_backpack_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_with_backpack_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 36.0, + 682.0, + 1012.0 + ], + "mask_score": 3.441997, + "mask_area_ratio": 0.151945, + "elapsed_seconds": 8.5941 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The subject is fully visible from head to toe, alone on a white background, with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_black_suv.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..cc5b6f1ba3f92eaeb8e6d2785dfd875c19c30713 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_black_suv.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_man_in_grey_sweater.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_man_in_grey_sweater.png new file mode 100644 index 0000000000000000000000000000000000000000..af44c868fcfa5fcd02e93a6586e159dc403a1e26 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_man_in_grey_sweater.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_man_in_pink_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_man_in_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..4be66b4c5436020f98f50d2c034d0d9084fcf32d Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_man_in_pink_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_light_blue.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_light_blue.png new file mode 100644 index 0000000000000000000000000000000000000000..add669de228529f08f5fe8807c776f0d1e76d56e Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_light_blue.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_light_jacket.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..b7eb78b41083efa40ed1c78e6ab3622d5635e24a Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_light_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_red.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_red.png new file mode 100644 index 0000000000000000000000000000000000000000..a81755f76f0d2073ed58d45a24919a24147d0688 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_red.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_striped_shirt.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..0a24463e108b0f2aaf54ffcbe8afc1e4fe0da9ae Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_in_striped_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..c162e1f11e54f3d6252943d62137f557e1d8882b Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_walking_away.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_with_backpack.png b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_with_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..1396c9e9dfaca2e20dec7e97c6f6f0f9f1d03439 Binary files /dev/null and b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/references/sam_mask_pedestrian_with_backpack.png differ diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/row.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/row.json new file mode 100644 index 0000000000000000000000000000000000000000..85aab57baeabc304cea7ec54a4356e94bb4b320e --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/row.json @@ -0,0 +1,440 @@ +{ + "sample_id": "sample_000010", + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 9, + "n_detected": 9, + "n_subjects": 9, + "subjects": [ + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", + "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away down the sidewalk on the right", + "measured_bbox": [ + 0.8776, + 0.2931, + 0.9906, + 0.7623 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 40.0, + 683.0, + 1002.0 + ], + "mask_score": 3.42052, + "mask_area_ratio": 0.145487, + "elapsed_seconds": 7.0967 + } + }, + { + "name": "pedestrian_with_backpack", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", + "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: crossing the street in front of the SUV", + "measured_bbox": [ + 0.2259, + 0.2854, + 0.322, + 0.6604 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_with_backpack.png", + "raw_ref_image": "references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_with_backpack.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_with_backpack_attempt_01.png", + "output": "references/ref_pedestrian_with_backpack.png", + "mask": "references/sam_mask_pedestrian_with_backpack.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 36.0, + 682.0, + 1012.0 + ], + "mask_score": 3.441997, + "mask_area_ratio": 0.151945, + "elapsed_seconds": 8.5941 + } + }, + { + "name": "pedestrian_in_red", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", + "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: standing at the street corner waiting to cross", + "measured_bbox": [ + 0.6576, + 0.3338, + 0.6899, + 0.5607 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_red.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_red_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_red.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_red_attempt_01.png", + "output": "references/ref_pedestrian_in_red.png", + "mask": "references/sam_mask_pedestrian_in_red.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 331.0, + 52.0, + 699.0, + 1007.0 + ], + "mask_score": 3.430953, + "mask_area_ratio": 0.159512, + "elapsed_seconds": 7.0834 + } + }, + { + "name": "pedestrian_in_striped_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", + "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: walking briskly across the crosswalk", + "measured_bbox": [ + 0.433, + 0.3315, + 0.5713, + 0.6823 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_striped_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_striped_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_striped_shirt_attempt_01.png", + "output": "references/ref_pedestrian_in_striped_shirt.png", + "mask": "references/sam_mask_pedestrian_in_striped_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 23.0, + 676.0, + 1011.0 + ], + "mask_score": 3.472095, + "mask_area_ratio": 0.152217, + "elapsed_seconds": 7.27 + } + }, + { + "name": "man_in_pink_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", + "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: standing near the curb waiting for a light", + "measured_bbox": [ + 0.7524, + 0.2743, + 0.8106, + 0.6687 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_in_pink_shirt.png", + "raw_ref_image": "references/raw_ref_man_in_pink_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_man_in_pink_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_pink_shirt_attempt_01.png", + "output": "references/ref_man_in_pink_shirt.png", + "mask": "references/sam_mask_man_in_pink_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 24.0, + 708.0, + 1000.0 + ], + "mask_score": 3.415589, + "mask_area_ratio": 0.161095, + "elapsed_seconds": 7.2651 + } + }, + { + "name": "man_in_grey_sweater", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", + "source_name": "man", + "source_description": "Man wearing a grey sweater. Source dataset: CrowdHuman. Scene context: People are walking through an airport terminal with prominent overhead signage.", + "sub_caption": "man: Man wearing a grey sweater.. Scene role: walking towards the camera on the crosswalk", + "measured_bbox": [ + 0.3541, + 0.2895, + 0.4483, + 0.7382 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_in_grey_sweater.png", + "raw_ref_image": "references/raw_ref_man_in_grey_sweater_attempt_01.png", + "reference_verify": "references/reference_verify_man_in_grey_sweater.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_in_grey_sweater_attempt_01.png", + "output": "references/ref_man_in_grey_sweater.png", + "mask": "references/sam_mask_man_in_grey_sweater.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 47.0, + 683.0, + 1003.0 + ], + "mask_score": 3.491882, + "mask_area_ratio": 0.143696, + "elapsed_seconds": 7.1775 + } + }, + { + "name": "pedestrian_in_light_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", + "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: walking on the sidewalk in the midground", + "measured_bbox": [ + 0.5797, + 0.3113, + 0.6425, + 0.5493 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_light_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_light_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_jacket_attempt_01.png", + "output": "references/ref_pedestrian_in_light_jacket.png", + "mask": "references/sam_mask_pedestrian_in_light_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 313.0, + 33.0, + 685.0, + 1017.0 + ], + "mask_score": 3.458198, + "mask_area_ratio": 0.174406, + "elapsed_seconds": 7.237 + } + }, + { + "name": "pedestrian_in_light_blue", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", + "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: crossing the street away from the camera's view", + "measured_bbox": [ + 0.0034, + 0.2952, + 0.1205, + 0.6424 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_light_blue.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_light_blue.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_in_light_blue_attempt_01.png", + "output": "references/ref_pedestrian_in_light_blue.png", + "mask": "references/sam_mask_pedestrian_in_light_blue.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 341.0, + 47.0, + 672.0, + 989.0 + ], + "mask_score": 3.478225, + "mask_area_ratio": 0.140584, + "elapsed_seconds": 7.065 + } + }, + { + "name": "black_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", + "sub_caption": "black suv: A black SUV.. Scene role: stopped at the crosswalk yielding to pedestrians", + "measured_bbox": [ + 0.0797, + 0.2941, + 0.5997, + 0.5875 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_suv.png", + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "reference_verify": "references/reference_verify_black_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "references/ref_black_suv.png", + "mask": "references/sam_mask_black_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/dataset/tmp/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 1.0, + 273.0, + 1023.0, + 701.0 + ], + "mask_score": 3.159418, + "mask_area_ratio": 0.229866, + "elapsed_seconds": 7.2921 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/vocab_task.json b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..c4df4441bcf7c41dc8c0fde70228277376390b85 --- /dev/null +++ b/samples_v8/driving/BDD100K_CityPersons_CrowdHuman_samples_clean/sample_000010/vocab_task.json @@ -0,0 +1,140 @@ +{ + "task_id": "sample_000010", + "sample_id": "sample_000010", + "sample_index": 10, + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 73165, + "image_id": "CrowdHuman:data/data_34/273275,e38390009eb9d542.jpg:person:3", + "name": "adult man", + "description": "Standing in the back center, taller than the others. Wearing a dark blue t-shirt with 'Red Sox' in red lettering. Smiling and holding up two fingers on both hands. Source dataset: CrowdHuman. Scene context: A group of young people and an adult posing for a photo outdoors near a pond and greenery." + }, + { + "candidate_index": 1, + "source_offset": 141052, + "image_id": "CrowdHuman:data/data_58/273275,81578000b3bc0044.jpg:person:8", + "name": "athlete in maroon", + "description": "Standing near the middle back, wearing a maroon jersey with white and green accents. Source dataset: CrowdHuman. Scene context: A large group of female athletes in team uniforms poses together for a team photo on a grassy field." + }, + { + "candidate_index": 2, + "source_offset": 171457, + "image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "name": "pedestrian", + "description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral." + }, + { + "candidate_index": 3, + "source_offset": 193812, + "image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "name": "pedestrian", + "description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide." + }, + { + "candidate_index": 4, + "source_offset": 58750, + "image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "name": "pedestrian standing", + "description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground." + }, + { + "candidate_index": 5, + "source_offset": 23389, + "image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "name": "pedestrian", + "description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day." + }, + { + "candidate_index": 6, + "source_offset": 32556, + "image_id": "CrowdHuman:data/data_2/282555,955000086c7869b.jpg:person:5", + "name": "visitor", + "description": "A smaller figure, likely a child, wearing a red top and light-colored pants or shorts, standing next to a pedestal. Source dataset: CrowdHuman. Scene context: A large indoor sculpture gallery with classical and neoclassical statues displayed on pedestals, surrounded by ornate architecture and visitors walking and admiring the art." + }, + { + "candidate_index": 7, + "source_offset": 136805, + "image_id": "CrowdHuman:data/data_56/273278,d7bf10008b6d941a.jpg:person:14", + "name": "person being held", + "description": "A person whose legs are being held by the woman in the green jacket. They are wearing blue jeans and black and white sneakers. Source dataset: CrowdHuman. Scene context: A large group of people posing for a photo on and around the back of a red trolley bus on a city street." + }, + { + "candidate_index": 8, + "source_offset": 113079, + "image_id": "CrowdHuman:data/data_48/273278,dee9a00056a41b83.jpg:person:9", + "name": "medical professional", + "description": "Standing in the second row, wearing a white lab coat. Source dataset: CrowdHuman. Scene context: A large group of medical professionals is posing for a photograph on the outdoor steps of a brick building." + }, + { + "candidate_index": 9, + "source_offset": 95844, + "image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "name": "man talking to young man", + "description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through." + }, + { + "candidate_index": 10, + "source_offset": 191693, + "image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", + "name": "man", + "description": "Man wearing a grey sweater. Source dataset: CrowdHuman. Scene context: People are walking through an airport terminal with prominent overhead signage." + }, + { + "candidate_index": 11, + "source_offset": 20832, + "image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "name": "pedestrian", + "description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below." + }, + { + "candidate_index": 12, + "source_offset": 88333, + "image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "name": "pedestrian", + "description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky." + }, + { + "candidate_index": 13, + "source_offset": 78183, + "image_id": "CrowdHuman:data/data_35/273278,10d613000f22b872d.jpg:person:2", + "name": "woman", + "description": "Standing on the left side of the promenade, wearing a sleeveless pink and white patterned dress. Source dataset: CrowdHuman. Scene context: A bustling waterfront promenade with people walking, dining under awnings, and a clock tower in the background on a sunny day." + }, + { + "candidate_index": 14, + "source_offset": 147258, + "image_id": "CrowdHuman:data/data_6/283554,110da0008553b110.jpg:person:1", + "name": "male student in yellow", + "description": "A young man on the left side, wearing a bright yellow t-shirt and black shorts, holding onto a barre and practicing a pose. Source dataset: CrowdHuman. Scene context: A female instructor is leading a group of young men in what appears to be a ballet or dance exercise using barres in a studio." + }, + { + "candidate_index": 15, + "source_offset": 39909, + "image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "name": "distant pedestrian", + "description": "A person walking in the background. Source dataset: CrowdHuman. Scene context: People walk through an outdoor plaza area with modern architecture, an outdoor seating section with red chairs on the left, and planters with yellow and blue flowers on the right." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 156087, + "image_id": "BDD100K:b6b616b3-979e75c3:object:6", + "name": "street sign", + "description": "A rectangular street sign is visible overhead on the right side of the street, partially obscured by the dirty windshield. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street, looking through a dirty or condensation-covered windshield at traffic and buildings ahead." + }, + { + "candidate_index": 1, + "source_offset": 166049, + "image_id": "BDD100K:b8fe1054-42625c45:object:1", + "name": "black suv", + "description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day." + } + ], + "rng_seed": 1782975283, + "created_at": 1782292413.4255238 +}