|
|
| """Utility classes and functions for image processing and ROI operations.
|
|
|
| Copyright (c) Microsoft Corporation.
|
|
|
| MIT License
|
|
|
| Permission is hereby granted, free of charge, to any person obtaining a copy
|
| of this software and associated documentation files (the "Software"), to deal
|
| in the Software without restriction, including without limitation the rights
|
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| copies of the Software, and to permit persons to whom the Software is
|
| furnished to do so, subject to the following conditions:
|
|
|
| The above copyright notice and this permission notice shall be included in all
|
| copies or substantial portions of the Software.
|
|
|
| THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| SOFTWARE.
|
| """
|
|
|
| import cv2
|
| import numpy as np
|
|
|
| ONNX_EP = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
| UINT8_MAX = np.iinfo(np.uint8).max
|
| UINT16_MAX = np.iinfo(np.uint16).max
|
|
|
|
|
| class ImageFormatError(Exception):
|
| """Exception raised for invalid image formats."""
|
|
|
| pass
|
|
|
|
|
| class ModelNotFoundError(Exception):
|
| """Exception raised when model file is not found."""
|
|
|
| pass
|
|
|
|
|
| def preprocess_img(img: np.ndarray) -> np.ndarray:
|
| """Preprocesses a BGR image for DNN. Turning to float if not already and normalizing to [0, 1].
|
|
|
| Normalization of uint images is done by dividing by brightest possible value (e.g. 255 for uint8).
|
|
|
| Arguments:
|
| img: The image to preprocess, can be uint8, uint16, float16, float32 or float64.
|
|
|
| Returns:
|
| The preprocessed image in np.float32 format.
|
|
|
| Raises:
|
| ImageFormatError: If the image is not three channels or not uint8, uint16, float16, float32 or float64.
|
| """
|
| if img.ndim != 3 or img.shape[2] != 3:
|
| raise ImageFormatError("image must be 3 channels, got shape: {img.shape}")
|
| if img.dtype not in [np.uint8, np.uint16, np.float16, np.float32, np.float64]:
|
| raise ImageFormatError("image must be uint8 or float16, float32, float64")
|
|
|
| if img.dtype == np.uint8:
|
| img = img.astype(np.float32) / UINT8_MAX
|
| if img.dtype == np.uint16:
|
| img = img.astype(np.float32) / UINT16_MAX
|
| img = np.clip(img, 0, 1)
|
| return img.astype(np.float32)
|
|
|
|
|
| def prepare_image_for_model(image: np.ndarray, roi_size: int = 512) -> tuple[np.ndarray, dict]:
|
| """Prepare any input image for model inference by resizing to roi_size x roi_size.
|
|
|
| This function takes an image of any size and prepares it for a model that expects
|
| a square input (e.g., 512x512). It handles aspect ratio preservation by padding
|
| with replicated border values.
|
|
|
| Args:
|
| image: Input image of any size
|
| roi_size: Target size for the model (default 512)
|
|
|
| Returns:
|
| tuple: (preprocessed_image, metadata_dict)
|
| - preprocessed_image: Image resized to roi_size x roi_size
|
| - metadata_dict: Contains information needed to composite back to original size
|
| """
|
|
|
| original_shape = image.shape[:2]
|
|
|
|
|
| if original_shape[0] < original_shape[1]:
|
| pad_h = (original_shape[1] - original_shape[0]) // 2
|
| pad_w = 0
|
| pad_h_extra = original_shape[1] - original_shape[0] - pad_h
|
| pad_w_extra = 0
|
| elif original_shape[0] > original_shape[1]:
|
| pad_w = (original_shape[0] - original_shape[1]) // 2
|
| pad_h = 0
|
| pad_w_extra = original_shape[0] - original_shape[1] - pad_w
|
| pad_h_extra = 0
|
| else:
|
| pad_h = pad_w = pad_h_extra = pad_w_extra = 0
|
|
|
|
|
| padded_image = cv2.copyMakeBorder(
|
| image,
|
| top=pad_h,
|
| bottom=pad_h_extra,
|
| left=pad_w,
|
| right=pad_w_extra,
|
| borderType=cv2.BORDER_REPLICATE,
|
| )
|
|
|
| square_shape = padded_image.shape[:2]
|
|
|
| while padded_image.shape[1] > roi_size * 3 and padded_image.shape[0] > roi_size * 3:
|
| padded_image = cv2.pyrDown(padded_image)
|
|
|
| resized_image = cv2.resize(padded_image, (roi_size, roi_size), interpolation=cv2.INTER_LINEAR)
|
|
|
| metadata = {
|
| "original_shape": original_shape,
|
| "square_shape": square_shape,
|
| "original_padding": (pad_h, pad_w, pad_h_extra, pad_w_extra),
|
| }
|
|
|
| return resized_image, metadata
|
|
|
|
|
| def composite_model_output_to_image(
|
| model_output: np.ndarray, metadata: dict, interp_mode: int = cv2.INTER_NEAREST
|
| ) -> np.ndarray:
|
| """Composite model output back to the original image size.
|
|
|
| Takes the model output (which should be roi_size x roi_size) and composites it
|
| back to the original image dimensions using the metadata from prepare_image_for_model.
|
|
|
| Args:
|
| model_output: Output from the model (roi_size x roi_size)
|
| metadata: Metadata dict returned from prepare_image_for_model
|
| interp_mode: Interpolation mode for resizing (default INTER_NEAREST for discrete outputs)
|
|
|
| Returns:
|
| np.ndarray: Output composited to original image size
|
| """
|
| pad_h, pad_w, pad_h_extra, pad_w_extra = metadata["original_padding"]
|
|
|
|
|
| square_shape = metadata["square_shape"]
|
| resized_to_square = cv2.resize(model_output, (square_shape[1], square_shape[0]), interpolation=interp_mode)
|
|
|
|
|
| if pad_h > 0 or pad_h_extra > 0:
|
| final_output = resized_to_square[pad_h : square_shape[0] - pad_h_extra, :]
|
| elif pad_w > 0 or pad_w_extra > 0:
|
| final_output = resized_to_square[:, pad_w : square_shape[1] - pad_w_extra]
|
| else:
|
| final_output = resized_to_square
|
|
|
| return final_output
|
|
|