MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/image-object-detection/yolo_onnx_preprocessing_utils.py

import cv2
import numpy as np
import torch
import time
import torchvision
from PIL import Image
from typing import Any, Dict, List


def letterbox(
    img,
    new_shape=(640, 640),
    color=(114, 114, 114),
    auto=True,
    scaleFill=False,
    scaleup=True,
):
    """Resize image to a 32-pixel-multiple rectangle
    https://github.com/ultralytics/yolov3/issues/232

    :param img: an image
    :type img: <class 'numpy.ndarray'>
    :param new_shape: target shape in [height, width]
    :type new_shape: <class 'int'>
    :param color: color for pad area
    :type color: <class 'tuple'>
    :param auto: minimum rectangle
    :type auto: bool
    :param scaleFill: stretch the image without pad
    :type scaleFill: bool
    :param scaleup: scale up
    :type scaleup: bool
    :return: letterbox image, scale ratio, padded area in (width, height) in each side
    :rtype: <class 'numpy.ndarray'>, <class 'tuple'>, <class 'tuple'>
    """
    shape = img.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better test mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(
        img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
    )  # add border
    return img, ratio, (dw, dh)


def clip_coords(boxes, img_shape):
    """Clip bounding xyxy bounding boxes to image shape (height, width)

    :param boxes: bbox
    :type boxes: <class 'torch.Tensor'>
    :return: img_shape: image shape
    :rtype: img_shape: <class 'tuple'>: (height, width)
    """
    boxes[:, 0].clamp_(0, img_shape[1])  # x1
    boxes[:, 1].clamp_(0, img_shape[0])  # y1
    boxes[:, 2].clamp_(0, img_shape[1])  # x2
    boxes[:, 3].clamp_(0, img_shape[0])  # y2


def unpad_bbox(boxes, img_shape, pad):
    """Correct bbox coordinates by removing the padded area from letterbox image

    :param boxes: bbox absolute coordinates from prediction
    :type boxes: <class 'torch.Tensor'>
    :param img_shape: image shape
    :type img_shape: <class 'tuple'>: (height, width)
    :param pad: pad used in letterbox image for inference
    :type pad: <class 'tuple'>: (width, height)
    :return: (unpadded) image height and width
    :rtype: <class 'tuple'>: (height, width)
    """
    dw, dh = pad
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    img_width = img_shape[1] - (left + right)
    img_height = img_shape[0] - (top + bottom)

    if boxes is not None:
        boxes[:, 0] -= left
        boxes[:, 1] -= top
        boxes[:, 2] -= left
        boxes[:, 3] -= top
        clip_coords(boxes, (img_height, img_width))

    return img_height, img_width


def _convert_to_rcnn_output(output, height, width, pad):
    # output: nx6 (x1, y1, x2, y2, conf, cls)
    rcnn_label: Dict[str, List[Any]] = {"boxes": [], "labels": [], "scores": []}

    # Adjust bbox to effective image bounds
    img_height, img_width = unpad_bbox(
        output[:, :4] if output is not None else None, (height, width), pad
    )

    if output is not None:
        rcnn_label["boxes"] = output[:, :4]
        rcnn_label["labels"] = output[:, 5:6].long()
        rcnn_label["scores"] = output[:, 4:5]

    return rcnn_label, (img_height, img_width)


def xywh2xyxy(x):
    """Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right

    :param x: bbox coordinates in [x center, y center, w, h]
    :type x: <class 'numpy.ndarray'> or torch.Tensor
    :return: new bbox coordinates in [x1, y1, x2, y2]
    :rtype: <class 'numpy.ndarray'> or torch.Tensor
    """
    y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y


def box_iou(box1, box2):
    """Return intersection-over-union (Jaccard index) of boxes.
    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
    https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py

    :param box1: bbox in (Tensor[N, 4]), N for multiple bboxes and 4 for the box coordinates
    :type box1: <class 'torch.Tensor'>
    :param box2: bbox in (Tensor[M, 4]), M is for multiple bboxes
    :type box2: <class 'torch.Tensor'>
    :return: iou of box1 to box2 in (Tensor[N, M]), the NxM matrix containing the pairwise
            IoU values for every element in boxes1 and boxes2
    :rtype: <class 'torch.Tensor'>
    """

    def box_area(box):
        # box = 4xn
        return (box[2] - box[0]) * (box[3] - box[1])

    area1 = box_area(box1.t())
    area2 = box_area(box2.t())

    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
    inter = (
        (
            torch.min(box1[:, None, 2:], box2[:, 2:])
            - torch.max(box1[:, None, :2], box2[:, :2])
        )
        .clamp(0)
        .prod(2)
    )
    return inter / (
        area1[:, None] + area2 - inter
    )  # iou = inter / (area1 + area2 - inter)


def non_max_suppression(
    prediction,
    conf_thres=0.1,
    iou_thres=0.6,
    multi_label=False,
    merge=False,
    classes=None,
    agnostic=False,
):
    """Performs per-class Non-Maximum Suppression (NMS) on inference results

    :param prediction: predictions
    :type prediction: <class 'torch.Tensor'>
    :param conf_thres: confidence threshold
    :type conf_thres: float
    :param iou_thres: IoU threshold
    :type iou_thres: float
    :param multi_label: enable to have multiple labels in each box?
    :type multi_label: bool
    :param merge: Merge NMS (boxes merged using weighted mean)
    :type merge: bool
    :param classes: specific target class
    :type classes:
    :param agnostic: enable class agnostic NMS?
    :type agnostic: bool
    :return: detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
    :rtype: <class 'list'>
    """
    if prediction.dtype is torch.float16:
        prediction = prediction.float()  # to FP32

    nc = prediction[0].shape[1] - 5  # number of classes
    xc = prediction[..., 4] > conf_thres  # candidates

    # min_wh = 2
    max_wh = 4096  # (pixels) maximum box width and height
    max_det = 300  # maximum number of detections per image
    time_limit = 10.0  # seconds to quit after
    redundant = True  # require redundant detections
    if multi_label and nc < 2:
        multi_label = False  # multiple labels per box (adds 0.5ms/img)

    t = time.time()
    output = [None] * prediction.shape[0]
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
        box = xywh2xyxy(x[:, :4])

        # Detections matrix nx6 (xyxy, conf, cls)
        if multi_label:
            i, j = (x[:, 5:] > conf_thres).nonzero().t()
            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
        else:  # best class only
            conf, j = x[:, 5:].max(1, keepdim=True)
            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]

        # Filter by class
        if classes:
            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

        # Apply finite constraint
        # if not torch.isfinite(x).all():
        #     x = x[torch.isfinite(x).all(1)]

        # If none remain process next image
        n = x.shape[0]  # number of boxes
        if not n:
            continue

        # Sort by confidence
        # x = x[x[:, 4].argsort(descending=True)]

        # Batched NMS
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
        if i.shape[0] > max_det:  # limit detections
            i = i[:max_det]
        if merge and (1 < n < 3e3):
            try:  # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
                iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
                weights = iou * scores[None]  # box weights
                x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
                    1, keepdim=True
                )  # merged boxes
                if redundant:
                    i = i[iou.sum(1) > 1]  # require redundancy
            except Exception:  # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
                print(
                    "[WARNING: possible CUDA error ({} {} {} {})]".format(
                        x, i, x.shape, i.shape
                    )
                )
                pass

        output[xi] = x[i]
        if (time.time() - t) > time_limit:
            break  # time limit exceeded

    return output


def _read_image(ignore_data_errors: bool, image_url: str, use_cv2: bool = False):
    try:
        if use_cv2:
            # cv2 can return None in some error cases
            img = cv2.imread(image_url)  # BGR
            if img is None:
                print("cv2.imread returned None")
            return img
        else:
            image = Image.open(image_url).convert("RGB")
            return image
    except Exception as ex:
        if ignore_data_errors:
            msg = "Exception occurred when trying to read the image. This file will be ignored."
            print(msg)
        else:
            print(str(ex), has_pii=True)
        return None


def preprocess(image_url, img_size=640):
    img0 = _read_image(
        ignore_data_errors=False, image_url=image_url, use_cv2=True
    )  # cv2.imread(image_url)  # BGR
    if img0 is None:
        return image_url, None, None

    img, ratio, pad = letterbox(img0, new_shape=img_size, auto=False, scaleup=False)

    # Convert
    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x640x640
    img = np.ascontiguousarray(img)
    np_image = torch.from_numpy(img)
    np_image = np.expand_dims(np_image, axis=0)
    np_image = np_image.astype(np.float32) / 255.0
    return np_image, pad