From 8b40feb982226470277c250721fea3dcb1d69746 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@outlook.com>
Date: Tue, 5 Oct 2021 07:43:44 -0400
Subject: [PATCH 01/16] Add Visualizer from detectron2

---
 yolort/utils/visualizer.py | 444 +++++++++++++++++++++++++++++++++++++
 1 file changed, 444 insertions(+)
 create mode 100644 yolort/utils/visualizer.py

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
new file mode 100644
index 00000000..040e94ac
--- /dev/null
+++ b/yolort/utils/visualizer.py
@@ -0,0 +1,444 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import colorsys
+import logging
+import matplotlib as mpl
+import matplotlib.figure as mplfigure
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+import matplotlib.colors as mplc
+import numpy as np
+
+from torchvision.ops.boxes import box_convert
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["VisImage", "Visualizer"]
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+
+
+def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+        is_crowd (list[bool] or None):
+
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None:
+        if class_names is not None and len(class_names) > 0:
+            labels = [class_names[i] for i in classes]
+        else:
+            labels = [str(i) for i in classes]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    if labels is not None and is_crowd is not None:
+        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, _ = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+
+
+class Visualizer:
+    """
+    Visualizer that draws data about detection on images.
+
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+
+    def __init__(self, img_rgb, metadata=None, scale=1.0):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (Metadata): dataset metadata (e.g. class names and colors)
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        if metadata is None:
+            metadata = MetadataCatalog.get("__nonexist__")
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+
+        colors = None
+
+        self.overlay_instances(boxes=boxes, labels=labels, assigned_colors=colors)
+        return self.output
+
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            boxes = [
+                box_convert(x["bbox"], x["bbox_mode"])
+                if len(x["bbox"]) == 4
+                else x["bbox"]
+                for x in annos
+            ]
+
+            colors = None
+            category_ids = [x["category_id"] for x in annos]
+            if self.metadata.get("thing_colors"):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                    for c in category_ids
+                ]
+            names = self.metadata.get("thing_classes", None)
+            labels = _create_text_labels(
+                category_ids,
+                scores=None,
+                class_names=names,
+                is_crowd=[x.get("iscrowd", 0) for x in annos],
+            )
+            self.overlay_instances(labels=labels, boxes=boxes, assigned_colors=colors)
+
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        assigned_colors=None,
+    ):
+        """
+        Args:
+            boxes (Boxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if labels is not None:
+                # first get a box
+                x0, y0, x1, y1 = boxes[i]
+                text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                horiz_align = "left"
+
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 4, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        return np.asarray(boxes)
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color

From 8e762cd3613f146220994d607162e06eed9cea0a Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Mon, 28 Feb 2022 22:41:48 +0800
Subject: [PATCH 02/16] Refactor

---
 yolort/data/builtin_meta.py  | 154 ++++++++++
 yolort/utils/builtin_meta.py | 578 -----------------------------------
 yolort/utils/visualizer.py   | 446 ++++++++-------------------
 yolort/utils/yolo2coco.py    |   6 +-
 4 files changed, 286 insertions(+), 898 deletions(-)
 create mode 100644 yolort/data/builtin_meta.py
 delete mode 100644 yolort/utils/builtin_meta.py

diff --git a/yolort/data/builtin_meta.py b/yolort/data/builtin_meta.py
new file mode 100644
index 00000000..be2fc7ab
--- /dev/null
+++ b/yolort/data/builtin_meta.py
@@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+Note:
+For your custom dataset, there is no need to hard-code metadata anywhere in the code.
+For example, for COCO-format dataset, metadata will be obtained automatically
+when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways
+during loading.
+
+However, we hard-coded metadata for a few common dataset here.
+The only goal is to allow users who don't have these dataset to use pre-trained models.
+Users don't have to download a COCO json (which contains metadata), in order to visualize a
+COCO model (with correct class names and colors).
+"""
+
+
+# All coco categories, together with their nice-looking visualization colors
+# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+COCO_CATEGORIES = [
+    {"id": 1, "color": [220, 20, 60], "isthing": 1, "name": "person"},
+    {"id": 2, "color": [119, 11, 32], "isthing": 1, "name": "bicycle"},
+    {"id": 3, "color": [0, 0, 142], "isthing": 1, "name": "car"},
+    {"id": 4, "color": [0, 0, 230], "isthing": 1, "name": "motorcycle"},
+    {"id": 5, "color": [106, 0, 228], "isthing": 1, "name": "airplane"},
+    {"id": 6, "color": [0, 60, 100], "isthing": 1, "name": "bus"},
+    {"id": 7, "color": [0, 80, 100], "isthing": 1, "name": "train"},
+    {"id": 8, "color": [0, 0, 70], "isthing": 1, "name": "truck"},
+    {"id": 9, "color": [0, 0, 192], "isthing": 1, "name": "boat"},
+    {"id": 10, "color": [250, 170, 30], "isthing": 1, "name": "traffic light"},
+    {"id": 11, "color": [100, 170, 30], "isthing": 1, "name": "fire hydrant"},
+    {"id": 13, "color": [220, 220, 0], "isthing": 1, "name": "stop sign"},
+    {"id": 14, "color": [175, 116, 175], "isthing": 1, "name": "parking meter"},
+    {"id": 15, "color": [250, 0, 30], "isthing": 1, "name": "bench"},
+    {"id": 16, "color": [165, 42, 42], "isthing": 1, "name": "bird"},
+    {"id": 17, "color": [255, 77, 255], "isthing": 1, "name": "cat"},
+    {"id": 18, "color": [0, 226, 252], "isthing": 1, "name": "dog"},
+    {"id": 19, "color": [182, 182, 255], "isthing": 1, "name": "horse"},
+    {"id": 20, "color": [0, 82, 0], "isthing": 1, "name": "sheep"},
+    {"id": 21, "color": [120, 166, 157], "isthing": 1, "name": "cow"},
+    {"id": 22, "color": [110, 76, 0], "isthing": 1, "name": "elephant"},
+    {"id": 23, "color": [174, 57, 255], "isthing": 1, "name": "bear"},
+    {"id": 24, "color": [199, 100, 0], "isthing": 1, "name": "zebra"},
+    {"id": 25, "color": [72, 0, 118], "isthing": 1, "name": "giraffe"},
+    {"id": 27, "color": [255, 179, 240], "isthing": 1, "name": "backpack"},
+    {"id": 28, "color": [0, 125, 92], "isthing": 1, "name": "umbrella"},
+    {"id": 31, "color": [209, 0, 151], "isthing": 1, "name": "handbag"},
+    {"id": 32, "color": [188, 208, 182], "isthing": 1, "name": "tie"},
+    {"id": 33, "color": [0, 220, 176], "isthing": 1, "name": "suitcase"},
+    {"id": 34, "color": [255, 99, 164], "isthing": 1, "name": "frisbee"},
+    {"id": 35, "color": [92, 0, 73], "isthing": 1, "name": "skis"},
+    {"id": 36, "color": [133, 129, 255], "isthing": 1, "name": "snowboard"},
+    {"id": 37, "color": [78, 180, 255], "isthing": 1, "name": "sports ball"},
+    {"id": 38, "color": [0, 228, 0], "isthing": 1, "name": "kite"},
+    {"id": 39, "color": [174, 255, 243], "isthing": 1, "name": "baseball bat"},
+    {"id": 40, "color": [45, 89, 255], "isthing": 1, "name": "baseball glove"},
+    {"id": 41, "color": [134, 134, 103], "isthing": 1, "name": "skateboard"},
+    {"id": 42, "color": [145, 148, 174], "isthing": 1, "name": "surfboard"},
+    {"id": 43, "color": [255, 208, 186], "isthing": 1, "name": "tennis racket"},
+    {"id": 44, "color": [197, 226, 255], "isthing": 1, "name": "bottle"},
+    {"id": 46, "color": [171, 134, 1], "isthing": 1, "name": "wine glass"},
+    {"id": 47, "color": [109, 63, 54], "isthing": 1, "name": "cup"},
+    {"id": 48, "color": [207, 138, 255], "isthing": 1, "name": "fork"},
+    {"id": 49, "color": [151, 0, 95], "isthing": 1, "name": "knife"},
+    {"id": 50, "color": [9, 80, 61], "isthing": 1, "name": "spoon"},
+    {"id": 51, "color": [84, 105, 51], "isthing": 1, "name": "bowl"},
+    {"id": 52, "color": [74, 65, 105], "isthing": 1, "name": "banana"},
+    {"id": 53, "color": [166, 196, 102], "isthing": 1, "name": "apple"},
+    {"id": 54, "color": [208, 195, 210], "isthing": 1, "name": "sandwich"},
+    {"id": 55, "color": [255, 109, 65], "isthing": 1, "name": "orange"},
+    {"id": 56, "color": [0, 143, 149], "isthing": 1, "name": "broccoli"},
+    {"id": 57, "color": [179, 0, 194], "isthing": 1, "name": "carrot"},
+    {"id": 58, "color": [209, 99, 106], "isthing": 1, "name": "hot dog"},
+    {"id": 59, "color": [5, 121, 0], "isthing": 1, "name": "pizza"},
+    {"id": 60, "color": [227, 255, 205], "isthing": 1, "name": "donut"},
+    {"id": 61, "color": [147, 186, 208], "isthing": 1, "name": "cake"},
+    {"id": 62, "color": [153, 69, 1], "isthing": 1, "name": "chair"},
+    {"id": 63, "color": [3, 95, 161], "isthing": 1, "name": "couch"},
+    {"id": 64, "color": [163, 255, 0], "isthing": 1, "name": "potted plant"},
+    {"id": 65, "color": [119, 0, 170], "isthing": 1, "name": "bed"},
+    {"id": 67, "color": [0, 182, 199], "isthing": 1, "name": "dining table"},
+    {"id": 70, "color": [0, 165, 120], "isthing": 1, "name": "toilet"},
+    {"id": 72, "color": [183, 130, 88], "isthing": 1, "name": "tv"},
+    {"id": 73, "color": [95, 32, 0], "isthing": 1, "name": "laptop"},
+    {"id": 74, "color": [130, 114, 135], "isthing": 1, "name": "mouse"},
+    {"id": 75, "color": [110, 129, 133], "isthing": 1, "name": "remote"},
+    {"id": 76, "color": [166, 74, 118], "isthing": 1, "name": "keyboard"},
+    {"id": 77, "color": [219, 142, 185], "isthing": 1, "name": "cell phone"},
+    {"id": 78, "color": [79, 210, 114], "isthing": 1, "name": "microwave"},
+    {"id": 79, "color": [178, 90, 62], "isthing": 1, "name": "oven"},
+    {"id": 80, "color": [65, 70, 15], "isthing": 1, "name": "toaster"},
+    {"id": 81, "color": [127, 167, 115], "isthing": 1, "name": "sink"},
+    {"id": 82, "color": [59, 105, 106], "isthing": 1, "name": "refrigerator"},
+    {"id": 84, "color": [142, 108, 45], "isthing": 1, "name": "book"},
+    {"id": 85, "color": [196, 172, 0], "isthing": 1, "name": "clock"},
+    {"id": 86, "color": [95, 54, 80], "isthing": 1, "name": "vase"},
+    {"id": 87, "color": [128, 76, 255], "isthing": 1, "name": "scissors"},
+    {"id": 88, "color": [201, 57, 1], "isthing": 1, "name": "teddy bear"},
+    {"id": 89, "color": [246, 0, 122], "isthing": 1, "name": "hair drier"},
+    {"id": 90, "color": [191, 162, 208], "isthing": 1, "name": "toothbrush"},
+    {"id": 92, "color": [255, 255, 128], "isthing": 0, "name": "banner"},
+    {"id": 93, "color": [147, 211, 203], "isthing": 0, "name": "blanket"},
+    {"id": 95, "color": [150, 100, 100], "isthing": 0, "name": "bridge"},
+    {"id": 100, "color": [168, 171, 172], "isthing": 0, "name": "cardboard"},
+    {"id": 107, "color": [146, 112, 198], "isthing": 0, "name": "counter"},
+    {"id": 109, "color": [210, 170, 100], "isthing": 0, "name": "curtain"},
+    {"id": 112, "color": [92, 136, 89], "isthing": 0, "name": "door-stuff"},
+    {"id": 118, "color": [218, 88, 184], "isthing": 0, "name": "floor-wood"},
+    {"id": 119, "color": [241, 129, 0], "isthing": 0, "name": "flower"},
+    {"id": 122, "color": [217, 17, 255], "isthing": 0, "name": "fruit"},
+    {"id": 125, "color": [124, 74, 181], "isthing": 0, "name": "gravel"},
+    {"id": 128, "color": [70, 70, 70], "isthing": 0, "name": "house"},
+    {"id": 130, "color": [255, 228, 255], "isthing": 0, "name": "light"},
+    {"id": 133, "color": [154, 208, 0], "isthing": 0, "name": "mirror-stuff"},
+    {"id": 138, "color": [193, 0, 92], "isthing": 0, "name": "net"},
+    {"id": 141, "color": [76, 91, 113], "isthing": 0, "name": "pillow"},
+    {"id": 144, "color": [255, 180, 195], "isthing": 0, "name": "platform"},
+    {"id": 145, "color": [106, 154, 176], "isthing": 0, "name": "playingfield"},
+    {"id": 147, "color": [230, 150, 140], "isthing": 0, "name": "railroad"},
+    {"id": 148, "color": [60, 143, 255], "isthing": 0, "name": "river"},
+    {"id": 149, "color": [128, 64, 128], "isthing": 0, "name": "road"},
+    {"id": 151, "color": [92, 82, 55], "isthing": 0, "name": "roof"},
+    {"id": 154, "color": [254, 212, 124], "isthing": 0, "name": "sand"},
+    {"id": 155, "color": [73, 77, 174], "isthing": 0, "name": "sea"},
+    {"id": 156, "color": [255, 160, 98], "isthing": 0, "name": "shelf"},
+    {"id": 159, "color": [255, 255, 255], "isthing": 0, "name": "snow"},
+    {"id": 161, "color": [104, 84, 109], "isthing": 0, "name": "stairs"},
+    {"id": 166, "color": [169, 164, 131], "isthing": 0, "name": "tent"},
+    {"id": 168, "color": [225, 199, 255], "isthing": 0, "name": "towel"},
+    {"id": 171, "color": [137, 54, 74], "isthing": 0, "name": "wall-brick"},
+    {"id": 175, "color": [135, 158, 223], "isthing": 0, "name": "wall-stone"},
+    {"id": 176, "color": [7, 246, 231], "isthing": 0, "name": "wall-tile"},
+    {"id": 177, "color": [107, 255, 200], "isthing": 0, "name": "wall-wood"},
+    {"id": 178, "color": [58, 41, 149], "isthing": 0, "name": "water-other"},
+    {"id": 180, "color": [183, 121, 142], "isthing": 0, "name": "window-blind"},
+    {"id": 181, "color": [255, 73, 97], "isthing": 0, "name": "window-other"},
+    {"id": 184, "color": [107, 142, 35], "isthing": 0, "name": "tree-merged"},
+    {"id": 185, "color": [190, 153, 153], "isthing": 0, "name": "fence-merged"},
+    {"id": 186, "color": [146, 139, 141], "isthing": 0, "name": "ceiling-merged"},
+    {"id": 187, "color": [70, 130, 180], "isthing": 0, "name": "sky-other-merged"},
+    {"id": 188, "color": [134, 199, 156], "isthing": 0, "name": "cabinet-merged"},
+    {"id": 189, "color": [209, 226, 140], "isthing": 0, "name": "table-merged"},
+    {"id": 190, "color": [96, 36, 108], "isthing": 0, "name": "floor-other-merged"},
+    {"id": 191, "color": [96, 96, 96], "isthing": 0, "name": "pavement-merged"},
+    {"id": 192, "color": [64, 170, 64], "isthing": 0, "name": "mountain-merged"},
+    {"id": 193, "color": [152, 251, 152], "isthing": 0, "name": "grass-merged"},
+    {"id": 194, "color": [208, 229, 228], "isthing": 0, "name": "dirt-merged"},
+    {"id": 195, "color": [206, 186, 171], "isthing": 0, "name": "paper-merged"},
+    {"id": 196, "color": [152, 161, 64], "isthing": 0, "name": "food-other-merged"},
+    {"id": 197, "color": [116, 112, 0], "isthing": 0, "name": "building-other-merged"},
+    {"id": 198, "color": [0, 114, 143], "isthing": 0, "name": "rock-merged"},
+    {"id": 199, "color": [102, 102, 156], "isthing": 0, "name": "wall-other-merged"},
+    {"id": 200, "color": [250, 141, 255], "isthing": 0, "name": "rug-merged"},
+]
diff --git a/yolort/utils/builtin_meta.py b/yolort/utils/builtin_meta.py
deleted file mode 100644
index a55d8d10..00000000
--- a/yolort/utils/builtin_meta.py
+++ /dev/null
@@ -1,578 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-Note:
-For your custom dataset, there is no need to hard-code metadata anywhere in the code.
-For example, for COCO-format dataset, metadata will be obtained automatically
-when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways
-during loading.
-However, we hard-coded metadata for a few common dataset here.
-The only goal is to allow users who don't have these dataset to use pre-trained models.
-Users don't have to download a COCO json (which contains metadata), in order to visualize a
-COCO model (with correct class names and colors).
-"""
-
-
-COCO_CATEGORIES = [
-    {
-        "id": 1,
-        "color": [220, 20, 60],
-        "isthing": 1,
-        "name": "person",
-        "supercategory": "person",
-    },
-    {
-        "id": 2,
-        "color": [119, 11, 32],
-        "isthing": 1,
-        "name": "bicycle",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 3,
-        "color": [0, 0, 142],
-        "isthing": 1,
-        "name": "car",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 4,
-        "color": [0, 0, 230],
-        "isthing": 1,
-        "name": "motorcycle",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 5,
-        "color": [106, 0, 228],
-        "isthing": 1,
-        "name": "airplane",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 6,
-        "color": [0, 60, 100],
-        "isthing": 1,
-        "name": "bus",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 7,
-        "color": [0, 80, 100],
-        "isthing": 1,
-        "name": "train",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 8,
-        "color": [0, 0, 70],
-        "isthing": 1,
-        "name": "truck",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 9,
-        "color": [0, 0, 192],
-        "isthing": 1,
-        "name": "boat",
-        "supercategory": "vehicle",
-    },
-    {
-        "id": 10,
-        "color": [250, 170, 30],
-        "isthing": 1,
-        "name": "traffic light",
-        "supercategory": "outdoor",
-    },
-    {
-        "id": 11,
-        "color": [100, 170, 30],
-        "isthing": 1,
-        "name": "fire hydrant",
-        "supercategory": "outdoor",
-    },
-    {
-        "id": 13,
-        "color": [220, 220, 0],
-        "isthing": 1,
-        "name": "stop sign",
-        "supercategory": "outdoor",
-    },
-    {
-        "id": 14,
-        "color": [175, 116, 175],
-        "isthing": 1,
-        "name": "parking meter",
-        "supercategory": "outdoor",
-    },
-    {
-        "id": 15,
-        "color": [250, 0, 30],
-        "isthing": 1,
-        "name": "bench",
-        "supercategory": "outdoor",
-    },
-    {
-        "id": 16,
-        "color": [165, 42, 42],
-        "isthing": 1,
-        "name": "bird",
-        "supercategory": "animal",
-    },
-    {
-        "id": 17,
-        "color": [255, 77, 255],
-        "isthing": 1,
-        "name": "cat",
-        "supercategory": "animal",
-    },
-    {
-        "id": 18,
-        "color": [0, 226, 252],
-        "isthing": 1,
-        "name": "dog",
-        "supercategory": "animal",
-    },
-    {
-        "id": 19,
-        "color": [182, 182, 255],
-        "isthing": 1,
-        "name": "horse",
-        "supercategory": "animal",
-    },
-    {
-        "id": 20,
-        "color": [0, 82, 0],
-        "isthing": 1,
-        "name": "sheep",
-        "supercategory": "animal",
-    },
-    {
-        "id": 21,
-        "color": [120, 166, 157],
-        "isthing": 1,
-        "name": "cow",
-        "supercategory": "animal",
-    },
-    {
-        "id": 22,
-        "color": [110, 76, 0],
-        "isthing": 1,
-        "name": "elephant",
-        "supercategory": "animal",
-    },
-    {
-        "id": 23,
-        "color": [174, 57, 255],
-        "isthing": 1,
-        "name": "bear",
-        "supercategory": "animal",
-    },
-    {
-        "id": 24,
-        "color": [199, 100, 0],
-        "isthing": 1,
-        "name": "zebra",
-        "supercategory": "animal",
-    },
-    {
-        "id": 25,
-        "color": [72, 0, 118],
-        "isthing": 1,
-        "name": "giraffe",
-        "supercategory": "animal",
-    },
-    {
-        "id": 27,
-        "color": [255, 179, 240],
-        "isthing": 1,
-        "name": "backpack",
-        "supercategory": "accessory",
-    },
-    {
-        "id": 28,
-        "color": [0, 125, 92],
-        "isthing": 1,
-        "name": "umbrella",
-        "supercategory": "accessory",
-    },
-    {
-        "id": 31,
-        "color": [209, 0, 151],
-        "isthing": 1,
-        "name": "handbag",
-        "supercategory": "accessory",
-    },
-    {
-        "id": 32,
-        "color": [188, 208, 182],
-        "isthing": 1,
-        "name": "tie",
-        "supercategory": "accessory",
-    },
-    {
-        "id": 33,
-        "color": [0, 220, 176],
-        "isthing": 1,
-        "name": "suitcase",
-        "supercategory": "accessory",
-    },
-    {
-        "id": 34,
-        "color": [255, 99, 164],
-        "isthing": 1,
-        "name": "frisbee",
-        "supercategory": "sports",
-    },
-    {
-        "id": 35,
-        "color": [92, 0, 73],
-        "isthing": 1,
-        "name": "skis",
-        "supercategory": "sports",
-    },
-    {
-        "id": 36,
-        "color": [133, 129, 255],
-        "isthing": 1,
-        "name": "snowboard",
-        "supercategory": "sports",
-    },
-    {
-        "id": 37,
-        "color": [78, 180, 255],
-        "isthing": 1,
-        "name": "sports ball",
-        "supercategory": "sports",
-    },
-    {
-        "id": 38,
-        "color": [0, 228, 0],
-        "isthing": 1,
-        "name": "kite",
-        "supercategory": "sports",
-    },
-    {
-        "id": 39,
-        "color": [174, 255, 243],
-        "isthing": 1,
-        "name": "baseball bat",
-        "supercategory": "sports",
-    },
-    {
-        "id": 40,
-        "color": [45, 89, 255],
-        "isthing": 1,
-        "name": "baseball glove",
-        "supercategory": "sports",
-    },
-    {
-        "id": 41,
-        "color": [134, 134, 103],
-        "isthing": 1,
-        "name": "skateboard",
-        "supercategory": "sports",
-    },
-    {
-        "id": 42,
-        "color": [145, 148, 174],
-        "isthing": 1,
-        "name": "surfboard",
-        "supercategory": "sports",
-    },
-    {
-        "id": 43,
-        "color": [255, 208, 186],
-        "isthing": 1,
-        "name": "tennis racket",
-        "supercategory": "sports",
-    },
-    {
-        "id": 44,
-        "color": [197, 226, 255],
-        "isthing": 1,
-        "name": "bottle",
-        "supercategory": "kitchen",
-    },
-    {
-        "id": 46,
-        "color": [171, 134, 1],
-        "isthing": 1,
-        "name": "wine glass",
-        "supercategory": "kitchen",
-    },
-    {
-        "id": 47,
-        "color": [109, 63, 54],
-        "isthing": 1,
-        "name": "cup",
-        "supercategory": "kitchen",
-    },
-    {
-        "id": 48,
-        "color": [207, 138, 255],
-        "isthing": 1,
-        "name": "fork",
-        "supercategory": "kitchen",
-    },
-    {
-        "id": 49,
-        "color": [151, 0, 95],
-        "isthing": 1,
-        "name": "knife",
-        "supercategory": "kitchen",
-    },
-    {
-        "id": 50,
-        "color": [9, 80, 61],
-        "isthing": 1,
-        "name": "spoon",
-        "supercategory": "kitchen",
-    },
-    {
-        "id": 51,
-        "color": [84, 105, 51],
-        "isthing": 1,
-        "name": "bowl",
-        "supercategory": "kitchen",
-    },
-    {
-        "id": 52,
-        "color": [74, 65, 105],
-        "isthing": 1,
-        "name": "banana",
-        "supercategory": "food",
-    },
-    {
-        "id": 53,
-        "color": [166, 196, 102],
-        "isthing": 1,
-        "name": "apple",
-        "supercategory": "food",
-    },
-    {
-        "id": 54,
-        "color": [208, 195, 210],
-        "isthing": 1,
-        "name": "sandwich",
-        "supercategory": "food",
-    },
-    {
-        "id": 55,
-        "color": [255, 109, 65],
-        "isthing": 1,
-        "name": "orange",
-        "supercategory": "food",
-    },
-    {
-        "id": 56,
-        "color": [0, 143, 149],
-        "isthing": 1,
-        "name": "broccoli",
-        "supercategory": "food",
-    },
-    {
-        "id": 57,
-        "color": [179, 0, 194],
-        "isthing": 1,
-        "name": "carrot",
-        "supercategory": "food",
-    },
-    {
-        "id": 58,
-        "color": [209, 99, 106],
-        "isthing": 1,
-        "name": "hot dog",
-        "supercategory": "food",
-    },
-    {
-        "id": 59,
-        "color": [5, 121, 0],
-        "isthing": 1,
-        "name": "pizza",
-        "supercategory": "food",
-    },
-    {
-        "id": 60,
-        "color": [227, 255, 205],
-        "isthing": 1,
-        "name": "donut",
-        "supercategory": "food",
-    },
-    {
-        "id": 61,
-        "color": [147, 186, 208],
-        "isthing": 1,
-        "name": "cake",
-        "supercategory": "food",
-    },
-    {
-        "id": 62,
-        "color": [153, 69, 1],
-        "isthing": 1,
-        "name": "chair",
-        "supercategory": "furniture",
-    },
-    {
-        "id": 63,
-        "color": [3, 95, 161],
-        "isthing": 1,
-        "name": "couch",
-        "supercategory": "furniture",
-    },
-    {
-        "id": 64,
-        "color": [163, 255, 0],
-        "isthing": 1,
-        "name": "potted plant",
-        "supercategory": "furniture",
-    },
-    {
-        "id": 65,
-        "color": [119, 0, 170],
-        "isthing": 1,
-        "name": "bed",
-        "supercategory": "furniture",
-    },
-    {
-        "id": 67,
-        "color": [0, 182, 199],
-        "isthing": 1,
-        "name": "dining table",
-        "supercategory": "furniture",
-    },
-    {
-        "id": 70,
-        "color": [0, 165, 120],
-        "isthing": 1,
-        "name": "toilet",
-        "supercategory": "furniture",
-    },
-    {
-        "id": 72,
-        "color": [183, 130, 88],
-        "isthing": 1,
-        "name": "tv",
-        "supercategory": "electronic",
-    },
-    {
-        "id": 73,
-        "color": [95, 32, 0],
-        "isthing": 1,
-        "name": "laptop",
-        "supercategory": "electronic",
-    },
-    {
-        "id": 74,
-        "color": [130, 114, 135],
-        "isthing": 1,
-        "name": "mouse",
-        "supercategory": "electronic",
-    },
-    {
-        "id": 75,
-        "color": [110, 129, 133],
-        "isthing": 1,
-        "name": "remote",
-        "supercategory": "electronic",
-    },
-    {
-        "id": 76,
-        "color": [166, 74, 118],
-        "isthing": 1,
-        "name": "keyboard",
-        "supercategory": "electronic",
-    },
-    {
-        "id": 77,
-        "color": [219, 142, 185],
-        "isthing": 1,
-        "name": "cell phone",
-        "supercategory": "electronic",
-    },
-    {
-        "id": 78,
-        "color": [79, 210, 114],
-        "isthing": 1,
-        "name": "microwave",
-        "supercategory": "appliance",
-    },
-    {
-        "id": 79,
-        "color": [178, 90, 62],
-        "isthing": 1,
-        "name": "oven",
-        "supercategory": "appliance",
-    },
-    {
-        "id": 80,
-        "color": [65, 70, 15],
-        "isthing": 1,
-        "name": "toaster",
-        "supercategory": "appliance",
-    },
-    {
-        "id": 81,
-        "color": [127, 167, 115],
-        "isthing": 1,
-        "name": "sink",
-        "supercategory": "appliance",
-    },
-    {
-        "id": 82,
-        "color": [59, 105, 106],
-        "isthing": 1,
-        "name": "refrigerator",
-        "supercategory": "appliance",
-    },
-    {
-        "id": 84,
-        "color": [142, 108, 45],
-        "isthing": 1,
-        "name": "book",
-        "supercategory": "indoor",
-    },
-    {
-        "id": 85,
-        "color": [196, 172, 0],
-        "isthing": 1,
-        "name": "clock",
-        "supercategory": "indoor",
-    },
-    {
-        "id": 86,
-        "color": [95, 54, 80],
-        "isthing": 1,
-        "name": "vase",
-        "supercategory": "indoor",
-    },
-    {
-        "id": 87,
-        "color": [128, 76, 255],
-        "isthing": 1,
-        "name": "scissors",
-        "supercategory": "indoor",
-    },
-    {
-        "id": 88,
-        "color": [201, 57, 1],
-        "isthing": 1,
-        "name": "teddy bear",
-        "supercategory": "indoor",
-    },
-    {
-        "id": 89,
-        "color": [246, 0, 122],
-        "isthing": 1,
-        "name": "hair drier",
-        "supercategory": "indoor",
-    },
-    {
-        "id": 90,
-        "color": [191, 162, 208],
-        "isthing": 1,
-        "name": "toothbrush",
-        "supercategory": "indoor",
-    },
-]
diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index 040e94ac..ad821f18 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -1,46 +1,34 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-import colorsys
-import logging
-import matplotlib as mpl
+from enum import Enum, unique
+from typing import List, Optional, Tuple, Union
+
 import matplotlib.figure as mplfigure
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-import matplotlib.colors as mplc
 import numpy as np
-
-from torchvision.ops.boxes import box_convert
-
-logger = logging.getLogger(__name__)
-
-__all__ = ["VisImage", "Visualizer"]
-
-_SMALL_OBJECT_AREA_THRESH = 1000
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image, ImageColor, ImageDraw, ImageFont
 
 
-def _create_text_labels(classes, scores, class_names, is_crowd=None):
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
     """
-    Args:
-        classes (list[int] or None):
-        scores (list[float] or None):
-        class_names (list[str] or None):
-        is_crowd (list[bool] or None):
 
-    Returns:
-        list[str] or None
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
     """
-    labels = None
-    if classes is not None:
-        if class_names is not None and len(class_names) > 0:
-            labels = [class_names[i] for i in classes]
-        else:
-            labels = [str(i) for i in classes]
-    if scores is not None:
-        if labels is None:
-            labels = ["{:.0f}%".format(s * 100) for s in scores]
-        else:
-            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
-    if labels is not None and is_crowd is not None:
-        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
-    return labels
 
 
 class VisImage:
@@ -113,13 +101,18 @@ def get_image(self):
         buffer = np.frombuffer(s, dtype="uint8")
 
         img_rgba = buffer.reshape(height, width, 4)
-        rgb, _ = np.split(img_rgba, [3], axis=2)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
         return rgb.astype("uint8")
 
 
+def _generate_color_palette(num_objects: int):
+    palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
+    return [tuple((i * palette) % 255) for i in range(num_objects)]
+
+
 class Visualizer:
     """
-    Visualizer that draws data about detection on images.
+    Visualizer that draws data about detection/segmentation on images.
 
     It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
     that draw primitive objects to images, as well as high-level wrappers like
@@ -128,8 +121,8 @@ class Visualizer:
 
     Note that the exact visualization style for the high-level wrappers are subject to change.
     Style such as color, opacity, label contents, visibility of labels, or even the visibility
-    of objects themselves (e.g. when the object is too small) may change according
-    to different heuristics, as long as the results still look visually reasonable.
+    of objects themselves (e.g. when the object is too small) may change according to different
+    heuristics, as long as the results still look visually reasonable.
 
     To obtain a consistent style, you can implement custom drawing functions with the
     abovementioned primitive methods instead. If you need more customized visualization
@@ -139,306 +132,125 @@ class Visualizer:
 
     This visualizer focuses on high rendering quality rather than performance. It is not
     designed to be used for real-time applications.
+
+    Args:
+        img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+            the height and width of the image respectively. C is the number of
+            color channels. The image is required to be in RGB format since that
+            is a requirement of the Matplotlib library. The image is also expected
+            to be in the range [0, 255].
+        metadata (Metadata): dataset metadata (e.g. class names and colors)
+        instance_mode (ColorMode): defines one of the pre-defined style for drawing
+            instances on an image.
     """
 
-    def __init__(self, img_rgb, metadata=None, scale=1.0):
-        """
-        Args:
-            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
-                the height and width of the image respectively. C is the number of
-                color channels. The image is required to be in RGB format since that
-                is a requirement of the Matplotlib library. The image is also expected
-                to be in the range [0, 255].
-            metadata (Metadata): dataset metadata (e.g. class names and colors)
-        """
+    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
+
         self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
-        if metadata is None:
-            metadata = MetadataCatalog.get("__nonexist__")
-        self.metadata = metadata
         self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
 
         # too small texts are useless, therefore clamp to 9
-        self._default_font_size = max(
-            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
-        )
-
-    def draw_instance_predictions(self, predictions):
+        self._default_font_size = max(np.sqrt(self.output.height * self.output.width) // 90, 10 // scale)
+        self._instance_mode = instance_mode
+
+    @torch.no_grad()
+    def draw_bounding_boxes(
+        image: torch.Tensor,
+        boxes: torch.Tensor,
+        labels: Optional[List[str]] = None,
+        colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
+        fill: Optional[bool] = False,
+        width: int = 1,
+        font: Optional[str] = None,
+        font_size: int = 10,
+    ) -> torch.Tensor:
         """
-        Draw instance-level prediction results on an image.
+        Draws bounding boxes on given image.
+        The values of the input image should be uint8 between 0 and 255.
+        If fill is True, Resulting Tensor should be saved as PNG image.
+
+        Adapted from https://github.com/pytorch/vision/blob/1fc53b2/torchvision/utils.py#L159-L195
 
         Args:
-            predictions (Instances): the output of an instance detection/segmentation
-                model. Following fields will be used to draw:
-                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+            image (Tensor): Tensor of shape (C x H x W) and dtype uint8.
+            boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax)
+                format. Note that the boxes are absolute coordinates with respect to the image. In other
+                words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
+            labels (List[str]): List containing the labels of bounding boxes.
+            colors (color or list of colors, optional): List containing the colors
+                of the boxes or single color for all boxes. The color can be represented as
+                PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+                By default, random colors are generated for boxes.
+            fill (bool): If `True` fills the bounding box with specified color.
+            width (int): Width of bounding box.
+            font (str): A filename containing a TrueType font. If the file is not found in this filename,
+                the loader may also search in other directories, such as the `fonts/` directory on Windows
+                or `/Library/Fonts/`, `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS.
+            font_size (int): The requested font size in points.
 
         Returns:
-            output (VisImage): image object with visualizations.
+            img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted.
         """
-        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
-        scores = predictions.scores if predictions.has("scores") else None
-        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
-        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
 
-        colors = None
+        if not isinstance(image, torch.Tensor):
+            raise TypeError(f"Tensor expected, got {type(image)}")
+        elif image.dtype != torch.uint8:
+            raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
+        elif image.dim() != 3:
+            raise ValueError("Pass individual images, not batches")
+        elif image.size(0) not in {1, 3}:
+            raise ValueError("Only grayscale and RGB images are supported")
 
-        self.overlay_instances(boxes=boxes, labels=labels, assigned_colors=colors)
-        return self.output
+        num_boxes = boxes.shape[0]
 
-    def draw_dataset_dict(self, dic):
-        """
-        Draw annotations/segmentaions in Detectron2 Dataset format.
-
-        Args:
-            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        annos = dic.get("annotations", None)
-        if annos:
-            boxes = [
-                box_convert(x["bbox"], x["bbox_mode"])
-                if len(x["bbox"]) == 4
-                else x["bbox"]
-                for x in annos
-            ]
-
-            colors = None
-            category_ids = [x["category_id"] for x in annos]
-            if self.metadata.get("thing_colors"):
-                colors = [
-                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
-                    for c in category_ids
-                ]
-            names = self.metadata.get("thing_classes", None)
-            labels = _create_text_labels(
-                category_ids,
-                scores=None,
-                class_names=names,
-                is_crowd=[x.get("iscrowd", 0) for x in annos],
+        if labels is None:
+            labels: Union[List[str], List[None]] = [None] * num_boxes  # type: ignore[no-redef]
+        if len(labels) != num_boxes:
+            raise ValueError(
+                f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. "
+                "Please specify labels for each box."
             )
-            self.overlay_instances(labels=labels, boxes=boxes, assigned_colors=colors)
 
-        return self.output
-
-    def overlay_instances(
-        self,
-        *,
-        boxes=None,
-        labels=None,
-        assigned_colors=None,
-    ):
-        """
-        Args:
-            boxes (Boxes or ndarray): either a :class:`Boxes`,
-                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
-                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image,
-            labels (list[str]): the text to be displayed for each instance.
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = 0
-        if boxes is not None:
-            boxes = self._convert_boxes(boxes)
-            num_instances = len(boxes)
-        if labels is not None:
-            assert len(labels) == num_instances
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-
-        # Display in largest to smallest order to reduce occlusion.
-        areas = None
-        if boxes is not None:
-            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-
-        if areas is not None:
-            sorted_idxs = np.argsort(-areas).tolist()
-            # Re-order overlapped instances in descending order.
-            boxes = boxes[sorted_idxs] if boxes is not None else None
-            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
-
-        for i in range(num_instances):
-            color = assigned_colors[i]
-            if boxes is not None:
-                self.draw_box(boxes[i], edge_color=color)
-
-            if labels is not None:
-                # first get a box
-                x0, y0, x1, y1 = boxes[i]
-                text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
-                horiz_align = "left"
-
-                # for small objects, draw text at the side to avoid occlusion
-                instance_area = (y1 - y0) * (x1 - x0)
-                if (
-                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
-                    or y1 - y0 < 40 * self.output.scale
-                ):
-                    if y1 >= self.output.height - 5:
-                        text_pos = (x1, y0)
-                    else:
-                        text_pos = (x0, y1)
-
-                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
-                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-                font_size = (
-                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
-                    * 0.5
-                    * self._default_font_size
-                )
-                self.draw_text(
-                    labels[i],
-                    text_pos,
-                    color=lighter_color,
-                    horizontal_alignment=horiz_align,
-                    font_size=font_size,
+        if colors is None:
+            colors = _generate_color_palette(num_boxes)
+        elif isinstance(colors, list):
+            if len(colors) < num_boxes:
+                raise ValueError(
+                    f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}).",
                 )
+        else:  # colors specifies a single color for all boxes
+            colors = [colors] * num_boxes
 
-        return self.output
-
-    """
-    Primitive drawing functions:
-    """
-
-    def draw_text(
-        self,
-        text,
-        position,
-        *,
-        font_size=None,
-        color="g",
-        horizontal_alignment="center",
-        rotation=0
-    ):
-        """
-        Args:
-            text (str): class label
-            position (tuple): a tuple of the x and y coordinates to place text on image.
-            font_size (int, optional): font of the text. If not provided, a font size
-                proportional to the image width is calculated and used.
-            color: color of the text. Refer to `matplotlib.colors` for full list
-                of formats that are accepted.
-            horizontal_alignment (str): see `matplotlib.text.Text`
-            rotation: rotation angle in degrees CCW
-
-        Returns:
-            output (VisImage): image object with text drawn.
-        """
-        if not font_size:
-            font_size = self._default_font_size
-
-        # since the text background is dark, we don't want the text to be dark
-        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
-        color[np.argmax(color)] = max(0.8, np.max(color))
-
-        x, y = position
-        self.output.ax.text(
-            x,
-            y,
-            text,
-            size=font_size * self.output.scale,
-            family="sans-serif",
-            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
-            verticalalignment="top",
-            horizontalalignment=horizontal_alignment,
-            color=color,
-            zorder=10,
-            rotation=rotation,
-        )
-        return self.output
-
-    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
-        """
-        Args:
-            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
-                are the coordinates of the image's top left corner. x1 and y1 are the
-                coordinates of the image's bottom right corner.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x0, y0, x1, y1 = box_coord
-        width = x1 - x0
-        height = y1 - y0
-
-        linewidth = max(self._default_font_size / 4, 1)
-
-        self.output.ax.add_patch(
-            mpl.patches.Rectangle(
-                (x0, y0),
-                width,
-                height,
-                fill=False,
-                edgecolor=edge_color,
-                linewidth=linewidth * self.output.scale,
-                alpha=alpha,
-                linestyle=line_style,
-            )
-        )
-        return self.output
+        colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
 
-    """
-    Internal methods:
-    """
+        # Handle Grayscale images
+        if image.size(0) == 1:
+            image = torch.tile(image, (3, 1, 1))
 
-    def _jitter(self, color):
-        """
-        Randomly modifies given color to produce a slightly different color than the color given.
+        ndarr = image.permute(1, 2, 0).cpu().numpy()
+        img_to_draw = Image.fromarray(ndarr)
+        img_boxes = boxes.to(torch.int64).tolist()
 
-        Args:
-            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
-                picked. The values in the list are in the [0.0, 1.0] range.
+        if fill:
+            draw = ImageDraw.Draw(img_to_draw, "RGBA")
+        else:
+            draw = ImageDraw.Draw(img_to_draw)
 
-        Returns:
-            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
-                color after being jittered. The values in the list are in the [0.0, 1.0] range.
-        """
-        color = mplc.to_rgb(color)
-        vec = np.random.rand(3)
-        # better to do it in another color space
-        vec = vec / np.linalg.norm(vec) * 0.5
-        res = np.clip(vec + color, 0, 1)
-        return tuple(res)
-
-    def _convert_boxes(self, boxes):
-        """
-        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
-        """
-        return np.asarray(boxes)
+        try:
+            txt_font = ImageFont.truetype(font=font, size=font_size)
+        except IOError:
+            txt_font = ImageFont.load_default()
 
-    def _change_color_brightness(self, color, brightness_factor):
-        """
-        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
-        less or more saturation than the original color.
+        for bbox, color, label in zip(img_boxes, colors, labels):  # type: ignore[arg-type]
+            if fill:
+                fill_color = color + (100,)
+                draw.rectangle(bbox, width=width, outline=color, fill=fill_color)
+            else:
+                draw.rectangle(bbox, width=width, outline=color)
 
-        Args:
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
-                0 will correspond to no change, a factor in [-1.0, 0) range will result in
-                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+            if label is not None:
+                margin = width + 1
+                draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=color, font=txt_font)
 
-        Returns:
-            modified_color (tuple[double]): a tuple containing the RGB values of the
-                modified color. Each value in the tuple is in the [0.0, 1.0] range.
-        """
-        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
-        color = mplc.to_rgb(color)
-        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
-        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
-        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
-        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
-        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
-        return modified_color
+        return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
diff --git a/yolort/utils/yolo2coco.py b/yolort/utils/yolo2coco.py
index c9896e9c..df7ee045 100644
--- a/yolort/utils/yolo2coco.py
+++ b/yolort/utils/yolo2coco.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, Zhiqiang Wang. All Rights Reserved.
+# Copyright (c) 2020, yolort team. All Rights Reserved.
+
 import argparse
 import json
 from pathlib import Path
 
 from PIL import Image
-
-from .builtin_meta import COCO_CATEGORIES
+from yolort.data.builtin_meta import COCO_CATEGORIES
 
 
 class YOLO2COCO:

From dfadb53875ec46d43b77df72a25e731d85ad4379 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Tue, 1 Mar 2022 01:30:47 +0800
Subject: [PATCH 03/16] Move display into cv2_imshow()

---
 yolort/utils/image_utils.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/yolort/utils/image_utils.py b/yolort/utils/image_utils.py
index 90d7b99f..2f0d5308 100644
--- a/yolort/utils/image_utils.py
+++ b/yolort/utils/image_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020, yolort team. All rights reserved.
+
 import logging
 from io import BytesIO
 from pathlib import Path
@@ -7,7 +9,6 @@
 import numpy as np
 import requests
 import torch
-from IPython.display import display
 from PIL import Image
 from torch import Tensor
 from torchvision.ops.boxes import box_convert
@@ -48,7 +49,7 @@ def plot_one_box(box, img, color=None, label=None, line_thickness=None):
 
 
 def cv2_imshow(
-    image: np.ndarray,
+    img_bgr: np.ndarray,
     imshow_scale: Optional[float] = None,
     convert_bgr_to_rgb: bool = True,
 ) -> None:
@@ -56,23 +57,26 @@ def cv2_imshow(
     A replacement of cv2.imshow() for using in Jupyter notebooks.
 
     Args:
-        image (np.ndarray):. shape (N, M) or (N, M, 1) is an NxM grayscale image. shape (N, M, 3)
+        img_bgr (np.ndarray):. shape (N, M) or (N, M, 1) is an NxM grayscale image. shape (N, M, 3)
             is an NxM BGR color image. shape (N, M, 4) is an NxM BGRA color image.
         imshow_scale (Optional[float]): zoom ratio to show the image
         convert_bgr_to_rgb (bool): switch to convert BGR to RGB channel.
     """
-    image = image.clip(0, 255).astype("uint8")
+
+    from IPython.display import display
+
+    img_bgr = img_bgr.clip(0, 255).astype("uint8")
     # cv2 stores colors as BGR; convert to RGB
-    if convert_bgr_to_rgb and image.ndim == 3:
-        if image.shape[2] == 4:
-            image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA)
+    if convert_bgr_to_rgb and img_bgr.ndim == 3:
+        if img_bgr.shape[2] == 4:
+            img_bgr = cv2.cvtColor(img_bgr, cv2.COLOR_BGRA2RGBA)
         else:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            img_bgr = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
 
     if imshow_scale is not None:
-        image = cv2.resize(image, None, fx=imshow_scale, fy=imshow_scale)
+        img_bgr = cv2.resize(img_bgr, None, fx=imshow_scale, fy=imshow_scale)
 
-    display(Image.fromarray(image))
+    display(Image.fromarray(img_bgr))
 
 
 def color_list():

From f35d002632f1ca746001f13efa10b7d8d022f49f Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Tue, 1 Mar 2022 17:18:00 +0800
Subject: [PATCH 04/16] Use OpenCV to draw bounding boxes

---
 yolort/utils/visualizer.py | 142 ++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 87 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index ad821f18..bda8ec37 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -1,11 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) 2022, yolort team. All rights reserved.
+
 from enum import Enum, unique
 from typing import List, Optional, Tuple, Union
 
+import cv2
 import matplotlib.figure as mplfigure
 import numpy as np
 import torch
 from matplotlib.backends.backend_agg import FigureCanvasAgg
-from PIL import Image, ImageColor, ImageDraw, ImageFont
+from torch import Tensor
 
 
 @unique
@@ -32,7 +36,7 @@ class ColorMode(Enum):
 
 
 class VisImage:
-    def __init__(self, img, scale=1.0):
+    def __init__(self, img: np.ndarray, scale: float = 1.0):
         """
         Args:
             img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
@@ -105,11 +109,6 @@ def get_image(self):
         return rgb.astype("uint8")
 
 
-def _generate_color_palette(num_objects: int):
-    palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
-    return [tuple((i * palette) % 255) for i in range(num_objects)]
-
-
 class Visualizer:
     """
     Visualizer that draws data about detection/segmentation on images.
@@ -134,46 +133,57 @@ class Visualizer:
     designed to be used for real-time applications.
 
     Args:
-        img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
-            the height and width of the image respectively. C is the number of
-            color channels. The image is required to be in RGB format since that
-            is a requirement of the Matplotlib library. The image is also expected
-            to be in the range [0, 255].
-        metadata (Metadata): dataset metadata (e.g. class names and colors)
+        image (Union[torch.Tensor, numpy.ndarray]): Tensor of shape (C x H x W) or ndarray of
+            shape (H x W x C) with dtype uint8.
         instance_mode (ColorMode): defines one of the pre-defined style for drawing
             instances on an image.
     """
 
-    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
+    def __init__(
+        self,
+        image: Union[Tensor, np.ndarray],
+        scale: float = 1.0,
+        line_width: Optional[int] = None,
+    ):
+
+        if isinstance(image, torch.Tensor):
+            if image.dtype != torch.uint8:
+                raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
+            if image.dim() != 3:
+                raise ValueError("Pass individual images, not batches")
+            if image.size(0) not in {1, 3}:
+                raise ValueError("Only grayscale and RGB images are supported")
+            # Handle Grayscale images
+            if image.size(0) == 1:
+                image = torch.tile(image, (3, 1, 1))
+            self.img = image.permute(1, 2, 0).cpu().numpy()
+        elif isinstance(image, np.ndarray):
+            if image.dtype != np.uint8:
+                raise ValueError(f"Numpy uint8 expected, got {image.dtype}")
+            if image.ndim != 3:
+                raise ValueError("Currently only RGB images are supported")
+            self.img = image
+        else:
+            raise TypeError(f"Tensor or numpy.ndarray expected, got {type(image)}")
 
-        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
         self.output = VisImage(self.img, scale=scale)
         self.cpu_device = torch.device("cpu")
-
-        # too small texts are useless, therefore clamp to 9
-        self._default_font_size = max(np.sqrt(self.output.height * self.output.width) // 90, 10 // scale)
-        self._instance_mode = instance_mode
+        self.line_width = line_width or max(round(sum(self.img.shape) / 2 * 0.003), 2)
 
     @torch.no_grad()
     def draw_bounding_boxes(
-        image: torch.Tensor,
+        self,
         boxes: torch.Tensor,
         labels: Optional[List[str]] = None,
         colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
-        fill: Optional[bool] = False,
-        width: int = 1,
-        font: Optional[str] = None,
-        font_size: int = 10,
+        txt_colors: Tuple[int, int, int] = (255, 255, 255),
     ) -> torch.Tensor:
         """
         Draws bounding boxes on given image.
         The values of the input image should be uint8 between 0 and 255.
         If fill is True, Resulting Tensor should be saved as PNG image.
 
-        Adapted from https://github.com/pytorch/vision/blob/1fc53b2/torchvision/utils.py#L159-L195
-
         Args:
-            image (Tensor): Tensor of shape (C x H x W) and dtype uint8.
             boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax)
                 format. Note that the boxes are absolute coordinates with respect to the image. In other
                 words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
@@ -193,64 +203,22 @@ def draw_bounding_boxes(
             img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted.
         """
 
-        if not isinstance(image, torch.Tensor):
-            raise TypeError(f"Tensor expected, got {type(image)}")
-        elif image.dtype != torch.uint8:
-            raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
-        elif image.dim() != 3:
-            raise ValueError("Pass individual images, not batches")
-        elif image.size(0) not in {1, 3}:
-            raise ValueError("Only grayscale and RGB images are supported")
-
-        num_boxes = boxes.shape[0]
-
-        if labels is None:
-            labels: Union[List[str], List[None]] = [None] * num_boxes  # type: ignore[no-redef]
-        if len(labels) != num_boxes:
-            raise ValueError(
-                f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. "
-                "Please specify labels for each box."
+        p1, p2 = (int(boxes[0]), int(boxes[1])), (int(boxes[2]), int(boxes[3]))
+        cv2.rectangle(self.img, p1, p2, colors, thickness=self.line_width, lineType=cv2.LINE_AA)
+        if labels:
+            tf = max(self.line_width - 1, 1)  # font thickness
+            w, h = cv2.getTextSize(labels, 0, fontScale=self.line_width / 3, thickness=tf)[0]
+            outside = p1[1] - h - 3 >= 0  # labels fits outside box
+            p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
+            cv2.rectangle(self.img, p1, p2, colors, -1, cv2.LINE_AA)  # filled
+            cv2.putText(
+                self.img,
+                labels,
+                (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
+                0,
+                self.line_width / 3,
+                txt_colors,
+                thickness=tf,
+                lineType=cv2.LINE_AA,
             )
-
-        if colors is None:
-            colors = _generate_color_palette(num_boxes)
-        elif isinstance(colors, list):
-            if len(colors) < num_boxes:
-                raise ValueError(
-                    f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}).",
-                )
-        else:  # colors specifies a single color for all boxes
-            colors = [colors] * num_boxes
-
-        colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
-
-        # Handle Grayscale images
-        if image.size(0) == 1:
-            image = torch.tile(image, (3, 1, 1))
-
-        ndarr = image.permute(1, 2, 0).cpu().numpy()
-        img_to_draw = Image.fromarray(ndarr)
-        img_boxes = boxes.to(torch.int64).tolist()
-
-        if fill:
-            draw = ImageDraw.Draw(img_to_draw, "RGBA")
-        else:
-            draw = ImageDraw.Draw(img_to_draw)
-
-        try:
-            txt_font = ImageFont.truetype(font=font, size=font_size)
-        except IOError:
-            txt_font = ImageFont.load_default()
-
-        for bbox, color, label in zip(img_boxes, colors, labels):  # type: ignore[arg-type]
-            if fill:
-                fill_color = color + (100,)
-                draw.rectangle(bbox, width=width, outline=color, fill=fill_color)
-            else:
-                draw.rectangle(bbox, width=width, outline=color)
-
-            if label is not None:
-                margin = width + 1
-                draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=color, font=txt_font)
-
-        return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
+        return self.img

From 21506159e834ba13b6c7317c743400b75e7814b5 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Wed, 2 Mar 2022 12:33:30 +0800
Subject: [PATCH 05/16] Cleanup

---
 yolort/utils/visualizer.py | 120 +------------------------------------
 1 file changed, 1 insertion(+), 119 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index bda8ec37..eb047716 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -1,133 +1,16 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
 # Copyright (c) 2022, yolort team. All rights reserved.
 
-from enum import Enum, unique
 from typing import List, Optional, Tuple, Union
 
 import cv2
-import matplotlib.figure as mplfigure
 import numpy as np
 import torch
-from matplotlib.backends.backend_agg import FigureCanvasAgg
 from torch import Tensor
 
 
-@unique
-class ColorMode(Enum):
-    """
-    Enum of different color modes to use for instance visualizations.
-    """
-
-    IMAGE = 0
-    """
-    Picks a random color for every instance and overlay segmentations with low opacity.
-    """
-    SEGMENTATION = 1
-    """
-    Let instances of the same category have similar colors
-    (from metadata.thing_colors), and overlay them with
-    high opacity. This provides more attention on the quality of segmentation.
-    """
-    IMAGE_BW = 2
-    """
-    Same as IMAGE, but convert all areas without masks to gray-scale.
-    Only available for drawing per-instance mask predictions.
-    """
-
-
-class VisImage:
-    def __init__(self, img: np.ndarray, scale: float = 1.0):
-        """
-        Args:
-            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
-            scale (float): scale the input image
-        """
-        self.img = img
-        self.scale = scale
-        self.width, self.height = img.shape[1], img.shape[0]
-        self._setup_figure(img)
-
-    def _setup_figure(self, img):
-        """
-        Args:
-            Same as in :meth:`__init__()`.
-
-        Returns:
-            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
-            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
-        """
-        fig = mplfigure.Figure(frameon=False)
-        self.dpi = fig.get_dpi()
-        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
-        # (https://github.com/matplotlib/matplotlib/issues/15363)
-        fig.set_size_inches(
-            (self.width * self.scale + 1e-2) / self.dpi,
-            (self.height * self.scale + 1e-2) / self.dpi,
-        )
-        self.canvas = FigureCanvasAgg(fig)
-        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
-        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
-        ax.axis("off")
-        self.fig = fig
-        self.ax = ax
-        self.reset_image(img)
-
-    def reset_image(self, img):
-        """
-        Args:
-            img: same as in __init__
-        """
-        img = img.astype("uint8")
-        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
-
-    def save(self, filepath):
-        """
-        Args:
-            filepath (str): a string that contains the absolute path, including the file name, where
-                the visualized image will be saved.
-        """
-        self.fig.savefig(filepath)
-
-    def get_image(self):
-        """
-        Returns:
-            ndarray:
-                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
-                The shape is scaled w.r.t the input image using the given `scale` argument.
-        """
-        canvas = self.canvas
-        s, (width, height) = canvas.print_to_buffer()
-        # buf = io.BytesIO()  # works for cairo backend
-        # canvas.print_rgba(buf)
-        # width, height = self.width, self.height
-        # s = buf.getvalue()
-
-        buffer = np.frombuffer(s, dtype="uint8")
-
-        img_rgba = buffer.reshape(height, width, 4)
-        rgb, alpha = np.split(img_rgba, [3], axis=2)
-        return rgb.astype("uint8")
-
-
 class Visualizer:
     """
-    Visualizer that draws data about detection/segmentation on images.
-
-    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
-    that draw primitive objects to images, as well as high-level wrappers like
-    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
-    that draw composite data in some pre-defined style.
-
-    Note that the exact visualization style for the high-level wrappers are subject to change.
-    Style such as color, opacity, label contents, visibility of labels, or even the visibility
-    of objects themselves (e.g. when the object is too small) may change according to different
-    heuristics, as long as the results still look visually reasonable.
-
-    To obtain a consistent style, you can implement custom drawing functions with the
-    abovementioned primitive methods instead. If you need more customized visualization
-    styles, you can process the data yourself following their format documented in
-    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
-    intend to satisfy everyone's preference on drawing styles.
+    Visualizer that draws data about detection on images.
 
     This visualizer focuses on high rendering quality rather than performance. It is not
     designed to be used for real-time applications.
@@ -166,7 +49,6 @@ def __init__(
         else:
             raise TypeError(f"Tensor or numpy.ndarray expected, got {type(image)}")
 
-        self.output = VisImage(self.img, scale=scale)
         self.cpu_device = torch.device("cpu")
         self.line_width = line_width or max(round(sum(self.img.shape) / 2 * 0.003), 2)
 

From 4806c856ca4822ed39e84802af91049af078bd8c Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Wed, 2 Mar 2022 22:34:04 +0800
Subject: [PATCH 06/16] Refactor the method in Visualizer

---
 yolort/utils/visualizer.py | 223 +++++++++++++++++++++++++++++++------
 1 file changed, 191 insertions(+), 32 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index eb047716..b41d574a 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2022, yolort team. All rights reserved.
 
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import cv2
 import numpy as np
@@ -51,25 +51,40 @@ def __init__(
 
         self.cpu_device = torch.device("cpu")
         self.line_width = line_width or max(round(sum(self.img.shape) / 2 * 0.003), 2)
+        self.output = self.img
 
-    @torch.no_grad()
-    def draw_bounding_boxes(
+    def draw_instance_predictions(self, predictions: Dict):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (dict): the output of an instance detection model. Following
+                fields will be used to draw: "boxes", "labels", "scores".
+
+        Returns:
+            np.ndarray: image object with visualizations.
+        """
+        boxes = predictions.get("boxes", None)
+        scores = predictions.get("scores", None)
+        labels = predictions.get("labels", None)
+        labels = self._create_text_labels(labels, scores)
+
+        self.overlay_instances(boxes=boxes, labels=labels, assigned_colors=None)
+        return self.output
+
+    def overlay_instances(
         self,
-        boxes: torch.Tensor,
+        *,
+        boxes: Optional[Union[Tensor, np.ndarray]] = None,
         labels: Optional[List[str]] = None,
-        colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
-        txt_colors: Tuple[int, int, int] = (255, 255, 255),
-    ) -> torch.Tensor:
+        assigned_colors: Optional[List[str]] = None,
+    ):
         """
-        Draws bounding boxes on given image.
-        The values of the input image should be uint8 between 0 and 255.
-        If fill is True, Resulting Tensor should be saved as PNG image.
-
         Args:
             boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax)
                 format. Note that the boxes are absolute coordinates with respect to the image. In other
                 words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
-            labels (List[str]): List containing the labels of bounding boxes.
+            labels (List[string]): List containing the labels of bounding boxes.
             colors (color or list of colors, optional): List containing the colors
                 of the boxes or single color for all boxes. The color can be represented as
                 PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
@@ -81,26 +96,170 @@ def draw_bounding_boxes(
                 or `/Library/Fonts/`, `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS.
             font_size (int): The requested font size in points.
 
+        Args:
+            boxes (Tensor or ndarray): Tensor or numpy array of size (N, 4) containing
+                bounding boxes in (xmin, ymin, xmax, ymax) format for the N objects in
+                a single image. Note that the boxes are absolute coordinates with respect
+                to the image. In other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
+            labels (List[string]): List containing the text to be displayed for each instance.
+            colors (color or list of colors, optional): List containing the colors
+                of the boxes or single color for all boxes. The color can be represented as
+                PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+                By default, random colors are generated for boxes.
+
         Returns:
-            img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted.
+            np.ndarray: image object with visualizations.
         """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
 
-        p1, p2 = (int(boxes[0]), int(boxes[1])), (int(boxes[2]), int(boxes[3]))
-        cv2.rectangle(self.img, p1, p2, colors, thickness=self.line_width, lineType=cv2.LINE_AA)
-        if labels:
-            tf = max(self.line_width - 1, 1)  # font thickness
-            w, h = cv2.getTextSize(labels, 0, fontScale=self.line_width / 3, thickness=tf)[0]
-            outside = p1[1] - h - 3 >= 0  # labels fits outside box
-            p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
-            cv2.rectangle(self.img, p1, p2, colors, -1, cv2.LINE_AA)  # filled
-            cv2.putText(
-                self.img,
-                labels,
-                (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
-                0,
-                self.line_width / 3,
-                txt_colors,
-                thickness=tf,
-                lineType=cv2.LINE_AA,
-            )
-        return self.img
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                self.draw_text(labels[i], text_pos, color=lighter_color)
+
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Draws bounding boxes on given image.
+        The values of the input image should be uint8 between 0 and 255.
+
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            np.ndarray: image object with box drawn.
+        """
+        p1, p2 = (int(box_coord[0]), int(box_coord[1])), (int(box_coord[2]), int(box_coord[3]))
+        cv2.rectangle(self.output, p1, p2, edge_color, thickness=self.line_width, lineType=cv2.LINE_AA)
+        return self.output
+
+    def draw_text(
+        self,
+        text: str,
+        position: Tuple,
+        *,
+        font_size: Optional[int] = None,
+        color: str = "g",
+        txt_colors: Tuple[int, int, int] = (255, 255, 255),
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+
+        Returns:
+            np.ndarray: image object with text drawn.
+        """
+        p1, p2 = (int(position[0]), int(position[1])), (int(position[2]), int(position[3]))
+
+        if font_size is None:
+            font_size = max(self.line_width - 1, 1)  # font thickness
+        w, h = cv2.getTextSize(text, 0, fontScale=self.line_width / 3, thickness=font_size)[0]
+        outside = p1[1] - h - 3 >= 0  # text fits outside box
+        p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
+        cv2.rectangle(self.output, p1, p2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            self.output,
+            text,
+            (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
+            0,
+            self.line_width / 3,
+            txt_colors,
+            thickness=font_size,
+            lineType=cv2.LINE_AA,
+        )
+        return self.output
+
+    @staticmethod
+    def _create_text_labels(classes, scores, class_names, is_crowd=None):
+        """
+        Args:
+            classes (list[int] or None):
+            scores (list[float] or None):
+            class_names (list[string] or None):
+            is_crowd (list[bool] or None):
+
+        Returns:
+            list[string] or None
+        """
+        labels = None
+        if classes is not None:
+            if class_names is not None and len(class_names) > 0:
+                labels = [class_names[i] for i in classes]
+            else:
+                labels = [str(i) for i in classes]
+        if scores is not None:
+            if labels is None:
+                labels = ["{:.0f}%".format(s * 100) for s in scores]
+            else:
+                labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+        if labels is not None and is_crowd is not None:
+            labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+        return labels
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        return color
+
+    def _convert_boxes(self, boxes: Tensor):
+        """
+        Convert different format of boxes to an Nx4 array.
+        """
+        return boxes.cpu().detach().numpy()

From 544f4dee49482c48ee10d43119258e7aaa3bf242 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Thu, 3 Mar 2022 14:17:44 +0800
Subject: [PATCH 07/16] Fix docstring

---
 yolort/utils/visualizer.py | 64 ++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index b41d574a..8f65f44e 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -12,14 +12,23 @@ class Visualizer:
     """
     Visualizer that draws data about detection on images.
 
+    It contains methods like `draw_{text,box}` that draw primitive objects to images, as well as
+    high-level wrappers like `draw_{instance_predictions,dataset_dict}` that draw composite data
+    in some pre-defined style.
+
     This visualizer focuses on high rendering quality rather than performance. It is not
     designed to be used for real-time applications.
 
+    Reference:
+        We have followed most of the interfaces of detectron2 here, but the implementation will be
+        a bit different. Check out the following for more details.
+        https://github.com/facebookresearch/detectron2/blob/9258799/detectron2/utils/visualizer.py
+
     Args:
-        image (Union[torch.Tensor, numpy.ndarray]): Tensor of shape (C x H x W) or ndarray of
+        image (torch.Tensor or numpy.ndarray): Tensor of shape (C x H x W) or ndarray of
             shape (H x W x C) with dtype uint8.
-        instance_mode (ColorMode): defines one of the pre-defined style for drawing
-            instances on an image.
+        instance_mode (int, optional): defines one of the pre-defined style for drawing
+            instances on an image. Default: None
     """
 
     def __init__(
@@ -80,32 +89,20 @@ def overlay_instances(
         assigned_colors: Optional[List[str]] = None,
     ):
         """
-        Args:
-            boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax)
-                format. Note that the boxes are absolute coordinates with respect to the image. In other
-                words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
-            labels (List[string]): List containing the labels of bounding boxes.
-            colors (color or list of colors, optional): List containing the colors
-                of the boxes or single color for all boxes. The color can be represented as
-                PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
-                By default, random colors are generated for boxes.
-            fill (bool): If `True` fills the bounding box with specified color.
-            width (int): Width of bounding box.
-            font (str): A filename containing a TrueType font. If the file is not found in this filename,
-                the loader may also search in other directories, such as the `fonts/` directory on Windows
-                or `/Library/Fonts/`, `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS.
-            font_size (int): The requested font size in points.
+        Overlay bounding boxes and labels on input image.
 
         Args:
-            boxes (Tensor or ndarray): Tensor or numpy array of size (N, 4) containing
+            boxes (Tensor or ndarray, optional): Tensor or numpy array of size (N, 4) containing
                 bounding boxes in (xmin, ymin, xmax, ymax) format for the N objects in
                 a single image. Note that the boxes are absolute coordinates with respect
                 to the image. In other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
-            labels (List[string]): List containing the text to be displayed for each instance.
+                Default: None
+            labels (List[string], optional): List containing the text to be displayed for each
+                instance. Default: None
             colors (color or list of colors, optional): List containing the colors
                 of the boxes or single color for all boxes. The color can be represented as
                 PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
-                By default, random colors are generated for boxes.
+                By default, random colors are generated for boxes. Default: None
 
         Returns:
             np.ndarray: image object with visualizations.
@@ -181,11 +178,13 @@ def draw_text(
         txt_colors: Tuple[int, int, int] = (255, 255, 255),
     ):
         """
+        Draws text on given image.
+
         Args:
-            text (str): class label
+            text (string): class label
             position (tuple): a tuple of the x and y coordinates to place text on image.
             font_size (int, optional): font of the text. If not provided, a font size
-                proportional to the image width is calculated and used.
+                proportional to the image width is calculated and used. Default: None
             color: color of the text. Refer to `matplotlib.colors` for full list
                 of formats that are accepted.
 
@@ -213,16 +212,15 @@ def draw_text(
         return self.output
 
     @staticmethod
-    def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    def _create_text_labels(
+        classes: Optional[List[int]] = None,
+        scores: Optional[List[float]] = None,
+        class_names: Optional[List[str]] = None,
+        is_crowd: Optional[List[bool]] = None,
+    ):
         """
-        Args:
-            classes (list[int] or None):
-            scores (list[float] or None):
-            class_names (list[string] or None):
-            is_crowd (list[bool] or None):
-
-        Returns:
-            list[string] or None
+        Generate labels that classes and scores can match, and set class back to its original
+        name if concrete class names are provided.
         """
         labels = None
         if classes is not None:
@@ -239,7 +237,7 @@ def _create_text_labels(classes, scores, class_names, is_crowd=None):
             labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
         return labels
 
-    def _change_color_brightness(self, color, brightness_factor):
+    def _change_color_brightness(self, color: Tuple[int, int, int], brightness_factor: float):
         """
         Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
         less or more saturation than the original color.

From 2ece383a548db21d9638ae1922c05e7a00a849e3 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Thu, 3 Mar 2022 15:27:59 +0800
Subject: [PATCH 08/16] Add metadata attribute in Visualizer

---
 yolort/utils/visualizer.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index 8f65f44e..5f6a0a7e 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -27,6 +27,7 @@ class Visualizer:
     Args:
         image (torch.Tensor or numpy.ndarray): Tensor of shape (C x H x W) or ndarray of
             shape (H x W x C) with dtype uint8.
+        metalabels (string, optional): Concrete label names of different classes. Default: None
         instance_mode (int, optional): defines one of the pre-defined style for drawing
             instances on an image. Default: None
     """
@@ -34,9 +35,11 @@ class Visualizer:
     def __init__(
         self,
         image: Union[Tensor, np.ndarray],
+        *,
+        metalabels: Optional[str] = None,
         scale: float = 1.0,
         line_width: Optional[int] = None,
-    ):
+    ) -> None:
 
         if isinstance(image, torch.Tensor):
             if image.dtype != torch.uint8:
@@ -58,6 +61,12 @@ def __init__(
         else:
             raise TypeError(f"Tensor or numpy.ndarray expected, got {type(image)}")
 
+        # Set dataset metadata (e.g. class names)
+        self.metadata = None
+        if metalabels is not None:
+            self.metadata = np.loadtxt(metalabels, dtype='str', delimiter='\n')
+
+        self.scale = scale
         self.cpu_device = torch.device("cpu")
         self.line_width = line_width or max(round(sum(self.img.shape) / 2 * 0.003), 2)
         self.output = self.img

From 78c156b91e2d0ea9b39d4eb9b6f00616b04019e1 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Thu, 3 Mar 2022 16:08:37 +0800
Subject: [PATCH 09/16] Fix Visualizer._create_text_labels()

---
 yolort/utils/visualizer.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index 5f6a0a7e..8dea5182 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -71,7 +71,7 @@ def __init__(
         self.line_width = line_width or max(round(sum(self.img.shape) / 2 * 0.003), 2)
         self.output = self.img
 
-    def draw_instance_predictions(self, predictions: Dict):
+    def draw_instance_predictions(self, predictions: Dict[str, Tensor]):
         """
         Draw instance-level prediction results on an image.
 
@@ -82,10 +82,11 @@ def draw_instance_predictions(self, predictions: Dict):
         Returns:
             np.ndarray: image object with visualizations.
         """
-        boxes = predictions.get("boxes", None)
-        scores = predictions.get("scores", None)
-        labels = predictions.get("labels", None)
+        boxes = predictions["boxes"].round().tolist()
+        labels = predictions["labels"].tolist()
+        scores = predictions["scores"].tolist()
         labels = self._create_text_labels(labels, scores)
+        print(labels)
 
         self.overlay_instances(boxes=boxes, labels=labels, assigned_colors=None)
         return self.output
@@ -220,11 +221,10 @@ def draw_text(
         )
         return self.output
 
-    @staticmethod
     def _create_text_labels(
+        self,
         classes: Optional[List[int]] = None,
         scores: Optional[List[float]] = None,
-        class_names: Optional[List[str]] = None,
         is_crowd: Optional[List[bool]] = None,
     ):
         """
@@ -233,17 +233,17 @@ def _create_text_labels(
         """
         labels = None
         if classes is not None:
-            if class_names is not None and len(class_names) > 0:
-                labels = [class_names[i] for i in classes]
+            if self.metadata is not None and len(self.metadata) > 0:
+                labels = [self.metadata[i] for i in classes]
             else:
                 labels = [str(i) for i in classes]
         if scores is not None:
             if labels is None:
-                labels = ["{:.0f}%".format(s * 100) for s in scores]
+                labels = [f"{score * 100:.0f}%" for score in scores]
             else:
-                labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+                labels = [f"{label} {score * 100:.0f}%" for label, score in zip(labels, scores)]
         if labels is not None and is_crowd is not None:
-            labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+            labels = [label + ("|crowd" if crowd else "") for label, crowd in zip(labels, is_crowd)]
         return labels
 
     def _change_color_brightness(self, color: Tuple[int, int, int], brightness_factor: float):

From 2cba497797f6e8983d96ed5f44751ae78e30726e Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Thu, 3 Mar 2022 16:10:49 +0800
Subject: [PATCH 10/16] Apply pre-commit

---
 yolort/utils/visualizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index 8dea5182..93495479 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -64,7 +64,7 @@ def __init__(
         # Set dataset metadata (e.g. class names)
         self.metadata = None
         if metalabels is not None:
-            self.metadata = np.loadtxt(metalabels, dtype='str', delimiter='\n')
+            self.metadata = np.loadtxt(metalabels, dtype="str", delimiter="\n")
 
         self.scale = scale
         self.cpu_device = torch.device("cpu")

From b27fcb27ca54bb47a47591cadd2955374263e5d7 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Thu, 3 Mar 2022 23:53:06 +0800
Subject: [PATCH 11/16] Fix Visualizer.overlay_instances()

---
 yolort/utils/visualizer.py | 92 +++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index 93495479..9ce1e2e2 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -6,6 +6,7 @@
 import numpy as np
 import torch
 from torch import Tensor
+from yolort.v5.utils.plots import Colors
 
 
 class Visualizer:
@@ -52,12 +53,14 @@ def __init__(
             if image.size(0) == 1:
                 image = torch.tile(image, (3, 1, 1))
             self.img = image.permute(1, 2, 0).cpu().numpy()
+            self.is_bgr = False
         elif isinstance(image, np.ndarray):
             if image.dtype != np.uint8:
                 raise ValueError(f"Numpy uint8 expected, got {image.dtype}")
             if image.ndim != 3:
-                raise ValueError("Currently only RGB images are supported")
+                raise ValueError("Currently only BGR images are supported")
             self.img = image
+            self.is_bgr = True
         else:
             raise TypeError(f"Tensor or numpy.ndarray expected, got {type(image)}")
 
@@ -69,6 +72,7 @@ def __init__(
         self.scale = scale
         self.cpu_device = torch.device("cpu")
         self.line_width = line_width or max(round(sum(self.img.shape) / 2 * 0.003), 2)
+        self.assigned_colors = Colors()
         self.output = self.img
 
     def draw_instance_predictions(self, predictions: Dict[str, Tensor]):
@@ -82,44 +86,37 @@ def draw_instance_predictions(self, predictions: Dict[str, Tensor]):
         Returns:
             np.ndarray: image object with visualizations.
         """
-        boxes = predictions["boxes"].round().tolist()
+        boxes = self._convert_boxes(predictions["boxes"])
         labels = predictions["labels"].tolist()
+        colors = self._create_colors(labels)
         scores = predictions["scores"].tolist()
         labels = self._create_text_labels(labels, scores)
-        print(labels)
 
-        self.overlay_instances(boxes=boxes, labels=labels, assigned_colors=None)
+        self.overlay_instances(boxes=boxes, labels=labels, colors=colors)
         return self.output
 
     def overlay_instances(
         self,
-        *,
-        boxes: Optional[Union[Tensor, np.ndarray]] = None,
+        boxes: Optional[np.ndarray] = None,
         labels: Optional[List[str]] = None,
-        assigned_colors: Optional[List[str]] = None,
+        colors: Optional[List[Tuple[int, int, int]]] = None,
     ):
         """
         Overlay bounding boxes and labels on input image.
 
         Args:
-            boxes (Tensor or ndarray, optional): Tensor or numpy array of size (N, 4) containing
-                bounding boxes in (xmin, ymin, xmax, ymax) format for the N objects in
-                a single image. Note that the boxes are absolute coordinates with respect
-                to the image. In other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
-                Default: None
+            boxes (ndarray, optional): Numpy array of size (N, 4) containing bounding boxes
+                in (xmin, ymin, xmax, ymax) format for the N objects in a single image.
+                Note that the boxes are absolute coordinates with respect to the image. In
+                other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`. Default: None
             labels (List[string], optional): List containing the text to be displayed for each
                 instance. Default: None
-            colors (color or list of colors, optional): List containing the colors
-                of the boxes or single color for all boxes. The color can be represented as
-                PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
-                By default, random colors are generated for boxes. Default: None
 
         Returns:
             np.ndarray: image object with visualizations.
         """
         num_instances = 0
         if boxes is not None:
-            boxes = self._convert_boxes(boxes)
             num_instances = len(boxes)
         if labels is not None:
             assert len(labels) == num_instances
@@ -136,28 +133,20 @@ def overlay_instances(
             # Re-order overlapped instances in descending order.
             boxes = boxes[sorted_idxs] if boxes is not None else None
             labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            colors = [colors[k] for k in sorted_idxs ] if colors is not None else None
 
         for i in range(num_instances):
-            color = assigned_colors[i]
+            color = colors[i]
             if boxes is not None:
                 self.draw_box(boxes[i], edge_color=color)
 
             if labels is not None:
-                # first get a box
-                if boxes is not None:
-                    x0, y0, x1, y1 = boxes[i]
-                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
-                    horiz_align = "left"
-                else:
-                    continue  # drawing the box confidence for keypoints isn't very useful.
-
                 lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-                self.draw_text(labels[i], text_pos, color=lighter_color)
+                self.draw_text(labels[i], boxes[i], color=lighter_color)
 
         return self.output
 
-    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+    def draw_box(self, box_coord: List[float], edge_color: Tuple[int, int, int] = (229, 160, 21)):
         """
         Draws bounding boxes on given image.
         The values of the input image should be uint8 between 0 and 255.
@@ -166,10 +155,7 @@ def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
             box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
                 are the coordinates of the image's top left corner. x1 and y1 are the
                 coordinates of the image's bottom right corner.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
+            edge_color: color of the outline of the box.
 
         Returns:
             np.ndarray: image object with box drawn.
@@ -184,7 +170,7 @@ def draw_text(
         position: Tuple,
         *,
         font_size: Optional[int] = None,
-        color: str = "g",
+        color: Tuple[int, int, int] = (229, 160, 21),
         txt_colors: Tuple[int, int, int] = (255, 255, 255),
     ):
         """
@@ -221,6 +207,15 @@ def draw_text(
         )
         return self.output
 
+    def _convert_boxes(self, boxes: Union[Tensor, np.ndarray]):
+        """
+        Convert different format of boxes to an Nx4 array.
+        """
+        if isinstance(boxes, Tensor):
+            return boxes.cpu().detach().numpy()
+        else:
+            return boxes
+
     def _create_text_labels(
         self,
         classes: Optional[List[int]] = None,
@@ -246,27 +241,34 @@ def _create_text_labels(
             labels = [label + ("|crowd" if crowd else "") for label, crowd in zip(labels, is_crowd)]
         return labels
 
-    def _change_color_brightness(self, color: Tuple[int, int, int], brightness_factor: float):
+    def _create_colors(self, labels: Optional[List[int]] = None):
+        """
+        Generate colors that match the labels.
+        """
+        colors = None
+        if labels is not None:
+            colors = [self.assigned_colors(label, bgr=self.is_bgr) for label in labels]
+        return colors
+
+    def _change_color_brightness(
+        self,
+        color: Tuple[int, int, int],
+        brightness_factor: float,
+    ) -> Tuple[int, int, int]:
         """
         Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
         less or more saturation than the original color.
 
         Args:
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
+            color: color of the polygon.
             brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
                 0 will correspond to no change, a factor in [-1.0, 0) range will result in
                 a darker color and a factor in (0, 1.0] range will result in a lighter color.
 
         Returns:
-            modified_color (tuple[double]): a tuple containing the RGB values of the
-                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+            modified_color (tuple[int]): a tuple containing the RGB values of the
+                modified color.
         """
         assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        # TODO: Implement the details in a follow-up PR
         return color
-
-    def _convert_boxes(self, boxes: Tensor):
-        """
-        Convert different format of boxes to an Nx4 array.
-        """
-        return boxes.cpu().detach().numpy()

From 39a7469612f8cf3f7f534f41956efc236ac0751f Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Thu, 3 Mar 2022 23:59:38 +0800
Subject: [PATCH 12/16] Apply pre-commit

---
 yolort/utils/visualizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index 9ce1e2e2..3f1d3f7e 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -133,7 +133,7 @@ def overlay_instances(
             # Re-order overlapped instances in descending order.
             boxes = boxes[sorted_idxs] if boxes is not None else None
             labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            colors = [colors[k] for k in sorted_idxs ] if colors is not None else None
+            colors = [colors[k] for k in sorted_idxs] if colors is not None else None
 
         for i in range(num_instances):
             color = colors[i]

From 9284a2288717ddb832bafd34eb65c0732ed2f33a Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Fri, 4 Mar 2022 00:47:27 +0800
Subject: [PATCH 13/16] Cleanup

---
 yolort/utils/visualizer.py | 115 ++++++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 54 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index 3f1d3f7e..dfad5df6 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -2,12 +2,16 @@
 
 from typing import Dict, List, Optional, Tuple, Union
 
-import cv2
 import numpy as np
 import torch
 from torch import Tensor
 from yolort.v5.utils.plots import Colors
 
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+
 
 class Visualizer:
     """
@@ -86,6 +90,10 @@ def draw_instance_predictions(self, predictions: Dict[str, Tensor]):
         Returns:
             np.ndarray: image object with visualizations.
         """
+
+        if cv2 is None:
+            raise ImportError("OpenCV is not installed, please install it first.")
+
         boxes = self._convert_boxes(predictions["boxes"])
         labels = predictions["labels"].tolist()
         colors = self._create_colors(labels)
@@ -97,111 +105,110 @@ def draw_instance_predictions(self, predictions: Dict[str, Tensor]):
 
     def overlay_instances(
         self,
-        boxes: Optional[np.ndarray] = None,
-        labels: Optional[List[str]] = None,
-        colors: Optional[List[Tuple[int, int, int]]] = None,
-    ):
+        boxes: np.ndarray,
+        labels: List[str],
+        colors: List[Tuple[int, int, int]],
+    ) -> np.ndarray:
         """
         Overlay bounding boxes and labels on input image.
 
         Args:
-            boxes (ndarray, optional): Numpy array of size (N, 4) containing bounding boxes
+            boxes (np.ndarray): Numpy array of size (N, 4) containing bounding boxes
                 in (xmin, ymin, xmax, ymax) format for the N objects in a single image.
-                Note that the boxes are absolute coordinates with respect to the image. In
-                other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`. Default: None
-            labels (List[string], optional): List containing the text to be displayed for each
-                instance. Default: None
+                Note that the boxes are absolute coordinates with respect to the image.
+                In other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`.
+            labels (List[string]): List containing the text to be displayed for each
+                instance.
+            colors (List[Tuple[int, int, int]]): List containing the color of the label
+                to be painted.
 
         Returns:
             np.ndarray: image object with visualizations.
         """
-        num_instances = 0
-        if boxes is not None:
-            num_instances = len(boxes)
-        if labels is not None:
-            assert len(labels) == num_instances
+
+        num_instances = len(boxes)
+        assert len(labels) == num_instances
         if num_instances == 0:
             return self.output
 
         # Display in largest to smallest order to reduce occlusion.
-        areas = None
-        if boxes is not None:
-            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-
-        if areas is not None:
-            sorted_idxs = np.argsort(-areas).tolist()
-            # Re-order overlapped instances in descending order.
-            boxes = boxes[sorted_idxs] if boxes is not None else None
-            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            colors = [colors[k] for k in sorted_idxs] if colors is not None else None
-
-        for i in range(num_instances):
-            color = colors[i]
-            if boxes is not None:
-                self.draw_box(boxes[i], edge_color=color)
-
-            if labels is not None:
-                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-                self.draw_text(labels[i], boxes[i], color=lighter_color)
+        areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs] if boxes is not None else None
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [colors[k] for k in sorted_idxs] if colors is not None else None
+
+        for box, label, color in zip(boxes, labels, colors):
+            pt1, pt2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
+            self.draw_box(pt1, pt2, color=color)
+
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            self.draw_text(label, pt1, pt2, color=lighter_color)
 
         return self.output
 
-    def draw_box(self, box_coord: List[float], edge_color: Tuple[int, int, int] = (229, 160, 21)):
+    def draw_box(
+        self,
+        pt1: Tuple[int, int],
+        pt2: Tuple[int, int],
+        color: Tuple[int, int, int] = (229, 160, 21),
+    ) -> np.ndarray:
         """
         Draws bounding boxes on given image.
         The values of the input image should be uint8 between 0 and 255.
 
         Args:
-            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
-                are the coordinates of the image's top left corner. x1 and y1 are the
-                coordinates of the image's bottom right corner.
-            edge_color: color of the outline of the box.
+            pt1 (Tuple[int, int]): Vertex of the rectangle (top left corner).
+            pt2 (Tuple[int, int]): Vertex of the rectangle opposite to pt1 (bottom right corner).
+            color (Tuple[int, int, int]): color of the outline of the box.
 
         Returns:
             np.ndarray: image object with box drawn.
         """
-        p1, p2 = (int(box_coord[0]), int(box_coord[1])), (int(box_coord[2]), int(box_coord[3]))
-        cv2.rectangle(self.output, p1, p2, edge_color, thickness=self.line_width, lineType=cv2.LINE_AA)
+        cv2.rectangle(self.output, pt1, pt2, color, thickness=self.line_width, lineType=cv2.LINE_AA)
         return self.output
 
     def draw_text(
         self,
         text: str,
-        position: Tuple,
+        pt1: Tuple[int, int],
+        pt2: Tuple[int, int],
         *,
         font_size: Optional[int] = None,
         color: Tuple[int, int, int] = (229, 160, 21),
-        txt_colors: Tuple[int, int, int] = (255, 255, 255),
+        txt_color: Tuple[int, int, int] = (255, 255, 255),
     ):
         """
         Draws text on given image.
 
         Args:
             text (string): class label
-            position (tuple): a tuple of the x and y coordinates to place text on image.
+            pt1 (Tuple[int, int]): Vertex of the rectangle (top left corner).
+            pt2 (Tuple[int, int]): Vertex of the rectangle opposite to pt1 (bottom right corner).
             font_size (int, optional): font of the text. If not provided, a font size
                 proportional to the image width is calculated and used. Default: None
-            color: color of the text. Refer to `matplotlib.colors` for full list
-                of formats that are accepted.
+            color (Tuple[int, int, int]): color of the filled text.
+            txt_color (Tuple[int, int, int]): color of the text.
 
         Returns:
             np.ndarray: image object with text drawn.
         """
-        p1, p2 = (int(position[0]), int(position[1])), (int(position[2]), int(position[3]))
 
         if font_size is None:
             font_size = max(self.line_width - 1, 1)  # font thickness
         w, h = cv2.getTextSize(text, 0, fontScale=self.line_width / 3, thickness=font_size)[0]
-        outside = p1[1] - h - 3 >= 0  # text fits outside box
-        p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
-        cv2.rectangle(self.output, p1, p2, color, -1, cv2.LINE_AA)  # filled
+        outside = pt1[1] - h - 3 >= 0  # text fits outside box
+        pt2 = pt1[0] + w, pt1[1] - h - 3 if outside else pt1[1] + h + 3
+        cv2.rectangle(self.output, pt1, pt2, color, -1, cv2.LINE_AA)  # filled
         cv2.putText(
             self.output,
             text,
-            (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
+            (pt1[0], pt1[1] - 2 if outside else pt1[1] + h + 2),
             0,
             self.line_width / 3,
-            txt_colors,
+            txt_color,
             thickness=font_size,
             lineType=cv2.LINE_AA,
         )
@@ -260,13 +267,13 @@ def _change_color_brightness(
         less or more saturation than the original color.
 
         Args:
-            color: color of the polygon.
+            color (Tuple[int, int, int]): color of the polygon.
             brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
                 0 will correspond to no change, a factor in [-1.0, 0) range will result in
                 a darker color and a factor in (0, 1.0] range will result in a lighter color.
 
         Returns:
-            modified_color (tuple[int]): a tuple containing the RGB values of the
+            modified_color (Tuple[int, int, int]): a tuple containing the RGB/BGR values of the
                 modified color.
         """
         assert brightness_factor >= -1.0 and brightness_factor <= 1.0

From b0e2f7c4657085abe56bd2a06a95a37617414d0f Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Fri, 4 Mar 2022 01:07:05 +0800
Subject: [PATCH 14/16] Add test_visualizer

---
 test/test_utils.py       | 19 ++++++++++++++++++-
 yolort/utils/__init__.py |  2 ++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index ba5726d8..3bbe7452 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,13 +4,15 @@
 import pytest
 import torch
 from torch import Tensor
+from torchvision.io import read_image
 from yolort import models
 from yolort.models import YOLO
 from yolort.utils import (
-    FeatureExtractor,
     get_image_from_url,
     load_from_ultralytics,
     read_image_to_tensor,
+    FeatureExtractor,
+    Visualizer,
 )
 from yolort.utils.image_utils import box_cxcywh_to_xyxy
 from yolort.v5 import (
@@ -22,6 +24,21 @@
 )
 
 
+
+@pytest.mark.parametrize("arch", ["yolov5n"])
+def test_visualizer(arch):
+    model = models.__dict__[arch](pretrained=True, size=(320, 320), score_thresh=0.45)
+    model = model.eval()
+    img_path = "test/assets/zidane.jpg"
+    preds = model.predict(img_path)
+
+    metalabels_path = "notebooks/assets/coco.names"
+    image = read_image(img_path)
+    v = Visualizer(image, metalabels=metalabels_path)
+    output = v.draw_instance_predictions(preds[0])
+    assert isinstance(output, np.ndarray)
+
+
 @pytest.mark.parametrize(
     "arch, version, upstream_version, hash_prefix, use_p6",
     [
diff --git a/yolort/utils/__init__.py b/yolort/utils/__init__.py
index ecdaf098..ea8479ef 100644
--- a/yolort/utils/__init__.py
+++ b/yolort/utils/__init__.py
@@ -8,6 +8,7 @@
 from .hooks import FeatureExtractor
 from .image_utils import cv2_imshow, get_image_from_url, read_image_to_tensor
 from .update_module_state import convert_yolov5_to_yolort, load_from_ultralytics
+from .visualizer import Visualizer
 
 
 __all__ = [
@@ -19,6 +20,7 @@
     "load_from_ultralytics",
     "load_state_dict_from_url",
     "read_image_to_tensor",
+    "Visualizer",
 ]
 
 

From 9386c160eff218b68ba5be8cb371f36fd745e265 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Fri, 4 Mar 2022 01:08:25 +0800
Subject: [PATCH 15/16] Apply pre-commit

---
 test/test_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 3bbe7452..bfbacfcf 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -24,7 +24,6 @@
 )
 
 
-
 @pytest.mark.parametrize("arch", ["yolov5n"])
 def test_visualizer(arch):
     model = models.__dict__[arch](pretrained=True, size=(320, 320), score_thresh=0.45)

From 88b830a6f1a18679638e760db5713a2f305a2d53 Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Fri, 4 Mar 2022 01:27:43 +0800
Subject: [PATCH 16/16] Add Visualizer.imshow()

---
 yolort/utils/visualizer.py | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/yolort/utils/visualizer.py b/yolort/utils/visualizer.py
index dfad5df6..ba4d3f85 100644
--- a/yolort/utils/visualizer.py
+++ b/yolort/utils/visualizer.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import torch
+from PIL import Image
 from torch import Tensor
 from yolort.v5.utils.plots import Colors
 
@@ -37,14 +38,7 @@ class Visualizer:
             instances on an image. Default: None
     """
 
-    def __init__(
-        self,
-        image: Union[Tensor, np.ndarray],
-        *,
-        metalabels: Optional[str] = None,
-        scale: float = 1.0,
-        line_width: Optional[int] = None,
-    ) -> None:
+    def __init__(self, image: Union[Tensor, np.ndarray], *, metalabels: Optional[str] = None) -> None:
 
         if isinstance(image, torch.Tensor):
             if image.dtype != torch.uint8:
@@ -73,9 +67,7 @@ def __init__(
         if metalabels is not None:
             self.metadata = np.loadtxt(metalabels, dtype="str", delimiter="\n")
 
-        self.scale = scale
-        self.cpu_device = torch.device("cpu")
-        self.line_width = line_width or max(round(sum(self.img.shape) / 2 * 0.003), 2)
+        self.line_width = max(round(sum(self.img.shape) / 2 * 0.003), 2)
         self.assigned_colors = Colors()
         self.output = self.img
 
@@ -103,6 +95,30 @@ def draw_instance_predictions(self, predictions: Dict[str, Tensor]):
         self.overlay_instances(boxes=boxes, labels=labels, colors=colors)
         return self.output
 
+    def imshow(self, scale: Optional[float] = None):
+        """
+        A replacement of cv2.imshow() for using in Jupyter notebooks.
+
+        Args:
+            scale (float, optional): zoom ratio to show the image. Default: None
+        """
+        from IPython.display import display
+
+        img = self.output
+
+        img = img.clip(0, 255).astype("uint8")
+        # cv2 stores colors as BGR; convert to RGB
+        if self.is_bgr and img.ndim == 3:
+            if img.shape[2] == 4:
+                img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGBA)
+            else:
+                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+        if scale is not None:
+            img = cv2.resize(img, None, fx=scale, fy=scale)
+
+        display(Image.fromarray(img))
+
     def overlay_instances(
         self,
         boxes: np.ndarray,