palm detection using yolo with oak-d pipeline

Kovelja009 · Apr 1, 2024 · af6956f · af6956f
1 parent ac6c438
commit af6956f
Show file tree

Hide file tree

Showing 7 changed files with 354 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -41,15 +41,15 @@ results*.csv
 coco/
 coco128/
 VOC/
-
+hagrid/
 # MATLAB GitIgnore -----------------------------------------------------------------------------------------------------
 *.m~
 *.mat
 !targets*.mat
 
 # Neural Network weights -----------------------------------------------------------------------------------------------
 *.weights
-*.pt
+# *.pt
 *.pb
 *.onnx
 *.engine

diff --git a/OAK_D_api.py b/OAK_D_api.py
@@ -0,0 +1,80 @@
+import cv2
+import depthai as dai
+import time
+
+
+class FPSHandler:
+    def __init__(self):
+        self.timestamp = time.time() + 1
+        self.start = time.time()
+        self.frame_cnt = 0
+
+        self._coordinates = (20, 20)
+        self._font = cv2.FONT_HERSHEY_SIMPLEX
+        self._font_scale = 0.7
+        self._color = (0, 0, 255)
+        self._thickness = 1
+
+    def next_iter(self):
+        self.timestamp = time.time()
+        self.frame_cnt += 1
+
+    def fps(self):
+        return self.frame_cnt / (self.timestamp - self.start)
+
+    def show_fps(self, frame, fps):
+        return cv2.putText(frame, fps.__str__(), self._coordinates, self._font, self._font_scale, self._color,
+                           self._thickness, cv2.LINE_AA)
+
+
+class OAK_D:
+    def __init__(self, fps=24, width=1920, height=1080):
+        # Create pipeline
+        self._pipeline = dai.Pipeline()
+
+        # Define source and output
+        self._camRgb = self._pipeline.create(dai.node.ColorCamera)
+        self._xoutVideo = self._pipeline.create(dai.node.XLinkOut)
+
+        self._xoutVideo.setStreamName("video")
+
+        # Properties
+        self._camRgb.setBoardSocket(dai.CameraBoardSocket.RGB)
+        self._camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
+        self._camRgb.setVideoSize(width, height)
+        self._camRgb.setFps(fps)
+
+        self._xoutVideo.input.setBlocking(False)
+        self._xoutVideo.input.setQueueSize(1)
+
+        # Linking
+        self._camRgb.video.link(self._xoutVideo.input)
+
+        # Connect to device and start pipeline
+        self._device = dai.Device(self._pipeline)
+        self._video = self._device.getOutputQueue(name="video", maxSize=1, blocking=False)
+        self.fps_handler = FPSHandler()
+        self.height = self._camRgb.getVideoHeight()
+        self.width = self._camRgb.getVideoWidth()
+
+    def get_color_frame(self, show_fps=False):
+        video_in = self._video.get()
+        # convert from 
+        # Get BGR frame from NV12 encoded video frame to show with opencv
+        # Visualizing the frame on slower hosts might have overhead
+        cv_frame = video_in.getCvFrame()
+        if show_fps:
+            self.fps_handler.next_iter()
+            # return video_in.getCvFrame()
+            return self.fps_handler.show_fps(cv_frame, round(self.fps_handler.fps(), 2))
+        else:
+            return cv_frame
+
+
+if __name__ == '__main__':
+    oak_d = OAK_D(fps=60, width=300, height=300)
+    while True:
+        frame = oak_d.get_color_frame(show_fps=True)
+        cv2.imshow("VidraCar", frame)
+        if cv2.waitKey(1) == ord('q'):
+            break
diff --git a/inference.py b/inference.py
@@ -0,0 +1,91 @@
+import torch
+from ultralytics.utils.plotting import Annotator, colors
+
+from utils.dataloaders import OakDLoadImages
+from utils.general import (
+    Profile,
+    check_img_size,
+    non_max_suppression,
+    scale_boxes,
+    xyxy2xywh,
+)
+from utils.torch_utils import smart_inference_mode
+
+
+@smart_inference_mode()
+def run(
+    frame=None,  # openCV image
+    imgsz=(640, 640),  # inference size (height, width)
+    conf_thres=0.25,  # confidence threshold
+    iou_thres=0.45,  # NMS IOU threshold
+    max_det=1000,  # maximum detections per image
+    device="",  # cuda device, i.e. 0 or 0,1,2,3 or cpu
+    classes=None,  # filter by class: --class 0, or --class 0 2 3
+    agnostic_nms=False,  # class-agnostic NMS
+    line_thickness=3,  # bounding box thickness (pixels)
+    hide_labels=False,  # hide labels
+    hide_conf=False,  # hide confidences
+    half=False,  # use FP16 half-precision inference
+    dnn=False,  # use OpenCV DNN for ONNX inference
+    model=None
+):
+    stride, names, pt = model.stride, model.names, model.pt
+    imgsz = check_img_size(imgsz, s=stride)  # check image size
+
+    # Dataloader
+    bs = 1  # batch_size
+    dataset = OakDLoadImages(frame, img_size= imgsz[0], stride=stride, auto=pt)
+
+    # Run inference
+    model.warmup(imgsz=(1 if pt or model.triton else bs, 3, *imgsz))  # warmup
+    seen, _, dt = 0, [], (Profile(device=device), Profile(device=device), Profile(device=device))
+    for im, im0 in dataset:
+        with dt[0]:
+            im = torch.from_numpy(im).to(model.device)
+            im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
+            im /= 255  # 0 - 255 to 0.0 - 1.0
+            if len(im.shape) == 3:
+                im = im[None]  # expand for batch dim
+            if model.xml and im.shape[0] > 1:
+                ims = torch.chunk(im, im.shape[0], 0)
+
+        # Inference
+        with dt[1]:
+            if model.xml and im.shape[0] > 1:
+                pred = None
+                for image in ims:
+                    if pred is None:
+                        pred = model(image, augment=False, visualize=False).unsqueeze(0)
+                    else:
+                        pred = torch.cat((pred, model(image, augment=False, visualize=False).unsqueeze(0)), dim=0)
+                pred = [pred, None]
+            else:
+                pred = model(im, augment=False, visualize=False)
+        # NMS
+        with dt[2]:
+            pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
+
+        results_for_bounding_boxes = []
+        # Process predictions
+        for _, det in enumerate(pred):  # per image
+            seen += 1
+            annotator = Annotator(im0, line_width=line_thickness, example=str(names))
+            if len(det):
+                # Rescale boxes from img_size to im0 size
+                det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round()
+
+                # Write results
+                for *xyxy, conf, cls in reversed(det):
+                    c = int(cls)  # integer class
+                    label = names[c] if hide_conf else f"{names[c]}"
+                    # Add bbox to image
+                    c = int(cls)  # integer class
+                    label = None if hide_labels else (names[c] if hide_conf else f"{names[c]} {conf:.2f}")
+                    annotator.box_label(xyxy, label, color=colors(c, True))
+
+                    ######################
+                    # TODO: check whether I need normalized xyxy or standard
+                    results_for_bounding_boxes.append(xyxy)
+                    ######################
+
+    return im0, results_for_bounding_boxes
diff --git a/models/custom_yolov5s.yaml b/models/custom_yolov5s.yaml
@@ -0,0 +1,49 @@
+# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license
+
+# Parameters
+nc: 3 # number of classes
+depth_multiple: 0.33 # model depth multiple
+width_multiple: 0.50 # layer channel multiple
+anchors:
+  - [10, 13, 16, 30, 33, 23] # P3/8
+  - [30, 61, 62, 45, 59, 119] # P4/16
+  - [116, 90, 156, 198, 373, 326] # P5/32
+
+# YOLOv5 v6.0 backbone
+backbone:
+  # [from, number, module, args]
+  [
+    [-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
+    [-1, 1, Conv, [128, 3, 2]], # 1-P2/4
+    [-1, 3, C3, [128]],
+    [-1, 1, Conv, [256, 3, 2]], # 3-P3/8
+    [-1, 6, C3, [256]],
+    [-1, 1, Conv, [512, 3, 2]], # 5-P4/16
+    [-1, 9, C3, [512]],
+    [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
+    [-1, 3, C3, [1024]],
+    [-1, 1, SPPF, [1024, 5]], # 9
+  ]
+
+# YOLOv5 v6.0 head
+head: [
+    [-1, 1, Conv, [512, 1, 1]],
+    [-1, 1, nn.Upsample, [None, 2, "nearest"]],
+    [[-1, 6], 1, Concat, [1]], # cat backbone P4
+    [-1, 3, C3, [512, False]], # 13
+
+    [-1, 1, Conv, [256, 1, 1]],
+    [-1, 1, nn.Upsample, [None, 2, "nearest"]],
+    [[-1, 4], 1, Concat, [1]], # cat backbone P3
+    [-1, 3, C3, [256, False]], # 17 (P3/8-small)
+
+    [-1, 1, Conv, [256, 3, 2]],
+    [[-1, 14], 1, Concat, [1]], # cat head P4
+    [-1, 3, C3, [512, False]], # 20 (P4/16-medium)
+
+    [-1, 1, Conv, [512, 3, 2]],
+    [[-1, 10], 1, Concat, [1]], # cat head P5
+    [-1, 3, C3, [1024, False]], # 23 (P5/32-large)
+
+    [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
+  ]
diff --git a/requirements.txt b/requirements.txt
@@ -1,50 +1,59 @@
-# YOLOv5 requirements
-# Usage: pip install -r requirements.txt
-
-# Base ------------------------------------------------------------------------
-gitpython>=3.1.30
-matplotlib>=3.3
-numpy>=1.23.5
-opencv-python>=4.1.1
-Pillow>=9.4.0
-psutil  # system resources
-PyYAML>=5.3.1
-requests>=2.23.0
-scipy>=1.4.1
-thop>=0.1.1  # FLOPs computation
-torch>=1.8.0  # see https://pytorch.org/get-started/locally (recommended)
-torchvision>=0.9.0
-tqdm>=4.64.0
-ultralytics>=8.0.232
-# protobuf<=3.20.1  # https://github.com/ultralytics/yolov5/issues/8012
-
-# Logging ---------------------------------------------------------------------
-# tensorboard>=2.4.1
-# clearml>=1.2.0
-# comet
-
-# Plotting --------------------------------------------------------------------
-pandas>=1.1.4
-seaborn>=0.11.0
-
-# Export ----------------------------------------------------------------------
-# coremltools>=6.0  # CoreML export
-# onnx>=1.10.0  # ONNX export
-# onnx-simplifier>=0.4.1  # ONNX simplifier
-# nvidia-pyindex  # TensorRT export
-# nvidia-tensorrt  # TensorRT export
-# scikit-learn<=1.1.2  # CoreML quantization
-# tensorflow>=2.4.0,<=2.13.1  # TF exports (-cpu, -aarch64, -macos)
-# tensorflowjs>=3.9.0  # TF.js export
-# openvino-dev>=2023.0  # OpenVINO export
-
-# Deploy ----------------------------------------------------------------------
-setuptools>=65.5.1 # Snyk vulnerability fix
-# tritonclient[all]~=2.24.0
-
-# Extras ----------------------------------------------------------------------
-# ipython  # interactive notebook
-# mss  # screenshots
-# albumentations>=1.0.3
-# pycocotools>=2.0.6  # COCO mAP
-wheel>=0.38.0 # not directly required, pinned by Snyk to avoid a vulnerability
+certifi==2024.2.2
+charset-normalizer==3.3.2
+clip @ git+https://github.com/ultralytics/CLIP.git@e17416a36b45d040758327936a1ea150c13fe3d1
+contourpy==1.2.0
+cycler==0.12.1
+depthai==2.25.0.0
+filelock==3.13.3
+fonttools==4.50.0
+fsspec==2024.3.1
+ftfy==6.2.0
+gitdb==4.0.11
+GitPython==3.1.43
+idna==3.6
+Jinja2==3.1.3
+kiwisolver==1.4.5
+MarkupSafe==2.1.5
+matplotlib==3.8.3
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.1.105
+opencv-python==4.9.0.80
+packaging==24.0
+pandas==2.2.1
+pillow==10.2.0
+psutil==5.9.8
+py-cpuinfo==9.0.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+scipy==1.12.0
+seaborn==0.13.2
+six==1.16.0
+smmap==5.0.1
+sympy==1.12
+thop==0.1.1.post2209072238
+torch==2.2.2
+torchvision==0.17.2
+tqdm==4.66.2
+triton==2.2.0
+typing_extensions==4.10.0
+tzdata==2024.1
+ultralytics==8.1.39
+urllib3==2.2.1
+wcwidth==0.2.13
diff --git a/rivian.py b/rivian.py
@@ -0,0 +1,32 @@
+import OAK_D_api as oak
+import cv2
+import torch
+from inference import run
+from models.common import DetectMultiBackend
+
+
+def run_object_detection():
+
+    # Load model
+    weights_path = './runs/train/yolov5s_results3/weights/best.pt'
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = DetectMultiBackend(weights_path, device=device, dnn=False, fp16=False)
+    model.eval()
+
+    # camera setup
+    oak_d = oak.OAK_D(fps=60, width=1920, height=1080)
+
+    while True:
+        frame = oak_d.get_color_frame(show_fps=True)
+        img, results_for_bounding_boxes = run(frame=frame, classes=[1], model=model)
+
+        cv2.imshow("Levi", img)
+
+        # Break the loop if 'q' key is pressed
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            cv2.destroyAllWindows()
+            break
+
+if __name__ == '__main__':
+    run_object_detection()
+