From 5247cb78f4497ea83edcd0f5ff3ecb6d253756d1 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Mon, 13 Nov 2023 09:56:39 +0200 Subject: [PATCH 01/15] new functions added for adaptive high-resolution pose estimation. some offset variables that existed have been erased due to changes in the annotation file, and the sample image for evaluation --- .../high_resolution_learner.py | 932 +++++++++++++++++- 1 file changed, 915 insertions(+), 17 deletions(-) diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index 43600eeeec..ed2d25beab 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -45,9 +45,9 @@ class HighResolutionPoseEstimationLearner(LightweightOpenPoseLearner): def __init__(self, device='cuda', backbone='mobilenet', temp_path='temp', mobilenet_use_stride=True, mobilenetv2_width=1.0, shufflenet_groups=3, num_refinement_stages=2, batches_per_iter=1, base_height=256, - first_pass_height=360, second_pass_height=540, percentage_around_crop=0.3, heatmap_threshold=0.1, - experiment_name='default', num_workers=8, weights_only=True, output_name='detections.json', - multiscale=False, scales=None, visualize=False, + first_pass_height=360, second_pass_height=540, method='adaptive', percentage_around_crop=0.3, + heatmap_threshold=0.1, experiment_name='default', num_workers=8, weights_only=True, + output_name='detections.json', multiscale=False, scales=None, visualize=False, img_mean=np.array([128, 128, 128], np.float32), img_scale=np.float32(1 / 256), pad_value=(0, 0, 0), half_precision=False): @@ -67,14 +67,32 @@ def __init__(self, device='cuda', backbone='mobilenet', self.first_pass_height = first_pass_height self.second_pass_height = second_pass_height + self.method = method self.perc = percentage_around_crop self.threshold = heatmap_threshold - self.xmin = None - self.ymin = None - self.xmax = None - self.ymax = None - self.counter = 0 self.prev_heatmap = np.array([]) + self.counter = 0 + if self.method == 'primary': + self.xmin = None + self.ymin = None + self.xmax = None + self.ymax = None + + elif self.method == 'adaptive': + self.xmin = None + self.ymin = None + self.xmax = None + self.ymax = None + + self.x1min = None + self.x1max = None + self.y1min = None + self.y1max = None + + self.x2min = None + self.x2max = None + self.y2min = None + self.y2max = None def __first_pass(self, img): """ @@ -177,6 +195,193 @@ def __pooling(self, img, kernel): # Pooling on input image for dimension reduct pool_img = pool_img.squeeze(0).permute(1, 2, 0).cpu().float().numpy() return pool_img + def __crop_heatmap(self, heatmap): + """ This method takes the generated heatmap and crops it arround the desirable ROI using its nonzero values. + Parameters + ---------- + :param heatmap: the heatmap that generated from __first_pass function + :type heatmap numpy.array + Returns + ------- + heatmap_dims: It is an array that contains the boundaries of the cropped image + """ + detection = False + + if heatmap.nonzero()[0].size > 10 and heatmap.nonzero()[0].size > 10: + detection = True + xmin = min(heatmap.nonzero()[1]) + ymin = min(heatmap.nonzero()[0]) + xmax = max(heatmap.nonzero()[1]) + ymax = max(heatmap.nonzero()[0]) + else: + xmin, ymin, xmax, ymax = 0, 0, 0, 0 + + heatmap_dims = (int(xmin), int(xmax), int(ymin), int(ymax)) + return heatmap_dims, detection + + def __check_for_split(self, cropped_heatmap): + """ This function checks weather or not the cropped heatmap needs further proccessing for extra cropping. + More specifically, returns a boolean for the decision for further crop, the decision depends on the distance between the + target subjects. + :param cropped_heatmap: the cropped area from the original heatmap + + Returns: boolean + + """ + sum_rows = cropped_heatmap.sum(axis=1) + sum_col = cropped_heatmap.sum(axis=0) + + heatmap_activation_area = len(sum_col.nonzero()[0]) * len(sum_rows.nonzero()[0]) + crop_total_area = cropped_heatmap.shape[0] * cropped_heatmap.shape[1] # heatmap total area + + if crop_total_area != 0: + crop_rule1 = (heatmap_activation_area / crop_total_area * 100 < 80) and ( + heatmap_activation_area / crop_total_area * 100 > 5) + if crop_rule1: + return True + else: + return False + else: + return False + + def __split_process(self, cropped_heatmap): + """ This function uses the cropped heatmap that crated from __crop_heatmap function and splits it in parts. + :param cropped_heatmap: the cropped area from the original heatmap + + Returns: + crops: A list with the new dimensions of the split parts + :type crops: list + """ + max_diff_c, max_diff_r = 0, 0 + y_crop_l = cropped_heatmap.shape[1] + y_crop_r = 0 + sum_col = cropped_heatmap.sum(axis=0) + x_crop_u = 0 + sum_rows = cropped_heatmap.sum(axis=1) + + for ind in range(len(sum_col.nonzero()[0]) - 1): + diff_c = abs(sum_col.nonzero()[0][ind + 1] - sum_col.nonzero()[0][ind]) + if (diff_c > max_diff_c) and (diff_c > 5): # nonzero columns have at least 5px difference in heatmap + max_diff_c = diff_c + y_crop_l = round(sum_col.nonzero()[0][ind]) + y_crop_r = round(sum_col.nonzero()[0][ind + 1]) + + for ind in range(len(sum_rows.nonzero()[0]) - 1): + diff_r = abs(sum_rows.nonzero()[0][ind + 1] - sum_rows.nonzero()[0][ind]) + if (diff_r > max_diff_r) and (diff_r > 5): # nonzero rows have at least 5px difference in heatmap + max_diff_r = diff_r + x_crop_u = round(sum_rows.nonzero()[0][ind]) + + if max_diff_c >= max_diff_r and max_diff_c > 0: # vertical cut + y1_i = 0 + y1_f = cropped_heatmap.shape[0] + x1_i = 0 + x1_f = int(y_crop_l) + + y2_i = 0 + y2_f = cropped_heatmap.shape[0] + x2_i = int(y_crop_r) + x2_f = cropped_heatmap.shape[1] + + crop1 = cropped_heatmap[y1_i:y1_f, x1_i:x1_f] + crop2 = cropped_heatmap[y2_i:y2_f, x2_i:x2_f] + + elif max_diff_r > max_diff_c and max_diff_r > 0: # horizontal cut + y1_i = 0 + y1_f = int(x_crop_u) + x1_i = 0 + x1_f = cropped_heatmap.shape[1] + + y2_i = int(x_crop_u + 3) + y2_f = cropped_heatmap.shape[0] + x2_i = 0 + x2_f = cropped_heatmap.shape[1] + + crop1 = cropped_heatmap[y1_i:y1_f, x1_i:x1_f] + crop2 = cropped_heatmap[y2_i:y2_f, x2_i:x2_f] + + else: + return [[cropped_heatmap, 0, cropped_heatmap.shape[1], 0, cropped_heatmap.shape[0]]] + + crops = [[crop1, x1_i, x1_f, y1_i, y1_f], [crop2, x2_i, x2_f, y2_i, y2_f]] + return crops + + def __crop_ecclosing_bbox(self, crop): + """ This function creates the bounding box for each split part + + Args: + crop: A split part from the original heatmap + + Returns: + xmin, xmax, ymin, ymax : Dimensions for enclosing bounding box + """ + if crop.nonzero()[0].size > 0 and crop.nonzero()[1].size > 0: + xmin = min(np.unique(crop.nonzero()[1])) + ymin = min(np.unique(crop.nonzero()[0])) + xmax = max(np.unique(crop.nonzero()[1])) + ymax = max(np.unique(crop.nonzero()[0])) + else: + xmin, xmax, ymin, ymax = 0, 0, 0, 0 + return xmin, xmax, ymin, ymax + + def __crop_image_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc, detection): + """ This function crops the region of interst(ROI) from the original image to use it in next steps + Args: + xmin, ymin: top left corner dimensions of the split part in the original heatmap + xamx, ymax: bottom right dimensions of the split part in the original heatmap + pool_img: the resized pooled input image + original_image: the original input image + heatmap: the heatmap generated from __first_pass function + perc: percentage of the image that is needed for adding extra pad + :type perc: float + detection: boolean that describes if any subject detection is made + :type detection: boolean + + Returns: + :crop_img: the cropped image part from the original image + :type crop_img: numpy.array + :xmin, xmax, ymin, ymax: the dimensions of the cropped part in the original image coordinate system + """ + upscale_factor = int(pool_img.shape[0] / heatmap.shape[0]) + xmin = upscale_factor * xmin + xmax = upscale_factor * xmax + ymin = upscale_factor * ymin + ymax = upscale_factor * ymax + + upscale_to_init_img = original_image.shape[0] / pool_img.shape[0] + xmin = upscale_to_init_img * xmin + xmax = upscale_to_init_img * xmax + ymin = upscale_to_init_img * ymin + ymax = upscale_to_init_img * ymax + + extra_pad_x = int(perc * (xmax - xmin)) # Adding an extra pad around cropped image + extra_pad_y = int(perc * (ymax - ymin)) + + if xmin - extra_pad_x > 0: + xmin = xmin - extra_pad_x + else: + xmin = xmin + if xmax + extra_pad_x < original_image.shape[1]: + xmax = xmax + extra_pad_x + else: + xmax = xmax + + if ymin - extra_pad_y > 0: + ymin = ymin - extra_pad_y + else: + ymin = ymin + if ymax + extra_pad_y < original_image.shape[0]: + ymax = ymax + extra_pad_y + else: + ymax = ymax + + if (xmax - xmin) > 40 and (ymax - ymin) > 40: + crop_img = original_image[int(ymin):int(ymax), int(xmin):int(xmax)] + else: + crop_img = original_image + + return crop_img, int(xmin), int(xmax), int(ymin), int(ymax) + def fit(self, dataset, val_dataset=None, logging_path='', logging_flush_secs=30, silent=False, verbose=True, epochs=None, use_val_subset=True, val_subset_size=250, images_folder_name="train2017", annotations_filename="person_keypoints_train2017.json", @@ -295,12 +500,10 @@ def eval(self, dataset, silent=False, verbose=True, use_subset=True, subset_size kernel = int(h / self.first_pass_height) if kernel > 0: pool_img = self.__pooling(img, kernel) - else: pool_img = img - # ------- Heatmap Generation ------- - avg_pafs = self.__first_pass(pool_img) + avg_pafs = self.__first_pass(pool_img) # Heatmap Generation avg_pafs = avg_pafs.astype(np.float32) pafs_map = cv2.blur(avg_pafs, (5, 5)) @@ -424,6 +627,347 @@ def eval(self, dataset, silent=False, verbose=True, use_subset=True, subset_size print("Evaluation ended with no detections.") return {"average_precision": [0.0 for _ in range(5)], "average_recall": [0.0 for _ in range(5)]} + def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, subset_size=250, upsample_ratio=4, + images_folder_name="val2017", annotations_filename="person_keypoints_val2017.json"): + """ + This method is used to evaluate a trained model on an evaluation dataset. + + :param dataset: object that holds the evaluation dataset. + :type dataset: ExternalDataset class object or DatasetIterator class object + :param silent: if set to True, disables all printing of evaluation progress reports and other + information to STDOUT, defaults to 'False' + :type silent: bool, optional + :param verbose: if set to True, enables the maximum verbosity, defaults to 'True' + :type verbose: bool, optional + :param use_subset: If set to True, a subset of the validation dataset is created and used in + evaluation, defaults to 'True' + :type use_subset: bool, optional + :param subset_size: Controls the size of the validation subset, defaults to '250' + :type subset_size: int, optional + param upsample_ratio: Defines the amount of upsampling to be performed on the heatmaps and PAFs + when resizing,defaults to 4 + :type upsample_ratio: int, optional + :param images_folder_name: Folder name that contains the dataset images. This folder should be contained + in the dataset path provided. Note that this is a folder name, not a path, defaults to 'val2017' + :type images_folder_name: str, optional + :param annotations_filename: Filename of the annotations json file. This file should be contained in the + dataset path provided, defaults to 'person_keypoints_val2017.json' + :type annotations_filename: str, optional + + :returns: returns stats regarding evaluation + :rtype: dict + """ + + data = super(HighResolutionPoseEstimationLearner, # NOQA + self)._LightweightOpenPoseLearner__prepare_val_dataset(dataset, use_subset=use_subset, + subset_name="val_subset.json", + subset_size=subset_size, + images_folder_default_name=images_folder_name, + annotations_filename=annotations_filename, + verbose=verbose and not silent) + # Model initialization if needed + if self.model is None and self.checkpoint_load_iter != 0: + # No model loaded, initializing new + self.init_model() + # User set checkpoint_load_iter, so they want to load a checkpoint + # Try to find the checkpoint_load_iter checkpoint + checkpoint_name = "checkpoint_iter_" + str(self.checkpoint_load_iter) + ".pth" + checkpoints_folder = os.path.join(self.parent_dir, '{}_checkpoints'.format(self.experiment_name)) + full_path = os.path.join(checkpoints_folder, checkpoint_name) + try: + checkpoint = torch.load(full_path, map_location=torch.device(self.device)) + except FileNotFoundError as e: + e.strerror = "File " + checkpoint_name + " not found inside checkpoints_folder, " \ + "provided checkpoint_load_iter (" + \ + str(self.checkpoint_load_iter) + \ + ") doesn't correspond to a saved checkpoint.\nNo such file or directory." + raise e + if not silent and verbose: + print("Loading checkpoint:", full_path) + + load_state(self.model, checkpoint) + elif self.model is None: + raise AttributeError("self.model is None. Please load a model or set checkpoint_load_iter.") + + self.model = self.model.eval() # Change model state to evaluation + self.model.to(self.device) + if "cuda" in self.device: + self.model = self.model.to(self.device) + if self.half: + self.model.half() + + if self.multiscale: + self.scales = [0.5, 1.0, 1.5, 2.0] + + coco_result = [] + num_keypoints = Pose.num_kpts + + pbar_eval = None + if not silent: + pbar_desc = "Evaluation progress" + pbar_eval = tqdm(desc=pbar_desc, total=len(data), bar_format="{l_bar}%s{bar}{r_bar}" % '\x1b[38;5;231m') + + for sample in data: + file_name = sample['file_name'] + img = sample['img'] + h, w, _ = img.shape + max_width = w + kernel = int(h / self.first_pass_height) + if kernel > 0: + pool_img = self.__pooling(img, kernel) + else: + pool_img = img + + avg_pafs = self.__first_pass(pool_img) # Heatmap Generation + avg_pafs = avg_pafs.astype(np.float32) + + pafs_map = cv2.blur(avg_pafs, (5, 5)) + pafs_map[pafs_map < self.threshold] = 0 + + heatmap = pafs_map.sum(axis=2) + heatmap = heatmap * 100 + heatmap = heatmap.astype(np.uint8) + heatmap = cv2.blur(heatmap, (5, 5)) + + self.prev_heatmap = heatmap + heatmap_dims, detection = self.__crop_heatmap(heatmap) + + if detection: + cropped_heatmap = heatmap[heatmap_dims[2]:heatmap_dims[3], heatmap_dims[0]:heatmap_dims[1]] + if self.__check_for_split(cropped_heatmap): + # Split horizontal or vertical + crops = self.__split_process(cropped_heatmap) + crop_part = 0 + for crop_params in crops: + crop = crop_params[0] + if crop.size > 0: + crop_part += 1 + + xmin, xmax, ymin, ymax = self.__crop_ecclosing_bbox(crop) + + xmin += heatmap_dims[0] + xmax += heatmap_dims[0] + ymin += heatmap_dims[2] + ymax += heatmap_dims[2] + + xmin += crop_params[1] + xmax += crop_params[1] + ymin += crop_params[3] + ymax += crop_params[3] + + crop_img, xmin, xmax, ymin, ymax = self.__crop_image_func(xmin, xmax, ymin, ymax, pool_img, img, + heatmap, self.perc, detection) + + if crop_part == 1: + if self.x1min is None: + self.x1min = xmin + self.y1min = ymin + self.x1max = xmax + self.y1max = ymax + else: + a = 0.2 + self.x1min = a * xmin + (1 - a) * self.x1min + self.y1min = a * ymin + (1 - a) * self.y1min + self.y1max = a * ymax + (1 - a) * self.y1max + self.x1max = a * xmax + (1 - a) * self.x1max + + elif crop_part == 2: + if self.x2min is None: + self.x2min = xmin + self.y2min = ymin + self.x2max = xmax + self.y2max = ymax + else: + a = 0.2 + self.x2min = a * xmin + (1 - a) * self.x2min + self.y2min = a * ymin + (1 - a) * self.y2min + self.y2max = a * ymax + (1 - a) * self.y2max + self.x2max = a * xmax + (1 - a) * self.x2max + + h, w, _, = crop_img.shape + if h > self.second_pass_height: + second_pass_height = self.second_pass_height + else: + second_pass_height = h + + # ------- Second pass of the image, inference for pose estimation ------- + avg_heatmaps, avg_pafs, scale, pad = self.__second_pass(crop_img, second_pass_height, + max_width, self.stride, upsample_ratio) + + total_keypoints_num = 0 + all_keypoints_by_type = [] + for kpt_idx in range(18): + total_keypoints_num += extract_keypoints(avg_heatmaps[:, :, kpt_idx], all_keypoints_by_type, + total_keypoints_num) + + pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, avg_pafs) + + for kpt_id in range(all_keypoints.shape[0]): + all_keypoints[kpt_id, 0] = ((all_keypoints[kpt_id, 0] * + self.stride / upsample_ratio - pad[1]) / scale) + all_keypoints[kpt_id, 1] = ((all_keypoints[kpt_id, 1] * + self.stride / upsample_ratio - pad[0]) / scale) + + for i in range(all_keypoints.shape[0]): + for j in range(all_keypoints.shape[1]): + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) + + current_poses = [] + for n in range(len(pose_entries)): + if len(pose_entries[n]) == 0: + continue + pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1 + for kpt_id in range(num_keypoints): + if pose_entries[n][kpt_id] != -1.0: # keypoint was found + pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0]) + pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) + pose = Pose(pose_keypoints, pose_entries[n][18]) + current_poses.append(pose) + coco_keypoints, scores = convert_to_coco_format(pose_entries, all_keypoints) + + image_id = int(file_name[0:file_name.rfind('.')]) + + for idx in range(len(coco_keypoints)): + coco_result.append({ + 'image_id': image_id, + 'category_id': 1, # person + 'keypoints': coco_keypoints[idx], + 'score': scores[idx] + }) + + if self.visualize: + for keypoints in coco_keypoints: + for idx in range(len(keypoints) // 3): + cv2.circle(img, (int(keypoints[idx * 3]), int(keypoints[idx * 3 + 1])), + 3, (255, 0, 255), -1) + cv2.imshow('keypoints', img) + key = cv2.waitKey() + if key == 27: # esc + return + else: + xmin = heatmap_dims[0] + xmax = heatmap_dims[1] + ymin = heatmap_dims[2] + ymax = heatmap_dims[3] + + h, w, _ = pool_img.shape + xmin = xmin * int((w / heatmap.shape[1])) * kernel + xmax = xmax * int((w / heatmap.shape[1])) * kernel + ymin = ymin * int((h / heatmap.shape[0])) * kernel + ymax = ymax * int((h / heatmap.shape[0])) * kernel + + extra_pad_x = int(self.perc * (xmax - xmin)) + extra_pad_y = int(self.perc * (ymax - ymin)) + + if xmin - extra_pad_x > 0: + xmin = xmin - extra_pad_x + else: + xmin = xmin + + if xmax + extra_pad_x < img.shape[1]: + xmax = xmax + extra_pad_x + else: + xmax = xmax + + if ymin - extra_pad_y > 0: + ymin = ymin - extra_pad_y + else: + ymin = ymin + + if ymax + extra_pad_y < img.shape[0]: + ymax = ymax + extra_pad_y + else: + ymax = ymax + + if (xmax - xmin) > 40 and (ymax - ymin) > 40: + crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] + else: + crop_img = img + + h, w, _, = crop_img.shape + if h > self.second_pass_height: + second_pass_height = self.second_pass_height + else: + second_pass_height = h + + # ------- Second pass of the image, inference for pose estimation ------- + avg_heatmaps, avg_pafs, scale, pad = self.__second_pass(crop_img, second_pass_height, + max_width, self.stride, upsample_ratio) + + total_keypoints_num = 0 + all_keypoints_by_type = [] + for kpt_idx in range(18): + total_keypoints_num += extract_keypoints(avg_heatmaps[:, :, kpt_idx], all_keypoints_by_type, + total_keypoints_num) + + pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, avg_pafs) + + for kpt_id in range(all_keypoints.shape[0]): + all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * self.stride / upsample_ratio - pad[1]) / scale + all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * self.stride / upsample_ratio - pad[0]) / scale + + for i in range(all_keypoints.shape[0]): + for j in range(all_keypoints.shape[1]): + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) + + current_poses = [] + for n in range(len(pose_entries)): + if len(pose_entries[n]) == 0: + continue + pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1 + for kpt_id in range(num_keypoints): + if pose_entries[n][kpt_id] != -1.0: # keypoint was found + pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0]) + pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) + pose = Pose(pose_keypoints, pose_entries[n][18]) + current_poses.append(pose) + + coco_keypoints, scores = convert_to_coco_format(pose_entries, all_keypoints) + + image_id = int(file_name[0:file_name.rfind('.')]) + + for idx in range(len(coco_keypoints)): + coco_result.append({ + 'image_id': image_id, + 'category_id': 1, # person + 'keypoints': coco_keypoints[idx], + 'score': scores[idx] + }) + + if self.visualize: + for keypoints in coco_keypoints: + for idx in range(len(keypoints) // 3): + cv2.circle(img, (int(keypoints[idx * 3]), int(keypoints[idx * 3 + 1])), + 3, (255, 0, 255), -1) + cv2.imshow('keypoints', img) + key = cv2.waitKey() + if key == 27: # esc + return + + if not silent: + pbar_eval.update(1) + + with open(self.output_name, 'w') as f: + json.dump(coco_result, f, indent=4) + if len(coco_result) != 0: + if use_subset: + result = run_coco_eval(os.path.join(dataset.path, "val_subset.json"), + self.output_name, verbose=not silent) + else: + result = run_coco_eval(os.path.join(dataset.path, annotations_filename), + self.output_name, verbose=not silent) + return {"average_precision": result.stats[0:5], "average_recall": result.stats[5:]} + else: + if not silent and verbose: + print("Evaluation ended with no detections.") + return {"average_precision": [0.0 for _ in range(5)], "average_recall": [0.0 for _ in range(5)]} + def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multiscale=False): """ This method is used to perform pose estimation on an image. @@ -467,12 +1011,10 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis kernel = int(h / self.first_pass_height) if kernel > 0: pool_img = self.__pooling(img, kernel) - else: pool_img = img - # # ------- Heatmap Generation ------- - avg_pafs = self.__first_pass(pool_img) + avg_pafs = self.__first_pass(pool_img) # Heatmap Generation avg_pafs = avg_pafs.astype(np.float32) pafs_map = cv2.blur(avg_pafs, (5, 5)) @@ -589,7 +1131,6 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis if np.count_nonzero(pose_keypoints == -1) < 26: pose = Pose(pose_keypoints, pose_entries[n][18]) current_poses.append(pose) - else: if self.xmin is None: self.xmin = xmin @@ -602,7 +1143,6 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis self.ymin = a * ymin + (1 - a) * self.ymin self.ymax = a * ymax + (1 - a) * self.ymax self.xmax = a * xmax + (1 - a) * self.xmax - else: extra_pad_x = int(self.perc * (self.xmax - self.xmin)) # Adding an extra pad around cropped image @@ -612,6 +1152,7 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis xmin = self.xmin - extra_pad_x else: xmin = self.xmin + if self.xmax + extra_pad_x < img.shape[1]: xmax = self.xmax + extra_pad_x else: @@ -621,6 +1162,7 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis ymin = self.ymin - extra_pad_y else: ymin = self.ymin + if self.ymax + extra_pad_y < img.shape[0]: ymax = self.ymax + extra_pad_y else: @@ -681,8 +1223,364 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis else: heatmap = self.prev_heatmap self.counter += 1 + bounds = ([self.xmin, self.xmax, self.ymin, self.ymax],) + return current_poses, heatmap, bounds + + def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multiscale=False): + """ + This method is used to perform pose estimation on an image. + + :param img: image to run inference on + :rtype img: engine.data.Image class object + :param upsample_ratio: Defines the amount of upsampling to be performed on the heatmaps and PAFs + when resizing,defaults to 4 + :type upsample_ratio: int, optional + :param stride: Defines the stride value for creating a padded image + :type stride: int,optional + :param track: If True, infer propagates poses ids from previous frame results to track poses, + defaults to 'True' + :type track: bool, optional + :param smooth: If True, smoothing is performed on pose keypoints between frames, defaults to 'True' + :type smooth: bool, optional + :param multiscale: Specifies whether evaluation will run in the predefined multiple scales setup or not. + :type multiscale: bool,optional + + :return: Returns a list of engine.target.Pose objects, where each holds a pose + and a heatmap that contains human silhouettes of the input image. + If no detections were made returns an empty list for poses and a black frame for heatmap. + + :rtype: poses -> list of engine.target.Pose objects + heatmap -> np.array() + """ + current_poses = [] + num_keypoints = Pose.num_kpts + if not isinstance(img, Image): + img = Image(img) + + # Bring image into the appropriate format for the implementation + img = img.convert(format='channels_last', channel_order='bgr') + h, w, _ = img.shape + max_width = w + xmin, ymin = 0, 0 + ymax, xmax, _ = img.shape + + if self.counter % 5 == 0: + kernel = int(h / self.first_pass_height) + if kernel > 0: + pool_img = self.__pooling(img, kernel) + else: + pool_img = img + + avg_pafs = self.__first_pass(pool_img) # Heatmap Generation + + avg_pafs = avg_pafs.astype(np.float32) + pafs_map = cv2.blur(avg_pafs, (5, 5)) + pafs_map[pafs_map < self.threshold] = 0 + heatmap = pafs_map.sum(axis=2) + heatmap = heatmap * 100 + heatmap = heatmap.astype(np.uint8) + heatmap = cv2.blur(heatmap, (5, 5)) + self.prev_heatmap = heatmap + heatmap_dims, detection = self.__crop_heatmap(heatmap) + + # self.xmin = heatmap_dims[0] * (img.shape[1] / heatmap.shape[1]) + # self.ymin = heatmap_dims[2] * (img.shape[0] / heatmap.shape[0]) + # self.xmax = heatmap_dims[1] * (img.shape[1] / heatmap.shape[1]) + # self.ymax = heatmap_dims[3] * (img.shape[0] / heatmap.shape[0]) + + if detection: + cropped_heatmap = heatmap[heatmap_dims[2]:heatmap_dims[3], heatmap_dims[0]:heatmap_dims[1]] + if self.__check_for_split(cropped_heatmap): + # Split horizontal or vertical + crops = self.__split_process(cropped_heatmap) + crop_part = 0 + for crop_params in crops: + crop = crop_params[0] + if crop.size > 0: + crop_part += 1 + + xmin, xmax, ymin, ymax = self.__crop_ecclosing_bbox(crop) + + xmin += heatmap_dims[0] + xmax += heatmap_dims[0] + ymin += heatmap_dims[2] + ymax += heatmap_dims[2] + + xmin += crop_params[1] + xmax += crop_params[1] + ymin += crop_params[3] + ymax += crop_params[3] + + crop_img, xmin, xmax, ymin, ymax = self.__crop_image_func(xmin, xmax, ymin, ymax, pool_img, img, + heatmap, self.perc, detection) + + if crop_part == 1: + if self.x1min is None: + self.x1min = xmin + self.y1min = ymin + self.x1max = xmax + self.y1max = ymax + else: + a = 0.2 + self.x1min = a * xmin + (1 - a) * self.x1min + self.y1min = a * ymin + (1 - a) * self.y1min + self.y1max = a * ymax + (1 - a) * self.y1max + self.x1max = a * xmax + (1 - a) * self.x1max + elif crop_part == 2: + if self.x2min is None: + self.x2min = xmin + self.y2min = ymin + self.x2max = xmax + self.y2max = ymax + else: + a = 0.2 + self.x2min = a * xmin + (1 - a) * self.x2min + self.y2min = a * ymin + (1 - a) * self.y2min + self.y2max = a * ymax + (1 - a) * self.y2max + self.x2max = a * xmax + (1 - a) * self.x2max + + h, w, _, = crop_img.shape + if h > self.second_pass_height: + second_pass_height = self.second_pass_height + else: + second_pass_height = h + + # ------- Second pass of the image, inference for pose estimation ------- + avg_heatmaps, avg_pafs, scale, pad = self.__second_pass(crop_img, second_pass_height, + max_width, self.stride, upsample_ratio) + + total_keypoints_num = 0 + all_keypoints_by_type = [] + for kpt_idx in range(18): + total_keypoints_num += extract_keypoints(avg_heatmaps[:, :, kpt_idx], all_keypoints_by_type, + total_keypoints_num) + + pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, avg_pafs) + + for kpt_id in range(all_keypoints.shape[0]): + all_keypoints[kpt_id, 0] = ((all_keypoints[kpt_id, 0] * + self.stride / upsample_ratio - pad[1]) / scale) + all_keypoints[kpt_id, 1] = ((all_keypoints[kpt_id, 1] * + self.stride / upsample_ratio - pad[0]) / scale) + + for i in range(all_keypoints.shape[0]): + for j in range(all_keypoints.shape[1]): + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) + + for n in range(len(pose_entries)): + if len(pose_entries[n]) == 0: + continue + pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1 + for kpt_id in range(num_keypoints): + if pose_entries[n][kpt_id] != -1.0: # keypoint was found + pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0]) + pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) + pose = Pose(pose_keypoints, pose_entries[n][18]) + current_poses.append(pose) + else: + xmin = heatmap_dims[0] + xmax = heatmap_dims[1] + ymin = heatmap_dims[2] + ymax = heatmap_dims[3] + + h, w, _ = pool_img.shape + xmin = xmin * int((w / heatmap.shape[1])) * kernel + xmax = xmax * int((w / heatmap.shape[1])) * kernel + ymin = ymin * int((h / heatmap.shape[0])) * kernel + ymax = ymax * int((h / heatmap.shape[0])) * kernel + + extra_pad_x = int(self.perc * (xmax - xmin)) + extra_pad_y = int(self.perc * (ymax - ymin)) + + if xmin - extra_pad_x > 0: + xmin = xmin - extra_pad_x + else: + xmin = xmin + if xmax + extra_pad_x < img.shape[1]: + xmax = xmax + extra_pad_x + else: + xmax = xmax + + if ymin - extra_pad_y > 0: + ymin = ymin - extra_pad_y + else: + ymin = ymin + if ymax + extra_pad_y < img.shape[0]: + ymax = ymax + extra_pad_y + else: + ymax = ymax + + if self.xmin is None: + self.xmin = xmin + self.ymin = ymin + self.xmax = xmax + self.ymax = ymax + else: + a = 0.2 + self.xmin = a * xmin + (1 - a) * self.xmin + self.ymin = a * ymin + (1 - a) * self.ymin + self.ymax = a * ymax + (1 - a) * self.ymax + self.xmax = a * xmax + (1 - a) * self.xmax + + if (xmax - xmin) > 40 and (ymax - ymin) > 40: + crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] + else: + crop_img = img + + h, w, _, = crop_img.shape + if h > self.second_pass_height: + second_pass_height = self.second_pass_height + else: + second_pass_height = h + + # ------- Second pass of the image, inference for pose estimation ------- + avg_heatmaps, avg_pafs, scale, pad = self.__second_pass(crop_img, second_pass_height, + max_width, self.stride, upsample_ratio) + + total_keypoints_num = 0 + all_keypoints_by_type = [] + for kpt_idx in range(18): + total_keypoints_num += extract_keypoints(avg_heatmaps[:, :, kpt_idx], all_keypoints_by_type, + total_keypoints_num) + + pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, avg_pafs) + + for kpt_id in range(all_keypoints.shape[0]): + all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * stride / upsample_ratio - pad[1]) / scale + all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * stride / upsample_ratio - pad[0]) / scale + + for i in range(all_keypoints.shape[0]): + for j in range(all_keypoints.shape[1]): + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) + + current_poses = [] + for n in range(len(pose_entries)): + if len(pose_entries[n]) == 0: + continue + pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1 + for kpt_id in range(num_keypoints): + if pose_entries[n][kpt_id] != -1.0: # keypoint was found + pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0]) + pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) + pose = Pose(pose_keypoints, pose_entries[n][18]) + current_poses.append(pose) + else: + if self.xmin is None: + self.xmin = xmin + self.ymin = ymin + self.xmax = xmax + self.ymax = ymax + else: + a = 0.2 + self.xmin = a * xmin + (1 - a) * self.xmin + self.ymin = a * ymin + (1 - a) * self.ymin + self.ymax = a * ymax + (1 - a) * self.ymax + self.xmax = a * xmax + (1 - a) * self.xmax + + self.x1min, self.x1max, self.y1min, self.y1max = None, None, None, None + self.x2min, self.x2max, self.y2min, self.y2max = None, None, None, None + else: + extra_pad_x = int(self.perc * (self.xmax - self.xmin)) # Adding an extra pad around cropped image + extra_pad_y = int(self.perc * (self.ymax - self.ymin)) + + if self.xmin - extra_pad_x > 0: + xmin = self.xmin - extra_pad_x + else: + xmin = self.xmin + + if self.xmax + extra_pad_x < img.shape[1]: + xmax = self.xmax + extra_pad_x + else: + xmax = self.xmax + + if self.ymin - extra_pad_y > 0: + ymin = self.ymin - extra_pad_y + else: + ymin = self.ymin + + if self.ymax + extra_pad_y < img.shape[0]: + ymax = self.ymax + extra_pad_y + else: + ymax = self.ymax + + if self.xmin is None: + self.xmin = xmin + self.ymin = ymin + self.xmax = xmax + self.ymax = ymax + else: + a = 0.2 + self.xmin = a * xmin + (1 - a) * self.xmin + self.ymin = a * ymin + (1 - a) * self.ymin + self.ymax = a * ymax + (1 - a) * self.ymax + self.xmax = a * xmax + (1 - a) * self.xmax + + if (xmax - xmin) > 40 and (ymax - ymin) > 40: + crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] + else: + crop_img = img[img.shape[0], img.shape[1]] + + h, w, _ = crop_img.shape + if crop_img.shape[0] < self.second_pass_height: + second_pass_height = crop_img.shape[0] + else: + second_pass_height = self.second_pass_height + + # ------- Second pass of the image, inference for pose estimation ------- + avg_heatmaps, avg_pafs, scale, pad = self.__second_pass(crop_img, second_pass_height, + max_width, self.stride, upsample_ratio) + + total_keypoints_num = 0 + all_keypoints_by_type = [] + for kpt_idx in range(18): + total_keypoints_num += extract_keypoints(avg_heatmaps[:, :, kpt_idx], all_keypoints_by_type, + total_keypoints_num) + + pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, avg_pafs) + + for kpt_id in range(all_keypoints.shape[0]): + all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * self.stride / upsample_ratio - pad[1]) / scale + all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * self.stride / upsample_ratio - pad[0]) / scale + + for i in range(all_keypoints.shape[0]): + for j in range(all_keypoints.shape[1]): + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) + + current_poses = [] + for n in range(len(pose_entries)): + if len(pose_entries[n]) == 0: + continue + pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1 + for kpt_id in range(num_keypoints): + if pose_entries[n][kpt_id] != -1.0: # keypoint was found + pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0]) + pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) + + if np.count_nonzero(pose_keypoints == -1) < 26: + pose = Pose(pose_keypoints, pose_entries[n][18]) + current_poses.append(pose) + + if np.any(self.prev_heatmap) is False: + heatmap = np.zeros((int(img.shape[0] / (int((img.shape[0] / self.first_pass_height))) / 8), + int(img.shape[1] / (int((img.shape[0] / self.first_pass_height))) / 8)), + dtype=np.uint8) + else: + heatmap = self.prev_heatmap + self.counter += 1 + + bounds = [(self.xmin, self.xmax, self.ymin, self.ymax), (self.x1min, self.x1max, self.y1min, self.y1max), + (self.x2min, self.x2max, self.y2min, self.y2max)] - return current_poses, heatmap + return current_poses, heatmap, bounds def download(self, path=None, mode="pretrained", verbose=False, url=OPENDR_SERVER_URL + "perception/pose_estimation/lightweight_open_pose/", From e2b24f09df0f4b5cac292b37b55f7be25073e875 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Mon, 13 Nov 2023 09:58:03 +0200 Subject: [PATCH 02/15] added extra arguments for adaptive pose estimation --- .../high_resolution_pose_estimation/README.md | 3 ++- .../demos/benchmarking_demo.py | 16 +++++++----- .../demos/eval_demo.py | 18 ++++++++----- .../demos/inference_demo.py | 21 ++++++++++----- .../demos/webcam_demo.py | 26 ++++++++++++++----- 5 files changed, 59 insertions(+), 25 deletions(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md index 62286ef976..e6fcee7fa4 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md @@ -9,4 +9,5 @@ More specifically, the applications provided are: 3. demos/benchmarking_demo.py: A simple benchmarking tool for measuring the performance of High Resolution Pose Estimation in various platforms. 4. demos/webcam_demo.py: A tool that performs live pose estimation both with high resolution and simple lightweight OpenPose using a webcam. - +NOTE: All demos can run either with "primary ROI selection" mode or with "adaptive ROI selection". +Use `--method primary` or `--method adaptive` for each case. diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py index 65f3278c35..d5a84cb896 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py @@ -28,11 +28,12 @@ action="store_true") parser.add_argument("--height1", help="Base height of resizing in heatmap generation", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) - + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", + default="primary") args = parser.parse_args() - device, accelerate, base_height1, base_height2 = args.device, args.accelerate,\ - args.height1, args.height2 + device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ + args.height1, args.height2, args.method if device == 'cpu': import torch @@ -51,7 +52,8 @@ pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, mobilenet_use_stride=stride, half_precision=half_precision, first_pass_height=int(base_height1), - second_pass_height=int(base_height2)) + second_pass_height=int(base_height2), + method=method) pose_estimator.download(path=".", verbose=True) pose_estimator.load("openpose_default") @@ -65,8 +67,10 @@ for i in tqdm(range(50)): start_time = time.perf_counter() # Perform inference - poses = pose_estimator.infer(img) - + if method == 'primary': + poses, _, _ = pose_estimator.infer(img) + if method == 'adaptive': + poses, _, _ = pose_estimator.infer_adaptive(img) end_time = time.perf_counter() fps_list.append(1.0 / (end_time - start_time)) print("Average FPS: %.2f" % (np.mean(fps_list))) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py index 1d093a99fd..d689472183 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py @@ -26,11 +26,12 @@ action="store_true") parser.add_argument("--height1", help="Base height of resizing in first inference", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) - + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", + default="primary") args = parser.parse_args() - device, accelerate, base_height1, base_height2 = args.device, args.accelerate,\ - args.height1, args.height2 + device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ + args.height1, args.height2, args.method if accelerate: stride = True @@ -45,7 +46,8 @@ mobilenet_use_stride=stride, half_precision=half_precision, first_pass_height=int(base_height1), - second_pass_height=int(base_height2)) + second_pass_height=int(base_height2), + method=method) pose_estimator.download(path=".", verbose=True) pose_estimator.load("openpose_default") @@ -55,8 +57,12 @@ eval_dataset = ExternalDataset(path=join("temp", "dataset"), dataset_type="COCO") t0 = time.time() - results_dict = pose_estimator.eval(eval_dataset, use_subset=False, verbose=True, silent=True, - images_folder_name="image", annotations_filename="annotation.json") + if method == "primary": + results_dict = pose_estimator.eval(eval_dataset, use_subset=False, verbose=True, silent=True, + images_folder_name="image", annotations_filename="annotation.json") + if method == "adaptive": + results_dict = pose_estimator.eval_adaptive(eval_dataset, use_subset=False, verbose=True, silent=True, + images_folder_name="image", annotations_filename="annotation.json") t1 = time.time() print("\n Evaluation time: ", t1 - t0, "seconds") print("Evaluation results = ", results_dict) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py index 754fe31c99..c1aac9cb33 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py @@ -27,11 +27,12 @@ action="store_true") parser.add_argument("--height1", help="Base height of resizing in first inference", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) - + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", + default="adaptive") args = parser.parse_args() - device, accelerate, base_height1, base_height2 = args.device, args.accelerate,\ - args.height1, args.height2 + device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ + args.height1, args.height2, args.method if accelerate: stride = True @@ -45,7 +46,8 @@ pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, mobilenet_use_stride=stride, half_precision=half_precision, first_pass_height=int(base_height1), - second_pass_height=int(base_height2)) + second_pass_height=int(base_height2), + method=method) pose_estimator.download(path=".", verbose=True) pose_estimator.load("openpose_default") @@ -56,10 +58,17 @@ img = Image.open(image_path) - poses, _ = pose_estimator.infer(img) - + if method == 'primary': + poses, _, bounds = pose_estimator.infer(img) + if method == 'adaptive': + poses, _, bounds = pose_estimator.infer_adaptive(img) img_cv = img.opencv() for pose in poses: draw(img_cv, pose) + + for i in range(len(bounds)): + if bounds[i][0] is not None: + cv2.rectangle(img_cv, (int(bounds[i][0]), int(bounds[i][2])), (int(bounds[i][1]), int(bounds[i][3])), (0, 0, 255), + thickness=2) cv2.imshow('Results', img_cv) cv2.waitKey(0) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py index 4f57dd1657..368b00fd02 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py @@ -50,13 +50,15 @@ def __next__(self): parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda") parser.add_argument("--accelerate", help="Enables acceleration flags (e.g., stride)", default=True, action="store_true") - parser.add_argument("--height1", help="Base height of resizing in first inference, defaults to 420", default=420) - parser.add_argument("--height2", help="Base height of resizing in second inference, defaults to 360", default=360) - + parser.add_argument("--height1", help="Base height of resizing in first inference, defaults to 420", default=360) + parser.add_argument("--height2", help="Base height of resizing in second inference, defaults to 360", default=540) + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", + default="adaptive") args = parser.parse_args() onnx, device, accelerate = args.onnx, args.device, args.accelerate - base_height1, base_height2 = args.height1, args.height2 + base_height1, base_height2, method = args.height1, args.height2, args.method + if accelerate: stride = True stages = 1 @@ -73,7 +75,8 @@ def __next__(self): mobilenet_use_stride=stride, half_precision=half_precision, first_pass_height=base_height1, second_pass_height=base_height2, - percentage_around_crop=0.1) + percentage_around_crop=0.1, + method=method) hr_pose_estimator.download(path=".", verbose=True) hr_pose_estimator.load("openpose_default") @@ -109,7 +112,7 @@ def __next__(self): # Perform inference start_time = time.perf_counter() - hr_poses, heatmap = hr_pose_estimator.infer(img) + hr_poses, heatmap, _ = hr_pose_estimator.infer_adaptive(img) hr_time = time.perf_counter() - start_time # Perform inference @@ -144,6 +147,17 @@ def __next__(self): fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), thickness=int(np.ceil(height / 600))) + cv2.putText(img=img_copy, text='FPS: ' + str(int(lw_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), + thickness=int(np.ceil(height / 600))) + + cv2.putText(img=img, text='FPS:' + str(int(hr_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), + thickness=int(np.ceil(height / 600))) + + heatmap = heatmap * 5 heatmap = cv2.cvtColor(heatmap, cv2.COLOR_GRAY2BGR) heatmap = cv2.resize(heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) From f187c977a23b9d07ad18f789b275c41d2b701c55 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Wed, 15 Nov 2023 13:38:21 +0200 Subject: [PATCH 03/15] Demos: Changed the image that used for inference and evaluation typos Learner: removed unnecessary variables (e.g. offset value) due to new sample image and annotation changed the path that _download function, gets the images from ftp typos Tests: added the new evaluation and inference functions that perform the "adaptive ROI selection pose estimation" --- .../demos/benchmarking_demo.py | 2 +- .../demos/inference_demo.py | 8 +-- .../demos/webcam_demo.py | 1 - .../high_resolution_learner.py | 57 ++++++++----------- .../test_high_resolution_pose_estimation.py | 30 +++++++++- 5 files changed, 57 insertions(+), 41 deletions(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py index d5a84cb896..c31a788488 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py @@ -59,7 +59,7 @@ # Download one sample image pose_estimator.download(path=".", mode="test_data") - image_path = join("temp", "dataset", "image", "000000000785_1080.jpg") + image_path = join("temp", "dataset", "image", "000000052591_1080.jpg") img = cv2.imread(image_path) fps_list = [] diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py index c1aac9cb33..11909a46fa 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py @@ -54,8 +54,7 @@ # Download one sample image pose_estimator.download(path=".", mode="test_data") - image_path = join("temp", "dataset", "image", "000000000785_1080.jpg") - + image_path = join("temp", "dataset", "image", "000000052591_1080.jpg") img = Image.open(image_path) if method == 'primary': @@ -68,7 +67,8 @@ for i in range(len(bounds)): if bounds[i][0] is not None: - cv2.rectangle(img_cv, (int(bounds[i][0]), int(bounds[i][2])), (int(bounds[i][1]), int(bounds[i][3])), (0, 0, 255), - thickness=2) + cv2.rectangle(img_cv, (int(bounds[i][0]), int(bounds[i][2])), + (int(bounds[i][1]), int(bounds[i][3])), (0, 0, 255), thickness=2) + img_cv = cv2.resize(img_cv, (1280, 720), interpolation=cv2.INTER_CUBIC) cv2.imshow('Results', img_cv) cv2.waitKey(0) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py index 368b00fd02..d5b5ce395a 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py @@ -157,7 +157,6 @@ def __next__(self): fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), thickness=int(np.ceil(height / 600))) - heatmap = heatmap * 5 heatmap = cv2.cvtColor(heatmap, cv2.COLOR_GRAY2BGR) heatmap = cv2.resize(heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index ed2d25beab..44fa325255 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -483,15 +483,6 @@ def eval(self, dataset, silent=False, verbose=True, use_subset=True, subset_size pbar_desc = "Evaluation progress" pbar_eval = tqdm(desc=pbar_desc, total=len(data), bar_format="{l_bar}%s{bar}{r_bar}" % '\x1b[38;5;231m') - img_height = data[0]['img'].shape[0] - - if img_height in (1080, 1440): - offset = 200 - elif img_height == 720: - offset = 50 - else: - offset = 0 - for sample in data: file_name = sample['file_name'] img = sample['img'] @@ -550,7 +541,7 @@ def eval(self, dataset, silent=False, verbose=True, use_subset=True, subset_size if (xmax - xmin) > 40 and (ymax - ymin) > 40: crop_img = img[ymin:ymax, xmin:xmax] else: - crop_img = img[offset:img.shape[0], offset:img.shape[1]] + crop_img = img[0:img.shape[0], 0:img.shape[1]] h, w, _ = crop_img.shape @@ -571,10 +562,10 @@ def eval(self, dataset, silent=False, verbose=True, use_subset=True, subset_size for i in range(all_keypoints.shape[0]): for j in range(all_keypoints.shape[1]): - if j == 0: # Adjust offset if needed for evaluation on our HR datasets - all_keypoints[i][j] = round((all_keypoints[i][j] + xmin) - offset) - if j == 1: # Adjust offset if needed for evaluation on our HR datasets - all_keypoints[i][j] = round((all_keypoints[i][j] + ymin) - offset) + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) current_poses = [] for n in range(len(pose_entries)): @@ -603,7 +594,7 @@ def eval(self, dataset, silent=False, verbose=True, use_subset=True, subset_size if self.visualize: for keypoints in coco_keypoints: for idx in range(len(keypoints) // 3): - cv2.circle(img, (int(keypoints[idx * 3] + offset), int(keypoints[idx * 3 + 1]) + offset), + cv2.circle(img, (int(keypoints[idx * 3]), int(keypoints[idx * 3 + 1])), 3, (255, 0, 255), -1) cv2.imshow('keypoints', img) key = cv2.waitKey() @@ -995,7 +986,6 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis heatmap -> np.array() """ current_poses = [] - offset = 0 num_keypoints = Pose.num_kpts if not isinstance(img, Image): img = Image(img) @@ -1085,7 +1075,7 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis if (xmax - xmin) > 40 and (ymax - ymin) > 40: crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] else: - crop_img = img[offset:img.shape[0], offset:img.shape[1]] + crop_img = img[0:img.shape[0], 0:img.shape[1]] h, w, _ = crop_img.shape if crop_img.shape[0] < self.second_pass_height: @@ -1113,10 +1103,10 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis for i in range(all_keypoints.shape[0]): for j in range(all_keypoints.shape[1]): - if j == 0: # Adjust offset if needed for evaluation on our HR datasets - all_keypoints[i][j] = round((all_keypoints[i][j] + xmin) - offset) - if j == 1: # Adjust offset if needed for evaluation on our HR datasets - all_keypoints[i][j] = round((all_keypoints[i][j] + ymin) - offset) + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) current_poses = [] for n in range(len(pose_entries)): @@ -1171,7 +1161,7 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis if (xmax - xmin) > 40 and (ymax - ymin) > 40: crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] else: - crop_img = img[offset:img.shape[0], offset:img.shape[1]] + crop_img = img[0:img.shape[0], 0:img.shape[1]] h, w, _ = crop_img.shape if crop_img.shape[0] < self.second_pass_height: @@ -1197,10 +1187,10 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis for i in range(all_keypoints.shape[0]): for j in range(all_keypoints.shape[1]): - if j == 0: # Adjust offset if needed for evaluation on our HR datasets - all_keypoints[i][j] = round((all_keypoints[i][j] + xmin) - offset) - if j == 1: # Adjust offset if needed for evaluation on our HR datasets - all_keypoints[i][j] = round((all_keypoints[i][j] + ymin) - offset) + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) current_poses = [] for n in range(len(pose_entries)): @@ -1283,12 +1273,11 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru self.prev_heatmap = heatmap heatmap_dims, detection = self.__crop_heatmap(heatmap) - # self.xmin = heatmap_dims[0] * (img.shape[1] / heatmap.shape[1]) - # self.ymin = heatmap_dims[2] * (img.shape[0] / heatmap.shape[0]) - # self.xmax = heatmap_dims[1] * (img.shape[1] / heatmap.shape[1]) - # self.ymax = heatmap_dims[3] * (img.shape[0] / heatmap.shape[0]) - if detection: + self.xmin = heatmap_dims[0] * (img.shape[1] / heatmap.shape[1]) + self.ymin = heatmap_dims[2] * (img.shape[0] / heatmap.shape[0]) + self.xmax = heatmap_dims[1] * (img.shape[1] / heatmap.shape[1]) + self.ymax = heatmap_dims[3] * (img.shape[0] / heatmap.shape[0]) cropped_heatmap = heatmap[heatmap_dims[2]:heatmap_dims[3], heatmap_dims[0]:heatmap_dims[1]] if self.__check_for_split(cropped_heatmap): # Split horizontal or vertical @@ -1399,6 +1388,7 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru xmin = xmin - extra_pad_x else: xmin = xmin + if xmax + extra_pad_x < img.shape[1]: xmax = xmax + extra_pad_x else: @@ -1408,6 +1398,7 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru ymin = ymin - extra_pad_y else: ymin = ymin + if ymax + extra_pad_y < img.shape[0]: ymax = ymax + extra_pad_y else: @@ -1628,8 +1619,8 @@ def download(self, path=None, mode="pretrained", verbose=False, urlretrieve(file_url, os.path.join(self.temp_path, "dataset", "annotation.json")) # Download test image if image_resolution in (1080, 1440): - file_url = os.path.join(url, "dataset", "image", "000000000785_" + str(image_resolution) + ".jpg") - urlretrieve(file_url, os.path.join(self.temp_path, "dataset", "image", "000000000785_1080.jpg")) + file_url = os.path.join(url, "dataset", "image", "000000052591_" + str(image_resolution) + ".jpg") + urlretrieve(file_url, os.path.join(self.temp_path, "dataset", "image", "000000052591_1080.jpg")) else: raise UserWarning("There are no data for this image resolution (only 1080 and 1440 are supported).") diff --git a/tests/sources/tools/perception/pose_estimation/high_resolution_pose_estimation/test_high_resolution_pose_estimation.py b/tests/sources/tools/perception/pose_estimation/high_resolution_pose_estimation/test_high_resolution_pose_estimation.py index 230b6da5fc..a6a461ccf7 100644 --- a/tests/sources/tools/perception/pose_estimation/high_resolution_pose_estimation/test_high_resolution_pose_estimation.py +++ b/tests/sources/tools/perception/pose_estimation/high_resolution_pose_estimation/test_high_resolution_pose_estimation.py @@ -81,15 +81,41 @@ def test_eval(self): warnings.simplefilter("default", ResourceWarning) warnings.simplefilter("default", DeprecationWarning) + def test_eval_adaptive(self): + # Test eval will issue resource warnings due to some files left open in pycoco tools, + # as well as a deprecation warning due to a cast of a float to integer (hopefully they will be fixed in a future + # version) + warnings.simplefilter("ignore", ResourceWarning) + warnings.simplefilter("ignore", DeprecationWarning) + + eval_dataset = ExternalDataset(path=os.path.join(self.temp_dir, "dataset"), dataset_type="COCO") + results_dict = self.pose_estimator.eval_adaptive(eval_dataset, use_subset=False, verbose=True, silent=True, + images_folder_name="image", annotations_filename="annotation.json") + self.assertNotEqual(len(results_dict['average_precision']), 0, + msg="Eval results dictionary contains empty list.") + self.assertNotEqual(len(results_dict['average_recall']), 0, + msg="Eval results dictionary contains empty list.") + # Cleanup + rmfile(os.path.join(self.temp_dir, "detections.json")) + warnings.simplefilter("default", ResourceWarning) + warnings.simplefilter("default", DeprecationWarning) + def test_infer(self): self.pose_estimator.model = None self.pose_estimator.load(os.path.join(self.temp_dir, "openpose_default")) - - img = Image.open(os.path.join(self.temp_dir, "dataset", "image", "000000000785_1080.jpg")) + img = Image.open(os.path.join(self.temp_dir, "dataset", "image", "000000052591_1080.jpg")) # Default pretrained mobilenet model detects 18 keypoints on img with id 785 self.assertGreater(len(self.pose_estimator.infer(img)[0][0].data), 0, msg="Returned pose must have non-zero number of keypoints.") + def test_infer_adaptive(self): + self.pose_estimator.model = None + self.pose_estimator.load(os.path.join(self.temp_dir, "openpose_default")) + img = Image.open(os.path.join(self.temp_dir, "dataset", "image", "000000052591_1080.jpg")) + # Default pretrained mobilenet model detects 18 keypoints on img with id 785 + self.assertGreater(len(self.pose_estimator.infer_adaptive(img)[0][0].data), 0, + msg="Returned pose must have non-zero number of keypoints.") + if __name__ == "__main__": unittest.main() From a903e2a910176e83b2c9812eb53ce69361f3d986 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Mon, 20 Nov 2023 09:21:21 +0200 Subject: [PATCH 04/15] fix bugs --- .../demos/eval_demo.py | 15 +++++++------- .../demos/inference_demo.py | 2 +- .../high_resolution_learner.py | 20 ++++++++++--------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py index d689472183..22d3c03157 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py @@ -26,8 +26,9 @@ action="store_true") parser.add_argument("--height1", help="Base height of resizing in first inference", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) - parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", - default="primary") + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection " + "methodology defaults to adaptive", + default="adaptive") args = parser.parse_args() device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ @@ -38,9 +39,9 @@ stages = 0 half_precision = True else: - stride = True + stride = False stages = 2 - half_precision = True + half_precision = False pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, mobilenet_use_stride=stride, @@ -54,14 +55,14 @@ # Download a sample dataset pose_estimator.download(path=".", mode="test_data") - eval_dataset = ExternalDataset(path=join("temp", "dataset"), dataset_type="COCO") + eval_dataset = ExternalDataset(path=join("temp", "dataset", 'hr'), dataset_type="COCO") t0 = time.time() if method == "primary": - results_dict = pose_estimator.eval(eval_dataset, use_subset=False, verbose=True, silent=True, + results_dict = pose_estimator.eval(eval_dataset, use_subset=True, verbose=True, silent=False, images_folder_name="image", annotations_filename="annotation.json") if method == "adaptive": - results_dict = pose_estimator.eval_adaptive(eval_dataset, use_subset=False, verbose=True, silent=True, + results_dict = pose_estimator.eval_adaptive(eval_dataset, use_subset=False, verbose=True, silent=False, images_folder_name="image", annotations_filename="annotation.json") t1 = time.time() print("\n Evaluation time: ", t1 - t0, "seconds") diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py index 11909a46fa..3f9cc0c0b7 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py @@ -27,7 +27,7 @@ action="store_true") parser.add_argument("--height1", help="Base height of resizing in first inference", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) - parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", default="adaptive") args = parser.parse_args() diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index 44fa325255..95f028be4f 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -817,6 +817,7 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) pose = Pose(pose_keypoints, pose_entries[n][18]) current_poses.append(pose) + coco_keypoints, scores = convert_to_coco_format(pose_entries, all_keypoints) image_id = int(file_name[0:file_name.rfind('.')]) @@ -919,17 +920,17 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su pose = Pose(pose_keypoints, pose_entries[n][18]) current_poses.append(pose) - coco_keypoints, scores = convert_to_coco_format(pose_entries, all_keypoints) + coco_keypoints, scores = convert_to_coco_format(pose_entries, all_keypoints) - image_id = int(file_name[0:file_name.rfind('.')]) + image_id = int(file_name[0:file_name.rfind('.')]) - for idx in range(len(coco_keypoints)): - coco_result.append({ - 'image_id': image_id, - 'category_id': 1, # person - 'keypoints': coco_keypoints[idx], - 'score': scores[idx] - }) + for idx in range(len(coco_keypoints)): + coco_result.append({ + 'image_id': image_id, + 'category_id': 1, # person + 'keypoints': coco_keypoints[idx], + 'score': scores[idx] + }) if self.visualize: for keypoints in coco_keypoints: @@ -1369,6 +1370,7 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) pose = Pose(pose_keypoints, pose_entries[n][18]) current_poses.append(pose) + else: xmin = heatmap_dims[0] xmax = heatmap_dims[1] From 30c89177fec17f53fb2ad92acf470873b3752f2b Mon Sep 17 00:00:00 2001 From: mthodoris Date: Tue, 21 Nov 2023 12:09:53 +0200 Subject: [PATCH 05/15] Learner fix bugs optimization webcam_demo --- .../demos/benchmarking_demo.py | 4 +- .../demos/eval_demo.py | 4 +- .../demos/webcam_demo.py | 80 +++++-- .../high_resolution_learner.py | 202 ++++++++++-------- 4 files changed, 169 insertions(+), 121 deletions(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py index c31a788488..f9651ce197 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py @@ -28,8 +28,8 @@ action="store_true") parser.add_argument("--height1", help="Base height of resizing in heatmap generation", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) - parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", - default="primary") + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", + default="adaptive") args = parser.parse_args() device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py index 22d3c03157..8f133bcebd 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py @@ -55,11 +55,11 @@ # Download a sample dataset pose_estimator.download(path=".", mode="test_data") - eval_dataset = ExternalDataset(path=join("temp", "dataset", 'hr'), dataset_type="COCO") + eval_dataset = ExternalDataset(path=join("temp", "dataset"), dataset_type="COCO") t0 = time.time() if method == "primary": - results_dict = pose_estimator.eval(eval_dataset, use_subset=True, verbose=True, silent=False, + results_dict = pose_estimator.eval(eval_dataset, use_subset=False, verbose=True, silent=False, images_folder_name="image", annotations_filename="annotation.json") if method == "adaptive": results_dict = pose_estimator.eval_adaptive(eval_dataset, use_subset=False, verbose=True, silent=False, diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py index d5b5ce395a..b8e0c55060 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py @@ -48,11 +48,11 @@ def __next__(self): parser = argparse.ArgumentParser() parser.add_argument("--onnx", help="Use ONNX", default=False, action="store_true") parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda") - parser.add_argument("--accelerate", help="Enables acceleration flags (e.g., stride)", default=True, + parser.add_argument("--accelerate", help="Enables acceleration flags (e.g., stride)", default=False, action="store_true") - parser.add_argument("--height1", help="Base height of resizing in first inference, defaults to 420", default=360) - parser.add_argument("--height2", help="Base height of resizing in second inference, defaults to 360", default=540) - parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to primary", + parser.add_argument("--height1", help="Base height of resizing in first inference, defaults to 420", default=420) + parser.add_argument("--height2", help="Base height of resizing in second inference, defaults to 360", default=360) + parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", default="adaptive") args = parser.parse_args() @@ -76,12 +76,24 @@ def __next__(self): first_pass_height=base_height1, second_pass_height=base_height2, percentage_around_crop=0.1, - method=method) + method="primary") + + adapt_hr_pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, + mobilenet_use_stride=stride, half_precision=half_precision, + first_pass_height=base_height1, + second_pass_height=base_height2, + percentage_around_crop=0.1, + method="adaptive") + hr_pose_estimator.download(path=".", verbose=True) hr_pose_estimator.load("openpose_default") + adapt_hr_pose_estimator.download(path=".", verbose=True) + adapt_hr_pose_estimator.load("openpose_default") + if onnx: hr_pose_estimator.optimize() + adapt_hr_pose_estimator.optimize() lw_pose_estimator.download(path=".", verbose=True) lw_pose_estimator.load("openpose_default") @@ -89,8 +101,9 @@ def __next__(self): if onnx: lw_pose_estimator.optimize() - hr_avg_fps = 0 + prim_hr_avg_fps = 0 lw_avg_fps = 0 + adapt_hr_avg_fps = 0 # Use the first camera available on the system image_provider = VideoReader(0) image_provider = iter(image_provider) @@ -98,21 +111,21 @@ def __next__(self): height = image_provider.cap.get(4) width = image_provider.cap.get(3) if width / height == 16 / 9: - size = (1280, int(720 / 2)) + size = (2 * 1280, 2 * int(720 / 3)) elif width / height == 4 / 3: - size = (1024, int(768 / 2)) + size = (2 * 1024, 2 * int(768 / 3)) else: - size = (width, int(height / 2)) + size = (width, int(height / 3)) while True: img = next(image_provider) total_time0 = time.time() img_copy = np.copy(img) - + adapt_img = np.copy(img) # Perform inference start_time = time.perf_counter() - hr_poses, heatmap, _ = hr_pose_estimator.infer_adaptive(img) + hr_poses, heatmap, _ = hr_pose_estimator.infer(img) hr_time = time.perf_counter() - start_time # Perform inference @@ -120,41 +133,59 @@ def __next__(self): lw_poses = lw_pose_estimator.infer(img_copy) lw_time = time.perf_counter() - start_time + # Perform inference + start_time = time.perf_counter() + adapt_hr_poses, adapt_heatmap, _ = adapt_hr_pose_estimator.infer_adaptive(img) + adapt_hr_time = time.perf_counter() - start_time + total_time = time.time() - total_time0 for hr_pose in hr_poses: draw(img, hr_pose) for lw_pose in lw_poses: draw(img_copy, lw_pose) + for adapt_hr_pose in adapt_hr_poses: + draw(adapt_img, adapt_hr_pose) + + lw_fps = 1 / (total_time - (hr_time + adapt_hr_time)) + prim_hr_fps = 1 / (total_time - (lw_time + adapt_hr_time)) + adapt_hr_fps = 1 / (total_time - (lw_time + hr_time)) - lw_fps = 1 / (total_time - hr_time) - hr_fps = 1 / (total_time - lw_time) # Calculate a running average on FPS - hr_avg_fps = 0.95 * hr_avg_fps + 0.05 * hr_fps + prim_hr_avg_fps = 0.95 * prim_hr_avg_fps + 0.05 * prim_hr_fps lw_avg_fps = 0.95 * lw_avg_fps + 0.05 * lw_fps + adapt_hr_avg_fps = 0.95 * adapt_hr_avg_fps + 0.05 * adapt_hr_fps cv2.putText(img=img, text="OpenDR High Resolution", org=(20, int(height / 10)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=img, text="Pose Estimation Primary ROI selection", org=(20, int(height / 10) + 50), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) - cv2.putText(img=img, text="Pose Estimation", org=(20, int(height / 10) + 50), + cv2.putText(img=img, text='FPS:' + str(int(prim_hr_avg_fps)), org=(20, int(height / 4)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) cv2.putText(img=img_copy, text='Lightweight OpenPose ', org=(20, int(height / 10)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) cv2.putText(img=img_copy, text='FPS: ' + str(int(lw_avg_fps)), org=(20, int(height / 4)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) - cv2.putText(img=img, text='FPS:' + str(int(hr_avg_fps)), org=(20, int(height / 4)), + cv2.putText(img=adapt_img, text="Pose Estimation Adaptive ROI selection", org=(20, int(height / 10) + 50), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(200, 0, 0), + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=adapt_img, text='FPS:' + str(int(adapt_hr_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) heatmap = heatmap * 5 @@ -162,7 +193,12 @@ def __next__(self): heatmap = cv2.resize(heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) img[(img.shape[0] - heatmap.shape[0]):img.shape[0], 0:heatmap.shape[1]] = heatmap - output_image = cv2.hconcat([img_copy, img]) + adapt_heatmap = adapt_heatmap * 5 + adapt_heatmap = cv2.cvtColor(adapt_heatmap, cv2.COLOR_GRAY2BGR) + adapt_heatmap = cv2.resize(adapt_heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) + adapt_img[(adapt_img.shape[0] - adapt_heatmap.shape[0]):adapt_img.shape[0], 0:adapt_heatmap.shape[1]] = adapt_heatmap + + output_image = cv2.hconcat([img_copy, img, adapt_img]) output_image = cv2.resize(output_image, size) cv2.imshow('Result', output_image) diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index 7a49b94db5..6b4c88b88e 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -250,7 +250,7 @@ def __split_process(self, cropped_heatmap): Returns: crops: A list with the new dimensions of the split parts - :type crops: list + :type: list """ max_diff_c, max_diff_r = 0, 0 y_crop_l = cropped_heatmap.shape[1] @@ -339,14 +339,15 @@ def __crop_image_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, he Returns: :crop_img: the cropped image part from the original image - :type crop_img: numpy.array + :type :numpy.array :xmin, xmax, ymin, ymax: the dimensions of the cropped part in the original image coordinate system """ - upscale_factor = int(pool_img.shape[0] / heatmap.shape[0]) - xmin = upscale_factor * xmin - xmax = upscale_factor * xmax - ymin = upscale_factor * ymin - ymax = upscale_factor * ymax + upscale_factor_x = pool_img.shape[0] / heatmap.shape[0] + upscale_factor_y = pool_img.shape[1] / heatmap.shape[1] + xmin = upscale_factor_x * xmin + xmax = upscale_factor_x * xmax + ymin = upscale_factor_y * ymin + ymax = upscale_factor_y * ymax upscale_to_init_img = original_image.shape[0] / pool_img.shape[0] xmin = upscale_to_init_img * xmin @@ -877,7 +878,7 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su if (xmax - xmin) > 40 and (ymax - ymin) > 40: crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] else: - crop_img = img + crop_img = img[0:img.shape[0], 0:img.shape[1]] h, w, _, = crop_img.shape if h > self.second_pass_height: @@ -1255,7 +1256,7 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru xmin, ymin = 0, 0 ymax, xmax, _ = img.shape - if self.counter % 5 == 0: + if self.counter % 2 == 0: kernel = int(h / self.first_pass_height) if kernel > 0: pool_img = self.__pooling(img, kernel) @@ -1291,16 +1292,16 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru xmin, xmax, ymin, ymax = self.__crop_ecclosing_bbox(crop) - xmin += heatmap_dims[0] - xmax += heatmap_dims[0] - ymin += heatmap_dims[2] - ymax += heatmap_dims[2] - xmin += crop_params[1] xmax += crop_params[1] ymin += crop_params[3] ymax += crop_params[3] + xmin += heatmap_dims[0] + xmax += heatmap_dims[0] + ymin += heatmap_dims[2] + ymax += heatmap_dims[2] + crop_img, xmin, xmax, ymin, ymax = self.__crop_image_func(xmin, xmax, ymin, ymax, pool_img, img, heatmap, self.perc, detection) @@ -1411,17 +1412,21 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru self.ymin = ymin self.xmax = xmax self.ymax = ymax + self.x1min, self.x1max, self.y1min, self.y1max = xmin, xmax, ymin, ymax + self.x2min, self.x2max, self.y2min, self.y2max = xmin, xmax, ymin, ymax else: a = 0.2 self.xmin = a * xmin + (1 - a) * self.xmin self.ymin = a * ymin + (1 - a) * self.ymin self.ymax = a * ymax + (1 - a) * self.ymax self.xmax = a * xmax + (1 - a) * self.xmax + self.x1min, self.x1max, self.y1min, self.y1max = self.xmin, self.xmax, self.ymin, self.ymax + self.x2min, self.x2max, self.y2min, self.y2max = self.xmin, self.xmax, self.ymin, self.ymax if (xmax - xmin) > 40 and (ymax - ymin) > 40: crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] else: - crop_img = img + crop_img = img[0:img.shape[0], 0:img.shape[1]] h, w, _, = crop_img.shape if h > self.second_pass_height: @@ -1469,110 +1474,117 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru self.ymin = ymin self.xmax = xmax self.ymax = ymax + self.x1min, self.x1max, self.y1min, self.y1max = 0, 0, 0, 0 + self.x2min, self.x2max, self.y2min, self.y2max = 0, 0, 0, 0 else: - a = 0.2 + a = 0.8 self.xmin = a * xmin + (1 - a) * self.xmin self.ymin = a * ymin + (1 - a) * self.ymin self.ymax = a * ymax + (1 - a) * self.ymax self.xmax = a * xmax + (1 - a) * self.xmax - self.x1min, self.x1max, self.y1min, self.y1max = None, None, None, None - self.x2min, self.x2max, self.y2min, self.y2max = None, None, None, None + self.x1min, self.x1max, self.y1min, self.y1max = 0, 0, 0, 0 + self.x2min, self.x2max, self.y2min, self.y2max = 0, 0, 0, 0 else: - extra_pad_x = int(self.perc * (self.xmax - self.xmin)) # Adding an extra pad around cropped image - extra_pad_y = int(self.perc * (self.ymax - self.ymin)) - - if self.xmin - extra_pad_x > 0: - xmin = self.xmin - extra_pad_x - else: - xmin = self.xmin + if self.x1min is None: + self.x1min = xmin + self.y1min = ymin + self.x1max = xmax + self.y1max = ymax + if self.x2min is None: + self.x2min = xmin + self.y2min = ymin + self.x2max = xmax + self.y2max = ymax + + boxes = ([self.x1min, self.x1max, self.y1min, self.y1max], [self.x2min, self.x2max, self.y2min, self.y2max]) + for box in boxes: + xmin = box[0] + xmax = box[1] + ymin = box[2] + ymax = box[3] + + extra_pad_x = int(self.perc * (xmax - xmin)) + extra_pad_y = int(self.perc * (ymax - ymin)) - if self.xmax + extra_pad_x < img.shape[1]: - xmax = self.xmax + extra_pad_x - else: - xmax = self.xmax + if (xmin - extra_pad_x > 0) and (xmin > 0): + xmin = xmin - extra_pad_x + else: + xmin = xmin - if self.ymin - extra_pad_y > 0: - ymin = self.ymin - extra_pad_y - else: - ymin = self.ymin + if (xmax + extra_pad_x < img.shape[1]) and (xmax < img.shape[1]): + xmax = xmax + extra_pad_x + else: + xmax = xmax - if self.ymax + extra_pad_y < img.shape[0]: - ymax = self.ymax + extra_pad_y - else: - ymax = self.ymax + if (ymin - extra_pad_y > 0) and (ymin > 0): + ymin = ymin - extra_pad_y + else: + ymin = ymin - if self.xmin is None: - self.xmin = xmin - self.ymin = ymin - self.xmax = xmax - self.ymax = ymax - else: - a = 0.2 - self.xmin = a * xmin + (1 - a) * self.xmin - self.ymin = a * ymin + (1 - a) * self.ymin - self.ymax = a * ymax + (1 - a) * self.ymax - self.xmax = a * xmax + (1 - a) * self.xmax + if (ymax + extra_pad_y < img.shape[0]) and (ymax < img.shape[0]): + ymax = ymax + extra_pad_y + else: + ymax = ymax - if (xmax - xmin) > 40 and (ymax - ymin) > 40: - crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] - else: - crop_img = img[img.shape[0], img.shape[1]] + if (xmax - xmin) > 40 and (ymax - ymin) > 40: + crop_img = img[int(ymin):int(ymax), int(xmin):int(xmax)] + else: + crop_img = img[0:img.shape[0], 0:img.shape[1]] - h, w, _ = crop_img.shape - if crop_img.shape[0] < self.second_pass_height: - second_pass_height = crop_img.shape[0] - else: - second_pass_height = self.second_pass_height + h, w, _ = crop_img.shape + if crop_img.shape[0] < self.second_pass_height: + second_pass_height = crop_img.shape[0] + else: + second_pass_height = self.second_pass_height - # ------- Second pass of the image, inference for pose estimation ------- - avg_heatmaps, avg_pafs, scale, pad = self.__second_pass(crop_img, second_pass_height, - max_width, self.stride, upsample_ratio) + # ------- Second pass of the image, inference for pose estimation ------- + avg_heatmaps, avg_pafs, scale, pad = self.__second_pass(crop_img, second_pass_height, + max_width, self.stride, upsample_ratio) - total_keypoints_num = 0 - all_keypoints_by_type = [] - for kpt_idx in range(18): - total_keypoints_num += extract_keypoints(avg_heatmaps[:, :, kpt_idx], all_keypoints_by_type, - total_keypoints_num) + total_keypoints_num = 0 + all_keypoints_by_type = [] + for kpt_idx in range(18): + total_keypoints_num += extract_keypoints(avg_heatmaps[:, :, kpt_idx], all_keypoints_by_type, + total_keypoints_num) - pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, avg_pafs) + pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, avg_pafs) - for kpt_id in range(all_keypoints.shape[0]): - all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * self.stride / upsample_ratio - pad[1]) / scale - all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * self.stride / upsample_ratio - pad[0]) / scale + for kpt_id in range(all_keypoints.shape[0]): + all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * self.stride / upsample_ratio - pad[1]) / scale + all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * self.stride / upsample_ratio - pad[0]) / scale - for i in range(all_keypoints.shape[0]): - for j in range(all_keypoints.shape[1]): - if j == 0: - all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) - if j == 1: - all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) + for i in range(all_keypoints.shape[0]): + for j in range(all_keypoints.shape[1]): + if j == 0: + all_keypoints[i][j] = round((all_keypoints[i][j] + xmin)) + if j == 1: + all_keypoints[i][j] = round((all_keypoints[i][j] + ymin)) - current_poses = [] - for n in range(len(pose_entries)): - if len(pose_entries[n]) == 0: - continue - pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1 - for kpt_id in range(num_keypoints): - if pose_entries[n][kpt_id] != -1.0: # keypoint was found - pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0]) - pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) + current_poses = [] + for n in range(len(pose_entries)): + if len(pose_entries[n]) == 0: + continue + pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1 + for kpt_id in range(num_keypoints): + if pose_entries[n][kpt_id] != -1.0: # keypoint was found + pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0]) + pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1]) - if np.count_nonzero(pose_keypoints == -1) < 26: - pose = Pose(pose_keypoints, pose_entries[n][18]) - current_poses.append(pose) + if np.count_nonzero(pose_keypoints == -1) < 26: + pose = Pose(pose_keypoints, pose_entries[n][18]) + current_poses.append(pose) - if np.any(self.prev_heatmap) is False: - heatmap = np.zeros((int(img.shape[0] / (int((img.shape[0] / self.first_pass_height))) / 8), - int(img.shape[1] / (int((img.shape[0] / self.first_pass_height))) / 8)), - dtype=np.uint8) - else: - heatmap = self.prev_heatmap + if np.any(self.prev_heatmap) is False: + heatmap = np.zeros((int(img.shape[0] / (int((img.shape[0] / self.first_pass_height))) / 8), + int(img.shape[1] / (int((img.shape[0] / self.first_pass_height))) / 8)), + dtype=np.uint8) + else: + heatmap = self.prev_heatmap self.counter += 1 - bounds = [(self.xmin, self.xmax, self.ymin, self.ymax), (self.x1min, self.x1max, self.y1min, self.y1max), + bounds = [(self.x1min, self.x1max, self.y1min, self.y1max), (self.x2min, self.x2max, self.y2min, self.y2max)] - return current_poses, heatmap, bounds def download(self, path=None, mode="pretrained", verbose=False, From b1c46ef3e785e8cb7c7062cc7a9263711b20e528 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Tue, 21 Nov 2023 13:04:12 +0200 Subject: [PATCH 06/15] Learner fix bugs Update docs file --- .../high-resolution-pose-estimation.md | 151 +++++++++++++++++- .../high_resolution_learner.py | 6 +- 2 files changed, 152 insertions(+), 5 deletions(-) diff --git a/docs/reference/high-resolution-pose-estimation.md b/docs/reference/high-resolution-pose-estimation.md index 63f0cd7566..55d6410470 100644 --- a/docs/reference/high-resolution-pose-estimation.md +++ b/docs/reference/high-resolution-pose-estimation.md @@ -45,6 +45,8 @@ Constructor parameters: Specifies the height that the input image will be resized during the heatmap generation procedure. - **second_pass_height**: *int, default=540*\ Specifies the height of the image on the second inference for pose estimation procedure. +- **method**: *str, default='adaptive' +- Determines which method (*adaptive* or *primary*) is used for ROI extraction. - **percentage_arround_crop**: *float, default=0.3*\ Specifies the percentage of an extra pad arround the cropped image - **heatmap_threshold**: *float, default=0.1*\ @@ -74,7 +76,63 @@ Constructor parameters: - **half_precision**: *bool, default=False*\ Enables inference using half (fp16) precision instead of single (fp32) precision. Valid only for GPU-based inference. +#### High Resolution Pose estimation using Adaptive ROI selection method +#### `HighResolutionPoseEstimationLearner.eval_adaptive` +```python +HighResolutionPoseEstimationLearner.eval_adaptive(self, dataset, silent, verbose, use_subset, subset_size, images_folder_name, annotations_filename) +``` + +This method is used to evaluate a trained model on an evaluation dataset. +Returns a dictionary containing statistics regarding evaluation. + +Parameters: + +- **dataset**: *object*\ + Object that holds the evaluation dataset. + Can be of type `ExternalDataset` or a custom dataset inheriting from `DatasetIterator`. +- **silent**: *bool, default=False*\ + If set to True, disables all printing of evaluation progress reports and other information to STDOUT. +- **verbose**: *bool, default=True*\ + If set to True, enables the maximum verbosity. +- **use_subset**: *bool, default=True*\ + If set to True, a subset of the validation dataset is created and used in evaluation. +- **subset_size**: *int, default=250*\ + Controls the size of the validation subset. +- **images_folder_name**: *str, default='val2017'*\ + Folder name that contains the dataset images. + This folder should be contained in the dataset path provided. + Note that this is a folder name, not a path. +- **annotations_filename**: *str, default='person_keypoints_val2017.json'*\ + Filename of the annotations JSON file. + This file should be contained in the dataset path provided. + +#### `HighResolutionPoseEstimation.infer_adaptive` +```python +HighResolutionPoseEstimation.infer_adaptive(self, img, upsample_ratio, stride, track, smooth, multiscale, visualize) +``` + +This method is used to perform pose estimation on an image. +Returns a list of engine.target.Pose objects, where each holds a pose +and a heatmap that contains human silhouettes of the input image. +If no detections were made it returns an empty list for poses and a black frame for heatmap. +Parameters: + +- **img**: *object***\ + Object of type engine.data.Image. +- **upsample_ratio**: *int, default=4*\ + Defines the amount of upsampling to be performed on the heatmaps and PAFs when resizing. +- **stride**: *int, default=8*\ + Defines the stride value for creating a padded image. +- **track**: *bool, default=True*\ + If True, infer propagates poses ids from previous frame results to track poses. +- **smooth**: *bool, default=True*\ + If True, smoothing is performed on pose keypoints between frames. +- **multiscale**: *bool, default=False*\ + Specifies whether evaluation will run in the predefined multiple scales setup or not. + + +#### High Resolution Pose estimation using Primary ROI selection method #### `HighResolutionPoseEstimationLearner.eval` ```python HighResolutionPoseEstimationLearner.eval(self, dataset, silent, verbose, use_subset, subset_size, images_folder_name, annotations_filename) @@ -170,6 +228,57 @@ Parameters: - **img_scale**: *float, default=1/256*\ Specifies the scale based on which the images are normalized. +#### `HighResolutionPoseEstimationLearner.__crop_heatmap` +```python +HighResolutionPoseEstimationLearner.__crop_heatmap(self, heatmap ) +``` +This method takes the generated heatmap and crops it arround the desirable ROI using its nonzero values. + +Parameters: + +- **heatmap**: *numpy.array* Describes the extracted heatmap from the `HighResolutionPoseEstimationLearner.__first_pass` function. + +#### `HighResolutionPoseEstimationLearner.__check_for_split` +```python +HighResolutionPoseEstimationLearner.__check_for_split(self, cropped_heatmap ) +``` +This function checks weather or not the cropped heatmap needs further proccessing for extra cropping. +More specifically, returns a boolean for the decision for further crop, the decision depends on the distance between the target subjects. + +Parameters: +- **cropped_heatmap**: *numpy.array* Describes the cropped heatmap from the `HighResolutionPoseEstimationLearner.__crop_heatmap` function. + +#### `HighResolutionPoseEstimationLearner.__split_process` +```python +HighResolutionPoseEstimationLearner.__split_process(self, cropped_heatmap ) +``` +This function uses the cropped heatmap that crated from `__crop_heatmap` function and splits it in parts. + +Parameters: +- **cropped_heatmap**: *numpy.array* Describes the cropped heatmap from the `HighResolutionPoseEstimationLearner.__crop_heatmap` function. + +#### `HighResolutionPoseEstimationLearner.__crop_ecnclosing_bbox` +```python +HighResolutionPoseEstimationLearner.__crop_enclosing_bbox(self, crop ) +``` +This function creates the bounding box for each split part +Parameters: +- **crop**: *numpy.array* Describes the cropped part of the heatmap. + +#### `HighResolutionPoseEstimationLearner.__crop_img_func` +```python +HighResolutionPoseEstimationLearner.__crop_img_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc ) +``` +This function crops the region of interst(ROI) from the original image to use it in next steps. + +Parameters: + + +- **xmin, xmax, ymin, ymax**: *int* Describe the boundaries of the bounding box. +- **pool_img**: *numpy.array* The pooled image that resized to be used in `__first_pass` +- **original_img**: *numpy.array* The input High-Resolution image +- **heatmap**: *numpy.array* The extracted heatmap +- **perc**: *float* The percentage is used for adding an extra pad on the cropped image. #### `HighResolutionPoseEstimation.download` ```python @@ -258,7 +367,7 @@ The experiments are conducted on a 1080p image. | OpenDR - Full | 0.2 | 10.8 | 1.4 | 3.1 | -#### High-Resolution Pose Estimation +#### High-Resolution Pose Estimation (Primary ROI Selection) | Method | CPU i7-9700K (FPS) | RTX 2070 (FPS) | Jetson TX2 (FPS) | Xavier NX (FPS) | |------------------------|--------------------|----------------|------------------|-----------------| | HRPoseEstim - Baseline | 2.3 | 13.6 | 1.4 | 1.8 | @@ -283,6 +392,29 @@ was used as input to the models. The average precision and average recall on the COCO evaluation split is also reported in the tables below: +#### High-Resolution Pose Estimation (Adaptive ROI Selection) +| Method | CPU i7-9700K (FPS) | RTX 2070 (FPS) | Jetson TX2 (FPS) | Xavier NX (FPS) | +|------------------------|--------------------|----------------|------------------|-----------------| +| HRPoseEstim - Baseline | 2.4 | 10.5 | 2.1 | 1.5 | +| HRPoseEstim - Half | 2.5 | 11.3 | 2.6 | 1.9 | +| HRPoseEstim - Stride | 11.3 | 38.1 | 6.8 | 5.2 | +| HRPoseEstim - Stages | 2.8 | 10 | 2.3 | 1.9 | +| HRPoseEstim - H+S | 11.4 | 38 | 6.5 | 4.5 | +| HRPoseEstim - Full | 11.6 | 48.3 | 7.7 | 6.4 | + +As it is shown in the previous tables, OpenDR Lightweight OpenPose achieves higher FPS when it is resizing the input image into 256 pixels. +It is easier to process that image, but as it is shown in the next tables the method falls apart when it comes to accuracy and there are no detections. + +We have evaluated the effect of using different inference settings, namely: +- *HRPoseEstim - Baseline*, which refers to directly using the High Resolution Pose Estimation method,which is based in Lightweight OpenPose, +- *HRPoseEstim - Half*, which refers to enabling inference in half (FP) precision, +- *HRPoseEstim - Stride*, which refers to increasing stride by two in the input layer of the model, +- *HRPoseEstim - Stages*, which refers to removing the refinement stages, +- *HRPoseEstim - H+S*, which uses both half precision and increased stride, and +- *HRPoseEstim - Full*, which refers to combining all three available optimization. +was used as input to the models. + + #### Lightweight OpenPose with resizing | Method | Average Precision (IoU=0.50) | Average Recall (IoU=0.50) | |-------------------|------------------------------|---------------------------| @@ -300,7 +432,7 @@ The average precision and average recall on the COCO evaluation split is also re -#### High Resolution Pose Estimation +#### High Resolution Pose Estimation (Primary ROI Selection) | Method | Average Precision (IoU=0.50) | Average Recall (IoU=0.50) | |------------------------|------------------------------|---------------------------| | HRPoseEstim - Baseline | 0.615 | 0.637 | @@ -323,6 +455,21 @@ The average precision and the average recall have been calculated on a 1080p ver For measuring the precision and recall we used the standard approach proposed for COCO, using an Intersection of Union (IoU) metric at 0.5. +#### High Resolution Pose Estimation (Adaptive ROI Selection) + +The average precision and the average recall have been calculated on a 1080p version of COCO2017 validation dataset and the results are reported in the table below: + +| Method | Average Precision (IoU=0.50) | Average Recall (IoU=0.50) | +|-------------------|------------------------------|---------------------------| +| HRPoseEstim - Baseline | 0.594 | 0.617 | +| HRPoseEstim - Half | 0.586 | 0.606 | +| HRPoseEstim - Stride | 0.251 | 0.271 | +| HRPoseEstim - Stages | 0.511 | 0.534 | +| HRPoseEstim - H+S | 0.251 | 0.263 | +| HRPoseEstim - Full | 0.229 | 0.247 | + +For measuring the precision and recall we used the standard approach proposed for COCO, using an Intersection of Union (IoU) metric at 0.5. + #### Notes diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index 6b4c88b88e..1b741c2c1c 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -324,7 +324,7 @@ def __crop_ecclosing_bbox(self, crop): xmin, xmax, ymin, ymax = 0, 0, 0, 0 return xmin, xmax, ymin, ymax - def __crop_image_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc, detection): + def __crop_image_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc): """ This function crops the region of interst(ROI) from the original image to use it in next steps Args: xmin, ymin: top left corner dimensions of the split part in the original heatmap @@ -748,7 +748,7 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su ymax += crop_params[3] crop_img, xmin, xmax, ymin, ymax = self.__crop_image_func(xmin, xmax, ymin, ymax, pool_img, img, - heatmap, self.perc, detection) + heatmap, self.perc) if crop_part == 1: if self.x1min is None: @@ -1303,7 +1303,7 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=Tru ymax += heatmap_dims[2] crop_img, xmin, xmax, ymin, ymax = self.__crop_image_func(xmin, xmax, ymin, ymax, pool_img, img, - heatmap, self.perc, detection) + heatmap, self.perc) if crop_part == 1: if self.x1min is None: From 80a7e61a0a6381621c1f6fce7a611fee536a9b34 Mon Sep 17 00:00:00 2001 From: mthodoris <45919203+mthodoris@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:03:37 +0200 Subject: [PATCH 07/15] Apply suggestions from code review Co-authored-by: Kostas Tsampazis <27914645+tsampazk@users.noreply.github.com> --- docs/reference/high-resolution-pose-estimation.md | 9 ++++----- .../demos/benchmarking_demo.py | 2 +- .../high_resolution_pose_estimation/demos/eval_demo.py | 2 +- .../demos/inference_demo.py | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/reference/high-resolution-pose-estimation.md b/docs/reference/high-resolution-pose-estimation.md index 55d6410470..a2ea859b8d 100644 --- a/docs/reference/high-resolution-pose-estimation.md +++ b/docs/reference/high-resolution-pose-estimation.md @@ -47,7 +47,7 @@ Constructor parameters: Specifies the height of the image on the second inference for pose estimation procedure. - **method**: *str, default='adaptive' - Determines which method (*adaptive* or *primary*) is used for ROI extraction. -- **percentage_arround_crop**: *float, default=0.3*\ +- **percentage_around_crop**: *float, default=0.3*\ Specifies the percentage of an extra pad arround the cropped image - **heatmap_threshold**: *float, default=0.1*\ Specifies the threshold value that the heatmap elements should have during the first pass in order to trigger the second pass @@ -79,7 +79,7 @@ Constructor parameters: #### High Resolution Pose estimation using Adaptive ROI selection method #### `HighResolutionPoseEstimationLearner.eval_adaptive` ```python -HighResolutionPoseEstimationLearner.eval_adaptive(self, dataset, silent, verbose, use_subset, subset_size, images_folder_name, annotations_filename) +HighResolutionPoseEstimationLearner.eval_adaptive(self, dataset, silent, verbose, use_subset, subset_size, upsample_ratio, images_folder_name, annotations_filename) ``` This method is used to evaluate a trained model on an evaluation dataset. @@ -406,13 +406,12 @@ As it is shown in the previous tables, OpenDR Lightweight OpenPose achieves high It is easier to process that image, but as it is shown in the next tables the method falls apart when it comes to accuracy and there are no detections. We have evaluated the effect of using different inference settings, namely: -- *HRPoseEstim - Baseline*, which refers to directly using the High Resolution Pose Estimation method,which is based in Lightweight OpenPose, +- *HRPoseEstim - Baseline*, which refers to directly using the High Resolution Pose Estimation method, which is based on Lightweight OpenPose, - *HRPoseEstim - Half*, which refers to enabling inference in half (FP) precision, - *HRPoseEstim - Stride*, which refers to increasing stride by two in the input layer of the model, - *HRPoseEstim - Stages*, which refers to removing the refinement stages, - *HRPoseEstim - H+S*, which uses both half precision and increased stride, and -- *HRPoseEstim - Full*, which refers to combining all three available optimization. -was used as input to the models. +- *HRPoseEstim - Full*, which refers to combining all three available optimization and were used as input to the models. #### Lightweight OpenPose with resizing diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py index f9651ce197..d13c44ac51 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/benchmarking_demo.py @@ -29,7 +29,7 @@ parser.add_argument("--height1", help="Base height of resizing in heatmap generation", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", - default="adaptive") + default="adaptive", choices=["primary", "adaptive"]) args = parser.parse_args() device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py index 8f133bcebd..ecad92d333 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/eval_demo.py @@ -28,7 +28,7 @@ parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) parser.add_argument("--method", help="Choose between primary or adaptive ROI selection " "methodology defaults to adaptive", - default="adaptive") + default="adaptive", choices=["primary", "adaptive"]) args = parser.parse_args() device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py index 3f9cc0c0b7..50877712e0 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/inference_demo.py @@ -28,7 +28,7 @@ parser.add_argument("--height1", help="Base height of resizing in first inference", default=360) parser.add_argument("--height2", help="Base height of resizing in second inference", default=540) parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", - default="adaptive") + default="adaptive", choices=["primary", "adaptive"]) args = parser.parse_args() device, accelerate, base_height1, base_height2, method = args.device, args.accelerate, \ From 3b95ca1fca26549e64b9900558bb460cfc879a68 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Thu, 23 Nov 2023 10:43:19 +0200 Subject: [PATCH 08/15] Update docs file --- .../high-resolution-pose-estimation.md | 68 ++----------------- 1 file changed, 7 insertions(+), 61 deletions(-) diff --git a/docs/reference/high-resolution-pose-estimation.md b/docs/reference/high-resolution-pose-estimation.md index a2ea859b8d..e219cf9193 100644 --- a/docs/reference/high-resolution-pose-estimation.md +++ b/docs/reference/high-resolution-pose-estimation.md @@ -98,6 +98,8 @@ Parameters: If set to True, a subset of the validation dataset is created and used in evaluation. - **subset_size**: *int, default=250*\ Controls the size of the validation subset. +- **upsample_ratio**: *int, default=4*\ + Defines the amount of upsampling to be performed on the heatmaps and PAFs when resizing,defaults to 4 - **images_folder_name**: *str, default='val2017'*\ Folder name that contains the dataset images. This folder should be contained in the dataset path provided. @@ -108,11 +110,13 @@ Parameters: #### `HighResolutionPoseEstimation.infer_adaptive` ```python -HighResolutionPoseEstimation.infer_adaptive(self, img, upsample_ratio, stride, track, smooth, multiscale, visualize) +HighResolutionPoseEstimation.infer_adaptive(self, img, upsample_ratio, stride) ``` - This method is used to perform pose estimation on an image. -Returns a list of engine.target.Pose objects, where each holds a pose +The predicted poses are estimated through an adaptive ROI selection method that is applied on the high-resolution images. +The difference between the `HighResolutionPoseEstimation.infer` method is that the adaptive technique tries to separate the +detected ROI's instead of using the minimum enclosing bounding box of them as the `infer` does. +Returns a list of engine.target. Pose objects, where each holds a pose and a heatmap that contains human silhouettes of the input image. If no detections were made it returns an empty list for poses and a black frame for heatmap. Parameters: @@ -123,12 +127,6 @@ Parameters: Defines the amount of upsampling to be performed on the heatmaps and PAFs when resizing. - **stride**: *int, default=8*\ Defines the stride value for creating a padded image. -- **track**: *bool, default=True*\ - If True, infer propagates poses ids from previous frame results to track poses. -- **smooth**: *bool, default=True*\ - If True, smoothing is performed on pose keypoints between frames. -- **multiscale**: *bool, default=False*\ - Specifies whether evaluation will run in the predefined multiple scales setup or not. @@ -228,58 +226,6 @@ Parameters: - **img_scale**: *float, default=1/256*\ Specifies the scale based on which the images are normalized. -#### `HighResolutionPoseEstimationLearner.__crop_heatmap` -```python -HighResolutionPoseEstimationLearner.__crop_heatmap(self, heatmap ) -``` -This method takes the generated heatmap and crops it arround the desirable ROI using its nonzero values. - -Parameters: - -- **heatmap**: *numpy.array* Describes the extracted heatmap from the `HighResolutionPoseEstimationLearner.__first_pass` function. - -#### `HighResolutionPoseEstimationLearner.__check_for_split` -```python -HighResolutionPoseEstimationLearner.__check_for_split(self, cropped_heatmap ) -``` -This function checks weather or not the cropped heatmap needs further proccessing for extra cropping. -More specifically, returns a boolean for the decision for further crop, the decision depends on the distance between the target subjects. - -Parameters: -- **cropped_heatmap**: *numpy.array* Describes the cropped heatmap from the `HighResolutionPoseEstimationLearner.__crop_heatmap` function. - -#### `HighResolutionPoseEstimationLearner.__split_process` -```python -HighResolutionPoseEstimationLearner.__split_process(self, cropped_heatmap ) -``` -This function uses the cropped heatmap that crated from `__crop_heatmap` function and splits it in parts. - -Parameters: -- **cropped_heatmap**: *numpy.array* Describes the cropped heatmap from the `HighResolutionPoseEstimationLearner.__crop_heatmap` function. - -#### `HighResolutionPoseEstimationLearner.__crop_ecnclosing_bbox` -```python -HighResolutionPoseEstimationLearner.__crop_enclosing_bbox(self, crop ) -``` -This function creates the bounding box for each split part -Parameters: -- **crop**: *numpy.array* Describes the cropped part of the heatmap. - -#### `HighResolutionPoseEstimationLearner.__crop_img_func` -```python -HighResolutionPoseEstimationLearner.__crop_img_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc ) -``` -This function crops the region of interst(ROI) from the original image to use it in next steps. - -Parameters: - - -- **xmin, xmax, ymin, ymax**: *int* Describe the boundaries of the bounding box. -- **pool_img**: *numpy.array* The pooled image that resized to be used in `__first_pass` -- **original_img**: *numpy.array* The input High-Resolution image -- **heatmap**: *numpy.array* The extracted heatmap -- **perc**: *float* The percentage is used for adding an extra pad on the cropped image. - #### `HighResolutionPoseEstimation.download` ```python HighResolutionPoseEstimation.download(self, path, mode, verbose, url) From 9bd74b669fa00d3dcec1eb9d326e8ce0d1038a46 Mon Sep 17 00:00:00 2001 From: mthodoris <45919203+mthodoris@users.noreply.github.com> Date: Thu, 23 Nov 2023 10:57:52 +0200 Subject: [PATCH 09/15] Apply suggestions from code review Co-authored-by: Kostas Tsampazis <27914645+tsampazk@users.noreply.github.com> --- .../high_resolution_learner.py | 61 ++++++++++--------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index 1b741c2c1c..66880bf235 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -195,8 +195,9 @@ def __pooling(self, img, kernel): # Pooling on input image for dimension reduct pool_img = pool_img.squeeze(0).permute(1, 2, 0).cpu().float().numpy() return pool_img - def __crop_heatmap(self, heatmap): - """ This method takes the generated heatmap and crops it arround the desirable ROI using its nonzero values. + @staticmethod + def __crop_heatmap(heatmap): + """ This method takes the generated heatmap and crops it around the desirable ROI using its nonzero values. Parameters ---------- :param heatmap: the heatmap that generated from __first_pass function @@ -622,33 +623,33 @@ def eval(self, dataset, silent=False, verbose=True, use_subset=True, subset_size def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, subset_size=250, upsample_ratio=4, images_folder_name="val2017", annotations_filename="person_keypoints_val2017.json"): """ - This method is used to evaluate a trained model on an evaluation dataset. - - :param dataset: object that holds the evaluation dataset. - :type dataset: ExternalDataset class object or DatasetIterator class object - :param silent: if set to True, disables all printing of evaluation progress reports and other - information to STDOUT, defaults to 'False' - :type silent: bool, optional - :param verbose: if set to True, enables the maximum verbosity, defaults to 'True' - :type verbose: bool, optional - :param use_subset: If set to True, a subset of the validation dataset is created and used in - evaluation, defaults to 'True' - :type use_subset: bool, optional - :param subset_size: Controls the size of the validation subset, defaults to '250' - :type subset_size: int, optional - param upsample_ratio: Defines the amount of upsampling to be performed on the heatmaps and PAFs - when resizing,defaults to 4 - :type upsample_ratio: int, optional - :param images_folder_name: Folder name that contains the dataset images. This folder should be contained - in the dataset path provided. Note that this is a folder name, not a path, defaults to 'val2017' - :type images_folder_name: str, optional - :param annotations_filename: Filename of the annotations json file. This file should be contained in the - dataset path provided, defaults to 'person_keypoints_val2017.json' - :type annotations_filename: str, optional - - :returns: returns stats regarding evaluation - :rtype: dict - """ + This method is used to evaluate a trained model on an evaluation dataset. + + :param dataset: object that holds the evaluation dataset. + :type dataset: ExternalDataset class object or DatasetIterator class object + :param silent: if set to True, disables all printing of evaluation progress reports and other + information to STDOUT, defaults to 'False' + :type silent: bool, optional + :param verbose: if set to True, enables the maximum verbosity, defaults to 'True' + :type verbose: bool, optional + :param use_subset: If set to True, a subset of the validation dataset is created and used in + evaluation, defaults to 'True' + :type use_subset: bool, optional + :param subset_size: Controls the size of the validation subset, defaults to '250' + :type subset_size: int, optional + param upsample_ratio: Defines the amount of upsampling to be performed on the heatmaps and PAFs + when resizing,defaults to 4 + :type upsample_ratio: int, optional + :param images_folder_name: Folder name that contains the dataset images. This folder should be contained + in the dataset path provided. Note that this is a folder name, not a path, defaults to 'val2017' + :type images_folder_name: str, optional + :param annotations_filename: Filename of the annotations json file. This file should be contained in the + dataset path provided, defaults to 'person_keypoints_val2017.json' + :type annotations_filename: str, optional + + :returns: returns stats regarding evaluation + :rtype: dict + """ data = super(HighResolutionPoseEstimationLearner, # NOQA self)._LightweightOpenPoseLearner__prepare_val_dataset(dataset, use_subset=use_subset, @@ -1218,7 +1219,7 @@ def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multis bounds = ([self.xmin, self.xmax, self.ymin, self.ymax],) return current_poses, heatmap, bounds - def infer_adaptive(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multiscale=False): + def infer_adaptive(self, img, upsample_ratio=4, stride=8): """ This method is used to perform pose estimation on an image. From 3413d7d805620520175e91f07019fc653d580356 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Thu, 23 Nov 2023 11:28:33 +0200 Subject: [PATCH 10/15] Update docstrings --- .../high_resolution_learner.py | 135 ++++++++++-------- 1 file changed, 75 insertions(+), 60 deletions(-) diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index 66880bf235..a13aac0392 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -197,14 +197,14 @@ def __pooling(self, img, kernel): # Pooling on input image for dimension reduct @staticmethod def __crop_heatmap(heatmap): - """ This method takes the generated heatmap and crops it around the desirable ROI using its nonzero values. - Parameters - ---------- + """ + This method takes the generated heatmap and crops it around the desirable ROI using its nonzero values. + :param heatmap: the heatmap that generated from __first_pass function - :type heatmap numpy.array - Returns - ------- - heatmap_dims: It is an array that contains the boundaries of the cropped image + :type heatmap: numpy.array + + :returns An array that contains the boundaries of the cropped image + :rtype: np.array """ detection = False @@ -220,14 +220,18 @@ def __crop_heatmap(heatmap): heatmap_dims = (int(xmin), int(xmax), int(ymin), int(ymax)) return heatmap_dims, detection + @staticmethod def __check_for_split(self, cropped_heatmap): - """ This function checks weather or not the cropped heatmap needs further proccessing for extra cropping. + """ + This function checks weather or not the cropped heatmap needs further proccessing for extra cropping. More specifically, returns a boolean for the decision for further crop, the decision depends on the distance between the target subjects. - :param cropped_heatmap: the cropped area from the original heatmap - Returns: boolean + :param cropped_heatmap: the cropped area from the original heatmap + :type cropped_heatmap: np.array + :returns: A boolean that describes weather is needed to proceed on further cropping + :rtype: bool """ sum_rows = cropped_heatmap.sum(axis=1) sum_col = cropped_heatmap.sum(axis=0) @@ -245,13 +249,16 @@ def __check_for_split(self, cropped_heatmap): else: return False + @staticmethod def __split_process(self, cropped_heatmap): - """ This function uses the cropped heatmap that crated from __crop_heatmap function and splits it in parts. + """ + This function uses the cropped heatmap that crated from __crop_heatmap function and splits it in parts. + :param cropped_heatmap: the cropped area from the original heatmap + :type cropped_heatmap: np.array - Returns: - crops: A list with the new dimensions of the split parts - :type: list + :returns: Returns a list with the new dimensions of the split parts + :rtype: list """ max_diff_c, max_diff_r = 0, 0 y_crop_l = cropped_heatmap.shape[1] @@ -307,14 +314,16 @@ def __split_process(self, cropped_heatmap): crops = [[crop1, x1_i, x1_f, y1_i, y1_f], [crop2, x2_i, x2_f, y2_i, y2_f]] return crops - def __crop_ecclosing_bbox(self, crop): - """ This function creates the bounding box for each split part + @staticmethod + def __crop_enclosing_bbox(self, crop): + """ + This function creates the bounding box for each split part - Args: - crop: A split part from the original heatmap + :param crop: A split part from the original heatmap + :type crop: np.array - Returns: - xmin, xmax, ymin, ymax : Dimensions for enclosing bounding box + :returns: the dimensions (xmin, xmax, ymin, ymax) for enclosing bounding box + :rtype: int """ if crop.nonzero()[0].size > 0 and crop.nonzero()[1].size > 0: xmin = min(np.unique(crop.nonzero()[1])) @@ -325,23 +334,29 @@ def __crop_ecclosing_bbox(self, crop): xmin, xmax, ymin, ymax = 0, 0, 0, 0 return xmin, xmax, ymin, ymax + @staticmethod def __crop_image_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc): - """ This function crops the region of interst(ROI) from the original image to use it in next steps - Args: - xmin, ymin: top left corner dimensions of the split part in the original heatmap - xamx, ymax: bottom right dimensions of the split part in the original heatmap - pool_img: the resized pooled input image - original_image: the original input image - heatmap: the heatmap generated from __first_pass function - perc: percentage of the image that is needed for adding extra pad - :type perc: float - detection: boolean that describes if any subject detection is made - :type detection: boolean - - Returns: - :crop_img: the cropped image part from the original image - :type :numpy.array - :xmin, xmax, ymin, ymax: the dimensions of the cropped part in the original image coordinate system + """ + This function crops the region of interst(ROI) from the original image to use it in next steps + + :param xmin, ymin: top left corner dimensions of the split part in the original heatmap + :type xmin,ymin: int + :param xmax, ymax: bottom right dimensions of the split part in the original heatmap + :type xmin,ymin: int + :param pool_img: the resized pooled input image + :type pool_img: np.array + :param original_image: the original input image + :type original_image: np.array + :param heatmap: the heatmap generated from __first_pass function + :type heatmap: np.array + :param perc: percentage of the image that is needed for adding extra pad + :type perc: float + :param detection: boolean that describes if any subject detection is made + :type detection: bool + + :returns: Returns the cropped image part from the original image and the dimensions of the cropped part in the + original image coordinate system + :rtype :numpy.array, int, int, int, int """ upscale_factor_x = pool_img.shape[0] / heatmap.shape[0] upscale_factor_y = pool_img.shape[1] / heatmap.shape[1] @@ -736,7 +751,7 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su if crop.size > 0: crop_part += 1 - xmin, xmax, ymin, ymax = self.__crop_ecclosing_bbox(crop) + xmin, xmax, ymin, ymax = self.__crop_enclosing_bbox(crop) xmin += heatmap_dims[0] xmax += heatmap_dims[0] @@ -964,29 +979,29 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su def infer(self, img, upsample_ratio=4, stride=8, track=True, smooth=True, multiscale=False): """ - This method is used to perform pose estimation on an image. - - :param img: image to run inference on - :rtype img: engine.data.Image class object - :param upsample_ratio: Defines the amount of upsampling to be performed on the heatmaps and PAFs - when resizing,defaults to 4 - :type upsample_ratio: int, optional - :param stride: Defines the stride value for creating a padded image - :type stride: int,optional - :param track: If True, infer propagates poses ids from previous frame results to track poses, - defaults to 'True' - :type track: bool, optional - :param smooth: If True, smoothing is performed on pose keypoints between frames, defaults to 'True' - :type smooth: bool, optional - :param multiscale: Specifies whether evaluation will run in the predefined multiple scales setup or not. - :type multiscale: bool,optional - - :return: Returns a list of engine.target.Pose objects, where each holds a pose - and a heatmap that contains human silhouettes of the input image. - If no detections were made returns an empty list for poses and a black frame for heatmap. + This method is used to perform pose estimation on an image. - :rtype: poses -> list of engine.target.Pose objects - heatmap -> np.array() + :param img: image to run inference on + :rtype img: engine.data.Image class object + :param upsample_ratio: Defines the amount of upsampling to be performed on the heatmaps and PAFs + when resizing,defaults to 4 + :type upsample_ratio: int, optional + :param stride: Defines the stride value for creating a padded image + :type stride: int,optional + :param track: If True, infer propagates poses ids from previous frame results to track poses, + defaults to 'True' + :type track: bool, optional + :param smooth: If True, smoothing is performed on pose keypoints between frames, defaults to 'True' + :type smooth: bool, optional + :param multiscale: Specifies whether evaluation will run in the predefined multiple scales setup or not. + :type multiscale: bool,optional + + :return: Returns a list of engine.target.Pose objects, where each holds a pose + and a heatmap that contains human silhouettes of the input image. + If no detections were made returns an empty list for poses and a black frame for heatmap. + + :rtype: poses -> list of engine.target.Pose objects + heatmap -> np.array() """ current_poses = [] num_keypoints = Pose.num_kpts @@ -1291,7 +1306,7 @@ def infer_adaptive(self, img, upsample_ratio=4, stride=8): if crop.size > 0: crop_part += 1 - xmin, xmax, ymin, ymax = self.__crop_ecclosing_bbox(crop) + xmin, xmax, ymin, ymax = self.__crop_enclosing_bbox(crop) xmin += crop_params[1] xmax += crop_params[1] From bf9b9a721a6434906779c8fd87aaa2bcee1be729 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Thu, 23 Nov 2023 12:49:24 +0200 Subject: [PATCH 11/15] fixed webcam demo issues edit learner --- .../demos/webcam_demo.py | 359 +++++++++++------- .../high_resolution_learner.py | 32 +- 2 files changed, 244 insertions(+), 147 deletions(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py index b8e0c55060..2b5a6d2dbf 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py @@ -50,13 +50,19 @@ def __next__(self): parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda") parser.add_argument("--accelerate", help="Enables acceleration flags (e.g., stride)", default=False, action="store_true") - parser.add_argument("--height1", help="Base height of resizing in first inference, defaults to 420", default=420) - parser.add_argument("--height2", help="Base height of resizing in second inference, defaults to 360", default=360) - parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", + parser.add_argument("--height1", + help="Base height of resizing in first inference, defaults to 420", default=420) + parser.add_argument("--height2", + help="Base height of resizing in second inference, defaults to 360", default=360) + parser.add_argument("--method", + help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", default="adaptive") + parser.add_argument("--run-comparison", + help="Enables comparison with all HR-pose-estimation methods and Lw-OpenPose", + action="store_true") args = parser.parse_args() - onnx, device, accelerate = args.onnx, args.device, args.accelerate + onnx, device, accelerate, run_comparison = args.onnx, args.device, args.accelerate, args.run_comparison base_height1, base_height2, method = args.height1, args.height2, args.method if accelerate: @@ -68,140 +74,233 @@ def __next__(self): stages = 2 half_precision = False - lw_pose_estimator = LightweightOpenPoseLearner(device=device, num_refinement_stages=stages, - mobilenet_use_stride=stride, half_precision=half_precision) + image_provider = VideoReader(0) # Use the first camera available on the system + image_provider = iter(image_provider) - hr_pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, - mobilenet_use_stride=stride, half_precision=half_precision, - first_pass_height=base_height1, - second_pass_height=base_height2, - percentage_around_crop=0.1, - method="primary") + height = image_provider.cap.get(4) + width = image_provider.cap.get(3) + if run_comparison: + prim_hr_avg_fps = 0 + lw_avg_fps = 0 + adapt_hr_avg_fps = 0 + + lw_pose_estimator = LightweightOpenPoseLearner(device=device, num_refinement_stages=stages, + mobilenet_use_stride=stride, half_precision=half_precision) + + hr_pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, + mobilenet_use_stride=stride, half_precision=half_precision, + first_pass_height=base_height1, + second_pass_height=base_height2, + percentage_around_crop=0.1, + method="primary") + + adapt_hr_pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, + mobilenet_use_stride=stride, + half_precision=half_precision, + first_pass_height=base_height1, + second_pass_height=base_height2, + percentage_around_crop=0.1, + method="adaptive") + + lw_pose_estimator.download(path=".", verbose=True) + + hr_pose_estimator.load("openpose_default") + adapt_hr_pose_estimator.load("openpose_default") + lw_pose_estimator.load("openpose_default") + + if onnx: + hr_pose_estimator.optimize() + adapt_hr_pose_estimator.optimize() + lw_pose_estimator.optimize() + + if width / height == 16 / 9: + size = (2 * 1280, 2 * int(720 / 3)) + elif width / height == 4 / 3: + size = (2 * 1024, 2 * int(768 / 3)) + else: + size = (width, int(height / 3)) - adapt_hr_pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, - mobilenet_use_stride=stride, half_precision=half_precision, - first_pass_height=base_height1, - second_pass_height=base_height2, - percentage_around_crop=0.1, - method="adaptive") + else: + hr_avg_fps = 0 + lw_avg_fps = 0 - hr_pose_estimator.download(path=".", verbose=True) - hr_pose_estimator.load("openpose_default") + lw_pose_estimator = LightweightOpenPoseLearner(device=device, num_refinement_stages=stages, + mobilenet_use_stride=stride, half_precision=half_precision) - adapt_hr_pose_estimator.download(path=".", verbose=True) - adapt_hr_pose_estimator.load("openpose_default") + hr_pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, + mobilenet_use_stride=stride, half_precision=half_precision, + first_pass_height=base_height1, + second_pass_height=base_height2, + percentage_around_crop=0.1, + method=method) - if onnx: - hr_pose_estimator.optimize() - adapt_hr_pose_estimator.optimize() + lw_pose_estimator.download(path=".", verbose=True) - lw_pose_estimator.download(path=".", verbose=True) - lw_pose_estimator.load("openpose_default") + hr_pose_estimator.load("openpose_default") + lw_pose_estimator.load("openpose_default") - if onnx: - lw_pose_estimator.optimize() + if onnx: + hr_pose_estimator.optimize() + lw_pose_estimator.optimize() - prim_hr_avg_fps = 0 - lw_avg_fps = 0 - adapt_hr_avg_fps = 0 - # Use the first camera available on the system - image_provider = VideoReader(0) - image_provider = iter(image_provider) - - height = image_provider.cap.get(4) - width = image_provider.cap.get(3) - if width / height == 16 / 9: - size = (2 * 1280, 2 * int(720 / 3)) - elif width / height == 4 / 3: - size = (2 * 1024, 2 * int(768 / 3)) - else: - size = (width, int(height / 3)) + if width / height == 16 / 9: + size = (1280, int(720 / 2)) + elif width / height == 4 / 3: + size = (1024, int(768 / 2)) + else: + size = (width, int(height / 2)) while True: img = next(image_provider) - - total_time0 = time.time() - img_copy = np.copy(img) - adapt_img = np.copy(img) - # Perform inference - start_time = time.perf_counter() - hr_poses, heatmap, _ = hr_pose_estimator.infer(img) - hr_time = time.perf_counter() - start_time - - # Perform inference - start_time = time.perf_counter() - lw_poses = lw_pose_estimator.infer(img_copy) - lw_time = time.perf_counter() - start_time - - # Perform inference - start_time = time.perf_counter() - adapt_hr_poses, adapt_heatmap, _ = adapt_hr_pose_estimator.infer_adaptive(img) - adapt_hr_time = time.perf_counter() - start_time - - total_time = time.time() - total_time0 - - for hr_pose in hr_poses: - draw(img, hr_pose) - for lw_pose in lw_poses: - draw(img_copy, lw_pose) - for adapt_hr_pose in adapt_hr_poses: - draw(adapt_img, adapt_hr_pose) - - lw_fps = 1 / (total_time - (hr_time + adapt_hr_time)) - prim_hr_fps = 1 / (total_time - (lw_time + adapt_hr_time)) - adapt_hr_fps = 1 / (total_time - (lw_time + hr_time)) - - # Calculate a running average on FPS - prim_hr_avg_fps = 0.95 * prim_hr_avg_fps + 0.05 * prim_hr_fps - lw_avg_fps = 0.95 * lw_avg_fps + 0.05 * lw_fps - adapt_hr_avg_fps = 0.95 * adapt_hr_avg_fps + 0.05 * adapt_hr_fps - - cv2.putText(img=img, text="OpenDR High Resolution", org=(20, int(height / 10)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - cv2.putText(img=img, text="Pose Estimation Primary ROI selection", org=(20, int(height / 10) + 50), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - cv2.putText(img=img, text='FPS:' + str(int(prim_hr_avg_fps)), org=(20, int(height / 4)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - - cv2.putText(img=img_copy, text='Lightweight OpenPose ', org=(20, int(height / 10)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - - cv2.putText(img=img_copy, text='FPS: ' + str(int(lw_avg_fps)), org=(20, int(height / 4)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - - cv2.putText(img=adapt_img, text="Pose Estimation Adaptive ROI selection", org=(20, int(height / 10) + 50), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - cv2.putText(img=adapt_img, text='FPS:' + str(int(adapt_hr_avg_fps)), org=(20, int(height / 4)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - - heatmap = heatmap * 5 - heatmap = cv2.cvtColor(heatmap, cv2.COLOR_GRAY2BGR) - heatmap = cv2.resize(heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) - img[(img.shape[0] - heatmap.shape[0]):img.shape[0], 0:heatmap.shape[1]] = heatmap - - adapt_heatmap = adapt_heatmap * 5 - adapt_heatmap = cv2.cvtColor(adapt_heatmap, cv2.COLOR_GRAY2BGR) - adapt_heatmap = cv2.resize(adapt_heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) - adapt_img[(adapt_img.shape[0] - adapt_heatmap.shape[0]):adapt_img.shape[0], 0:adapt_heatmap.shape[1]] = adapt_heatmap - - output_image = cv2.hconcat([img_copy, img, adapt_img]) - output_image = cv2.resize(output_image, size) - cv2.imshow('Result', output_image) - - key = cv2.waitKey(1) - if key == 27: - exit(0) + if run_comparison: + total_time0 = time.time() + img_copy = np.copy(img) + adapt_img = np.copy(img) + # Perform inference + start_time = time.perf_counter() + hr_poses, heatmap, _ = hr_pose_estimator.infer(img) + hr_time = time.perf_counter() - start_time + + # Perform inference + start_time = time.perf_counter() + lw_poses = lw_pose_estimator.infer(img_copy) + lw_time = time.perf_counter() - start_time + + # Perform inference + start_time = time.perf_counter() + adapt_hr_poses, adapt_heatmap, _ = adapt_hr_pose_estimator.infer_adaptive(img) + adapt_hr_time = time.perf_counter() - start_time + + total_time = time.time() - total_time0 + + for hr_pose in hr_poses: + draw(img, hr_pose) + for lw_pose in lw_poses: + draw(img_copy, lw_pose) + for adapt_hr_pose in adapt_hr_poses: + draw(adapt_img, adapt_hr_pose) + + lw_fps = 1 / (total_time - (hr_time + adapt_hr_time)) + prim_hr_fps = 1 / (total_time - (lw_time + adapt_hr_time)) + adapt_hr_fps = 1 / (total_time - (lw_time + hr_time)) + + # Calculate a running average on FPS + prim_hr_avg_fps = 0.95 * prim_hr_avg_fps + 0.05 * prim_hr_fps + lw_avg_fps = 0.95 * lw_avg_fps + 0.05 * lw_fps + adapt_hr_avg_fps = 0.95 * adapt_hr_avg_fps + 0.05 * adapt_hr_fps + + cv2.putText(img=img, text="OpenDR High Resolution", org=(20, int(height / 10)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=img, text="Pose Estimation Primary ROI selection", org=(20, int(height / 10) + 50), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=img, text='FPS:' + str(int(prim_hr_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + + cv2.putText(img=img_copy, text='Lightweight OpenPose ', org=(20, int(height / 10)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + + cv2.putText(img=img_copy, text='FPS: ' + str(int(lw_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + + cv2.putText(img=adapt_img, text="OpenDR High Resolution", org=(20, int(height / 10)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=adapt_img, text="Pose Estimation Adaptive ROI selection", org=(20, int(height / 10) + 50), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=adapt_img, text='FPS:' + str(int(adapt_hr_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + + heatmap = heatmap * 5 + heatmap = cv2.cvtColor(heatmap, cv2.COLOR_GRAY2BGR) + heatmap = cv2.resize(heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) + img[(img.shape[0] - heatmap.shape[0]):img.shape[0], 0:heatmap.shape[1]] = heatmap + + adapt_heatmap = adapt_heatmap * 5 + adapt_heatmap = cv2.cvtColor(adapt_heatmap, cv2.COLOR_GRAY2BGR) + adapt_heatmap = cv2.resize(adapt_heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) + adapt_img[(adapt_img.shape[0] - adapt_heatmap.shape[0]):adapt_img.shape[0], 0:adapt_heatmap.shape[1]] = adapt_heatmap + + output_image = cv2.hconcat([img_copy, img, adapt_img]) + output_image = cv2.resize(output_image, size) + cv2.imshow('Result', output_image) + + key = cv2.waitKey(1) + if key == 27: + exit(0) + else: + total_time0 = time.time() + img_copy = np.copy(img) + # Perform inference + start_time = time.perf_counter() + hr_poses, heatmap, _ = hr_pose_estimator.infer(img) + hr_time = time.perf_counter() - start_time + + # Perform inference + start_time = time.perf_counter() + lw_poses = lw_pose_estimator.infer(img_copy) + lw_time = time.perf_counter() - start_time + + total_time = time.time() - total_time0 + + for hr_pose in hr_poses: + draw(img, hr_pose) + for lw_pose in lw_poses: + draw(img_copy, lw_pose) + + lw_fps = 1 / (total_time - hr_time) + hr_fps = 1 / (total_time - lw_time) + + # Calculate a running average on FPS + hr_avg_fps = 0.95 * hr_avg_fps + 0.05 * hr_fps + lw_avg_fps = 0.95 * lw_avg_fps + 0.05 * lw_fps + + cv2.putText(img=img, text="OpenDR High Resolution", org=(20, int(height / 10)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=img, text="Pose Estimation", org=(20, int(height / 10) + 50), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + cv2.putText(img=img, text='FPS:' + str(int(hr_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + + cv2.putText(img=img_copy, text='Lightweight OpenPose ', org=(20, int(height / 10)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + + cv2.putText(img=img_copy, text='FPS: ' + str(int(lw_avg_fps)), org=(20, int(height / 4)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + thickness=int(np.ceil(height / 600))) + + heatmap = heatmap * 5 + heatmap = cv2.cvtColor(heatmap, cv2.COLOR_GRAY2BGR) + heatmap = cv2.resize(heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) + img[(img.shape[0] - heatmap.shape[0]):img.shape[0], 0:heatmap.shape[1]] = heatmap + + output_image = cv2.hconcat([img_copy, img]) + output_image = cv2.resize(output_image, size) + cv2.imshow('Result', output_image) + + key = cv2.waitKey(1) + if key == 27: + exit(0) diff --git a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py index a13aac0392..8fb88cf0b2 100644 --- a/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py +++ b/src/opendr/perception/pose_estimation/hr_pose_estimation/high_resolution_learner.py @@ -221,7 +221,7 @@ def __crop_heatmap(heatmap): return heatmap_dims, detection @staticmethod - def __check_for_split(self, cropped_heatmap): + def __check_for_split(cropped_heatmap): """ This function checks weather or not the cropped heatmap needs further proccessing for extra cropping. More specifically, returns a boolean for the decision for further crop, the decision depends on the distance between the @@ -250,7 +250,7 @@ def __check_for_split(self, cropped_heatmap): return False @staticmethod - def __split_process(self, cropped_heatmap): + def __split_process(cropped_heatmap): """ This function uses the cropped heatmap that crated from __crop_heatmap function and splits it in parts. @@ -315,7 +315,7 @@ def __split_process(self, cropped_heatmap): return crops @staticmethod - def __crop_enclosing_bbox(self, crop): + def __crop_enclosing_bbox(crop): """ This function creates the bounding box for each split part @@ -335,7 +335,7 @@ def __crop_enclosing_bbox(self, crop): return xmin, xmax, ymin, ymax @staticmethod - def __crop_image_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc): + def __crop_image_func(xmin, xmax, ymin, ymax, pool_img, original_image, heatmap, perc): """ This function crops the region of interst(ROI) from the original image to use it in next steps @@ -351,8 +351,6 @@ def __crop_image_func(self, xmin, xmax, ymin, ymax, pool_img, original_image, he :type heatmap: np.array :param perc: percentage of the image that is needed for adding extra pad :type perc: float - :param detection: boolean that describes if any subject detection is made - :type detection: bool :returns: Returns the cropped image part from the original image and the dimensions of the cropped part in the original image coordinate system @@ -743,8 +741,8 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su if detection: cropped_heatmap = heatmap[heatmap_dims[2]:heatmap_dims[3], heatmap_dims[0]:heatmap_dims[1]] if self.__check_for_split(cropped_heatmap): - # Split horizontal or vertical - crops = self.__split_process(cropped_heatmap) + crops = self.__split_process(cropped_heatmap) # Split horizontal or vertical + crop_part = 0 for crop_params in crops: crop = crop_params[0] @@ -847,15 +845,15 @@ def eval_adaptive(self, dataset, silent=False, verbose=True, use_subset=True, su 'score': scores[idx] }) - if self.visualize: - for keypoints in coco_keypoints: - for idx in range(len(keypoints) // 3): - cv2.circle(img, (int(keypoints[idx * 3]), int(keypoints[idx * 3 + 1])), - 3, (255, 0, 255), -1) - cv2.imshow('keypoints', img) - key = cv2.waitKey() - if key == 27: # esc - return + if self.visualize: + for keypoints in coco_keypoints: + for idx in range(len(keypoints) // 3): + cv2.circle(img, (int(keypoints[idx * 3]), int(keypoints[idx * 3 + 1])), + 3, (255, 0, 255), -1) + cv2.imshow('keypoints', img) + key = cv2.waitKey() + if key == 27: # esc + return else: xmin = heatmap_dims[0] xmax = heatmap_dims[1] From 9f74dc1948dc8c73cb2d97d09985825a190ab4d5 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Thu, 23 Nov 2023 12:52:01 +0200 Subject: [PATCH 12/15] pep8 fix --- .../high_resolution_pose_estimation/demos/webcam_demo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py index 2b5a6d2dbf..5170af0aff 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py @@ -233,7 +233,8 @@ def __next__(self): adapt_heatmap = adapt_heatmap * 5 adapt_heatmap = cv2.cvtColor(adapt_heatmap, cv2.COLOR_GRAY2BGR) adapt_heatmap = cv2.resize(adapt_heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) - adapt_img[(adapt_img.shape[0] - adapt_heatmap.shape[0]):adapt_img.shape[0], 0:adapt_heatmap.shape[1]] = adapt_heatmap + adapt_img[(adapt_img.shape[0] - adapt_heatmap.shape[0]):adapt_img.shape[0], 0:adapt_heatmap.shape[1]]\ + = adapt_heatmap output_image = cv2.hconcat([img_copy, img, adapt_img]) output_image = cv2.resize(output_image, size) From 9477c262f25580a3011f273d1e80a80b10d42985 Mon Sep 17 00:00:00 2001 From: mthodoris Date: Thu, 23 Nov 2023 14:32:38 +0200 Subject: [PATCH 13/15] webcam demo runs hr pose estimation if --run-comparison is enabled then it shows the comparison between lightweight openpose, hr-pose estimation primary method and adaptive method --- .../demos/webcam_demo.py | 62 +++++-------------- 1 file changed, 15 insertions(+), 47 deletions(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py index 5170af0aff..f2839c4b1a 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py @@ -122,10 +122,6 @@ def __next__(self): else: hr_avg_fps = 0 - lw_avg_fps = 0 - - lw_pose_estimator = LightweightOpenPoseLearner(device=device, num_refinement_stages=stages, - mobilenet_use_stride=stride, half_precision=half_precision) hr_pose_estimator = HighResolutionPoseEstimationLearner(device=device, num_refinement_stages=stages, mobilenet_use_stride=stride, half_precision=half_precision, @@ -134,19 +130,15 @@ def __next__(self): percentage_around_crop=0.1, method=method) - lw_pose_estimator.download(path=".", verbose=True) - hr_pose_estimator.load("openpose_default") - lw_pose_estimator.load("openpose_default") if onnx: hr_pose_estimator.optimize() - lw_pose_estimator.optimize() if width / height == 16 / 9: - size = (1280, int(720 / 2)) + size = (1280, int(720)) elif width / height == 4 / 3: - size = (1024, int(768 / 2)) + size = (1024, int(768)) else: size = (width, int(height / 2)) @@ -191,7 +183,7 @@ def __next__(self): cv2.putText(img=img, text="OpenDR High Resolution", org=(20, int(height / 10)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) cv2.putText(img=img, text="Pose Estimation Primary ROI selection", org=(20, int(height / 10) + 50), fontFace=cv2.FONT_HERSHEY_TRIPLEX, @@ -199,22 +191,22 @@ def __next__(self): thickness=int(np.ceil(height / 600))) cv2.putText(img=img, text='FPS:' + str(int(prim_hr_avg_fps)), org=(20, int(height / 4)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) cv2.putText(img=img_copy, text='Lightweight OpenPose ', org=(20, int(height / 10)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) cv2.putText(img=img_copy, text='FPS: ' + str(int(lw_avg_fps)), org=(20, int(height / 4)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) cv2.putText(img=adapt_img, text="OpenDR High Resolution", org=(20, int(height / 10)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) cv2.putText(img=adapt_img, text="Pose Estimation Adaptive ROI selection", org=(20, int(height / 10) + 50), fontFace=cv2.FONT_HERSHEY_TRIPLEX, @@ -222,7 +214,7 @@ def __next__(self): thickness=int(np.ceil(height / 600))) cv2.putText(img=adapt_img, text='FPS:' + str(int(adapt_hr_avg_fps)), org=(20, int(height / 4)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) heatmap = heatmap * 5 @@ -250,47 +242,24 @@ def __next__(self): start_time = time.perf_counter() hr_poses, heatmap, _ = hr_pose_estimator.infer(img) hr_time = time.perf_counter() - start_time - - # Perform inference - start_time = time.perf_counter() - lw_poses = lw_pose_estimator.infer(img_copy) - lw_time = time.perf_counter() - start_time - total_time = time.time() - total_time0 for hr_pose in hr_poses: draw(img, hr_pose) - for lw_pose in lw_poses: - draw(img_copy, lw_pose) - lw_fps = 1 / (total_time - hr_time) - hr_fps = 1 / (total_time - lw_time) + hr_fps = 1 / total_time # Calculate a running average on FPS hr_avg_fps = 0.95 * hr_avg_fps + 0.05 * hr_fps - lw_avg_fps = 0.95 * lw_avg_fps + 0.05 * lw_fps - cv2.putText(img=img, text="OpenDR High Resolution", org=(20, int(height / 10)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - cv2.putText(img=img, text="Pose Estimation", org=(20, int(height / 10) + 50), + cv2.putText(img=img, text="OpenDR High Resolution Pose Estimation", org=(20, int(height / 10)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - cv2.putText(img=img, text='FPS:' + str(int(hr_avg_fps)), org=(20, int(height / 4)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), - thickness=int(np.ceil(height / 600))) - - cv2.putText(img=img_copy, text='Lightweight OpenPose ', org=(20, int(height / 10)), - fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) - cv2.putText(img=img_copy, text='FPS: ' + str(int(lw_avg_fps)), org=(20, int(height / 4)), + cv2.putText(img=img, text='FPS:' + str(int(hr_avg_fps)), org=(20, int(height / 4)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, - fontScale=int(np.ceil(height / 600)), color=(0, 0, 200), + fontScale=int(np.ceil(height / 800)), color=(0, 0, 200), thickness=int(np.ceil(height / 600))) heatmap = heatmap * 5 @@ -298,9 +267,8 @@ def __next__(self): heatmap = cv2.resize(heatmap, (int(img.shape[1] / 4), int(img.shape[0] / 4))) img[(img.shape[0] - heatmap.shape[0]):img.shape[0], 0:heatmap.shape[1]] = heatmap - output_image = cv2.hconcat([img_copy, img]) - output_image = cv2.resize(output_image, size) - cv2.imshow('Result', output_image) + img = cv2.resize(img, size) + cv2.imshow('Result', img) key = cv2.waitKey(1) if key == 27: From 8d41b1356c882e030c10f9eaab7b553084d2573b Mon Sep 17 00:00:00 2001 From: mthodoris Date: Thu, 23 Nov 2023 14:38:01 +0200 Subject: [PATCH 14/15] Readme edit for webcam demo --- .../pose_estimation/high_resolution_pose_estimation/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md index e6fcee7fa4..96749e4f07 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/README.md @@ -7,7 +7,9 @@ More specifically, the applications provided are: 1. demos/inference_demo.py: A tool that demonstrates how to perform inference on a single high resolution image and then draw the detected poses. 2. demos/eval_demo.py: A tool that demonstrates how to perform evaluation using the High Resolution Pose Estimation algorithm on 720p, 1080p and 1440p datasets. 3. demos/benchmarking_demo.py: A simple benchmarking tool for measuring the performance of High Resolution Pose Estimation in various platforms. -4. demos/webcam_demo.py: A tool that performs live pose estimation both with high resolution and simple lightweight OpenPose using a webcam. +4. demos/webcam_demo.py: A tool that performs live pose estimation with high resolution pose estimation method using a webcam. + If `--run-comparison` is enabled then it shows the differences between Lightweight OpenPose, and both adaptive and primary methods in High_resolution pose estimation. NOTE: All demos can run either with "primary ROI selection" mode or with "adaptive ROI selection". Use `--method primary` or `--method adaptive` for each case. + From 0858f46b673333bb26742182c4d6a0caaa0211db Mon Sep 17 00:00:00 2001 From: mthodoris <45919203+mthodoris@users.noreply.github.com> Date: Thu, 23 Nov 2023 15:04:53 +0200 Subject: [PATCH 15/15] Update projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py Co-authored-by: Kostas Tsampazis <27914645+tsampazk@users.noreply.github.com> --- .../high_resolution_pose_estimation/demos/webcam_demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py index f2839c4b1a..a9260c2d34 100644 --- a/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py +++ b/projects/python/perception/pose_estimation/high_resolution_pose_estimation/demos/webcam_demo.py @@ -56,7 +56,7 @@ def __next__(self): help="Base height of resizing in second inference, defaults to 360", default=360) parser.add_argument("--method", help="Choose between primary or adaptive ROI selection methodology defaults to adaptive", - default="adaptive") + default="adaptive", choices=["primary", "adaptive"]) parser.add_argument("--run-comparison", help="Enables comparison with all HR-pose-estimation methods and Lw-OpenPose", action="store_true")