From de1d69fde0e395a6ceb3bdf6ea37fb38489fafab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kozlovsk=C3=BD?= Date: Fri, 4 Oct 2024 09:45:34 +0200 Subject: [PATCH] Improved Duplicate Warnings (#184) --- luxonis_ml/data/datasets/luxonis_dataset.py | 71 ++++++++++++++----- luxonis_ml/data/datasets/utils.py | 2 +- luxonis_ml/data/loaders/luxonis_loader.py | 6 +- .../classification_directory_parser.py | 2 +- luxonis_ml/data/parsers/coco_parser.py | 2 +- luxonis_ml/data/parsers/create_ml_parser.py | 2 +- .../segmentation_mask_directory_parser.py | 2 +- luxonis_ml/data/parsers/voc_parser.py | 4 +- luxonis_ml/data/parsers/yolov4_parser.py | 2 +- 9 files changed, 67 insertions(+), 26 deletions(-) diff --git a/luxonis_ml/data/datasets/luxonis_dataset.py b/luxonis_ml/data/datasets/luxonis_dataset.py index 20ce8cb0..c92c70c4 100644 --- a/luxonis_ml/data/datasets/luxonis_dataset.py +++ b/luxonis_ml/data/datasets/luxonis_dataset.py @@ -199,14 +199,30 @@ def _load_df_offline( dfs = [pl.read_parquet(file) for file in path.glob("*.parquet")] return pl.concat(dfs) if dfs else None - def _get_file_index(self) -> Optional[pl.DataFrame]: + @overload + def _get_file_index( + self, lazy: Literal[False] = ... + ) -> Optional[pl.DataFrame]: ... + + @overload + def _get_file_index( + self, lazy: Literal[True] = ... + ) -> Optional[pl.LazyFrame]: ... + + def _get_file_index( + self, lazy: bool = False + ) -> Optional[Union[pl.DataFrame, pl.LazyFrame]]: path = get_file( self.fs, "metadata/file_index.parquet", self.media_path ) if path is not None and path.exists(): - return pl.read_parquet(path).select( - pl.all().exclude("^__index_level_.*$") - ) + if not lazy: + df = pl.read_parquet(path) + else: + df = pl.scan_parquet(path) + + return df.select(pl.all().exclude("^__index_level_.*$")) + return None def _write_index( @@ -438,7 +454,7 @@ def _process_arrays(self, batch_data: List[DatasetRecord]) -> None: uuid_dict[str(ann.path)] = uuid ann.path = Path(uuid).with_suffix(ann.path.suffix) else: - ann.path = ann.path.absolute() + ann.path = ann.path.absolute().resolve() self.progress.stop() self.progress.remove_task(task) if self.is_remote: @@ -496,7 +512,7 @@ def _add_process_batch( new_index["uuid"].append(uuid) new_index["file"].append(file) new_index["original_filepath"].append( - str(filepath.absolute()) + str(filepath.absolute().resolve()) ) processed_uuids.add(uuid) @@ -514,7 +530,9 @@ def add( batch_data: list[DatasetRecord] = [] - classes_per_task: Dict[str, OrderedSet[str]] = defaultdict(OrderedSet) + classes_per_task: Dict[str, OrderedSet[str]] = defaultdict( + lambda: OrderedSet([]) + ) num_kpts_per_task: Dict[str, int] = {} annotations_path = get_dir( @@ -584,36 +602,55 @@ def add( def _warn_on_duplicates(self) -> None: df = self._load_df_offline(lazy=True) - if df is None: + index_df = self._get_file_index(lazy=True) + if df is None or index_df is None: return + df = df.join(index_df, on="uuid").drop("file_right") # Warn on duplicate UUIDs duplicates_paired = ( df.group_by("uuid") .agg(pl.col("file").n_unique().alias("file_count")) .filter(pl.col("file_count") > 1) .join(df, on="uuid") - .select(["uuid", "file"]) + .select("uuid", "file") .unique() .group_by("uuid") - .agg([pl.col("file").alias("files")]) + .agg(pl.col("file").alias("files")) .filter(pl.col("files").len() > 1) + .collect() ) - duplicates_paired_df = duplicates_paired.collect() - for uuid, files in duplicates_paired_df.iter_rows(): + for uuid, files in duplicates_paired.iter_rows(): self.logger.warning( f"UUID: {uuid} has multiple file names: {files}" ) # Warn on duplicate annotations duplicate_annotation = ( - df.group_by(["file", "annotation"]) + df.group_by( + "original_filepath", + "task", + "type", + "annotation", + "instance_id", + ) .agg(pl.len().alias("count")) .filter(pl.col("count") > 1) - ) - duplicate_annotation_df = duplicate_annotation.collect() - for file_name, annotation, _ in duplicate_annotation_df.iter_rows(): + .filter(pl.col("annotation") != "{}") + .drop("instance_id") + ).collect() + + for ( + file_name, + task, + type_, + annotation, + count, + ) in duplicate_annotation.iter_rows(): + if "RLE" in type_ or "Mask" in type_: + annotation = "" self.logger.warning( - f"File '{file_name}' has the same annotation '{annotation}' added multiple times." + f"File '{file_name}' has the same '{type_}' annotation " + f"'{annotation}' ({task=}) added {count} times." ) def get_splits(self) -> Optional[Dict[str, List[str]]]: diff --git a/luxonis_ml/data/datasets/utils.py b/luxonis_ml/data/datasets/utils.py index 6df42d47..e87cee8c 100644 --- a/luxonis_ml/data/datasets/utils.py +++ b/luxonis_ml/data/datasets/utils.py @@ -86,7 +86,7 @@ def find_filepath_uuid( if index is None: return None - abs_path = str(Path(filepath).absolute()) + abs_path = str(Path(filepath).absolute().resolve()) matched = index.filter(pl.col("original_filepath") == abs_path) if len(matched): diff --git a/luxonis_ml/data/loaders/luxonis_loader.py b/luxonis_ml/data/loaders/luxonis_loader.py index 29781cf8..740d2e93 100644 --- a/luxonis_ml/data/loaders/luxonis_loader.py +++ b/luxonis_ml/data/loaders/luxonis_loader.py @@ -84,7 +84,7 @@ def __init__( for view in self.view: self.instances.extend(splits[view]) - self.idx_to_df_row = [] + self.idx_to_df_row: list[list[int]] = [] for uuid in self.instances: boolean_mask = df["uuid"] == uuid row_indexes = boolean_mask.arg_true().to_list() @@ -139,7 +139,9 @@ def __getitem__(self, idx: int) -> LuxonisLoaderOutput: else: picked_indices = set() max_val = len(self) - while len(picked_indices) < self.augmentations.aug_batch_size - 1: + while ( + len(picked_indices) < self.augmentations.aug_batch_size - 1 + ): rand_idx = random.randint(0, max_val - 1) if rand_idx != idx and rand_idx not in picked_indices: picked_indices.add(rand_idx) diff --git a/luxonis_ml/data/parsers/classification_directory_parser.py b/luxonis_ml/data/parsers/classification_directory_parser.py index 0beb1cf7..8976459f 100644 --- a/luxonis_ml/data/parsers/classification_directory_parser.py +++ b/luxonis_ml/data/parsers/classification_directory_parser.py @@ -77,7 +77,7 @@ def generator() -> DatasetIterator: for class_name in class_names: for img_path in (class_dir / class_name).iterdir(): yield { - "file": str(img_path.absolute()), + "file": str(img_path.absolute().resolve()), "annotation": { "type": "classification", "class": class_name, diff --git a/luxonis_ml/data/parsers/coco_parser.py b/luxonis_ml/data/parsers/coco_parser.py index ffe8e15d..9408e0e0 100644 --- a/luxonis_ml/data/parsers/coco_parser.py +++ b/luxonis_ml/data/parsers/coco_parser.py @@ -226,7 +226,7 @@ def generator() -> DatasetIterator: ann_dict[img_id].append(ann) for img_id, img in img_dict.items(): - path = image_dir.absolute() / img["file_name"] + path = image_dir.absolute().resolve() / img["file_name"] if not path.exists(): continue path = str(path) diff --git a/luxonis_ml/data/parsers/create_ml_parser.py b/luxonis_ml/data/parsers/create_ml_parser.py index 0677250a..b57be0b9 100644 --- a/luxonis_ml/data/parsers/create_ml_parser.py +++ b/luxonis_ml/data/parsers/create_ml_parser.py @@ -92,7 +92,7 @@ def from_split( class_names = set() images_annotations = [] for annotations in annotations_data: - path = image_dir.absolute() / annotations["image"] + path = image_dir.absolute().resolve() / annotations["image"] if not path.exists(): continue file = str(path) diff --git a/luxonis_ml/data/parsers/segmentation_mask_directory_parser.py b/luxonis_ml/data/parsers/segmentation_mask_directory_parser.py index 8bac4ad7..ea3088fb 100644 --- a/luxonis_ml/data/parsers/segmentation_mask_directory_parser.py +++ b/luxonis_ml/data/parsers/segmentation_mask_directory_parser.py @@ -104,7 +104,7 @@ def from_split( def generator() -> DatasetIterator: for mask_path in seg_dir.glob("*_mask.*"): image_path = next(image_dir.glob(f"{mask_path.stem[:-5]}.*")) - file = str(image_path.absolute()) + file = str(image_path.absolute().resolve()) mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE) ids = np.unique(mask) diff --git a/luxonis_ml/data/parsers/voc_parser.py b/luxonis_ml/data/parsers/voc_parser.py index 19c6ed77..453134e2 100644 --- a/luxonis_ml/data/parsers/voc_parser.py +++ b/luxonis_ml/data/parsers/voc_parser.py @@ -86,7 +86,9 @@ def from_split( annotation_data = ET.parse(anno_xml) root = annotation_data.getroot() - path = image_dir.absolute() / self._xml_find(root, "filename") + path = image_dir.absolute().resolve() / self._xml_find( + root, "filename" + ) if not path.exists(): continue diff --git a/luxonis_ml/data/parsers/yolov4_parser.py b/luxonis_ml/data/parsers/yolov4_parser.py index c295900f..37cc4092 100644 --- a/luxonis_ml/data/parsers/yolov4_parser.py +++ b/luxonis_ml/data/parsers/yolov4_parser.py @@ -98,7 +98,7 @@ def generator() -> DatasetIterator: data = ann_line.split(" ") img_path = data[0] - path = image_dir.absolute() / img_path + path = image_dir.absolute().resolve() / img_path if not path.exists(): continue