Skip to content

Commit

Permalink
Improved Duplicate Warnings (#184)
Browse files Browse the repository at this point in the history
  • Loading branch information
kozlov721 authored Oct 4, 2024
1 parent 0183a45 commit de1d69f
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 26 deletions.
71 changes: 54 additions & 17 deletions luxonis_ml/data/datasets/luxonis_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,14 +199,30 @@ def _load_df_offline(
dfs = [pl.read_parquet(file) for file in path.glob("*.parquet")]
return pl.concat(dfs) if dfs else None

def _get_file_index(self) -> Optional[pl.DataFrame]:
@overload
def _get_file_index(
self, lazy: Literal[False] = ...
) -> Optional[pl.DataFrame]: ...

@overload
def _get_file_index(
self, lazy: Literal[True] = ...
) -> Optional[pl.LazyFrame]: ...

def _get_file_index(
self, lazy: bool = False
) -> Optional[Union[pl.DataFrame, pl.LazyFrame]]:
path = get_file(
self.fs, "metadata/file_index.parquet", self.media_path
)
if path is not None and path.exists():
return pl.read_parquet(path).select(
pl.all().exclude("^__index_level_.*$")
)
if not lazy:
df = pl.read_parquet(path)
else:
df = pl.scan_parquet(path)

return df.select(pl.all().exclude("^__index_level_.*$"))

return None

def _write_index(
Expand Down Expand Up @@ -438,7 +454,7 @@ def _process_arrays(self, batch_data: List[DatasetRecord]) -> None:
uuid_dict[str(ann.path)] = uuid
ann.path = Path(uuid).with_suffix(ann.path.suffix)
else:
ann.path = ann.path.absolute()
ann.path = ann.path.absolute().resolve()
self.progress.stop()
self.progress.remove_task(task)
if self.is_remote:
Expand Down Expand Up @@ -496,7 +512,7 @@ def _add_process_batch(
new_index["uuid"].append(uuid)
new_index["file"].append(file)
new_index["original_filepath"].append(
str(filepath.absolute())
str(filepath.absolute().resolve())
)
processed_uuids.add(uuid)

Expand All @@ -514,7 +530,9 @@ def add(

batch_data: list[DatasetRecord] = []

classes_per_task: Dict[str, OrderedSet[str]] = defaultdict(OrderedSet)
classes_per_task: Dict[str, OrderedSet[str]] = defaultdict(
lambda: OrderedSet([])
)
num_kpts_per_task: Dict[str, int] = {}

annotations_path = get_dir(
Expand Down Expand Up @@ -584,36 +602,55 @@ def add(

def _warn_on_duplicates(self) -> None:
df = self._load_df_offline(lazy=True)
if df is None:
index_df = self._get_file_index(lazy=True)
if df is None or index_df is None:
return
df = df.join(index_df, on="uuid").drop("file_right")
# Warn on duplicate UUIDs
duplicates_paired = (
df.group_by("uuid")
.agg(pl.col("file").n_unique().alias("file_count"))
.filter(pl.col("file_count") > 1)
.join(df, on="uuid")
.select(["uuid", "file"])
.select("uuid", "file")
.unique()
.group_by("uuid")
.agg([pl.col("file").alias("files")])
.agg(pl.col("file").alias("files"))
.filter(pl.col("files").len() > 1)
.collect()
)
duplicates_paired_df = duplicates_paired.collect()
for uuid, files in duplicates_paired_df.iter_rows():
for uuid, files in duplicates_paired.iter_rows():
self.logger.warning(
f"UUID: {uuid} has multiple file names: {files}"
)

# Warn on duplicate annotations
duplicate_annotation = (
df.group_by(["file", "annotation"])
df.group_by(
"original_filepath",
"task",
"type",
"annotation",
"instance_id",
)
.agg(pl.len().alias("count"))
.filter(pl.col("count") > 1)
)
duplicate_annotation_df = duplicate_annotation.collect()
for file_name, annotation, _ in duplicate_annotation_df.iter_rows():
.filter(pl.col("annotation") != "{}")
.drop("instance_id")
).collect()

for (
file_name,
task,
type_,
annotation,
count,
) in duplicate_annotation.iter_rows():
if "RLE" in type_ or "Mask" in type_:
annotation = "<binary mask>"
self.logger.warning(
f"File '{file_name}' has the same annotation '{annotation}' added multiple times."
f"File '{file_name}' has the same '{type_}' annotation "
f"'{annotation}' ({task=}) added {count} times."
)

def get_splits(self) -> Optional[Dict[str, List[str]]]:
Expand Down
2 changes: 1 addition & 1 deletion luxonis_ml/data/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def find_filepath_uuid(
if index is None:
return None

abs_path = str(Path(filepath).absolute())
abs_path = str(Path(filepath).absolute().resolve())
matched = index.filter(pl.col("original_filepath") == abs_path)

if len(matched):
Expand Down
6 changes: 4 additions & 2 deletions luxonis_ml/data/loaders/luxonis_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(
for view in self.view:
self.instances.extend(splits[view])

self.idx_to_df_row = []
self.idx_to_df_row: list[list[int]] = []
for uuid in self.instances:
boolean_mask = df["uuid"] == uuid
row_indexes = boolean_mask.arg_true().to_list()
Expand Down Expand Up @@ -139,7 +139,9 @@ def __getitem__(self, idx: int) -> LuxonisLoaderOutput:
else:
picked_indices = set()
max_val = len(self)
while len(picked_indices) < self.augmentations.aug_batch_size - 1:
while (
len(picked_indices) < self.augmentations.aug_batch_size - 1
):
rand_idx = random.randint(0, max_val - 1)
if rand_idx != idx and rand_idx not in picked_indices:
picked_indices.add(rand_idx)
Expand Down
2 changes: 1 addition & 1 deletion luxonis_ml/data/parsers/classification_directory_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def generator() -> DatasetIterator:
for class_name in class_names:
for img_path in (class_dir / class_name).iterdir():
yield {
"file": str(img_path.absolute()),
"file": str(img_path.absolute().resolve()),
"annotation": {
"type": "classification",
"class": class_name,
Expand Down
2 changes: 1 addition & 1 deletion luxonis_ml/data/parsers/coco_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def generator() -> DatasetIterator:
ann_dict[img_id].append(ann)

for img_id, img in img_dict.items():
path = image_dir.absolute() / img["file_name"]
path = image_dir.absolute().resolve() / img["file_name"]
if not path.exists():
continue
path = str(path)
Expand Down
2 changes: 1 addition & 1 deletion luxonis_ml/data/parsers/create_ml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def from_split(
class_names = set()
images_annotations = []
for annotations in annotations_data:
path = image_dir.absolute() / annotations["image"]
path = image_dir.absolute().resolve() / annotations["image"]
if not path.exists():
continue
file = str(path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def from_split(
def generator() -> DatasetIterator:
for mask_path in seg_dir.glob("*_mask.*"):
image_path = next(image_dir.glob(f"{mask_path.stem[:-5]}.*"))
file = str(image_path.absolute())
file = str(image_path.absolute().resolve())
mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)

ids = np.unique(mask)
Expand Down
4 changes: 3 additions & 1 deletion luxonis_ml/data/parsers/voc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def from_split(
annotation_data = ET.parse(anno_xml)
root = annotation_data.getroot()

path = image_dir.absolute() / self._xml_find(root, "filename")
path = image_dir.absolute().resolve() / self._xml_find(
root, "filename"
)
if not path.exists():
continue

Expand Down
2 changes: 1 addition & 1 deletion luxonis_ml/data/parsers/yolov4_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def generator() -> DatasetIterator:
data = ann_line.split(" ")
img_path = data[0]

path = image_dir.absolute() / img_path
path = image_dir.absolute().resolve() / img_path
if not path.exists():
continue

Expand Down

0 comments on commit de1d69f

Please sign in to comment.