Merge pull request #11 from siesto1elemento/my-feature-branch

added common voice versions and some fixes in other formats
midas-research · Oct 30, 2024 · f767ced · f767ced
2 parents 9e73148 + c005331
commit f767ced
Show file tree

Hide file tree

Showing 7 changed files with 503 additions and 59 deletions.
diff --git a/cvat/apps/dataset_manager/formats/LibriVox.py b/cvat/apps/dataset_manager/formats/LibriVox.py
@@ -2,6 +2,7 @@
 import os.path as osp
 import zipfile
 import csv
+import math
 from django.db import transaction
 from glob import glob
 from pydub import AudioSegment
@@ -21,11 +22,24 @@ def calculate_duration(row):
     return end_time - start_time
 
 
-def split_rows_by_time(all_rows, time_threshold=600):
+def split_rows_by_time(all_rows, clips_folder, time_threshold=600):
     result = []
 
     total_duration = 0
 
+    for row in all_rows:
+        if not row.get("start") or not row.get("end"):
+            audio_file_name = row["file"]
+            audio_file_path = os.path.join(clips_folder, audio_file_name)
+
+            if os.path.isfile(audio_file_path):
+                audio_segment = AudioSegment.from_file(audio_file_path)
+                audio_duration = audio_segment.duration_seconds
+
+                # Set start to 0 if missing, and end to the audio duration
+                row["start"] = row.get("start", "0")
+                row["end"] = row.get("end", str(audio_duration))
+
     for row in all_rows:
         start_time = float(row["start"])
         end_time = float(row["end"])
@@ -198,9 +212,9 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 num_tsv_rows = len(tsv_rows)
                 num_clips = len(os.listdir(clips_folder))
 
-                if num_tsv_rows != num_clips:
+                if num_tsv_rows > num_clips:
                     raise ValueError(
-                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. Clips must be equal or more."
                     )
 
             # Combined audio that will be the final output
@@ -221,6 +235,8 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                         combined_audio += (
                             audio_segment  # Append the audio in the order from TSV
                         )
+                    else:
+                        raise FileNotFoundError(f"File not found: {file_path}")
 
             # Create raw folder to store combined audio
             raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
@@ -254,7 +270,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 reader = csv.DictReader(tsvfile, delimiter="\t")
                 all_rows = list(reader)
 
-                new_rows = split_rows_by_time(all_rows)
+            new_rows = split_rows_by_time(all_rows, clips_folder)
 
             jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
 
@@ -361,7 +377,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
 
                     record_index += 1
                     total_duration = round(end_time, 2)
-                    if 599.9 <= total_duration <= 600:
+                    if math.isclose(total_duration, 600, abs_tol=1e-6):
                         break
 
         else:

diff --git a/cvat/apps/dataset_manager/formats/VCTK_Corpus.py b/cvat/apps/dataset_manager/formats/VCTK_Corpus.py
@@ -2,6 +2,7 @@
 import os.path as osp
 import zipfile
 import csv
+import math
 from django.db import transaction
 from glob import glob
 from pydub import AudioSegment
@@ -21,11 +22,24 @@ def calculate_duration(row):
     return end_time - start_time
 
 
-def split_rows_by_time(all_rows, time_threshold=600):
+def split_rows_by_time(all_rows, clips_folder, time_threshold=600):
     result = []
 
     total_duration = 0
 
+    for row in all_rows:
+        if not row.get("start") or not row.get("end"):
+            audio_file_name = row["file"]
+            audio_file_path = os.path.join(clips_folder, audio_file_name)
+
+            if os.path.isfile(audio_file_path):
+                audio_segment = AudioSegment.from_file(audio_file_path)
+                audio_duration = audio_segment.duration_seconds
+
+                # Set start to 0 if missing, and end to the audio duration
+                row["start"] = row.get("start", "0")
+                row["end"] = row.get("end", str(audio_duration))
+
     for row in all_rows:
         start_time = float(row["start"])
         end_time = float(row["end"])
@@ -198,9 +212,9 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 num_tsv_rows = len(tsv_rows)
                 num_clips = len(os.listdir(clips_folder))
 
-                if num_tsv_rows != num_clips:
+                if num_tsv_rows > num_clips:
                     raise ValueError(
-                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. Clips must be equal or more."
                     )
 
             # Combined audio that will be the final output
@@ -221,6 +235,8 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                         combined_audio += (
                             audio_segment  # Append the audio in the order from TSV
                         )
+                    else:
+                        raise FileNotFoundError(f"File not found: {file_path}")
 
             # Create raw folder to store combined audio
             raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
@@ -254,7 +270,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 reader = csv.DictReader(tsvfile, delimiter="\t")
                 all_rows = list(reader)
 
-                new_rows = split_rows_by_time(all_rows)
+            new_rows = split_rows_by_time(all_rows, clips_folder)
 
             jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
 
@@ -361,7 +377,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
 
                     record_index += 1
                     total_duration = round(end_time, 2)
-                    if 599.9 <= total_duration <= 600:
+                    if math.isclose(total_duration, 600, abs_tol=1e-6):
                         break
 
         else:

diff --git a/cvat/apps/dataset_manager/formats/VoxCeleb.py b/cvat/apps/dataset_manager/formats/VoxCeleb.py
@@ -2,6 +2,7 @@
 import os.path as osp
 import zipfile
 import csv
+import math
 from django.db import transaction
 from glob import glob
 from pydub import AudioSegment
@@ -21,11 +22,24 @@ def calculate_duration(row):
     return end_time - start_time
 
 
-def split_rows_by_time(all_rows, time_threshold=600):
+def split_rows_by_time(all_rows, clips_folder, time_threshold=600):
     result = []
 
     total_duration = 0
 
+    for row in all_rows:
+        if not row.get("start") or not row.get("end"):
+            audio_file_name = row["file"]
+            audio_file_path = os.path.join(clips_folder, audio_file_name)
+
+            if os.path.isfile(audio_file_path):
+                audio_segment = AudioSegment.from_file(audio_file_path)
+                audio_duration = audio_segment.duration_seconds
+
+                # Set start to 0 if missing, and end to the audio duration
+                row["start"] = row.get("start", "0")
+                row["end"] = row.get("end", str(audio_duration))
+
     for row in all_rows:
         start_time = float(row["start"])
         end_time = float(row["end"])
@@ -198,9 +212,9 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 num_tsv_rows = len(tsv_rows)
                 num_clips = len(os.listdir(clips_folder))
 
-                if num_tsv_rows != num_clips:
+                if num_tsv_rows > num_clips:
                     raise ValueError(
-                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. Clips must be equal or more."
                     )
 
             # Combined audio that will be the final output
@@ -221,6 +235,8 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                         combined_audio += (
                             audio_segment  # Append the audio in the order from TSV
                         )
+                    else:
+                        raise FileNotFoundError(f"File not found: {file_path}")
 
             # Create raw folder to store combined audio
             raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
@@ -254,7 +270,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 reader = csv.DictReader(tsvfile, delimiter="\t")
                 all_rows = list(reader)
 
-                new_rows = split_rows_by_time(all_rows)
+            new_rows = split_rows_by_time(all_rows, clips_folder)
 
             jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
 
@@ -361,7 +377,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
 
                     record_index += 1
                     total_duration = round(end_time, 2)
-                    if 599.9 <= total_duration <= 600:
+                    if math.isclose(total_duration, 600, abs_tol=1e-6):
                         break
 
         else:

diff --git a/cvat/apps/dataset_manager/formats/Voxpopuli.py b/cvat/apps/dataset_manager/formats/Voxpopuli.py
@@ -2,6 +2,7 @@
 import os.path as osp
 import zipfile
 import csv
+import math
 from django.db import transaction
 from glob import glob
 from pydub import AudioSegment
@@ -21,11 +22,24 @@ def calculate_duration(row):
     return end_time - start_time
 
 
-def split_rows_by_time(all_rows, time_threshold=600):
+def split_rows_by_time(all_rows, clips_folder, time_threshold=600):
     result = []
 
     total_duration = 0
 
+    for row in all_rows:
+        if not row.get("start") or not row.get("end"):
+            audio_file_name = row["audio_path"]
+            audio_file_path = os.path.join(clips_folder, audio_file_name)
+
+            if os.path.isfile(audio_file_path):
+                audio_segment = AudioSegment.from_file(audio_file_path)
+                audio_duration = audio_segment.duration_seconds
+
+                # Set start to 0 if missing, and end to the audio duration
+                row["start"] = row.get("start", "0")
+                row["end"] = row.get("end", str(audio_duration))
+
     for row in all_rows:
         start_time = float(row["start"])
         end_time = float(row["end"])
@@ -198,9 +212,9 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 num_tsv_rows = len(tsv_rows)
                 num_clips = len(os.listdir(clips_folder))
 
-                if num_tsv_rows != num_clips:
+                if num_tsv_rows > num_clips:
                     raise ValueError(
-                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. Clips must be equal or more."
                     )
 
             # Combined audio that will be the final output
@@ -221,6 +235,8 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                         combined_audio += (
                             audio_segment  # Append the audio in the order from TSV
                         )
+                    else:
+                        raise FileNotFoundError(f"File not found: {file_path}")
 
             # Create raw folder to store combined audio
             raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
@@ -254,7 +270,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
                 reader = csv.DictReader(tsvfile, delimiter="\t")
                 all_rows = list(reader)
 
-                new_rows = split_rows_by_time(all_rows)
+            new_rows = split_rows_by_time(all_rows, clips_folder)
 
             jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
 
@@ -361,7 +377,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
 
                     record_index += 1
                     total_duration = round(end_time, 2)
-                    if 599.9 <= total_duration <= 600:
+                    if math.isclose(total_duration, 600, abs_tol=1e-6):
                         break
 
         else: