v1.0.3 fixed issues with encoding, multiple channels

NavodPeiris · Jan 19, 2024 · ec8f23b · ec8f23b
1 parent 58146e0
commit ec8f23b
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 18 deletions.
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="speechlib",
-    version="1.0.1",
+    version="1.0.3",
     description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names",
     packages=find_packages(),
     long_description=long_description,

diff --git a/speechlib/convert_to_mono.py b/speechlib/convert_to_mono.py
@@ -0,0 +1,31 @@
+import wave
+import numpy as np
+
+def convert_to_mono(input_wav):
+    # Open the input WAV file
+    with wave.open(input_wav, 'rb') as input_file:
+        # Get the parameters of the input file
+        params = input_file.getparams()
+
+        # Check if the file is stereo
+        if params.nchannels > 1:
+            # Read the audio data
+            frames = input_file.readframes(-1)
+            audio_data = np.frombuffer(frames, dtype=np.int16)
+
+            # Take the average of the channels to convert to mono
+            mono_audio_data = np.mean(audio_data.reshape(-1, params.nchannels), axis=1)
+
+            # Create a new WAV file for mono audio
+            with wave.open(input_wav, 'wb') as output_file:
+                # Set the parameters for the output file
+                output_file.setparams((1, params.sampwidth, params.framerate, len(mono_audio_data), params.comptype, params.compname))
+
+                # Write the mono audio data to the output file
+                output_file.writeframes(np.int16(mono_audio_data))
+
+            print(f'{input_wav} converted to mono')
+        else:
+            print(f'{input_wav} is already a mono audio file.')
+
+
diff --git a/speechlib/core_analysis.py b/speechlib/core_analysis.py
@@ -5,12 +5,22 @@
 from .speaker_recognition import (speaker_recognition)
 from .write_log_file import (write_log_file)
 
+from .re_encode import (re_encode)
+from .convert_to_mono import (convert_to_mono)
 
 # by default use google speech-to-text API
 # if False, then use whisper finetuned version for sinhala
 def core_analysis(file_name, voices_folder, log_folder, language):
 
-    # <-------------------Processing file-------------------------->
+    # <-------------------PreProcessing file-------------------------->
+
+    # convert file to mono
+    convert_to_mono(file_name)
+
+    # re-encode file to 16-bit PCM encoding
+    re_encode(file_name)
+
+    # <--------------------running analysis--------------------------->
 
     speaker_tags = []
 

diff --git a/speechlib/re_encode.py b/speechlib/re_encode.py
@@ -0,0 +1,37 @@
+import wave
+import struct
+
+def re_encode(file_name):
+
+    with wave.open(file_name, 'rb') as original_file:
+
+        # Get the original audio parameters
+        params = original_file.getparams()
+
+        # Check if the sample width is already 16-bit
+        if params.sampwidth == 2:
+
+            print("The file already has 16-bit samples.")
+
+        elif params.sampwidth == 1:
+
+            # Open a new WAV file with 16-bit samples
+            file_name = file_name + '_16bit.wav'
+
+            with wave.open(file_name, 'wb') as new_file:
+                # Set the new audio parameters
+                new_file.setparams(params)
+                new_file.setsampwidth(2)
+                new_file.setnchannels(1)
+
+                # Read and convert each sample
+                for _ in range(params.nframes):
+                    sample = original_file.readframes(1)
+                    sample_value = struct.unpack("<B", sample)[0]
+                    new_sample_value = (sample_value - 128) * 256
+                    new_sample = struct.pack("<h", new_sample_value)
+                    new_file.writeframes(new_sample)
+
+            print("Conversion completed. Saved as " + file_name)
+        else:
+            print("Unsupported sample width.")
diff --git a/speechlib/speaker_recognition.py b/speechlib/speaker_recognition.py
@@ -39,17 +39,20 @@ def speaker_recognition(file_name, voices_folder, segments, wildcards):
             for voice in voices:
                 voice_file = voices_folder + "/" + speaker + "/" + voice
 
-                # compare voice file with audio file
-                score, prediction = verification.verify_files(voice_file, file)
-                prediction = prediction[0].item()
-                score = score[0].item()
-
-                if prediction == True:
-                    if score >= max_score:
-                        max_score = score
-                        speakerId = speaker.split(".")[0]  
-                        if speakerId not in wildcards:        # speaker_00 cannot be speaker_01
-                            person = speakerId
+                try:
+                    # compare voice file with audio file
+                    score, prediction = verification.verify_files(voice_file, file)
+                    prediction = prediction[0].item()
+                    score = score[0].item()
+
+                    if prediction == True:
+                        if score >= max_score:
+                            max_score = score
+                            speakerId = speaker.split(".")[0]  
+                            if speakerId not in wildcards:        # speaker_00 cannot be speaker_01
+                                person = speakerId
+                except:
+                    pass
 
         Id_count[person] += 1
 

diff --git a/speechlib/wav_segmenter.py b/speechlib/wav_segmenter.py
@@ -26,11 +26,13 @@ def wav_file_segmentation(file_name, segments, language):
         file = folder_name + "/" + "segment"+ str(i) + ".wav"
         clip.export(file, format="wav")
 
-        trans = transcribe(file, language)  
-
-        # return -> [[start time, end time, transcript], [start time, end time, transcript], ..]
-        texts.append([segment[0], segment[1], trans])
-
+        try:
+            trans = transcribe(file, language)  
+
+            # return -> [[start time, end time, transcript], [start time, end time, transcript], ..]
+            texts.append([segment[0], segment[1], trans])
+        except:
+            pass
         # Delete the WAV file after processing
         os.remove(file)