Skip to content

Commit

Permalink
v1.0.3 fixed issues with encoding, multiple channels
Browse files Browse the repository at this point in the history
  • Loading branch information
NavodPeiris committed Jan 19, 2024
1 parent 58146e0 commit ec8f23b
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 18 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="speechlib",
version="1.0.1",
version="1.0.3",
description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names",
packages=find_packages(),
long_description=long_description,
Expand Down
31 changes: 31 additions & 0 deletions speechlib/convert_to_mono.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import wave
import numpy as np

def convert_to_mono(input_wav):
# Open the input WAV file
with wave.open(input_wav, 'rb') as input_file:
# Get the parameters of the input file
params = input_file.getparams()

# Check if the file is stereo
if params.nchannels > 1:
# Read the audio data
frames = input_file.readframes(-1)
audio_data = np.frombuffer(frames, dtype=np.int16)

# Take the average of the channels to convert to mono
mono_audio_data = np.mean(audio_data.reshape(-1, params.nchannels), axis=1)

# Create a new WAV file for mono audio
with wave.open(input_wav, 'wb') as output_file:
# Set the parameters for the output file
output_file.setparams((1, params.sampwidth, params.framerate, len(mono_audio_data), params.comptype, params.compname))

# Write the mono audio data to the output file
output_file.writeframes(np.int16(mono_audio_data))

print(f'{input_wav} converted to mono')
else:
print(f'{input_wav} is already a mono audio file.')


12 changes: 11 additions & 1 deletion speechlib/core_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,22 @@
from .speaker_recognition import (speaker_recognition)
from .write_log_file import (write_log_file)

from .re_encode import (re_encode)
from .convert_to_mono import (convert_to_mono)

# by default use google speech-to-text API
# if False, then use whisper finetuned version for sinhala
def core_analysis(file_name, voices_folder, log_folder, language):

# <-------------------Processing file-------------------------->
# <-------------------PreProcessing file-------------------------->

# convert file to mono
convert_to_mono(file_name)

# re-encode file to 16-bit PCM encoding
re_encode(file_name)

# <--------------------running analysis--------------------------->

speaker_tags = []

Expand Down
37 changes: 37 additions & 0 deletions speechlib/re_encode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import wave
import struct

def re_encode(file_name):

with wave.open(file_name, 'rb') as original_file:

# Get the original audio parameters
params = original_file.getparams()

# Check if the sample width is already 16-bit
if params.sampwidth == 2:

print("The file already has 16-bit samples.")

elif params.sampwidth == 1:

# Open a new WAV file with 16-bit samples
file_name = file_name + '_16bit.wav'

with wave.open(file_name, 'wb') as new_file:
# Set the new audio parameters
new_file.setparams(params)
new_file.setsampwidth(2)
new_file.setnchannels(1)

# Read and convert each sample
for _ in range(params.nframes):
sample = original_file.readframes(1)
sample_value = struct.unpack("<B", sample)[0]
new_sample_value = (sample_value - 128) * 256
new_sample = struct.pack("<h", new_sample_value)
new_file.writeframes(new_sample)

print("Conversion completed. Saved as " + file_name)
else:
print("Unsupported sample width.")
25 changes: 14 additions & 11 deletions speechlib/speaker_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,20 @@ def speaker_recognition(file_name, voices_folder, segments, wildcards):
for voice in voices:
voice_file = voices_folder + "/" + speaker + "/" + voice

# compare voice file with audio file
score, prediction = verification.verify_files(voice_file, file)
prediction = prediction[0].item()
score = score[0].item()

if prediction == True:
if score >= max_score:
max_score = score
speakerId = speaker.split(".")[0]
if speakerId not in wildcards: # speaker_00 cannot be speaker_01
person = speakerId
try:
# compare voice file with audio file
score, prediction = verification.verify_files(voice_file, file)
prediction = prediction[0].item()
score = score[0].item()

if prediction == True:
if score >= max_score:
max_score = score
speakerId = speaker.split(".")[0]
if speakerId not in wildcards: # speaker_00 cannot be speaker_01
person = speakerId
except:
pass

Id_count[person] += 1

Expand Down
12 changes: 7 additions & 5 deletions speechlib/wav_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ def wav_file_segmentation(file_name, segments, language):
file = folder_name + "/" + "segment"+ str(i) + ".wav"
clip.export(file, format="wav")

trans = transcribe(file, language)

# return -> [[start time, end time, transcript], [start time, end time, transcript], ..]
texts.append([segment[0], segment[1], trans])

try:
trans = transcribe(file, language)

# return -> [[start time, end time, transcript], [start time, end time, transcript], ..]
texts.append([segment[0], segment[1], trans])
except:
pass
# Delete the WAV file after processing
os.remove(file)

Expand Down

0 comments on commit ec8f23b

Please sign in to comment.