Skip to content

Commit

Permalink
added out directory
Browse files Browse the repository at this point in the history
  • Loading branch information
skewballfox committed Jan 3, 2022
1 parent a07ce16 commit 5cb7bfc
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 125 deletions.
2 changes: 1 addition & 1 deletion data_generation_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def data_generation_flow(
)
)
noise_destination_directory = (
wakeword_model_name + "/random/non-utterances/pdsounds_march2009/"
"out/" + wakeword_model_name + "/random/non-utterances/pdsounds_march2009/"
)
basic_file_operations_instance.convert_mp3s_in_directory_to_wavs(
noise_directory, noise_destination_directory
Expand Down
150 changes: 106 additions & 44 deletions further_data_generation_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,77 +10,139 @@
# TODO: input for flow to add one or multiple directories (make list of datasets and links for IDE)
# TODO: refactor the code to use the basic_file_operations_instance more (the problem is either copying wav or converting to mp3 and then copying to wav)

def convert_mp3s_to_wavs_split_into_multiple_directories(source_directory, destination_directory, max_files_from_source_directory, max_files_per_destination_directory):
files = basic_file_operations_instance.get_limited_number_of_files(source_directory, max_files_from_source_directory)
print(f'{len(files)} files found in {source_directory}')

def convert_mp3s_to_wavs_split_into_multiple_directories(
source_directory,
destination_directory,
max_files_from_source_directory,
max_files_per_destination_directory,
):
files = basic_file_operations_instance.get_limited_number_of_files(
source_directory, max_files_from_source_directory
)
print(f"{len(files)} files found in {source_directory}")
directory_number = 1
file_count = 1
number_of_directories = int(len(files) / max_files_per_destination_directory) + 1

def check_for_wav_files(files, source_directory):
if all(file.endswith('.wav') for file in files):
print(f'All files in {source_directory} are already in wav format')
if all(file.endswith(".wav") for file in files):
print(f"All files in {source_directory} are already in wav format")
return True
else:
print(f'All files in {source_directory} are probably in mp3 format and will be converted to wav')
print(
f"All files in {source_directory} are probably in mp3 format and will be converted to wav"
)
return False

check_for_wav_files = check_for_wav_files(files, source_directory)

def split_files_into_multiple_directories(files, directory_number, file_count, number_of_directories):

def split_files_into_multiple_directories(
files, directory_number, file_count, number_of_directories
):
# Will copy if files are already in wav format, otherwise will convert to wav and then copy
for file in files:
if number_of_directories > 1:
#print(f'Number of directories: {number_of_directories}')
destination_slug = '0' + str(directory_number) + '/'
basic_file_operations_instance.make_directory(destination_directory + destination_slug)
# print(f'Number of directories: {number_of_directories}')
destination_slug = "0" + str(directory_number) + "/"
basic_file_operations_instance.make_directory(
destination_directory + destination_slug
)
else:
destination_slug = ''
destination_slug = ""
if file_count < max_files_per_destination_directory:
if check_for_wav_files:
basic_file_operations_instance.copy_file(file, source_directory, destination_directory + destination_slug)
basic_file_operations_instance.copy_file(
file, source_directory, destination_directory + destination_slug
)
else:
basic_file_operations_instance.convert_mp3_to_wav(file, source_directory, destination_directory + destination_slug)
basic_file_operations_instance.convert_mp3_to_wav(
file, source_directory, destination_directory + destination_slug
)
file_count += 1
elif file_count == max_files_per_destination_directory:
if check_for_wav_files:
basic_file_operations_instance.copy_file(file, source_directory, destination_directory + destination_slug)
basic_file_operations_instance.copy_file(
file, source_directory, destination_directory + destination_slug
)
else:
basic_file_operations_instance.convert_mp3_to_wav(file, source_directory, destination_directory + destination_slug)
basic_file_operations_instance.convert_mp3_to_wav(
file, source_directory, destination_directory + destination_slug
)
directory_number += 1
file_count = 1
return number_of_directories
return split_files_into_multiple_directories(files, directory_number, file_count, number_of_directories)

def incremental_train_over_number_of_directories(number_of_directories, wakeword_model_name, destination_directory):

return split_files_into_multiple_directories(
files, directory_number, file_count, number_of_directories
)


def incremental_train_over_number_of_directories(
number_of_directories, wakeword_model_name, destination_directory
):
if number_of_directories > 1:
for directory_number in range(1, number_of_directories + 1):
current_model_info = incremental_training_flow(destination_directory + '0' + str(directory_number) + '/', wakeword_model_name, epochs='10')
current_model_info = incremental_training_flow(
destination_directory + "0" + str(directory_number) + "/",
wakeword_model_name,
epochs="10",
)
else:
current_model_info = incremental_training_flow(destination_directory, wakeword_model_name, epochs='20')
print(f'Current model info: {current_model_info}')
current_model_info = incremental_training_flow(
destination_directory, wakeword_model_name, epochs="20"
)
print(f"Current model info: {current_model_info}")
return current_model_info

def further_data_generation_flow(directories_to_process, extra_audio_directories_labels, max_files_from_source_directory, max_files_per_destination_directory, wakeword_model_name):
for source_directory, extra_audio_directories_label in zip(directories_to_process, extra_audio_directories_labels):
destination_directory = wakeword_model_name + '/' + 'random/' + extra_audio_directories_label + '/' + source_directory.split('/')[-2] + '/'


def further_data_generation_flow(
directories_to_process,
extra_audio_directories_labels,
max_files_from_source_directory,
max_files_per_destination_directory,
wakeword_model_name,
):
for source_directory, extra_audio_directories_label in zip(
directories_to_process, extra_audio_directories_labels
):
destination_directory = (
"out/"
+ wakeword_model_name
+ "/"
+ "random/"
+ extra_audio_directories_label
+ "/"
+ source_directory.split("/")[-2]
+ "/"
)

basic_file_operations_instance.make_directory(destination_directory)
number_of_directories = convert_mp3s_to_wavs_split_into_multiple_directories(source_directory, destination_directory, max_files_from_source_directory, max_files_per_destination_directory)
print(f'All files from {source_directory} have been copied to {destination_directory}')

current_model_info = incremental_train_over_number_of_directories(number_of_directories, wakeword_model_name, destination_directory)

print(f'Training complete for {source_directory}')
print(f'Current model info: {current_model_info}')

final_model_info = train_model_flow(wakeword_model_name, epochs='50')

print('Final training complete')
#TODO: get number of epochs from the model info
#TODO: train a model on the whole dataset with the same number of epochs as the last model
#TODO: compare the results (done by hand: about 10% increase in accuracy on test set, not bad!)
print(f'Final model info: {final_model_info}')
print(f'Make sure to test your model in precise-listen {wakeword_model_name}.net')
number_of_directories = convert_mp3s_to_wavs_split_into_multiple_directories(
source_directory,
destination_directory,
max_files_from_source_directory,
max_files_per_destination_directory,
)
print(
f"All files from {source_directory} have been copied to {destination_directory}"
)

current_model_info = incremental_train_over_number_of_directories(
number_of_directories, wakeword_model_name, destination_directory
)

print(f"Training complete for {source_directory}")
print(f"Current model info: {current_model_info}")

final_model_info = train_model_flow(wakeword_model_name, epochs="50")

print("Final training complete")
# TODO: get number of epochs from the model info
# TODO: train a model on the whole dataset with the same number of epochs as the last model
# TODO: compare the results (done by hand: about 10% increase in accuracy on test set, not bad!)
print(f"Final model info: {final_model_info}")
print(f"Make sure to test your model in precise-listen {wakeword_model_name}.net")


# TESTING:
Expand All @@ -106,4 +168,4 @@ def further_data_generation_flow(directories_to_process, extra_audio_directories
directory_number = 1
file_count = 1
number_of_directories = int(len(files) / max_files_per_destination_directory) + 1
print(f'{number_of_directories} directories will be created')"""
print(f'{number_of_directories} directories will be created')"""
Loading

0 comments on commit 5cb7bfc

Please sign in to comment.