Skip to content

Commit

Permalink
Update to TF 1.15
Browse files Browse the repository at this point in the history
Cleanup configs
Include one sample from ENST for demo purposes
  • Loading branch information
f90 committed May 4, 2020
1 parent 54cd803 commit 2b0dc6c
Show file tree
Hide file tree
Showing 37 changed files with 82 additions and 203 deletions.
129 changes: 9 additions & 120 deletions Config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
@config_ingredient.config
def cfg():
# Base configuration
model_config = {"enst_path" : "/import/c4dm-04/davem/ENST-drums/", # SET MUSDB PATH HERE, AND SET CCMIXTER PATH IN CCMixter.xml
model_config = {"enst_path" : "/mnt/windaten/Datasets/ENST_Drums", # SET MUSDB PATH HERE, AND SET CCMIXTER PATH IN CCMixter.xml
"estimates_path" : "/mnt/windaten/Source_Estimates", # SET THIS PATH TO WHERE YOU WANT SOURCE ESTIMATES PRODUCED BY THE TRAINED MODEL TO BE SAVED. Folder itself must exist!
"data_path" : "data", # Set this to where the preprocessed dataset should be saved

Expand Down Expand Up @@ -44,129 +44,18 @@ def cfg():
model_config["num_outputs"] = 1 if model_config["mono_downmix"] else 2

@config_ingredient.named_config
def baseline():
print("Training baseline model")

@config_ingredient.named_config
def baseline_wet():
print("Training wet model")
model_config = {
"task" : "wet"
}

@config_ingredient.named_config
def norm_context_wet():
def context_wet():
print("Training wet model")
model_config = {
"task" : "wet",
"data_path" : "data_norm",
"num_frames" : 88200,
"context" : True
}

@config_ingredient.named_config
def baseline_diff():
print("Training baseline model with difference output")
model_config = {
"output_type" : "difference"
}

@config_ingredient.named_config
def baseline_context():
print("Training baseline model with difference output and input context (valid convolutions)")
model_config = {
"output_type" : "difference",
"context" : True
}

@config_ingredient.named_config
def baseline_stereo():
print("Training baseline model with difference output and input context (valid convolutions) and stereo input/output")
model_config = {
"output_type" : "difference",
"context" : True,
"mono_downmix" : False
"task": "wet",
"num_frames": 88200,
"context": True
}

@config_ingredient.named_config
def full():
print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer")
def context_dry():
print("Training dry model")
model_config = {
"output_type" : "difference",
"context" : True,
"upsampling": "learned",
"mono_downmix" : False
"num_frames": 88200,
"context": True
}

@config_ingredient.named_config
def full_44KHz():
print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer, and 44.1 KHz sampling rate")
model_config = {
"output_type" : "difference",
"context" : True,
"upsampling": "learned",
"mono_downmix" : False,
"expected_sr" : 44100
}

@config_ingredient.named_config
def baseline_context_smallfilter_deep():
model_config = {
"output_type": "difference",
"context": True,
"num_layers" : 14,
"duration" : 7,
"filter_size" : 5,
"merge_filter_size" : 1
}

@config_ingredient.named_config
def full_multi_instrument():
print("Training multi-instrument separation with best model")
model_config = {
"output_type": "difference",
"context": True,
"upsampling": "linear",
"mono_downmix": False,
"task" : "multi_instrument"
}

@config_ingredient.named_config
def baseline_comparison():
model_config = {
"batch_size": 4, # Less output since model is so big. Doesn't matter since the model's output is not dependent on its output or input size (only convolutions)

"output_type": "difference",
"context": True,
"num_frames" : 768*127 + 1024,
"duration" : 13,
"expected_sr" : 8192,
"num_initial_filters" : 34
}

@config_ingredient.named_config
def unet_spectrogram():
model_config = {
"batch_size": 4, # Less output since model is so big.

"network" : "unet_spectrogram",
"num_layers" : 6,
"expected_sr" : 8192,
"num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
"duration" : 13,
"num_initial_filters" : 16
}

@config_ingredient.named_config
def unet_spectrogram_l1():
model_config = {
"batch_size": 4, # Less output since model is so big.

"network" : "unet_spectrogram",
"num_layers" : 6,
"expected_sr" : 8192,
"num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
"duration" : 13,
"num_initial_filters" : 16,
"raw_audio_loss" : False
}
14 changes: 7 additions & 7 deletions Datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

def take_random_snippets(sample, keys, input_shape, output_shape, num_samples):
# Take a sample (collection of audio files) and extract snippets from it at a number of random positions
start_pos = tf.random_uniform([num_samples], 0, maxval=sample["length"] - input_shape[0], dtype=tf.int64)
start_pos = tf.random.uniform([num_samples], 0, maxval=sample["length"] - input_shape[0], dtype=tf.int64)
return take_snippets_at_pos(sample, keys, start_pos, input_shape[0], output_shape[1], num_samples)

def take_all_snippets(sample, keys, input_shape, output_shape):
Expand Down Expand Up @@ -58,7 +58,7 @@ def write_records(sample_list, model_config, input_shape, output_shape, records_

# Set up writers
num_writers = 1
writers = [tf.python_io.TFRecordWriter(records_path + str(i) + ".tfrecords") for i in range(num_writers)]
writers = [tf.io.TFRecordWriter(records_path + str(i) + ".tfrecords") for i in range(num_writers)]

# Go through songs and write them to TFRecords
all_keys = ["mix"] + model_config["source_names"]
Expand Down Expand Up @@ -122,11 +122,11 @@ def parse_record(example_proto, source_names, output_channels):

all_names = source_names + ["mix"]

features = {key : tf.FixedLenSequenceFeature([], allow_missing=True, dtype=tf.float32) for key in all_names}
features["length"] = tf.FixedLenFeature([], tf.int64)
features["channels"] = tf.FixedLenFeature([], tf.int64)
features = {key : tf.io.FixedLenSequenceFeature([], allow_missing=True, dtype=tf.float32) for key in all_names}
features["length"] = tf.io.FixedLenFeature([], tf.int64)
features["channels"] = tf.io.FixedLenFeature([], tf.int64)

parsed_features = tf.parse_single_example(example_proto, features)
parsed_features = tf.io.parse_single_example(example_proto, features)

# Reshape
length = tf.cast(parsed_features["length"], tf.int64)
Expand Down Expand Up @@ -173,7 +173,7 @@ def get_dataset(model_config, input_shape, output_shape, partition):
# The dataset structure is a dictionary with "train", "valid", "test" keys, whose entries are lists, where each element represents a song.
# Each song is represented as a dictionary containing elements mix, acc, vocal or mix, bass, drums, other, vocal depending on the task.

num_cores = 8
num_cores = 1

for curr_partition in ["train", "val", "test"]:
print("Writing " + curr_partition + " partition...")
Expand Down
18 changes: 9 additions & 9 deletions Evaluate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import soundfile
import tensorflow as tf
import librosa

Expand Down Expand Up @@ -35,7 +36,7 @@ def predict(audio, model_config, load_model):
sep_input_shape[0] = 1
sep_output_shape[0] = 1

tracks_ph = tf.placeholder(tf.float32, sep_input_shape)
tracks_ph = tf.compat.v1.placeholder(tf.float32, sep_input_shape)

print("Testing...")

Expand All @@ -45,12 +46,12 @@ def predict(audio, model_config, load_model):

# Start session and queue input threads
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(tf.compat.v1.global_variables_initializer())

# Load model
# Load pretrained model to continue training, if we are supposed to
restorer = tf.train.Saver(None, write_version=tf.train.SaverDef.V2)
print("Num of variables" + str(len(tf.global_variables())))
restorer = tf.compat.v1.train.Saver(None, write_version=tf.compat.v1.train.SaverDef.V2)
print("Num of variables" + str(len(tf.compat.v1.global_variables())))
restorer.restore(sess, load_model)
print('Pre-trained model restored for song prediction')

Expand All @@ -62,7 +63,7 @@ def predict(audio, model_config, load_model):

# Close session, clear computational graph
sess.close()
tf.reset_default_graph()
tf.compat.v1.reset_default_graph()

return mix_pred

Expand Down Expand Up @@ -93,14 +94,13 @@ def predict_track(model_config, sess, audio, sep_input_shape, sep_output_shape,

# Preallocate source predictions (same shape as input mixture)
mix_time_frames = audio[key].shape[0]
mix_preds = np.zeros((mix_time_frames, sep_output_shape[2]), np.float32)
# mix_preds = {name : np.zeros(mix_audio.shape, np.float32) for name in model_config["source_names"]}
mix_preds = np.asfortranarray(np.zeros((mix_time_frames, sep_output_shape[2]), np.float32))

input_time_frames = sep_input_shape[1]
output_time_frames = sep_output_shape[1]

# Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal
pad_time_frames = (input_time_frames - output_time_frames) / 2
pad_time_frames = (input_time_frames - output_time_frames) // 2


for key in audio.keys():
Expand Down Expand Up @@ -178,5 +178,5 @@ def produce_source_estimates(model_config, load_model, tracksdict, output_path):
os.makedirs(directory)
assert(os.path.exists(directory))
print(output_path)
librosa.output.write_wav(output_path, mix_pred, sr)
soundfile.write(output_path, mix_pred, sr)

2 changes: 1 addition & 1 deletion Models/InterpolationLayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def learned_interpolation_layer(input, padding, level):

# Construct 2FxF weight matrix, where F is the number of feature channels in the feature map.
# Matrix is constrained, made up out of two diagonal FxF matrices with diagonal weights w and 1-w. w is constrained to be in [0,1] # mioid
weights = tf.get_variable("interp_" + str(level), shape=[features], dtype=tf.float32)
weights = tf.compat.v1.get_variable("interp_" + str(level), shape=[features], dtype=tf.float32)
weights_scaled = tf.nn.sigmoid(weights) # Constrain weights to [0,1]
counter_weights = 1.0 - weights_scaled # Mirrored weights for the features from the other time step
conv_weights = tf.expand_dims(tf.concat([tf.expand_dims(tf.diag(weights_scaled), axis=0), tf.expand_dims(tf.diag(counter_weights), axis=0)], axis=0), axis=0)
Expand Down
6 changes: 3 additions & 3 deletions Models/UnetAudioSeparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def get_output(self, input, training, return_spectrogram=False, reuse=True):
'''


with tf.variable_scope("separator", reuse=reuse):
with tf.compat.v1.variable_scope("separator", reuse=reuse):
enc_outputs = list()

current_layer = input #tf.concat([input[key] for key in input.keys() if key != 'mix'], 2)
Expand All @@ -116,9 +116,9 @@ def get_output(self, input, training, return_spectrogram=False, reuse=True):
current_layer = Models.InterpolationLayer.learned_interpolation_layer(current_layer, self.padding, i)
else:
if self.context:
current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2] * 2 - 1], align_corners=True)
current_layer = tf.compat.v1.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2] * 2 - 1], align_corners=True)
else:
current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2]*2]) # out = in + in - 1
current_layer = tf.compat.v1.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2]*2]) # out = in + in - 1
current_layer = tf.squeeze(current_layer, axis=1)
# UPSAMPLING FINISHED

Expand Down
5 changes: 2 additions & 3 deletions Models/UnetSpectrogramSeparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from Utils import LeakyReLU
import numpy as np
import functools
from tensorflow.contrib.signal.python.ops import window_ops

class UnetSpectrogramSeparator:
'''
Expand Down Expand Up @@ -48,9 +47,9 @@ def get_output(self, input, training, return_spectrogram=False, reuse=True):
:return: U-Net output: If return_spectrogram: Accompaniment and voice magnitudes as length-two list with two 4D tensors. Otherwise Two 3D tensors containing the raw audio estimates
'''
# Setup STFT computation
window = functools.partial(window_ops.hann_window, periodic=True)
window = functools.partial(tf.signal.hann_window, periodic=True)
inv_window = tf.contrib.signal.inverse_stft_window_fn(self.hop, forward_window_fn=window)
with tf.variable_scope("separator", reuse=reuse):
with tf.compat.v1.variable_scope("separator", reuse=reuse):
# Compute spectrogram
assert(input.get_shape().as_list()[2] == 1) # Model works ONLY on mono
stfts = tf.contrib.signal.stft(tf.squeeze(input, 2), frame_length=self.frame_len, frame_step=self.hop, fft_length=self.frame_len, window_fn=window)
Expand Down
36 changes: 12 additions & 24 deletions Predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,22 @@

@ex.config
def cfg():
model_path = os.path.join("checkpoints", "baseline_wet", "940358-130000") # Load stereo vocal model by default
# input_path = {'hi-hat': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/hi-hat/001_hits_snare-drum_sticks_x5.wav',
# 'kick': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/kick/001_hits_snare-drum_sticks_x5.wav',
# 'mix': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/wet_mix/001_hits_snare-drum_sticks_x5_norm.wav',
# 'overhead_L': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/overhead_L/001_hits_snare-drum_sticks_x5.wav',
# 'overhead_R': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/overhead_R/001_hits_snare-drum_sticks_x5.wav',
# 'snare': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/snare/001_hits_snare-drum_sticks_x5.wav',
# 'tom_1': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/tom_1/001_hits_snare-drum_sticks_x5.wav',
# 'tom_2': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/tom_2/001_hits_snare-drum_sticks_x5.wav',
# 'tom_3': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/hi-hat/001_hits_snare-drum_sticks_x5.wav'}

input_path = {'hi-hat': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/hi-hat/078_phrase_reggae_simple_slow_sticks.wav',
'kick': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/kick/078_phrase_reggae_simple_slow_sticks.wav',
'mix': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/wet_mix/078_phrase_reggae_simple_slow_sticks_norm.wav',
'overhead_L': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/overhead_L/078_phrase_reggae_simple_slow_sticks.wav',
'overhead_R': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/overhead_R/078_phrase_reggae_simple_slow_sticks.wav',
'snare': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/snare/078_phrase_reggae_simple_slow_sticks.wav',
'tom_1': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/tom_1/078_phrase_reggae_simple_slow_sticks.wav',
'tom_2': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/tom_2/078_phrase_reggae_simple_slow_sticks.wav',
model_path = os.path.join("checkpoints", "wet", "wet-1108000") # Load wet pretrained model by default

input_path = {'hi-hat': 'audio_examples/inputs/hihat.wav',
'kick': 'audio_examples/inputs/kick.wav',
'mix': 'audio_examples/inputs/wet_mix.wav',
'overhead_L': 'audio_examples/inputs/overheadL.wav',
'overhead_R': 'audio_examples/inputs/overheadR.wav',
'snare': 'audio_examples/inputs/snare.wav',
'tom_1': 'audio_examples/inputs/tom1.wav',
'tom_2': 'audio_examples/inputs/tom2.wav',
'tom_3': None}
output_path = 'audio_examples/mix.wav'

output_path = 'audio_examples/outputs/wet_mix.wav'


@ex.automain
def main(cfg, model_path, input_path, output_path):


model_config = cfg["model_config"]

Evaluate.produce_source_estimates(model_config, model_path, input_path, output_path)
3 changes: 2 additions & 1 deletion PredictDataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import soundfile
from sacred import Experiment
from Config import config_ingredient
import Evaluate, Datasets, Utils
Expand Down Expand Up @@ -56,7 +57,7 @@ def main(cfg, model_path, output_path):
target, sr = Utils.load(track['mix'], sr=None, mono=False)
target = target/np.max(np.abs(target))

librosa.output.write_wav(output_track+'_target.wav', target, sr=sr)
soundfile.write(output_track+'_target.wav', target, sr)

output, _ = Utils.load(output_track, sr=None, mono=False)

Expand Down
Loading

0 comments on commit 2b0dc6c

Please sign in to comment.