Update to TF 1.15

Cleanup configs Include one sample from ENST for demo purposes
f90 · May 4, 2020 · 2b0dc6c · 2b0dc6c
1 parent 54cd803
commit 2b0dc6c
Show file tree

Hide file tree

Showing 37 changed files with 82 additions and 203 deletions.
diff --git a/Config.py b/Config.py
@@ -6,7 +6,7 @@
 @config_ingredient.config
 def cfg():
     # Base configuration
-    model_config = {"enst_path" : "/import/c4dm-04/davem/ENST-drums/", # SET MUSDB PATH HERE, AND SET CCMIXTER PATH IN CCMixter.xml
+    model_config = {"enst_path" : "/mnt/windaten/Datasets/ENST_Drums", # SET MUSDB PATH HERE, AND SET CCMIXTER PATH IN CCMixter.xml
                     "estimates_path" : "/mnt/windaten/Source_Estimates", # SET THIS PATH TO WHERE YOU WANT SOURCE ESTIMATES PRODUCED BY THE TRAINED MODEL TO BE SAVED. Folder itself must exist!
                     "data_path" : "data", # Set this to where the preprocessed dataset should be saved
 
@@ -44,129 +44,18 @@ def cfg():
     model_config["num_outputs"] = 1 if model_config["mono_downmix"] else 2
 
 @config_ingredient.named_config
-def baseline():
-    print("Training baseline model")
-
-@config_ingredient.named_config
-def baseline_wet():
-    print("Training wet model")
-    model_config = {
-        "task" : "wet"
-    }
-
-@config_ingredient.named_config
-def norm_context_wet():
+def context_wet():
     print("Training wet model")
     model_config = {
-        "task" : "wet",
-        "data_path" : "data_norm",
-        "num_frames" : 88200,
-        "context" : True
-    }
-
-@config_ingredient.named_config
-def baseline_diff():
-    print("Training baseline model with difference output")
-    model_config = {
-        "output_type" : "difference"
-    }
-
-@config_ingredient.named_config
-def baseline_context():
-    print("Training baseline model with difference output and input context (valid convolutions)")
-    model_config = {
-        "output_type" : "difference",
-        "context" : True
-    }
-
-@config_ingredient.named_config
-def baseline_stereo():
-    print("Training baseline model with difference output and input context (valid convolutions) and stereo input/output")
-    model_config = {
-        "output_type" : "difference",
-        "context" : True,
-        "mono_downmix" : False
+        "task": "wet",
+        "num_frames": 88200,
+        "context": True
     }
 
 @config_ingredient.named_config
-def full():
-    print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer")
+def context_dry():
+    print("Training dry model")
     model_config = {
-        "output_type" : "difference",
-        "context" : True,
-        "upsampling": "learned",
-        "mono_downmix" : False
+        "num_frames": 88200,
+        "context": True
     }
-
-@config_ingredient.named_config
-def full_44KHz():
-    print("Training full singing voice separation model, with difference output and input context (valid convolutions) and stereo input/output, and learned upsampling layer, and 44.1 KHz sampling rate")
-    model_config = {
-        "output_type" : "difference",
-        "context" : True,
-        "upsampling": "learned",
-        "mono_downmix" : False,
-        "expected_sr" : 44100
-    }
-
-@config_ingredient.named_config
-def baseline_context_smallfilter_deep():
-    model_config = {
-        "output_type": "difference",
-        "context": True,
-        "num_layers" : 14,
-        "duration" : 7,
-        "filter_size" : 5,
-        "merge_filter_size" : 1
-    }
-
-@config_ingredient.named_config
-def full_multi_instrument():
-    print("Training multi-instrument separation with best model")
-    model_config = {
-        "output_type": "difference",
-        "context": True,
-        "upsampling": "linear",
-        "mono_downmix": False,
-        "task" : "multi_instrument"
-    }
-
-@config_ingredient.named_config
-def baseline_comparison():
-    model_config = {
-        "batch_size": 4, # Less output since model is so big. Doesn't matter since the model's output is not dependent on its output or input size (only convolutions)
-
-        "output_type": "difference",
-        "context": True,
-        "num_frames" : 768*127 + 1024,
-        "duration" : 13,
-        "expected_sr" : 8192,
-        "num_initial_filters" : 34
-    }
-
-@config_ingredient.named_config
-def unet_spectrogram():
-    model_config = {
-        "batch_size": 4, # Less output since model is so big.
-
-        "network" : "unet_spectrogram",
-        "num_layers" : 6,
-        "expected_sr" : 8192,
-        "num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
-        "duration" : 13,
-        "num_initial_filters" : 16
-    }
-
-@config_ingredient.named_config
-def unet_spectrogram_l1():
-    model_config = {
-        "batch_size": 4, # Less output since model is so big.
-
-        "network" : "unet_spectrogram",
-        "num_layers" : 6,
-        "expected_sr" : 8192,
-        "num_frames" : 768 * 127 + 1024, # hop_size * (time_frames_of_spectrogram_input - 1) + fft_length
-        "duration" : 13,
-        "num_initial_filters" : 16,
-        "raw_audio_loss" : False
-    }
diff --git a/Datasets.py b/Datasets.py
@@ -15,7 +15,7 @@
 
 def take_random_snippets(sample, keys, input_shape, output_shape, num_samples):
     # Take a sample (collection of audio files) and extract snippets from it at a number of random positions
-    start_pos = tf.random_uniform([num_samples], 0, maxval=sample["length"] - input_shape[0], dtype=tf.int64)
+    start_pos = tf.random.uniform([num_samples], 0, maxval=sample["length"] - input_shape[0], dtype=tf.int64)
     return take_snippets_at_pos(sample, keys, start_pos, input_shape[0], output_shape[1], num_samples)
 
 def take_all_snippets(sample, keys, input_shape, output_shape):
@@ -58,7 +58,7 @@ def write_records(sample_list, model_config, input_shape, output_shape, records_
 
     # Set up writers
     num_writers = 1
-    writers = [tf.python_io.TFRecordWriter(records_path + str(i) + ".tfrecords") for i in range(num_writers)]
+    writers = [tf.io.TFRecordWriter(records_path + str(i) + ".tfrecords") for i in range(num_writers)]
 
     # Go through songs and write them to TFRecords
     all_keys = ["mix"] + model_config["source_names"]
@@ -122,11 +122,11 @@ def parse_record(example_proto, source_names, output_channels):
 
     all_names = source_names + ["mix"]
 
-    features = {key : tf.FixedLenSequenceFeature([], allow_missing=True, dtype=tf.float32) for key in all_names}
-    features["length"] = tf.FixedLenFeature([], tf.int64)
-    features["channels"] = tf.FixedLenFeature([], tf.int64)
+    features = {key : tf.io.FixedLenSequenceFeature([], allow_missing=True, dtype=tf.float32) for key in all_names}
+    features["length"] = tf.io.FixedLenFeature([], tf.int64)
+    features["channels"] = tf.io.FixedLenFeature([], tf.int64)
 
-    parsed_features = tf.parse_single_example(example_proto, features)
+    parsed_features = tf.io.parse_single_example(example_proto, features)
 
     # Reshape
     length = tf.cast(parsed_features["length"], tf.int64)
@@ -173,7 +173,7 @@ def get_dataset(model_config, input_shape, output_shape, partition):
         # The dataset structure is a dictionary with "train", "valid", "test" keys, whose entries are lists, where each element represents a song.
         # Each song is represented as a dictionary containing elements mix, acc, vocal or mix, bass, drums, other, vocal depending on the task.
 
-        num_cores = 8
+        num_cores = 1
 
         for curr_partition in ["train", "val", "test"]:
             print("Writing " + curr_partition + " partition...")

diff --git a/Evaluate.py b/Evaluate.py
@@ -1,4 +1,5 @@
 import numpy as np
+import soundfile
 import tensorflow as tf
 import librosa
 
@@ -35,7 +36,7 @@ def predict(audio, model_config, load_model):
     sep_input_shape[0] = 1
     sep_output_shape[0] = 1
 
-    tracks_ph = tf.placeholder(tf.float32, sep_input_shape)
+    tracks_ph = tf.compat.v1.placeholder(tf.float32, sep_input_shape)
 
     print("Testing...")
 
@@ -45,12 +46,12 @@ def predict(audio, model_config, load_model):
 
     # Start session and queue input threads
     sess = tf.Session()
-    sess.run(tf.global_variables_initializer())
+    sess.run(tf.compat.v1.global_variables_initializer())
 
     # Load model
     # Load pretrained model to continue training, if we are supposed to
-    restorer = tf.train.Saver(None, write_version=tf.train.SaverDef.V2)
-    print("Num of variables" + str(len(tf.global_variables())))
+    restorer = tf.compat.v1.train.Saver(None, write_version=tf.compat.v1.train.SaverDef.V2)
+    print("Num of variables" + str(len(tf.compat.v1.global_variables())))
     restorer.restore(sess, load_model)
     print('Pre-trained model restored for song prediction')
 
@@ -62,7 +63,7 @@ def predict(audio, model_config, load_model):
 
     # Close session, clear computational graph
     sess.close()
-    tf.reset_default_graph()
+    tf.compat.v1.reset_default_graph()
 
     return mix_pred
 
@@ -93,14 +94,13 @@ def predict_track(model_config, sess, audio, sep_input_shape, sep_output_shape,
 
     # Preallocate source predictions (same shape as input mixture)
     mix_time_frames = audio[key].shape[0]
-    mix_preds = np.zeros((mix_time_frames, sep_output_shape[2]), np.float32)
-#     mix_preds = {name : np.zeros(mix_audio.shape, np.float32) for name in model_config["source_names"]}
+    mix_preds = np.asfortranarray(np.zeros((mix_time_frames, sep_output_shape[2]), np.float32))
 
     input_time_frames = sep_input_shape[1]
     output_time_frames = sep_output_shape[1]
 
     # Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal
-    pad_time_frames = (input_time_frames - output_time_frames) / 2
+    pad_time_frames = (input_time_frames - output_time_frames) // 2
 
 
     for key in audio.keys():     
@@ -178,5 +178,5 @@ def produce_source_estimates(model_config, load_model, tracksdict, output_path):
         os.makedirs(directory)
     assert(os.path.exists(directory))
     print(output_path)
-    librosa.output.write_wav(output_path, mix_pred, sr)
+    soundfile.write(output_path, mix_pred, sr)
 
diff --git a/Models/InterpolationLayer.py b/Models/InterpolationLayer.py
@@ -16,7 +16,7 @@ def learned_interpolation_layer(input, padding, level):
 
     # Construct 2FxF weight matrix, where F is the number of feature channels in the feature map.
     # Matrix is constrained, made up out of two diagonal FxF matrices with diagonal weights w and 1-w. w is constrained to be in [0,1] # mioid
-    weights = tf.get_variable("interp_" + str(level), shape=[features], dtype=tf.float32)
+    weights = tf.compat.v1.get_variable("interp_" + str(level), shape=[features], dtype=tf.float32)
     weights_scaled = tf.nn.sigmoid(weights) # Constrain weights to [0,1]
     counter_weights = 1.0 - weights_scaled # Mirrored weights for the features from the other time step
     conv_weights = tf.expand_dims(tf.concat([tf.expand_dims(tf.diag(weights_scaled), axis=0), tf.expand_dims(tf.diag(counter_weights), axis=0)], axis=0), axis=0)

diff --git a/Models/UnetAudioSeparator.py b/Models/UnetAudioSeparator.py
@@ -92,7 +92,7 @@ def get_output(self, input, training, return_spectrogram=False, reuse=True):
         '''
 
 
-        with tf.variable_scope("separator", reuse=reuse):
+        with tf.compat.v1.variable_scope("separator", reuse=reuse):
             enc_outputs = list()
 
             current_layer = input #tf.concat([input[key] for key in input.keys() if key != 'mix'], 2)
@@ -116,9 +116,9 @@ def get_output(self, input, training, return_spectrogram=False, reuse=True):
                     current_layer = Models.InterpolationLayer.learned_interpolation_layer(current_layer, self.padding, i)
                 else:
                     if self.context:
-                        current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2] * 2 - 1], align_corners=True)
+                        current_layer = tf.compat.v1.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2] * 2 - 1], align_corners=True)
                     else:
-                        current_layer = tf.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2]*2]) # out = in + in - 1
+                        current_layer = tf.compat.v1.image.resize_bilinear(current_layer, [1, current_layer.get_shape().as_list()[2]*2]) # out = in + in - 1
                 current_layer = tf.squeeze(current_layer, axis=1)
                 # UPSAMPLING FINISHED
 

diff --git a/Models/UnetSpectrogramSeparator.py b/Models/UnetSpectrogramSeparator.py
@@ -4,7 +4,6 @@
 from Utils import LeakyReLU
 import numpy as np
 import functools
-from tensorflow.contrib.signal.python.ops import window_ops
 
 class UnetSpectrogramSeparator:
     '''
@@ -48,9 +47,9 @@ def get_output(self, input, training, return_spectrogram=False, reuse=True):
         :return: U-Net output: If return_spectrogram: Accompaniment and voice magnitudes as length-two list with two 4D tensors. Otherwise Two 3D tensors containing the raw audio estimates
         '''
         # Setup STFT computation
-        window = functools.partial(window_ops.hann_window, periodic=True)
+        window = functools.partial(tf.signal.hann_window, periodic=True)
         inv_window = tf.contrib.signal.inverse_stft_window_fn(self.hop, forward_window_fn=window)
-        with tf.variable_scope("separator", reuse=reuse):
+        with tf.compat.v1.variable_scope("separator", reuse=reuse):
             # Compute spectrogram
             assert(input.get_shape().as_list()[2] == 1) # Model works ONLY on mono
             stfts = tf.contrib.signal.stft(tf.squeeze(input, 2), frame_length=self.frame_len, frame_step=self.hop, fft_length=self.frame_len, window_fn=window)

diff --git a/Predict.py b/Predict.py
@@ -7,34 +7,22 @@
 
 @ex.config
 def cfg():
-    model_path = os.path.join("checkpoints", "baseline_wet", "940358-130000") # Load stereo vocal model by default
-#     input_path = {'hi-hat': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/hi-hat/001_hits_snare-drum_sticks_x5.wav',
-#  'kick': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/kick/001_hits_snare-drum_sticks_x5.wav',
-#  'mix': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/wet_mix/001_hits_snare-drum_sticks_x5_norm.wav',
-#  'overhead_L': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/overhead_L/001_hits_snare-drum_sticks_x5.wav',
-#  'overhead_R': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/overhead_R/001_hits_snare-drum_sticks_x5.wav',
-#  'snare': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/snare/001_hits_snare-drum_sticks_x5.wav',
-#  'tom_1': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/tom_1/001_hits_snare-drum_sticks_x5.wav',
-#  'tom_2': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/tom_2/001_hits_snare-drum_sticks_x5.wav',
-#  'tom_3': '/import/c4dm-04/davem/ENST-drums/drummer_3/audio/hi-hat/001_hits_snare-drum_sticks_x5.wav'}
-
-    input_path = {'hi-hat': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/hi-hat/078_phrase_reggae_simple_slow_sticks.wav',
-  'kick': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/kick/078_phrase_reggae_simple_slow_sticks.wav',
-  'mix': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/wet_mix/078_phrase_reggae_simple_slow_sticks_norm.wav',
-  'overhead_L': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/overhead_L/078_phrase_reggae_simple_slow_sticks.wav',
-  'overhead_R': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/overhead_R/078_phrase_reggae_simple_slow_sticks.wav',
-  'snare': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/snare/078_phrase_reggae_simple_slow_sticks.wav',
-  'tom_1': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/tom_1/078_phrase_reggae_simple_slow_sticks.wav',
-  'tom_2': '/import/c4dm-04/davem/ENST-drums/drummer_1/audio/tom_2/078_phrase_reggae_simple_slow_sticks.wav',
+    model_path = os.path.join("checkpoints", "wet", "wet-1108000") # Load wet pretrained model by default
+
+    input_path = {'hi-hat': 'audio_examples/inputs/hihat.wav',
+  'kick': 'audio_examples/inputs/kick.wav',
+  'mix': 'audio_examples/inputs/wet_mix.wav',
+  'overhead_L': 'audio_examples/inputs/overheadL.wav',
+  'overhead_R': 'audio_examples/inputs/overheadR.wav',
+  'snare': 'audio_examples/inputs/snare.wav',
+  'tom_1': 'audio_examples/inputs/tom1.wav',
+  'tom_2': 'audio_examples/inputs/tom2.wav',
   'tom_3': None}
-    
-    output_path = 'audio_examples/mix.wav'
+
+    output_path = 'audio_examples/outputs/wet_mix.wav'
 
 
 @ex.automain
 def main(cfg, model_path, input_path, output_path):
-
-
     model_config = cfg["model_config"]
-
     Evaluate.produce_source_estimates(model_config, model_path, input_path, output_path)
diff --git a/PredictDataset.py b/PredictDataset.py
@@ -1,3 +1,4 @@
+import soundfile
 from sacred import Experiment
 from Config import config_ingredient
 import Evaluate, Datasets, Utils
@@ -56,7 +57,7 @@ def main(cfg, model_path, output_path):
         target, sr = Utils.load(track['mix'], sr=None, mono=False) 
         target = target/np.max(np.abs(target))
 
-        librosa.output.write_wav(output_track+'_target.wav', target, sr=sr)
+        soundfile.write(output_track+'_target.wav', target, sr)
 
         output, _ = Utils.load(output_track, sr=None, mono=False)