From 4a65d0b579b97e1ae64b8864402aee79617f5a6b Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Thu, 24 Mar 2022 00:10:09 +0100 Subject: [PATCH 01/20] feat: start implementing initial dumb audio parsing --- src/pytti/AudioParse.py | 29 +++++++++++++++++++++++++++ src/pytti/config/structured_config.py | 6 ++++++ 2 files changed, 35 insertions(+) create mode 100644 src/pytti/AudioParse.py diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py new file mode 100644 index 0000000..aab3531 --- /dev/null +++ b/src/pytti/AudioParse.py @@ -0,0 +1,29 @@ +import numpy as np +import audiofile +import typing + +class SpectralAudioParser: + """ + Audio Parser reads a given input file, scans along it and parses its spectrum using FFT. + The FFT output is split into three bands (low,mid,high), the (average) amplitude of which is then returned for use in animation functions. + """ + def __init__( + self, + params=None + ): + if params.input_audio: + self.audio_samples, self.sample_rate = audiofile.read(params.input_audio, offset=params.input_audio_offset, always_2d=True) + + def get_params(self, t) -> typing.Tuple[float, float, float]: + """ + Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended. + """ + # Get the point in time (sample-offset) in the track in seconds based on sample-rate + sample_offset = int(t * self.sample_rate) + if sample_offset < self.audio_samples.shape[0]: + # TODO: read back up on numpy array slicing, read [sample_offset, sample_offset+window_size] here + # read back up on fft window size parameters etc. + # read up on whether to use fft2 here or to sum the audio file on initialization into a mono signal first maybe + np.fft.fft2(self.audio_samples[:sample_offset]) + else: + return (0, 0, 0) \ No newline at end of file diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py index 6ef7ea3..f77a5eb 100644 --- a/src/pytti/config/structured_config.py +++ b/src/pytti/config/structured_config.py @@ -100,6 +100,12 @@ def check(self, attribute, value): ### Induced Motion ### ###################### + input_audio: str = "" + input_audio_offset: float = 0 + input_audio_window_size: int = 1024 + input_audio_band_split_low_medium: int = 500 + input_audio_band_split_medium_high: int = 3500 + # _2d and _3d only apply to those animation modes translate_x: str = "0" From 68545fc0c7930d179b61b1c6827cd1778350206d Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Thu, 24 Mar 2022 22:55:07 +0100 Subject: [PATCH 02/20] feat: initial rough audio parsing logic --- src/pytti/AudioParse.py | 86 ++++++++++++++++++-- src/pytti/ImageGuide.py | 169 ++++++++++++++++++++++++++++++++++++++++ src/pytti/eval_tools.py | 13 +++- 3 files changed, 258 insertions(+), 10 deletions(-) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index aab3531..d9d2ca3 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -1,6 +1,9 @@ import numpy as np -import audiofile import typing +import subprocess +from loguru import logger + +SAMPLERATE=44100 class SpectralAudioParser: """ @@ -12,18 +15,85 @@ def __init__( params=None ): if params.input_audio: - self.audio_samples, self.sample_rate = audiofile.read(params.input_audio, offset=params.input_audio_offset, always_2d=True) + pipe = subprocess.Popen(['ffmpeg', '-i', params.input_audio, + '-f', 's16le', + '-acodec', 'pcm_s16le', + '-ar', str(SAMPLERATE), + '-ac', '1', + '-'], stdout=subprocess.PIPE, bufsize=10**8) + + self.audio_samples = np.array([], dtype=np.int16) + + # read the audio file from the pipe in 0.5s blocks (2 bytes per sample) + while True: + buf = pipe.stdout.read(SAMPLERATE) + self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16)) + if len(buf) < SAMPLERATE: + break + + logger.debug(f"initialized audio file {params.input_audio}") + self.input_audio_offset = params.input_audio_offset + self.window_size = params.input_audio_window_size + self.low_cutoff = params.input_audio_band_split_low_medium + self.mid_cutoff = params.input_audio_band_split_medium_high + # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107 + self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size + self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None) + + def get_params(self, t) -> typing.Tuple[float, float, float]: """ Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended. + Amplitude/energy parameters are normalized into the [0,1] range. """ # Get the point in time (sample-offset) in the track in seconds based on sample-rate - sample_offset = int(t * self.sample_rate) - if sample_offset < self.audio_samples.shape[0]: - # TODO: read back up on numpy array slicing, read [sample_offset, sample_offset+window_size] here - # read back up on fft window size parameters etc. - # read up on whether to use fft2 here or to sum the audio file on initialization into a mono signal first maybe - np.fft.fft2(self.audio_samples[:sample_offset]) + sample_offset = int(t * SAMPLERATE + self.input_audio_offset * SAMPLERATE) + if sample_offset < len(self.audio_samples): + window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size] + if len(window_samples) < self.window_size: + # audio input file has likely ended + # TODO could round down to the next pow2 then do it anyway. not a critical case though IMO. + return (0, 0, 0) + # fade-in / fade-out window + window_samples = window_samples * np.hamming(len(window_samples)) + fft = np.fft.fft(window_samples) + # summing together the real and imaginary components, i think(??) + left, right = np.split(np.abs(fft), 2) + fft = np.add(left, right[::-1]) + + # pink noise adjust + fft = fft * self.power_normalization_coefficients + + freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE) + # collect energy for each frequency band + # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though + low_bucket = 0 + low_count = 0 + mid_bucket = 0 + mid_count = 0 + high_bucket = 0 + high_count = 0 + for i in range(len(fft)): + freq = self.fftx[i] + if freq < self.low_cutoff: + low_bucket += fft[i] + low_count += 1 + elif freq < self.mid_cutoff: + mid_bucket += fft[i] + mid_count += 1 + else: + high_bucket += fft[i] + high_count += 1 + # mean energy per bucket + low_bucket = low_bucket / low_count + mid_bucket = mid_bucket / mid_count + high_bucket = high_bucket / high_count + # normalize to [0,1] range + max_val = np.max(fft) + low_bucket = low_bucket / max_val + mid_bucket = mid_bucket / max_val + high_bucket = high_bucket / max_val + return (low_bucket, mid_bucket, high_bucket) else: return (0, 0, 0) \ No newline at end of file diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index 3534ce0..d52a3f9 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -19,6 +19,7 @@ freeze_vram_usage, vram_usage_mode, ) +from pytti.AudioParse import SpectralAudioParser from pytti.Image.differentiable_image import DifferentiableImage from pytti.Image.PixelImage import PixelImage from pytti.Notebook import tqdm, make_hbox @@ -108,6 +109,11 @@ def __init__( self.optimizer = optimizer self.dataframe = [] + if params.input_audio: + self.audio_parser = SpectralAudioParser(params) + else: + self.audio_parser = None + # self.null_update = null_update self.params = params self.writer = writer @@ -365,4 +371,167 @@ def update(self, model, img, i, stage_i, *args, **kwargs): """ update hook called ever step """ +<<<<<<< HEAD pass +======= + # logger.debug("model.update called") + + # ... I have regrets. + params = self.params + writer = self.writer + OUTPATH = self.OUTPATH + base_name = self.base_name + fig = self.fig + axs = self.axs + video_frames = self.video_frames + optical_flows = self.optical_flows + stabilization_augs = self.stabilization_augs + last_frame_semantic = self.last_frame_semantic + semantic_init_prompt = self.semantic_init_prompt + init_augs = self.init_augs + + model = self + img = self.image_rep + embedder = self.embedder + + model.report_out( + i=i, + stage_i=stage_i, + # model=model, + writer=writer, + fig=fig, # default to None... + axs=axs, # default to None... + clear_every=params.clear_every, + display_every=params.display_every, + approximate_vram_usage=params.approximate_vram_usage, + display_scale=params.display_scale, + show_graphs=params.show_graphs, + show_palette=params.show_palette, + ) + + model.save_out( + i=i, + # img=img, + writer=writer, + OUTPATH=OUTPATH, + base_name=base_name, + save_every=params.save_every, + file_namespace=params.file_namespace, + backups=params.backups, + ) + + # animate + ################ + ## TO DO: attach T as a class attribute + t = (i - params.pre_animation_steps) / ( + params.steps_per_frame * params.frames_per_second + ) + if self.audio_parser is None: + set_t(t, 0, 0, 0) + # set_t(t) # this won't need to be a thing with `t`` attached to the class + if i >= params.pre_animation_steps: + if self.audio_parser is not None: + lo, mid, hi = self.audio_parser.get_params(t) + logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}") + set_t(t, lo, mid, hi) + # next_step_pil = None + if (i - params.pre_animation_steps) % params.steps_per_frame == 0: + logger.debug(f"Time: {t:.4f} seconds") + # update_rotoscopers( + ROTOSCOPERS.update_rotoscopers( + ((i - params.pre_animation_steps) // params.steps_per_frame + 1) + * params.frame_stride + ) + if params.reset_lr_each_frame: + model.set_optim(None) + + if params.animation_mode == "2D": + + next_step_pil = animate_2d( + translate_y=params.translate_y, + translate_x=params.translate_x, + rotate_2d=params.rotate_2d, + zoom_x_2d=params.zoom_x_2d, + zoom_y_2d=params.zoom_y_2d, + infill_mode=params.infill_mode, + sampling_mode=params.sampling_mode, + writer=writer, + i=i, + img=img, + t=t, # just here for logging + ) + + ########################### + elif params.animation_mode == "3D": + try: + im + except NameError: + im = img.decode_image() + with vram_usage_mode("Optical Flow Loss"): + # zoom_3d -> rename to animate_3d or transform_3d + flow, next_step_pil = zoom_3d( + img, + ( + params.translate_x, + params.translate_y, + params.translate_z_3d, + ), + params.rotate_3d, + params.field_of_view, + params.near_plane, + params.far_plane, + border_mode=params.infill_mode, + sampling_mode=params.sampling_mode, + stabilize=params.lock_camera, + ) + freeze_vram_usage() + + for optical_flow in optical_flows: + optical_flow.set_last_step(im) + optical_flow.set_target_flow(flow) + optical_flow.set_enabled(True) + + elif params.animation_mode == "Video Source": + + flow_im, next_step_pil = animate_video_source( + i=i, + img=img, + video_frames=video_frames, + optical_flows=optical_flows, + base_name=base_name, + pre_animation_steps=params.pre_animation_steps, + frame_stride=params.frame_stride, + steps_per_frame=params.steps_per_frame, + file_namespace=params.file_namespace, + reencode_each_frame=params.reencode_each_frame, + lock_palette=params.lock_palette, + save_every=params.save_every, + infill_mode=params.infill_mode, + sampling_mode=params.sampling_mode, + ) + + if params.animation_mode != "off": + try: + for aug in stabilization_augs: + aug.set_comp(next_step_pil) + aug.set_enabled(True) + if last_frame_semantic is not None: + last_frame_semantic.set_image(embedder, next_step_pil) + last_frame_semantic.set_enabled(True) + for aug in init_augs: + aug.set_enabled(False) + if semantic_init_prompt is not None: + semantic_init_prompt.set_enabled(False) + except UnboundLocalError: + logger.critical( + "\n\n-----< PYTTI-TOOLS > ------" + "If you are seeing this error, it might mean " + "you are using an option that expects you have " + "provided an init_image or video_file.\n\nIf you " + "think you are seeing this message in error, please " + "file an issue here: " + "https://github.com/pytti-tools/pytti-core/issues/new" + "-----< PYTTI-TOOLS > ------\n\n" + ) + raise +>>>>>>> 57553a3 (feat: initial rough audio parsing logic) diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py index 85cfac1..a0741f8 100644 --- a/src/pytti/eval_tools.py +++ b/src/pytti/eval_tools.py @@ -5,6 +5,9 @@ math_env = None global_t = 0 +global_fLo = 0 +global_fMid = 0 +global_fHi = 0 eval_memo = {} @@ -27,6 +30,9 @@ def parametric_eval(string, **vals): ) math_env.update(vals) math_env["t"] = global_t + math_env["fLo"] = global_fLo + math_env["fMid"] = global_fMid + math_env["fHi"] = global_fHi try: output = eval(string, math_env) except SyntaxError as e: @@ -37,9 +43,12 @@ def parametric_eval(string, **vals): return string -def set_t(t): - global global_t, eval_memo +def set_t(t, fLo, fMid, fHi): + global global_t, global_fLo, global_fMid, global_fHi, eval_memo global_t = t + global_fLo = fLo + global_fMid = fMid + global_fHi = fHi eval_memo = {} From ee20b1ef70ede750c548da51f99f2cdf0a6d9377 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Fri, 25 Mar 2022 08:29:31 +0100 Subject: [PATCH 03/20] fix: some math edge cases --- src/pytti/AudioParse.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index d9d2ca3..2a33d0f 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -86,14 +86,20 @@ def get_params(self, t) -> typing.Tuple[float, float, float]: high_bucket += fft[i] high_count += 1 # mean energy per bucket - low_bucket = low_bucket / low_count - mid_bucket = mid_bucket / mid_count - high_bucket = high_bucket / high_count + if low_count > 0 and mid_count > 0 and high_count > 0: + low_bucket = low_bucket / low_count + mid_bucket = mid_bucket / mid_count + high_bucket = high_bucket / high_count + else: + return (0,0,0) # normalize to [0,1] range max_val = np.max(fft) - low_bucket = low_bucket / max_val - mid_bucket = mid_bucket / max_val - high_bucket = high_bucket / max_val - return (low_bucket, mid_bucket, high_bucket) + if max_val > 0: + low_bucket = low_bucket / max_val + mid_bucket = mid_bucket / max_val + high_bucket = high_bucket / max_val + else: + return (0,0,0) + return (float(low_bucket), float(mid_bucket), float(high_bucket)) else: return (0, 0, 0) \ No newline at end of file From 8a1ff32bd00784ca8a6f15d67a7584272ccb3b6b Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Fri, 25 Mar 2022 17:24:12 +0100 Subject: [PATCH 04/20] fix: add debug log for audio 0 vectors --- src/pytti/AudioParse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index 2a33d0f..f79ebf3 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -54,6 +54,7 @@ def get_params(self, t) -> typing.Tuple[float, float, float]: if len(window_samples) < self.window_size: # audio input file has likely ended # TODO could round down to the next pow2 then do it anyway. not a critical case though IMO. + logger.debug(f"Warning: sample offset is out of range at time offset {t+self.input_audio_offset}s. Returning 0 vector") return (0, 0, 0) # fade-in / fade-out window window_samples = window_samples * np.hamming(len(window_samples)) From 2c5c7c25dc14ad2870a07833cb2f39de75201ff1 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Fri, 25 Mar 2022 17:33:48 +0100 Subject: [PATCH 05/20] fix: add missing warmup config params --- src/pytti/assets/default.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml index a4f9630..b1cc16a 100644 --- a/src/pytti/assets/default.yaml +++ b/src/pytti/assets/default.yaml @@ -81,6 +81,11 @@ far_plane: 10000 ###################### ### Induced Motion ### ###################### +input_audio: "" +input_audio_offset: 0 +input_audio_window_size: 8192 +input_audio_band_split_low_medium: 150 +input_audio_band_split_medium_high: 300 pre_animation_steps: 100 lock_camera: true From affdaaa44be4b008258860cb0a118b6e7929bae4 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Sat, 26 Mar 2022 13:17:28 +0100 Subject: [PATCH 06/20] fix: run fft analysis only when images are actually saved --- src/pytti/ImageGuide.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index d52a3f9..f506e10 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -430,13 +430,14 @@ def update(self, model, img, i, stage_i, *args, **kwargs): set_t(t, 0, 0, 0) # set_t(t) # this won't need to be a thing with `t`` attached to the class if i >= params.pre_animation_steps: - if self.audio_parser is not None: - lo, mid, hi = self.audio_parser.get_params(t) - logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}") - set_t(t, lo, mid, hi) # next_step_pil = None if (i - params.pre_animation_steps) % params.steps_per_frame == 0: - logger.debug(f"Time: {t:.4f} seconds") + if self.audio_parser is not None: + lo, mid, hi = self.audio_parser.get_params(t) + logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}") + set_t(t, lo, mid, hi) + else: + logger.debug(f"Time: {t:.4f} seconds") # update_rotoscopers( ROTOSCOPERS.update_rotoscopers( ((i - params.pre_animation_steps) // params.steps_per_frame + 1) From 3eea43122dae2f98d60d494e5b36cdc3ae883a9e Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Sat, 26 Mar 2022 14:04:06 +0100 Subject: [PATCH 07/20] fix: try adding some rudimentary error handling --- src/pytti/AudioParse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index f79ebf3..96b19f0 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -30,7 +30,8 @@ def __init__( self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16)) if len(buf) < SAMPLERATE: break - + if len(self.audio_samples) < 0: + raise RuntimeError("Audio samples are empty, assuming load failed") logger.debug(f"initialized audio file {params.input_audio}") self.input_audio_offset = params.input_audio_offset self.window_size = params.input_audio_window_size From 97aaa06748ea2d3d83bc02ae4124960b8bd5b85e Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Sat, 26 Mar 2022 14:05:13 +0100 Subject: [PATCH 08/20] fix: add sample count to log statement as well --- src/pytti/AudioParse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index 96b19f0..c123c16 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -32,7 +32,7 @@ def __init__( break if len(self.audio_samples) < 0: raise RuntimeError("Audio samples are empty, assuming load failed") - logger.debug(f"initialized audio file {params.input_audio}") + logger.debug(f"initialized audio file {params.input_audio}, samples read: {len(self.audio_samples)}") self.input_audio_offset = params.input_audio_offset self.window_size = params.input_audio_window_size self.low_cutoff = params.input_audio_band_split_low_medium From d0edc43f9b2ade1aae06775b5d75493174ee4607 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Sat, 26 Mar 2022 14:19:19 +0100 Subject: [PATCH 09/20] fix: more warning logs for debugging --- src/pytti/AudioParse.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index c123c16..bf48869 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -50,6 +50,7 @@ def get_params(self, t) -> typing.Tuple[float, float, float]: """ # Get the point in time (sample-offset) in the track in seconds based on sample-rate sample_offset = int(t * SAMPLERATE + self.input_audio_offset * SAMPLERATE) + logger.debug(f"Analyzing audio at {self.input_audio_offset+t}s") if sample_offset < len(self.audio_samples): window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size] if len(window_samples) < self.window_size: @@ -93,6 +94,7 @@ def get_params(self, t) -> typing.Tuple[float, float, float]: mid_bucket = mid_bucket / mid_count high_bucket = high_bucket / high_count else: + logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector") return (0,0,0) # normalize to [0,1] range max_val = np.max(fft) @@ -101,7 +103,10 @@ def get_params(self, t) -> typing.Tuple[float, float, float]: mid_bucket = mid_bucket / max_val high_bucket = high_bucket / max_val else: + logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector") return (0,0,0) return (float(low_bucket), float(mid_bucket), float(high_bucket)) else: + + logger.debug(f"Warning: Audio input has ended. Returning 0 vector") return (0, 0, 0) \ No newline at end of file From 5bc02f4b07ef51e04be17523296d54e0633086a4 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Sat, 2 Apr 2022 01:57:25 +0200 Subject: [PATCH 10/20] feat: start implementing bandpass filters instead of fft+ frequency split --- src/pytti/AudioParse.py | 184 +++++++++++++++----------- src/pytti/ImageGuide.py | 2 +- src/pytti/assets/default.yaml | 7 +- src/pytti/config/structured_config.py | 10 +- 4 files changed, 120 insertions(+), 83 deletions(-) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index bf48869..68319ac 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -2,6 +2,7 @@ import typing import subprocess from loguru import logger +from scipy.signal import butter, sosfilt, sosfreqz SAMPLERATE=44100 @@ -12,34 +13,35 @@ class SpectralAudioParser: """ def __init__( self, - params=None + input_audio, + offset, + window_size, + filters ): - if params.input_audio: - pipe = subprocess.Popen(['ffmpeg', '-i', params.input_audio, - '-f', 's16le', - '-acodec', 'pcm_s16le', - '-ar', str(SAMPLERATE), - '-ac', '1', - '-'], stdout=subprocess.PIPE, bufsize=10**8) + pipe = subprocess.Popen(['ffmpeg', '-i', input_audio, + '-f', 's16le', + '-acodec', 'pcm_s16le', + '-ar', str(SAMPLERATE), + '-ac', '1', + '-'], stdout=subprocess.PIPE, bufsize=10**8) - self.audio_samples = np.array([], dtype=np.int16) - - # read the audio file from the pipe in 0.5s blocks (2 bytes per sample) - while True: - buf = pipe.stdout.read(SAMPLERATE) - self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16)) - if len(buf) < SAMPLERATE: - break - if len(self.audio_samples) < 0: - raise RuntimeError("Audio samples are empty, assuming load failed") - logger.debug(f"initialized audio file {params.input_audio}, samples read: {len(self.audio_samples)}") - self.input_audio_offset = params.input_audio_offset - self.window_size = params.input_audio_window_size - self.low_cutoff = params.input_audio_band_split_low_medium - self.mid_cutoff = params.input_audio_band_split_medium_high - # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107 - self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size - self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None) + self.audio_samples = np.array([], dtype=np.int16) + + # read the audio file from the pipe in 0.5s blocks (2 bytes per sample) + while True: + buf = pipe.stdout.read(SAMPLERATE) + self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16)) + if len(buf) < SAMPLERATE: + break + if len(self.audio_samples) < 0: + raise RuntimeError("Audio samples are empty, assuming load failed") + logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}") + self.offset = offset + self.window_size = window_size + self.filters = filters + # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107 + self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size + self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None) @@ -49,64 +51,90 @@ def get_params(self, t) -> typing.Tuple[float, float, float]: Amplitude/energy parameters are normalized into the [0,1] range. """ # Get the point in time (sample-offset) in the track in seconds based on sample-rate - sample_offset = int(t * SAMPLERATE + self.input_audio_offset * SAMPLERATE) - logger.debug(f"Analyzing audio at {self.input_audio_offset+t}s") + sample_offset = int(t * SAMPLERATE + self.offset * SAMPLERATE) + logger.debug(f"Analyzing audio at {self.offset+t}s") if sample_offset < len(self.audio_samples): window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size] if len(window_samples) < self.window_size: # audio input file has likely ended - # TODO could round down to the next pow2 then do it anyway. not a critical case though IMO. - logger.debug(f"Warning: sample offset is out of range at time offset {t+self.input_audio_offset}s. Returning 0 vector") + # TODO could round down to the next lower pow2 then do it anyway. not a critical case though IMO. + logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning 0 vector") return (0, 0, 0) - # fade-in / fade-out window + + # fade-in / fade-out window to taper off the signal window_samples = window_samples * np.hamming(len(window_samples)) - fft = np.fft.fft(window_samples) - # summing together the real and imaginary components, i think(??) - left, right = np.split(np.abs(fft), 2) - fft = np.add(left, right[::-1]) + return bp_tuple(t, window_samples, self.filters) + #return fft_tuple(t) + else: + logger.debug(f"Warning: Audio input has ended. Returning 0 vector") + return (0, 0, 0) - # pink noise adjust - fft = fft * self.power_normalization_coefficients +def butter_bandpass(lowcut, highcut, fs, order=5): + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + sos = butter(order, [low, high], analog=False, btype='band', output='sos') + return sos - freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE) - # collect energy for each frequency band - # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though - low_bucket = 0 - low_count = 0 - mid_bucket = 0 - mid_count = 0 - high_bucket = 0 - high_count = 0 - for i in range(len(fft)): - freq = self.fftx[i] - if freq < self.low_cutoff: - low_bucket += fft[i] - low_count += 1 - elif freq < self.mid_cutoff: - mid_bucket += fft[i] - mid_count += 1 - else: - high_bucket += fft[i] - high_count += 1 - # mean energy per bucket - if low_count > 0 and mid_count > 0 and high_count > 0: - low_bucket = low_bucket / low_count - mid_bucket = mid_bucket / mid_count - high_bucket = high_bucket / high_count - else: - logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector") - return (0,0,0) - # normalize to [0,1] range - max_val = np.max(fft) - if max_val > 0: - low_bucket = low_bucket / max_val - mid_bucket = mid_bucket / max_val - high_bucket = high_bucket / max_val - else: - logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector") - return (0,0,0) - return (float(low_bucket), float(mid_bucket), float(high_bucket)) - else: +def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): + sos = butter_bandpass(lowcut, highcut, fs, order=order) + y = sosfilt(sos, data) + return y - logger.debug(f"Warning: Audio input has ended. Returning 0 vector") - return (0, 0, 0) \ No newline at end of file +def bp_tuple(t, window_samples, filters) -> typing.Dict[str, float]: + for filter in filters: + offset = filter.f_width/2 + lower = filter.f_center - offset + upper = filter.f_center + offset + filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order) + # Normalize from signed 16-bit max value to 0..1 range + val = np.max(np.abs(filtered)) / 32768 + return (val, 0, 0) + +def fft_tuple(t, window_samples) -> typing.Tuple[float, float, float]: + fft = np.fft.fft(window_samples) + # summing together the real and imaginary components, i think(??) + left, right = np.split(np.abs(fft), 2) + fft = np.add(left, right[::-1]) + + # pink noise adjust + fft = fft * self.power_normalization_coefficients + + freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE) + # collect energy for each frequency band + # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though + low_bucket = 0 + low_count = 0 + mid_bucket = 0 + mid_count = 0 + high_bucket = 0 + high_count = 0 + for i in range(len(fft)): + freq = self.fftx[i] + if freq < self.low_cutoff: + low_bucket += fft[i] + low_count += 1 + elif freq < self.mid_cutoff: + mid_bucket += fft[i] + mid_count += 1 + else: + high_bucket += fft[i] + high_count += 1 + # mean energy per bucket + if low_count > 0 and mid_count > 0 and high_count > 0: + low_bucket = low_bucket / low_count + mid_bucket = mid_bucket / mid_count + high_bucket = high_bucket / high_count + else: + logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector") + return (0,0,0) + # normalize to [0,1] range + max_val = np.max(fft) + if max_val > 0: + low_bucket = low_bucket / max_val + mid_bucket = mid_bucket / max_val + high_bucket = high_bucket / max_val + else: + logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector") + return (0,0,0) + return (float(low_bucket), float(mid_bucket), float(high_bucket)) \ No newline at end of file diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index f506e10..796fba3 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -110,7 +110,7 @@ def __init__( self.dataframe = [] if params.input_audio: - self.audio_parser = SpectralAudioParser(params) + self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.window_size, params.filters) else: self.audio_parser = None diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml index b1cc16a..f2a801e 100644 --- a/src/pytti/assets/default.yaml +++ b/src/pytti/assets/default.yaml @@ -84,8 +84,11 @@ far_plane: 10000 input_audio: "" input_audio_offset: 0 input_audio_window_size: 8192 -input_audio_band_split_low_medium: 150 -input_audio_band_split_medium_high: 300 +input_audio_filters: [] +# - variable_name: fLo +# f_center: 60 +# f_width: 30 +# order: 5 pre_animation_steps: 100 lock_camera: true diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py index f77a5eb..2e0c8f2 100644 --- a/src/pytti/config/structured_config.py +++ b/src/pytti/config/structured_config.py @@ -16,6 +16,13 @@ def check_input_against_list(attribute, value, valid_values): ) +@define(auto_attribs=True) +class AudioFilterConfig: + variable_name: str = "???" + f_center: int = "???" + f_width: int = "???" + order: int = 5 + @define(auto_attribs=True) class ConfigSchema: ############# @@ -103,8 +110,7 @@ def check(self, attribute, value): input_audio: str = "" input_audio_offset: float = 0 input_audio_window_size: int = 1024 - input_audio_band_split_low_medium: int = 500 - input_audio_band_split_medium_high: int = 3500 + input_audio_filters: AudioFilterConfig = None # _2d and _3d only apply to those animation modes From c2323866a89ab619121fd1be110665c172d8f3be Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Sun, 3 Apr 2022 13:48:10 +0200 Subject: [PATCH 11/20] feat: completely refactor fft / window_size /band-splitting based impl with band-pass filters, window size based on FPS --- src/pytti/AudioParse.py | 121 ++++++++++++++-------------------- src/pytti/ImageGuide.py | 6 +- src/pytti/assets/default.yaml | 1 - 3 files changed, 54 insertions(+), 74 deletions(-) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index 68319ac..7fc623a 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -8,16 +8,19 @@ class SpectralAudioParser: """ - Audio Parser reads a given input file, scans along it and parses its spectrum using FFT. - The FFT output is split into three bands (low,mid,high), the (average) amplitude of which is then returned for use in animation functions. + reads a given input file, scans along it and parses the amplitude in selected bands using butterworth bandpass filters. + the amplitude is normalized into the 0..1 range for easier use in transformation functions. """ def __init__( self, input_audio, offset, - window_size, + frames_per_second, filters ): + if len(filters) < 1: + raise RuntimeError("When using input_audio, at least 1 filter must be specified") + pipe = subprocess.Popen(['ffmpeg', '-i', input_audio, '-f', 's16le', '-acodec', 'pcm_s16le', @@ -35,17 +38,34 @@ def __init__( break if len(self.audio_samples) < 0: raise RuntimeError("Audio samples are empty, assuming load failed") - logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}") + self.duration = len(self.audio_samples) / SAMPLERATE + logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s") self.offset = offset - self.window_size = window_size + if offset > self.duration: + raise RuntimeError(f"Audio offset set at {offset}s but input audio is only {duration}s long") + # analyze all samples for the current frame + self.window_size = int(1/frames_per_second * SAMPLERATE) self.filters = filters - # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107 - self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size - self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None) - + # parse band maxima first for normalizing the filtered signal to 0..1 at arbitrary points in the file later + # this initialization is a bit compute intensive, especially for higher fps numbers, but i couldn't find a cleaner way + # (band-passing the entire track instead of windows creates maxima that are way off, some filtering anomaly i don't understand...) + steps = int((self.duration - self.offset) * frames_per_second) + interval = 1/frames_per_second + maxima = {} + time_steps = np.linspace(0, steps, num=steps) * interval + for t in time_steps: + sample_offset = int(t * SAMPLERATE) + cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset+self.window_size], filters) + for key in cur_maxima: + if key in maxima: + maxima[key] = max(maxima[key], cur_maxima[key]) + else: + maxima[key] = cur_maxima[key] + self.band_maxima = maxima + logger.debug(f"initialized band maxima for {len(filters)} filters: {self.band_maxima}") - def get_params(self, t) -> typing.Tuple[float, float, float]: + def get_params(self, t) -> typing.Dict[str, float]: """ Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended. Amplitude/energy parameters are normalized into the [0,1] range. @@ -58,22 +78,23 @@ def get_params(self, t) -> typing.Tuple[float, float, float]: if len(window_samples) < self.window_size: # audio input file has likely ended # TODO could round down to the next lower pow2 then do it anyway. not a critical case though IMO. - logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning 0 vector") - return (0, 0, 0) - + logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning null result") + return {} # fade-in / fade-out window to taper off the signal - window_samples = window_samples * np.hamming(len(window_samples)) - return bp_tuple(t, window_samples, self.filters) - #return fft_tuple(t) + #window_samples = window_samples * np.hamming(len(window_samples)) + return bp_filtered_norm(window_samples, self.filters, self.band_maxima) else: - logger.debug(f"Warning: Audio input has ended. Returning 0 vector") - return (0, 0, 0) + logger.debug(f"Warning: Audio input has ended. Returning null result") + return {} + + def get_duration(self): + return self.duration def butter_bandpass(lowcut, highcut, fs, order=5): nyq = 0.5 * fs low = lowcut / nyq high = highcut / nyq - sos = butter(order, [low, high], analog=False, btype='band', output='sos') + sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos') return sos def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): @@ -81,60 +102,20 @@ def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): y = sosfilt(sos, data) return y -def bp_tuple(t, window_samples, filters) -> typing.Dict[str, float]: + +def bp_filtered(window_samples, filters) -> typing.Dict[str, float]: + results = {} for filter in filters: offset = filter.f_width/2 lower = filter.f_center - offset upper = filter.f_center + offset filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order) - # Normalize from signed 16-bit max value to 0..1 range - val = np.max(np.abs(filtered)) / 32768 - return (val, 0, 0) + results[filter.variable_name] = np.max(np.abs(filtered)) + return results -def fft_tuple(t, window_samples) -> typing.Tuple[float, float, float]: - fft = np.fft.fft(window_samples) - # summing together the real and imaginary components, i think(??) - left, right = np.split(np.abs(fft), 2) - fft = np.add(left, right[::-1]) - - # pink noise adjust - fft = fft * self.power_normalization_coefficients - - freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE) - # collect energy for each frequency band - # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though - low_bucket = 0 - low_count = 0 - mid_bucket = 0 - mid_count = 0 - high_bucket = 0 - high_count = 0 - for i in range(len(fft)): - freq = self.fftx[i] - if freq < self.low_cutoff: - low_bucket += fft[i] - low_count += 1 - elif freq < self.mid_cutoff: - mid_bucket += fft[i] - mid_count += 1 - else: - high_bucket += fft[i] - high_count += 1 - # mean energy per bucket - if low_count > 0 and mid_count > 0 and high_count > 0: - low_bucket = low_bucket / low_count - mid_bucket = mid_bucket / mid_count - high_bucket = high_bucket / high_count - else: - logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector") - return (0,0,0) - # normalize to [0,1] range - max_val = np.max(fft) - if max_val > 0: - low_bucket = low_bucket / max_val - mid_bucket = mid_bucket / max_val - high_bucket = high_bucket / max_val - else: - logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector") - return (0,0,0) - return (float(low_bucket), float(mid_bucket), float(high_bucket)) \ No newline at end of file +def bp_filtered_norm(window_samples, filters, norm_factors) -> typing.Dict[str, float]: + results = bp_filtered(window_samples, filters) + for key in results: + # normalize + results[key] = results[key] / norm_factors[key] + return results \ No newline at end of file diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index 796fba3..352a47a 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -110,7 +110,7 @@ def __init__( self.dataframe = [] if params.input_audio: - self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.window_size, params.filters) + self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.frames_per_second, params.filters) else: self.audio_parser = None @@ -433,9 +433,9 @@ def update(self, model, img, i, stage_i, *args, **kwargs): # next_step_pil = None if (i - params.pre_animation_steps) % params.steps_per_frame == 0: if self.audio_parser is not None: - lo, mid, hi = self.audio_parser.get_params(t) + band_dict = self.audio_parser.get_params(t) logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}") - set_t(t, lo, mid, hi) + set_t(t, band_dict) else: logger.debug(f"Time: {t:.4f} seconds") # update_rotoscopers( diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml index f2a801e..4556bb5 100644 --- a/src/pytti/assets/default.yaml +++ b/src/pytti/assets/default.yaml @@ -83,7 +83,6 @@ far_plane: 10000 ###################### input_audio: "" input_audio_offset: 0 -input_audio_window_size: 8192 input_audio_filters: [] # - variable_name: fLo # f_center: 60 From 21e5b3fe589f6e3811f058f96185abdb6d3ae087 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Sun, 3 Apr 2022 16:46:56 +0200 Subject: [PATCH 12/20] refactor: remove window size config param, fix eval tooling for dict math env --- src/pytti/config/structured_config.py | 1 - src/pytti/eval_tools.py | 18 +++++++----------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py index 2e0c8f2..0290860 100644 --- a/src/pytti/config/structured_config.py +++ b/src/pytti/config/structured_config.py @@ -109,7 +109,6 @@ def check(self, attribute, value): input_audio: str = "" input_audio_offset: float = 0 - input_audio_window_size: int = 1024 input_audio_filters: AudioFilterConfig = None # _2d and _3d only apply to those animation modes diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py index a0741f8..64b936e 100644 --- a/src/pytti/eval_tools.py +++ b/src/pytti/eval_tools.py @@ -5,9 +5,7 @@ math_env = None global_t = 0 -global_fLo = 0 -global_fMid = 0 -global_fHi = 0 +global_bands = {} eval_memo = {} @@ -30,9 +28,9 @@ def parametric_eval(string, **vals): ) math_env.update(vals) math_env["t"] = global_t - math_env["fLo"] = global_fLo - math_env["fMid"] = global_fMid - math_env["fHi"] = global_fHi + # TODO set envs from global bandpass dict values + for band in global_bands: + math_env[band] = global_bands[band] try: output = eval(string, math_env) except SyntaxError as e: @@ -43,12 +41,10 @@ def parametric_eval(string, **vals): return string -def set_t(t, fLo, fMid, fHi): - global global_t, global_fLo, global_fMid, global_fHi, eval_memo +def set_t(t, band_dict): + global global_t, global_bands, eval_memo global_t = t - global_fLo = fLo - global_fMid = fMid - global_fHi = fHi + global_bands = band_dict eval_memo = {} From 139eaaafb5606cecd6d9effcc8549e505caea4b0 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Fri, 8 Apr 2022 23:35:20 +0200 Subject: [PATCH 13/20] style: cleanup, remove useless todos, formatting --- src/pytti/AudioParse.py | 70 ++++++++++++++++++++++------------------- src/pytti/eval_tools.py | 11 +++++-- 2 files changed, 46 insertions(+), 35 deletions(-) diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py index 7fc623a..fe94e58 100644 --- a/src/pytti/AudioParse.py +++ b/src/pytti/AudioParse.py @@ -4,32 +4,34 @@ from loguru import logger from scipy.signal import butter, sosfilt, sosfreqz -SAMPLERATE=44100 +SAMPLERATE = 44100 + class SpectralAudioParser: """ reads a given input file, scans along it and parses the amplitude in selected bands using butterworth bandpass filters. the amplitude is normalized into the 0..1 range for easier use in transformation functions. """ + def __init__( - self, - input_audio, - offset, - frames_per_second, - filters - ): + self, + input_audio, + offset, + frames_per_second, + filters + ): if len(filters) < 1: raise RuntimeError("When using input_audio, at least 1 filter must be specified") pipe = subprocess.Popen(['ffmpeg', '-i', input_audio, - '-f', 's16le', - '-acodec', 'pcm_s16le', - '-ar', str(SAMPLERATE), - '-ac', '1', - '-'], stdout=subprocess.PIPE, bufsize=10**8) + '-f', 's16le', + '-acodec', 'pcm_s16le', + '-ar', str(SAMPLERATE), + '-ac', '1', + '-'], stdout=subprocess.PIPE, bufsize=10 ** 8) self.audio_samples = np.array([], dtype=np.int16) - + # read the audio file from the pipe in 0.5s blocks (2 bytes per sample) while True: buf = pipe.stdout.read(SAMPLERATE) @@ -39,24 +41,25 @@ def __init__( if len(self.audio_samples) < 0: raise RuntimeError("Audio samples are empty, assuming load failed") self.duration = len(self.audio_samples) / SAMPLERATE - logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s") + logger.debug( + f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s") self.offset = offset if offset > self.duration: raise RuntimeError(f"Audio offset set at {offset}s but input audio is only {duration}s long") # analyze all samples for the current frame - self.window_size = int(1/frames_per_second * SAMPLERATE) + self.window_size = int(1 / frames_per_second * SAMPLERATE) self.filters = filters # parse band maxima first for normalizing the filtered signal to 0..1 at arbitrary points in the file later # this initialization is a bit compute intensive, especially for higher fps numbers, but i couldn't find a cleaner way # (band-passing the entire track instead of windows creates maxima that are way off, some filtering anomaly i don't understand...) steps = int((self.duration - self.offset) * frames_per_second) - interval = 1/frames_per_second + interval = 1 / frames_per_second maxima = {} time_steps = np.linspace(0, steps, num=steps) * interval for t in time_steps: sample_offset = int(t * SAMPLERATE) - cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset+self.window_size], filters) + cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset + self.window_size], filters) for key in cur_maxima: if key in maxima: maxima[key] = max(maxima[key], cur_maxima[key]) @@ -72,16 +75,14 @@ def get_params(self, t) -> typing.Dict[str, float]: """ # Get the point in time (sample-offset) in the track in seconds based on sample-rate sample_offset = int(t * SAMPLERATE + self.offset * SAMPLERATE) - logger.debug(f"Analyzing audio at {self.offset+t}s") + logger.debug(f"Analyzing audio at {self.offset + t}s") if sample_offset < len(self.audio_samples): - window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size] + window_samples = self.audio_samples[sample_offset:sample_offset + self.window_size] if len(window_samples) < self.window_size: # audio input file has likely ended - # TODO could round down to the next lower pow2 then do it anyway. not a critical case though IMO. - logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning null result") + logger.debug( + f"Warning: sample offset is out of range at time offset {t + self.offset}s. Returning null result") return {} - # fade-in / fade-out window to taper off the signal - #window_samples = window_samples * np.hamming(len(window_samples)) return bp_filtered_norm(window_samples, self.filters, self.band_maxima) else: logger.debug(f"Warning: Audio input has ended. Returning null result") @@ -90,32 +91,35 @@ def get_params(self, t) -> typing.Dict[str, float]: def get_duration(self): return self.duration + def butter_bandpass(lowcut, highcut, fs, order=5): - nyq = 0.5 * fs - low = lowcut / nyq - high = highcut / nyq - sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos') - return sos + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos') + return sos + def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): - sos = butter_bandpass(lowcut, highcut, fs, order=order) - y = sosfilt(sos, data) - return y + sos = butter_bandpass(lowcut, highcut, fs, order=order) + y = sosfilt(sos, data) + return y def bp_filtered(window_samples, filters) -> typing.Dict[str, float]: results = {} for filter in filters: - offset = filter.f_width/2 + offset = filter.f_width / 2 lower = filter.f_center - offset upper = filter.f_center + offset filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order) results[filter.variable_name] = np.max(np.abs(filtered)) return results + def bp_filtered_norm(window_samples, filters, norm_factors) -> typing.Dict[str, float]: results = bp_filtered(window_samples, filters) for key in results: # normalize results[key] = results[key] / norm_factors[key] - return results \ No newline at end of file + return results diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py index 64b936e..e637d06 100644 --- a/src/pytti/eval_tools.py +++ b/src/pytti/eval_tools.py @@ -6,6 +6,7 @@ math_env = None global_t = 0 global_bands = {} +global_bands_prev = {} eval_memo = {} @@ -28,9 +29,11 @@ def parametric_eval(string, **vals): ) math_env.update(vals) math_env["t"] = global_t - # TODO set envs from global bandpass dict values for band in global_bands: math_env[band] = global_bands[band] + if global_bands_prev: + for band in global_bands_prev: + math_env[f'{band}_prev'] = global_bands_prev[band] try: output = eval(string, math_env) except SyntaxError as e: @@ -42,8 +45,12 @@ def parametric_eval(string, **vals): def set_t(t, band_dict): - global global_t, global_bands, eval_memo + global global_t, global_bands, global_bands_prev, eval_memo global_t = t + if global_bands: + global_bands_prev = global_bands + else: + global_bands_prev = band_dict global_bands = band_dict eval_memo = {} From 52c58b172547d6ae6d364fddc568de5d8bf42f11 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Fri, 8 Apr 2022 23:43:15 +0200 Subject: [PATCH 14/20] fix: use correct parameter naming, properly print bands --- src/pytti/ImageGuide.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index 352a47a..88d086d 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -109,8 +109,9 @@ def __init__( self.optimizer = optimizer self.dataframe = [] - if params.input_audio: - self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.frames_per_second, params.filters) + if params.input_audio and params.input_audio_filters: + self.audio_parser = SpectralAudioParser(params.input_audio, params.input_audio_offset, + params.frames_per_second, params.input_audio_filters) else: self.audio_parser = None @@ -434,7 +435,7 @@ def update(self, model, img, i, stage_i, *args, **kwargs): if (i - params.pre_animation_steps) % params.steps_per_frame == 0: if self.audio_parser is not None: band_dict = self.audio_parser.get_params(t) - logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}") + logger.debug(f"Time: {t:.4f} seconds, audio params: {band_dict}") set_t(t, band_dict) else: logger.debug(f"Time: {t:.4f} seconds") From 1688ab4d6950356fc816fc1bf915a1dfb6f66330 Mon Sep 17 00:00:00 2001 From: Simon Baier Date: Mon, 18 Apr 2022 19:59:39 +0200 Subject: [PATCH 15/20] fix: param must be dict instead of tuple now --- src/pytti/ImageGuide.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index 88d086d..25dfcbe 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -428,7 +428,7 @@ def update(self, model, img, i, stage_i, *args, **kwargs): params.steps_per_frame * params.frames_per_second ) if self.audio_parser is None: - set_t(t, 0, 0, 0) + set_t(t, {}) # set_t(t) # this won't need to be a thing with `t`` attached to the class if i >= params.pre_animation_steps: # next_step_pil = None From 548edba1bf318cdf3e03c245b9bb0a3954850bec Mon Sep 17 00:00:00 2001 From: David Marx Date: Wed, 20 Apr 2022 14:03:17 -0700 Subject: [PATCH 16/20] fixed dangling merge conflict --- src/pytti/ImageGuide.py | 172 ++------------------------------------- src/pytti/update_func.py | 20 ++++- 2 files changed, 25 insertions(+), 167 deletions(-) diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index 25dfcbe..2266223 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -110,8 +110,12 @@ def __init__( self.dataframe = [] if params.input_audio and params.input_audio_filters: - self.audio_parser = SpectralAudioParser(params.input_audio, params.input_audio_offset, - params.frames_per_second, params.input_audio_filters) + self.audio_parser = SpectralAudioParser( + params.input_audio, + params.input_audio_offset, + params.frames_per_second, + params.input_audio_filters, + ) else: self.audio_parser = None @@ -372,168 +376,4 @@ def update(self, model, img, i, stage_i, *args, **kwargs): """ update hook called ever step """ -<<<<<<< HEAD pass -======= - # logger.debug("model.update called") - - # ... I have regrets. - params = self.params - writer = self.writer - OUTPATH = self.OUTPATH - base_name = self.base_name - fig = self.fig - axs = self.axs - video_frames = self.video_frames - optical_flows = self.optical_flows - stabilization_augs = self.stabilization_augs - last_frame_semantic = self.last_frame_semantic - semantic_init_prompt = self.semantic_init_prompt - init_augs = self.init_augs - - model = self - img = self.image_rep - embedder = self.embedder - - model.report_out( - i=i, - stage_i=stage_i, - # model=model, - writer=writer, - fig=fig, # default to None... - axs=axs, # default to None... - clear_every=params.clear_every, - display_every=params.display_every, - approximate_vram_usage=params.approximate_vram_usage, - display_scale=params.display_scale, - show_graphs=params.show_graphs, - show_palette=params.show_palette, - ) - - model.save_out( - i=i, - # img=img, - writer=writer, - OUTPATH=OUTPATH, - base_name=base_name, - save_every=params.save_every, - file_namespace=params.file_namespace, - backups=params.backups, - ) - - # animate - ################ - ## TO DO: attach T as a class attribute - t = (i - params.pre_animation_steps) / ( - params.steps_per_frame * params.frames_per_second - ) - if self.audio_parser is None: - set_t(t, {}) - # set_t(t) # this won't need to be a thing with `t`` attached to the class - if i >= params.pre_animation_steps: - # next_step_pil = None - if (i - params.pre_animation_steps) % params.steps_per_frame == 0: - if self.audio_parser is not None: - band_dict = self.audio_parser.get_params(t) - logger.debug(f"Time: {t:.4f} seconds, audio params: {band_dict}") - set_t(t, band_dict) - else: - logger.debug(f"Time: {t:.4f} seconds") - # update_rotoscopers( - ROTOSCOPERS.update_rotoscopers( - ((i - params.pre_animation_steps) // params.steps_per_frame + 1) - * params.frame_stride - ) - if params.reset_lr_each_frame: - model.set_optim(None) - - if params.animation_mode == "2D": - - next_step_pil = animate_2d( - translate_y=params.translate_y, - translate_x=params.translate_x, - rotate_2d=params.rotate_2d, - zoom_x_2d=params.zoom_x_2d, - zoom_y_2d=params.zoom_y_2d, - infill_mode=params.infill_mode, - sampling_mode=params.sampling_mode, - writer=writer, - i=i, - img=img, - t=t, # just here for logging - ) - - ########################### - elif params.animation_mode == "3D": - try: - im - except NameError: - im = img.decode_image() - with vram_usage_mode("Optical Flow Loss"): - # zoom_3d -> rename to animate_3d or transform_3d - flow, next_step_pil = zoom_3d( - img, - ( - params.translate_x, - params.translate_y, - params.translate_z_3d, - ), - params.rotate_3d, - params.field_of_view, - params.near_plane, - params.far_plane, - border_mode=params.infill_mode, - sampling_mode=params.sampling_mode, - stabilize=params.lock_camera, - ) - freeze_vram_usage() - - for optical_flow in optical_flows: - optical_flow.set_last_step(im) - optical_flow.set_target_flow(flow) - optical_flow.set_enabled(True) - - elif params.animation_mode == "Video Source": - - flow_im, next_step_pil = animate_video_source( - i=i, - img=img, - video_frames=video_frames, - optical_flows=optical_flows, - base_name=base_name, - pre_animation_steps=params.pre_animation_steps, - frame_stride=params.frame_stride, - steps_per_frame=params.steps_per_frame, - file_namespace=params.file_namespace, - reencode_each_frame=params.reencode_each_frame, - lock_palette=params.lock_palette, - save_every=params.save_every, - infill_mode=params.infill_mode, - sampling_mode=params.sampling_mode, - ) - - if params.animation_mode != "off": - try: - for aug in stabilization_augs: - aug.set_comp(next_step_pil) - aug.set_enabled(True) - if last_frame_semantic is not None: - last_frame_semantic.set_image(embedder, next_step_pil) - last_frame_semantic.set_enabled(True) - for aug in init_augs: - aug.set_enabled(False) - if semantic_init_prompt is not None: - semantic_init_prompt.set_enabled(False) - except UnboundLocalError: - logger.critical( - "\n\n-----< PYTTI-TOOLS > ------" - "If you are seeing this error, it might mean " - "you are using an option that expects you have " - "provided an init_image or video_file.\n\nIf you " - "think you are seeing this message in error, please " - "file an issue here: " - "https://github.com/pytti-tools/pytti-core/issues/new" - "-----< PYTTI-TOOLS > ------\n\n" - ) - raise ->>>>>>> 57553a3 (feat: initial rough audio parsing logic) diff --git a/src/pytti/update_func.py b/src/pytti/update_func.py index 1f25736..bb7fe62 100644 --- a/src/pytti/update_func.py +++ b/src/pytti/update_func.py @@ -190,10 +190,28 @@ def save_out( t = (i - params.pre_animation_steps) / ( params.steps_per_frame * params.frames_per_second ) - set_t(t) + set_t(t, {}) if i >= params.pre_animation_steps: if (i - params.pre_animation_steps) % params.steps_per_frame == 0: logger.debug(f"Time: {t:.4f} seconds") + + # Audio Reactivity ############ + if model.audio_parser is None: + set_t(t, {}) + # set_t(t) # this won't need to be a thing with `t`` attached to the class + if i >= params.pre_animation_steps: + # next_step_pil = None + if (i - params.pre_animation_steps) % params.steps_per_frame == 0: + if model.audio_parser is not None: + band_dict = model.audio_parser.get_params(t) + logger.debug( + f"Time: {t:.4f} seconds, audio params: {band_dict}" + ) + set_t(t, band_dict) + else: + logger.debug(f"Time: {t:.4f} seconds") + ############################### + update_rotoscopers( ((i - params.pre_animation_steps) // params.steps_per_frame + 1) * params.frame_stride From ba63ea97d0865c7cfb73e974640a57e3b0ca5f79 Mon Sep 17 00:00:00 2001 From: David Marx Date: Wed, 20 Apr 2022 14:07:15 -0700 Subject: [PATCH 17/20] audio filter config needs to be optional. --- src/pytti/config/structured_config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py index 0290860..54662b1 100644 --- a/src/pytti/config/structured_config.py +++ b/src/pytti/config/structured_config.py @@ -18,11 +18,12 @@ def check_input_against_list(attribute, value, valid_values): @define(auto_attribs=True) class AudioFilterConfig: - variable_name: str = "???" - f_center: int = "???" - f_width: int = "???" + variable_name: str = "" + f_center: int = -1 + f_width: int = -1 order: int = 5 + @define(auto_attribs=True) class ConfigSchema: ############# From 7f59f1e6af94ba9a4ad6544c2a4ad0a6af59936f Mon Sep 17 00:00:00 2001 From: David Marx Date: Wed, 20 Apr 2022 14:20:14 -0700 Subject: [PATCH 18/20] typing.Optional for backwards compatibility --- src/pytti/config/structured_config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py index 54662b1..9db6c43 100644 --- a/src/pytti/config/structured_config.py +++ b/src/pytti/config/structured_config.py @@ -108,9 +108,9 @@ def check(self, attribute, value): ### Induced Motion ### ###################### - input_audio: str = "" - input_audio_offset: float = 0 - input_audio_filters: AudioFilterConfig = None + input_audio: Optional[str] = "" + input_audio_offset: Optional[float] = 0 + input_audio_filters: Optional[AudioFilterConfig] = None # _2d and _3d only apply to those animation modes @@ -207,7 +207,7 @@ def check(self, attribute, value): backups: int = 0 show_graphs: bool = False approximate_vram_usage: bool = False - use_tensorboard: bool = False + use_tensorboard: Optional[bool] = False ##################################### From 92c5f4acf1f901f552e4d30f536c7bc8af5a1e5e Mon Sep 17 00:00:00 2001 From: David Marx Date: Wed, 20 Apr 2022 14:39:34 -0700 Subject: [PATCH 19/20] sadly, optional doesn't permit absent... --- tests/config/default.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/config/default.yaml b/tests/config/default.yaml index 79f9e4d..4aa9c6c 100644 --- a/tests/config/default.yaml +++ b/tests/config/default.yaml @@ -154,3 +154,11 @@ models_parent_dir: ${user_cache:} ########################## gradient_accumulation_steps: 1 + +################## + +# This shouldn't be necessary, but let's see if maybe it squashes test errors? + +input_audio: "" +input_audio_offset: 0 +input_audio_filters: null From 80faea26ecd8dffc4d1a57dd4f49c352497ff96c Mon Sep 17 00:00:00 2001 From: David Marx Date: Wed, 20 Apr 2022 14:53:22 -0700 Subject: [PATCH 20/20] fixed minor integration errors --- src/pytti/ImageGuide.py | 20 +++++++++++--------- src/pytti/config/structured_config.py | 4 ++-- tests/test_animation_broken.py | 4 ++++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index 2266223..1855915 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -109,15 +109,17 @@ def __init__( self.optimizer = optimizer self.dataframe = [] - if params.input_audio and params.input_audio_filters: - self.audio_parser = SpectralAudioParser( - params.input_audio, - params.input_audio_offset, - params.frames_per_second, - params.input_audio_filters, - ) - else: - self.audio_parser = None + self.audio_parser = None + if params is not None: + if params.input_audio and params.input_audio_filters: + self.audio_parser = SpectralAudioParser( + params.input_audio, + params.input_audio_offset, + params.frames_per_second, + params.input_audio_filters, + ) + # else: + # self.audio_parser = None # self.null_update = null_update self.params = params diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py index 9db6c43..a574d5f 100644 --- a/src/pytti/config/structured_config.py +++ b/src/pytti/config/structured_config.py @@ -108,8 +108,8 @@ def check(self, attribute, value): ### Induced Motion ### ###################### - input_audio: Optional[str] = "" - input_audio_offset: Optional[float] = 0 + input_audio: str = "" + input_audio_offset: float = 0 input_audio_filters: Optional[AudioFilterConfig] = None # _2d and _3d only apply to those animation modes diff --git a/tests/test_animation_broken.py b/tests/test_animation_broken.py index 0bfbbec..8d36f1a 100644 --- a/tests/test_animation_broken.py +++ b/tests/test_animation_broken.py @@ -90,6 +90,10 @@ ########################## # adding new config items for backwards compatibility "use_tensorboard": True, # This should actually default to False. Prior to April2022, tb was non-optional + # Default null audio input parameters + "input_audio": "", + "input_audio_offset": 0, + "input_audio_filters": [], }