From 4a65d0b579b97e1ae64b8864402aee79617f5a6b Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Thu, 24 Mar 2022 00:10:09 +0100
Subject: [PATCH 01/20] feat: start implementing initial dumb audio parsing

---
 src/pytti/AudioParse.py               | 29 +++++++++++++++++++++++++++
 src/pytti/config/structured_config.py |  6 ++++++
 2 files changed, 35 insertions(+)
 create mode 100644 src/pytti/AudioParse.py

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
new file mode 100644
index 0000000..aab3531
--- /dev/null
+++ b/src/pytti/AudioParse.py
@@ -0,0 +1,29 @@
+import numpy as np
+import audiofile
+import typing
+
+class SpectralAudioParser:
+    """
+    Audio Parser reads a given input file, scans along it and parses its spectrum using FFT.
+    The FFT output is split into three bands (low,mid,high), the (average) amplitude of which is then returned for use in animation functions.
+    """
+    def __init__(
+        self,
+        params=None
+        ):
+        if params.input_audio:
+            self.audio_samples, self.sample_rate = audiofile.read(params.input_audio, offset=params.input_audio_offset, always_2d=True)
+
+    def get_params(self, t) -> typing.Tuple[float, float, float]:
+        """
+        Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended.
+        """
+        # Get the point in time (sample-offset) in the track in seconds based on sample-rate
+        sample_offset = int(t * self.sample_rate)
+        if sample_offset < self.audio_samples.shape[0]:
+            # TODO: read back up on numpy array slicing, read [sample_offset, sample_offset+window_size] here
+            #       read back up on fft window size parameters etc.
+            #       read up on whether to use fft2 here or to sum the audio file on initialization into a mono signal first maybe
+            np.fft.fft2(self.audio_samples[:sample_offset])
+        else:
+            return (0, 0, 0)
\ No newline at end of file
diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py
index 6ef7ea3..f77a5eb 100644
--- a/src/pytti/config/structured_config.py
+++ b/src/pytti/config/structured_config.py
@@ -100,6 +100,12 @@ def check(self, attribute, value):
     ### Induced Motion ###
     ######################
 
+    input_audio: str = ""
+    input_audio_offset: float = 0
+    input_audio_window_size: int = 1024
+    input_audio_band_split_low_medium: int = 500
+    input_audio_band_split_medium_high: int = 3500
+
     #  _2d and _3d only apply to those animation modes
 
     translate_x: str = "0"

From 68545fc0c7930d179b61b1c6827cd1778350206d Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Thu, 24 Mar 2022 22:55:07 +0100
Subject: [PATCH 02/20] feat: initial rough audio parsing logic

---
 src/pytti/AudioParse.py |  86 ++++++++++++++++++--
 src/pytti/ImageGuide.py | 169 ++++++++++++++++++++++++++++++++++++++++
 src/pytti/eval_tools.py |  13 +++-
 3 files changed, 258 insertions(+), 10 deletions(-)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index aab3531..d9d2ca3 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -1,6 +1,9 @@
 import numpy as np
-import audiofile
 import typing
+import subprocess
+from loguru import logger
+
+SAMPLERATE=44100
 
 class SpectralAudioParser:
     """
@@ -12,18 +15,85 @@ def __init__(
         params=None
         ):
         if params.input_audio:
-            self.audio_samples, self.sample_rate = audiofile.read(params.input_audio, offset=params.input_audio_offset, always_2d=True)
+            pipe = subprocess.Popen(['ffmpeg', '-i', params.input_audio,
+                '-f', 's16le',
+                '-acodec', 'pcm_s16le',
+                '-ar', str(SAMPLERATE),
+                '-ac', '1',
+                '-'], stdout=subprocess.PIPE, bufsize=10**8)
+
+            self.audio_samples = np.array([], dtype=np.int16)
+            
+            # read the audio file from the pipe in 0.5s blocks (2 bytes per sample)
+            while True:
+                buf = pipe.stdout.read(SAMPLERATE)
+                self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16))
+                if len(buf) < SAMPLERATE:
+                    break
+            
+            logger.debug(f"initialized audio file {params.input_audio}")
+            self.input_audio_offset = params.input_audio_offset
+            self.window_size = params.input_audio_window_size
+            self.low_cutoff = params.input_audio_band_split_low_medium
+            self.mid_cutoff = params.input_audio_band_split_medium_high
+            # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107
+            self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size
+            self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None)
+
+
 
     def get_params(self, t) -> typing.Tuple[float, float, float]:
         """
         Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended.
+        Amplitude/energy parameters are normalized into the [0,1] range.
         """
         # Get the point in time (sample-offset) in the track in seconds based on sample-rate
-        sample_offset = int(t * self.sample_rate)
-        if sample_offset < self.audio_samples.shape[0]:
-            # TODO: read back up on numpy array slicing, read [sample_offset, sample_offset+window_size] here
-            #       read back up on fft window size parameters etc.
-            #       read up on whether to use fft2 here or to sum the audio file on initialization into a mono signal first maybe
-            np.fft.fft2(self.audio_samples[:sample_offset])
+        sample_offset = int(t * SAMPLERATE + self.input_audio_offset * SAMPLERATE)
+        if sample_offset < len(self.audio_samples):
+            window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size]
+            if len(window_samples) < self.window_size:
+                # audio input file has likely ended
+                # TODO could round down to the next pow2 then do it anyway. not a critical case though IMO.
+                return (0, 0, 0)
+            # fade-in / fade-out window
+            window_samples = window_samples * np.hamming(len(window_samples))
+            fft = np.fft.fft(window_samples)
+            # summing together the real and imaginary components, i think(??)
+            left, right = np.split(np.abs(fft), 2)
+            fft = np.add(left, right[::-1])
+
+            # pink noise adjust
+            fft = fft * self.power_normalization_coefficients
+
+            freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE)
+            # collect energy for each frequency band
+            # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though
+            low_bucket = 0
+            low_count = 0
+            mid_bucket = 0
+            mid_count = 0
+            high_bucket = 0
+            high_count = 0
+            for i in range(len(fft)):
+                freq = self.fftx[i]
+                if freq < self.low_cutoff:
+                    low_bucket += fft[i]
+                    low_count += 1
+                elif freq < self.mid_cutoff:
+                    mid_bucket += fft[i]
+                    mid_count += 1
+                else:
+                    high_bucket += fft[i]
+                    high_count += 1
+            # mean energy per bucket
+            low_bucket = low_bucket / low_count
+            mid_bucket = mid_bucket / mid_count
+            high_bucket = high_bucket / high_count
+            # normalize to [0,1] range
+            max_val = np.max(fft)
+            low_bucket = low_bucket / max_val
+            mid_bucket = mid_bucket / max_val
+            high_bucket = high_bucket / max_val
+            return (low_bucket, mid_bucket, high_bucket)
         else:
             return (0, 0, 0)
\ No newline at end of file
diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index 3534ce0..d52a3f9 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -19,6 +19,7 @@
     freeze_vram_usage,
     vram_usage_mode,
 )
+from pytti.AudioParse import SpectralAudioParser
 from pytti.Image.differentiable_image import DifferentiableImage
 from pytti.Image.PixelImage import PixelImage
 from pytti.Notebook import tqdm, make_hbox
@@ -108,6 +109,11 @@ def __init__(
             self.optimizer = optimizer
         self.dataframe = []
 
+        if params.input_audio:
+            self.audio_parser = SpectralAudioParser(params)
+        else:
+            self.audio_parser = None
+
         # self.null_update = null_update
         self.params = params
         self.writer = writer
@@ -365,4 +371,167 @@ def update(self, model, img, i, stage_i, *args, **kwargs):
         """
         update hook called ever step
         """
+<<<<<<< HEAD
         pass
+=======
+        # logger.debug("model.update called")
+
+        # ... I have regrets.
+        params = self.params
+        writer = self.writer
+        OUTPATH = self.OUTPATH
+        base_name = self.base_name
+        fig = self.fig
+        axs = self.axs
+        video_frames = self.video_frames
+        optical_flows = self.optical_flows
+        stabilization_augs = self.stabilization_augs
+        last_frame_semantic = self.last_frame_semantic
+        semantic_init_prompt = self.semantic_init_prompt
+        init_augs = self.init_augs
+
+        model = self
+        img = self.image_rep
+        embedder = self.embedder
+
+        model.report_out(
+            i=i,
+            stage_i=stage_i,
+            # model=model,
+            writer=writer,
+            fig=fig,  # default to None...
+            axs=axs,  # default to None...
+            clear_every=params.clear_every,
+            display_every=params.display_every,
+            approximate_vram_usage=params.approximate_vram_usage,
+            display_scale=params.display_scale,
+            show_graphs=params.show_graphs,
+            show_palette=params.show_palette,
+        )
+
+        model.save_out(
+            i=i,
+            # img=img,
+            writer=writer,
+            OUTPATH=OUTPATH,
+            base_name=base_name,
+            save_every=params.save_every,
+            file_namespace=params.file_namespace,
+            backups=params.backups,
+        )
+
+        # animate
+        ################
+        ## TO DO: attach T as a class attribute
+        t = (i - params.pre_animation_steps) / (
+            params.steps_per_frame * params.frames_per_second
+        )
+        if self.audio_parser is None:
+            set_t(t, 0, 0, 0)
+        # set_t(t)  # this won't need to be a thing with `t`` attached to the class
+        if i >= params.pre_animation_steps:
+            if self.audio_parser is not None:
+                lo, mid, hi = self.audio_parser.get_params(t)
+                logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}")
+                set_t(t, lo, mid, hi)
+            # next_step_pil = None
+            if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
+                logger.debug(f"Time: {t:.4f} seconds")
+                # update_rotoscopers(
+                ROTOSCOPERS.update_rotoscopers(
+                    ((i - params.pre_animation_steps) // params.steps_per_frame + 1)
+                    * params.frame_stride
+                )
+                if params.reset_lr_each_frame:
+                    model.set_optim(None)
+
+                if params.animation_mode == "2D":
+
+                    next_step_pil = animate_2d(
+                        translate_y=params.translate_y,
+                        translate_x=params.translate_x,
+                        rotate_2d=params.rotate_2d,
+                        zoom_x_2d=params.zoom_x_2d,
+                        zoom_y_2d=params.zoom_y_2d,
+                        infill_mode=params.infill_mode,
+                        sampling_mode=params.sampling_mode,
+                        writer=writer,
+                        i=i,
+                        img=img,
+                        t=t,  # just here for logging
+                    )
+
+                    ###########################
+                elif params.animation_mode == "3D":
+                    try:
+                        im
+                    except NameError:
+                        im = img.decode_image()
+                    with vram_usage_mode("Optical Flow Loss"):
+                        # zoom_3d -> rename to animate_3d or transform_3d
+                        flow, next_step_pil = zoom_3d(
+                            img,
+                            (
+                                params.translate_x,
+                                params.translate_y,
+                                params.translate_z_3d,
+                            ),
+                            params.rotate_3d,
+                            params.field_of_view,
+                            params.near_plane,
+                            params.far_plane,
+                            border_mode=params.infill_mode,
+                            sampling_mode=params.sampling_mode,
+                            stabilize=params.lock_camera,
+                        )
+                        freeze_vram_usage()
+
+                    for optical_flow in optical_flows:
+                        optical_flow.set_last_step(im)
+                        optical_flow.set_target_flow(flow)
+                        optical_flow.set_enabled(True)
+
+                elif params.animation_mode == "Video Source":
+
+                    flow_im, next_step_pil = animate_video_source(
+                        i=i,
+                        img=img,
+                        video_frames=video_frames,
+                        optical_flows=optical_flows,
+                        base_name=base_name,
+                        pre_animation_steps=params.pre_animation_steps,
+                        frame_stride=params.frame_stride,
+                        steps_per_frame=params.steps_per_frame,
+                        file_namespace=params.file_namespace,
+                        reencode_each_frame=params.reencode_each_frame,
+                        lock_palette=params.lock_palette,
+                        save_every=params.save_every,
+                        infill_mode=params.infill_mode,
+                        sampling_mode=params.sampling_mode,
+                    )
+
+                if params.animation_mode != "off":
+                    try:
+                        for aug in stabilization_augs:
+                            aug.set_comp(next_step_pil)
+                            aug.set_enabled(True)
+                        if last_frame_semantic is not None:
+                            last_frame_semantic.set_image(embedder, next_step_pil)
+                            last_frame_semantic.set_enabled(True)
+                        for aug in init_augs:
+                            aug.set_enabled(False)
+                        if semantic_init_prompt is not None:
+                            semantic_init_prompt.set_enabled(False)
+                    except UnboundLocalError:
+                        logger.critical(
+                            "\n\n-----< PYTTI-TOOLS > ------"
+                            "If you are seeing this error, it might mean "
+                            "you are using an option that expects you have "
+                            "provided an init_image or video_file.\n\nIf you "
+                            "think you are seeing this message in error, please "
+                            "file an issue here: "
+                            "https://github.com/pytti-tools/pytti-core/issues/new"
+                            "-----< PYTTI-TOOLS > ------\n\n"
+                        )
+                        raise
+>>>>>>> 57553a3 (feat: initial rough audio parsing logic)
diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py
index 85cfac1..a0741f8 100644
--- a/src/pytti/eval_tools.py
+++ b/src/pytti/eval_tools.py
@@ -5,6 +5,9 @@
 
 math_env = None
 global_t = 0
+global_fLo = 0
+global_fMid = 0
+global_fHi = 0
 eval_memo = {}
 
 
@@ -27,6 +30,9 @@ def parametric_eval(string, **vals):
             )
         math_env.update(vals)
         math_env["t"] = global_t
+        math_env["fLo"] = global_fLo
+        math_env["fMid"] = global_fMid
+        math_env["fHi"] = global_fHi
         try:
             output = eval(string, math_env)
         except SyntaxError as e:
@@ -37,9 +43,12 @@ def parametric_eval(string, **vals):
         return string
 
 
-def set_t(t):
-    global global_t, eval_memo
+def set_t(t, fLo, fMid, fHi):
+    global global_t, global_fLo, global_fMid, global_fHi, eval_memo
     global_t = t
+    global_fLo = fLo
+    global_fMid = fMid
+    global_fHi = fHi
     eval_memo = {}
 
 

From ee20b1ef70ede750c548da51f99f2cdf0a6d9377 Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Fri, 25 Mar 2022 08:29:31 +0100
Subject: [PATCH 03/20] fix: some math edge cases

---
 src/pytti/AudioParse.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index d9d2ca3..2a33d0f 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -86,14 +86,20 @@ def get_params(self, t) -> typing.Tuple[float, float, float]:
                     high_bucket += fft[i]
                     high_count += 1
             # mean energy per bucket
-            low_bucket = low_bucket / low_count
-            mid_bucket = mid_bucket / mid_count
-            high_bucket = high_bucket / high_count
+            if low_count > 0 and mid_count > 0 and high_count > 0:    
+                low_bucket = low_bucket / low_count
+                mid_bucket = mid_bucket / mid_count
+                high_bucket = high_bucket / high_count
+            else:
+                return (0,0,0)
             # normalize to [0,1] range
             max_val = np.max(fft)
-            low_bucket = low_bucket / max_val
-            mid_bucket = mid_bucket / max_val
-            high_bucket = high_bucket / max_val
-            return (low_bucket, mid_bucket, high_bucket)
+            if max_val > 0:
+                low_bucket = low_bucket / max_val
+                mid_bucket = mid_bucket / max_val
+                high_bucket = high_bucket / max_val
+            else:
+                return (0,0,0)
+            return (float(low_bucket), float(mid_bucket), float(high_bucket))
         else:
             return (0, 0, 0)
\ No newline at end of file

From 8a1ff32bd00784ca8a6f15d67a7584272ccb3b6b Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Fri, 25 Mar 2022 17:24:12 +0100
Subject: [PATCH 04/20] fix: add debug log for audio 0 vectors

---
 src/pytti/AudioParse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index 2a33d0f..f79ebf3 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -54,6 +54,7 @@ def get_params(self, t) -> typing.Tuple[float, float, float]:
             if len(window_samples) < self.window_size:
                 # audio input file has likely ended
                 # TODO could round down to the next pow2 then do it anyway. not a critical case though IMO.
+                logger.debug(f"Warning: sample offset is out of range at time offset {t+self.input_audio_offset}s. Returning 0 vector")
                 return (0, 0, 0)
             # fade-in / fade-out window
             window_samples = window_samples * np.hamming(len(window_samples))

From 2c5c7c25dc14ad2870a07833cb2f39de75201ff1 Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Fri, 25 Mar 2022 17:33:48 +0100
Subject: [PATCH 05/20] fix: add missing warmup config params

---
 src/pytti/assets/default.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml
index a4f9630..b1cc16a 100644
--- a/src/pytti/assets/default.yaml
+++ b/src/pytti/assets/default.yaml
@@ -81,6 +81,11 @@ far_plane: 10000
 ######################
 ### Induced Motion ###
 ######################
+input_audio: ""
+input_audio_offset: 0
+input_audio_window_size: 8192
+input_audio_band_split_low_medium: 150
+input_audio_band_split_medium_high: 300
 
 pre_animation_steps: 100
 lock_camera: true

From affdaaa44be4b008258860cb0a118b6e7929bae4 Mon Sep 17 00:00:00 2001
From: Simon Baier <SimonBaier@outlook.de>
Date: Sat, 26 Mar 2022 13:17:28 +0100
Subject: [PATCH 06/20] fix: run fft analysis only when images are actually
 saved

---
 src/pytti/ImageGuide.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index d52a3f9..f506e10 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -430,13 +430,14 @@ def update(self, model, img, i, stage_i, *args, **kwargs):
             set_t(t, 0, 0, 0)
         # set_t(t)  # this won't need to be a thing with `t`` attached to the class
         if i >= params.pre_animation_steps:
-            if self.audio_parser is not None:
-                lo, mid, hi = self.audio_parser.get_params(t)
-                logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}")
-                set_t(t, lo, mid, hi)
             # next_step_pil = None
             if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
-                logger.debug(f"Time: {t:.4f} seconds")
+                if self.audio_parser is not None:
+                    lo, mid, hi = self.audio_parser.get_params(t)
+                    logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}")
+                    set_t(t, lo, mid, hi)
+                else:
+                    logger.debug(f"Time: {t:.4f} seconds")
                 # update_rotoscopers(
                 ROTOSCOPERS.update_rotoscopers(
                     ((i - params.pre_animation_steps) // params.steps_per_frame + 1)

From 3eea43122dae2f98d60d494e5b36cdc3ae883a9e Mon Sep 17 00:00:00 2001
From: Simon Baier <SimonBaier@outlook.de>
Date: Sat, 26 Mar 2022 14:04:06 +0100
Subject: [PATCH 07/20] fix: try adding some rudimentary error handling

---
 src/pytti/AudioParse.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index f79ebf3..96b19f0 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -30,7 +30,8 @@ def __init__(
                 self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16))
                 if len(buf) < SAMPLERATE:
                     break
-            
+            if len(self.audio_samples) < 0:
+                raise RuntimeError("Audio samples are empty, assuming load failed")
             logger.debug(f"initialized audio file {params.input_audio}")
             self.input_audio_offset = params.input_audio_offset
             self.window_size = params.input_audio_window_size

From 97aaa06748ea2d3d83bc02ae4124960b8bd5b85e Mon Sep 17 00:00:00 2001
From: Simon Baier <SimonBaier@outlook.de>
Date: Sat, 26 Mar 2022 14:05:13 +0100
Subject: [PATCH 08/20] fix: add sample count to log statement as well

---
 src/pytti/AudioParse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index 96b19f0..c123c16 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -32,7 +32,7 @@ def __init__(
                     break
             if len(self.audio_samples) < 0:
                 raise RuntimeError("Audio samples are empty, assuming load failed")
-            logger.debug(f"initialized audio file {params.input_audio}")
+            logger.debug(f"initialized audio file {params.input_audio}, samples read: {len(self.audio_samples)}")
             self.input_audio_offset = params.input_audio_offset
             self.window_size = params.input_audio_window_size
             self.low_cutoff = params.input_audio_band_split_low_medium

From d0edc43f9b2ade1aae06775b5d75493174ee4607 Mon Sep 17 00:00:00 2001
From: Simon Baier <SimonBaier@outlook.de>
Date: Sat, 26 Mar 2022 14:19:19 +0100
Subject: [PATCH 09/20] fix: more warning logs for debugging

---
 src/pytti/AudioParse.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index c123c16..bf48869 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -50,6 +50,7 @@ def get_params(self, t) -> typing.Tuple[float, float, float]:
         """
         # Get the point in time (sample-offset) in the track in seconds based on sample-rate
         sample_offset = int(t * SAMPLERATE + self.input_audio_offset * SAMPLERATE)
+        logger.debug(f"Analyzing audio at {self.input_audio_offset+t}s")
         if sample_offset < len(self.audio_samples):
             window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size]
             if len(window_samples) < self.window_size:
@@ -93,6 +94,7 @@ def get_params(self, t) -> typing.Tuple[float, float, float]:
                 mid_bucket = mid_bucket / mid_count
                 high_bucket = high_bucket / high_count
             else:
+                logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector")
                 return (0,0,0)
             # normalize to [0,1] range
             max_val = np.max(fft)
@@ -101,7 +103,10 @@ def get_params(self, t) -> typing.Tuple[float, float, float]:
                 mid_bucket = mid_bucket / max_val
                 high_bucket = high_bucket / max_val
             else:
+                logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector")
                 return (0,0,0)
             return (float(low_bucket), float(mid_bucket), float(high_bucket))
         else:
+
+            logger.debug(f"Warning: Audio input has ended. Returning 0 vector")
             return (0, 0, 0)
\ No newline at end of file

From 5bc02f4b07ef51e04be17523296d54e0633086a4 Mon Sep 17 00:00:00 2001
From: Simon Baier <SimonBaier@outlook.de>
Date: Sat, 2 Apr 2022 01:57:25 +0200
Subject: [PATCH 10/20] feat: start implementing bandpass filters instead of
 fft+ frequency split

---
 src/pytti/AudioParse.py               | 184 +++++++++++++++-----------
 src/pytti/ImageGuide.py               |   2 +-
 src/pytti/assets/default.yaml         |   7 +-
 src/pytti/config/structured_config.py |  10 +-
 4 files changed, 120 insertions(+), 83 deletions(-)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index bf48869..68319ac 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -2,6 +2,7 @@
 import typing
 import subprocess
 from loguru import logger
+from scipy.signal import butter, sosfilt, sosfreqz
 
 SAMPLERATE=44100
 
@@ -12,34 +13,35 @@ class SpectralAudioParser:
     """
     def __init__(
         self,
-        params=None
+        input_audio,
+        offset,
+        window_size,
+        filters
         ):
-        if params.input_audio:
-            pipe = subprocess.Popen(['ffmpeg', '-i', params.input_audio,
-                '-f', 's16le',
-                '-acodec', 'pcm_s16le',
-                '-ar', str(SAMPLERATE),
-                '-ac', '1',
-                '-'], stdout=subprocess.PIPE, bufsize=10**8)
+        pipe = subprocess.Popen(['ffmpeg', '-i', input_audio,
+            '-f', 's16le',
+            '-acodec', 'pcm_s16le',
+            '-ar', str(SAMPLERATE),
+            '-ac', '1',
+            '-'], stdout=subprocess.PIPE, bufsize=10**8)
 
-            self.audio_samples = np.array([], dtype=np.int16)
-            
-            # read the audio file from the pipe in 0.5s blocks (2 bytes per sample)
-            while True:
-                buf = pipe.stdout.read(SAMPLERATE)
-                self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16))
-                if len(buf) < SAMPLERATE:
-                    break
-            if len(self.audio_samples) < 0:
-                raise RuntimeError("Audio samples are empty, assuming load failed")
-            logger.debug(f"initialized audio file {params.input_audio}, samples read: {len(self.audio_samples)}")
-            self.input_audio_offset = params.input_audio_offset
-            self.window_size = params.input_audio_window_size
-            self.low_cutoff = params.input_audio_band_split_low_medium
-            self.mid_cutoff = params.input_audio_band_split_medium_high
-            # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107
-            self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size
-            self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None)
+        self.audio_samples = np.array([], dtype=np.int16)
+        
+        # read the audio file from the pipe in 0.5s blocks (2 bytes per sample)
+        while True:
+            buf = pipe.stdout.read(SAMPLERATE)
+            self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16))
+            if len(buf) < SAMPLERATE:
+                break
+        if len(self.audio_samples) < 0:
+            raise RuntimeError("Audio samples are empty, assuming load failed")
+        logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}")
+        self.offset = offset
+        self.window_size = window_size
+        self.filters = filters
+        # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107
+        self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size
+        self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None)
 
 
 
@@ -49,64 +51,90 @@ def get_params(self, t) -> typing.Tuple[float, float, float]:
         Amplitude/energy parameters are normalized into the [0,1] range.
         """
         # Get the point in time (sample-offset) in the track in seconds based on sample-rate
-        sample_offset = int(t * SAMPLERATE + self.input_audio_offset * SAMPLERATE)
-        logger.debug(f"Analyzing audio at {self.input_audio_offset+t}s")
+        sample_offset = int(t * SAMPLERATE + self.offset * SAMPLERATE)
+        logger.debug(f"Analyzing audio at {self.offset+t}s")
         if sample_offset < len(self.audio_samples):
             window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size]
             if len(window_samples) < self.window_size:
                 # audio input file has likely ended
-                # TODO could round down to the next pow2 then do it anyway. not a critical case though IMO.
-                logger.debug(f"Warning: sample offset is out of range at time offset {t+self.input_audio_offset}s. Returning 0 vector")
+                # TODO could round down to the next lower pow2 then do it anyway. not a critical case though IMO.
+                logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning 0 vector")
                 return (0, 0, 0)
-            # fade-in / fade-out window
+                    
+            # fade-in / fade-out window to taper off the signal
             window_samples = window_samples * np.hamming(len(window_samples))
-            fft = np.fft.fft(window_samples)
-            # summing together the real and imaginary components, i think(??)
-            left, right = np.split(np.abs(fft), 2)
-            fft = np.add(left, right[::-1])
+            return bp_tuple(t, window_samples, self.filters)
+            #return fft_tuple(t)
+        else:
+            logger.debug(f"Warning: Audio input has ended. Returning 0 vector")
+            return (0, 0, 0)
 
-            # pink noise adjust
-            fft = fft * self.power_normalization_coefficients
+def butter_bandpass(lowcut, highcut, fs, order=5):
+        nyq = 0.5 * fs
+        low = lowcut / nyq
+        high = highcut / nyq
+        sos = butter(order, [low, high], analog=False, btype='band', output='sos')
+        return sos
 
-            freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE)
-            # collect energy for each frequency band
-            # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though
-            low_bucket = 0
-            low_count = 0
-            mid_bucket = 0
-            mid_count = 0
-            high_bucket = 0
-            high_count = 0
-            for i in range(len(fft)):
-                freq = self.fftx[i]
-                if freq < self.low_cutoff:
-                    low_bucket += fft[i]
-                    low_count += 1
-                elif freq < self.mid_cutoff:
-                    mid_bucket += fft[i]
-                    mid_count += 1
-                else:
-                    high_bucket += fft[i]
-                    high_count += 1
-            # mean energy per bucket
-            if low_count > 0 and mid_count > 0 and high_count > 0:    
-                low_bucket = low_bucket / low_count
-                mid_bucket = mid_bucket / mid_count
-                high_bucket = high_bucket / high_count
-            else:
-                logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector")
-                return (0,0,0)
-            # normalize to [0,1] range
-            max_val = np.max(fft)
-            if max_val > 0:
-                low_bucket = low_bucket / max_val
-                mid_bucket = mid_bucket / max_val
-                high_bucket = high_bucket / max_val
-            else:
-                logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector")
-                return (0,0,0)
-            return (float(low_bucket), float(mid_bucket), float(high_bucket))
-        else:
+def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
+        sos = butter_bandpass(lowcut, highcut, fs, order=order)
+        y = sosfilt(sos, data)
+        return y
 
-            logger.debug(f"Warning: Audio input has ended. Returning 0 vector")
-            return (0, 0, 0)
\ No newline at end of file
+def bp_tuple(t, window_samples, filters) -> typing.Dict[str, float]:
+    for filter in filters:
+        offset = filter.f_width/2
+        lower = filter.f_center - offset
+        upper = filter.f_center + offset
+        filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order)
+        # Normalize from signed 16-bit max value to 0..1 range
+        val = np.max(np.abs(filtered)) / 32768
+        return (val, 0, 0)
+
+def fft_tuple(t, window_samples) -> typing.Tuple[float, float, float]:
+    fft = np.fft.fft(window_samples)
+    # summing together the real and imaginary components, i think(??)
+    left, right = np.split(np.abs(fft), 2)
+    fft = np.add(left, right[::-1])
+
+    # pink noise adjust
+    fft = fft * self.power_normalization_coefficients
+
+    freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE)
+    # collect energy for each frequency band
+    # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though
+    low_bucket = 0
+    low_count = 0
+    mid_bucket = 0
+    mid_count = 0
+    high_bucket = 0
+    high_count = 0
+    for i in range(len(fft)):
+        freq = self.fftx[i]
+        if freq < self.low_cutoff:
+            low_bucket += fft[i]
+            low_count += 1
+        elif freq < self.mid_cutoff:
+            mid_bucket += fft[i]
+            mid_count += 1
+        else:
+            high_bucket += fft[i]
+            high_count += 1
+    # mean energy per bucket
+    if low_count > 0 and mid_count > 0 and high_count > 0:    
+        low_bucket = low_bucket / low_count
+        mid_bucket = mid_bucket / mid_count
+        high_bucket = high_bucket / high_count
+    else:
+        logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector")
+        return (0,0,0)
+    # normalize to [0,1] range
+    max_val = np.max(fft)
+    if max_val > 0:
+        low_bucket = low_bucket / max_val
+        mid_bucket = mid_bucket / max_val
+        high_bucket = high_bucket / max_val
+    else:
+        logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector")
+        return (0,0,0)
+    return (float(low_bucket), float(mid_bucket), float(high_bucket))
\ No newline at end of file
diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index f506e10..796fba3 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -110,7 +110,7 @@ def __init__(
         self.dataframe = []
 
         if params.input_audio:
-            self.audio_parser = SpectralAudioParser(params)
+            self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.window_size, params.filters)
         else:
             self.audio_parser = None
 
diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml
index b1cc16a..f2a801e 100644
--- a/src/pytti/assets/default.yaml
+++ b/src/pytti/assets/default.yaml
@@ -84,8 +84,11 @@ far_plane: 10000
 input_audio: ""
 input_audio_offset: 0
 input_audio_window_size: 8192
-input_audio_band_split_low_medium: 150
-input_audio_band_split_medium_high: 300
+input_audio_filters: []
+# - variable_name: fLo
+#   f_center: 60
+#   f_width: 30
+#   order: 5
 
 pre_animation_steps: 100
 lock_camera: true
diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py
index f77a5eb..2e0c8f2 100644
--- a/src/pytti/config/structured_config.py
+++ b/src/pytti/config/structured_config.py
@@ -16,6 +16,13 @@ def check_input_against_list(attribute, value, valid_values):
         )
 
 
+@define(auto_attribs=True)
+class AudioFilterConfig:
+    variable_name: str = "???"
+    f_center: int = "???"
+    f_width: int = "???"
+    order: int = 5
+
 @define(auto_attribs=True)
 class ConfigSchema:
     #############
@@ -103,8 +110,7 @@ def check(self, attribute, value):
     input_audio: str = ""
     input_audio_offset: float = 0
     input_audio_window_size: int = 1024
-    input_audio_band_split_low_medium: int = 500
-    input_audio_band_split_medium_high: int = 3500
+    input_audio_filters: AudioFilterConfig = None
 
     #  _2d and _3d only apply to those animation modes
 

From c2323866a89ab619121fd1be110665c172d8f3be Mon Sep 17 00:00:00 2001
From: Simon Baier <SimonBaier@outlook.de>
Date: Sun, 3 Apr 2022 13:48:10 +0200
Subject: [PATCH 11/20] feat: completely refactor fft / window_size
 /band-splitting based impl with band-pass filters, window size based on FPS

---
 src/pytti/AudioParse.py       | 121 ++++++++++++++--------------------
 src/pytti/ImageGuide.py       |   6 +-
 src/pytti/assets/default.yaml |   1 -
 3 files changed, 54 insertions(+), 74 deletions(-)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index 68319ac..7fc623a 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -8,16 +8,19 @@
 
 class SpectralAudioParser:
     """
-    Audio Parser reads a given input file, scans along it and parses its spectrum using FFT.
-    The FFT output is split into three bands (low,mid,high), the (average) amplitude of which is then returned for use in animation functions.
+    reads a given input file, scans along it and parses the amplitude in selected bands using butterworth bandpass filters.
+    the amplitude is normalized into the 0..1 range for easier use in transformation functions.
     """
     def __init__(
         self,
         input_audio,
         offset,
-        window_size,
+        frames_per_second,
         filters
         ):
+        if len(filters) < 1:
+            raise RuntimeError("When using input_audio, at least 1 filter must be specified")
+
         pipe = subprocess.Popen(['ffmpeg', '-i', input_audio,
             '-f', 's16le',
             '-acodec', 'pcm_s16le',
@@ -35,17 +38,34 @@ def __init__(
                 break
         if len(self.audio_samples) < 0:
             raise RuntimeError("Audio samples are empty, assuming load failed")
-        logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}")
+        self.duration = len(self.audio_samples) / SAMPLERATE
+        logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s")
         self.offset = offset
-        self.window_size = window_size
+        if offset > self.duration:
+            raise RuntimeError(f"Audio offset set at {offset}s but input audio is only {duration}s long")
+        # analyze all samples for the current frame
+        self.window_size = int(1/frames_per_second * SAMPLERATE)
         self.filters = filters
-        # pink noise normalization blatantly stolen from https://github.com/aiXander/Realtime_PyAudio_FFT/blob/275c8b1fc268ac946470b0d7a80de56eb2212b58/src/stream_analyzer.py#L107
-        self.fftx = np.arange(int(self.window_size/2), dtype=float) * SAMPLERATE / self.window_size
-        self.power_normalization_coefficients = np.logspace(np.log2(1), np.log2(np.log2(SAMPLERATE/2)), len(self.fftx), endpoint=True, base=2, dtype=None)
-
 
+        # parse band maxima first for normalizing the filtered signal to 0..1 at arbitrary points in the file later
+        # this initialization is a bit compute intensive, especially for higher fps numbers, but i couldn't find a cleaner way
+        # (band-passing the entire track instead of windows creates maxima that are way off, some filtering anomaly i don't understand...)
+        steps = int((self.duration - self.offset) * frames_per_second)
+        interval = 1/frames_per_second
+        maxima = {}
+        time_steps = np.linspace(0, steps, num=steps) * interval
+        for t in time_steps:
+            sample_offset = int(t * SAMPLERATE)
+            cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset+self.window_size], filters)
+            for key in cur_maxima:
+                if key in maxima:
+                    maxima[key] = max(maxima[key], cur_maxima[key])
+                else:
+                    maxima[key] = cur_maxima[key]
+        self.band_maxima = maxima
+        logger.debug(f"initialized band maxima for {len(filters)} filters: {self.band_maxima}")
 
-    def get_params(self, t) -> typing.Tuple[float, float, float]:
+    def get_params(self, t) -> typing.Dict[str, float]:
         """
         Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended.
         Amplitude/energy parameters are normalized into the [0,1] range.
@@ -58,22 +78,23 @@ def get_params(self, t) -> typing.Tuple[float, float, float]:
             if len(window_samples) < self.window_size:
                 # audio input file has likely ended
                 # TODO could round down to the next lower pow2 then do it anyway. not a critical case though IMO.
-                logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning 0 vector")
-                return (0, 0, 0)
-                    
+                logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning null result")
+                return {}
             # fade-in / fade-out window to taper off the signal
-            window_samples = window_samples * np.hamming(len(window_samples))
-            return bp_tuple(t, window_samples, self.filters)
-            #return fft_tuple(t)
+            #window_samples = window_samples * np.hamming(len(window_samples))
+            return bp_filtered_norm(window_samples, self.filters, self.band_maxima)
         else:
-            logger.debug(f"Warning: Audio input has ended. Returning 0 vector")
-            return (0, 0, 0)
+            logger.debug(f"Warning: Audio input has ended. Returning null result")
+            return {}
+
+    def get_duration(self):
+        return self.duration
 
 def butter_bandpass(lowcut, highcut, fs, order=5):
         nyq = 0.5 * fs
         low = lowcut / nyq
         high = highcut / nyq
-        sos = butter(order, [low, high], analog=False, btype='band', output='sos')
+        sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos')
         return sos
 
 def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
@@ -81,60 +102,20 @@ def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
         y = sosfilt(sos, data)
         return y
 
-def bp_tuple(t, window_samples, filters) -> typing.Dict[str, float]:
+
+def bp_filtered(window_samples, filters) -> typing.Dict[str, float]:
+    results = {}
     for filter in filters:
         offset = filter.f_width/2
         lower = filter.f_center - offset
         upper = filter.f_center + offset
         filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order)
-        # Normalize from signed 16-bit max value to 0..1 range
-        val = np.max(np.abs(filtered)) / 32768
-        return (val, 0, 0)
+        results[filter.variable_name] = np.max(np.abs(filtered))
+    return results
 
-def fft_tuple(t, window_samples) -> typing.Tuple[float, float, float]:
-    fft = np.fft.fft(window_samples)
-    # summing together the real and imaginary components, i think(??)
-    left, right = np.split(np.abs(fft), 2)
-    fft = np.add(left, right[::-1])
-
-    # pink noise adjust
-    fft = fft * self.power_normalization_coefficients
-
-    freq_buckets = np.fft.fftfreq(self.window_size, 1 / SAMPLERATE)
-    # collect energy for each frequency band
-    # TODO: this could probably be done in a much nicer way with bandpass filters somehow... not sure on the correct arithmetic though
-    low_bucket = 0
-    low_count = 0
-    mid_bucket = 0
-    mid_count = 0
-    high_bucket = 0
-    high_count = 0
-    for i in range(len(fft)):
-        freq = self.fftx[i]
-        if freq < self.low_cutoff:
-            low_bucket += fft[i]
-            low_count += 1
-        elif freq < self.mid_cutoff:
-            mid_bucket += fft[i]
-            mid_count += 1
-        else:
-            high_bucket += fft[i]
-            high_count += 1
-    # mean energy per bucket
-    if low_count > 0 and mid_count > 0 and high_count > 0:    
-        low_bucket = low_bucket / low_count
-        mid_bucket = mid_bucket / mid_count
-        high_bucket = high_bucket / high_count
-    else:
-        logger.debug(f"Warning: There were empty buckets in the audio frequency analysis. Returning 0 vector")
-        return (0,0,0)
-    # normalize to [0,1] range
-    max_val = np.max(fft)
-    if max_val > 0:
-        low_bucket = low_bucket / max_val
-        mid_bucket = mid_bucket / max_val
-        high_bucket = high_bucket / max_val
-    else:
-        logger.debug(f"Warning: Max val was 0 in the audio frequency analysis. Returning 0 vector")
-        return (0,0,0)
-    return (float(low_bucket), float(mid_bucket), float(high_bucket))
\ No newline at end of file
+def bp_filtered_norm(window_samples, filters, norm_factors) -> typing.Dict[str, float]:
+    results = bp_filtered(window_samples, filters)
+    for key in results:
+        # normalize
+        results[key] = results[key] / norm_factors[key]
+    return results
\ No newline at end of file
diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index 796fba3..352a47a 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -110,7 +110,7 @@ def __init__(
         self.dataframe = []
 
         if params.input_audio:
-            self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.window_size, params.filters)
+            self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.frames_per_second, params.filters)
         else:
             self.audio_parser = None
 
@@ -433,9 +433,9 @@ def update(self, model, img, i, stage_i, *args, **kwargs):
             # next_step_pil = None
             if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
                 if self.audio_parser is not None:
-                    lo, mid, hi = self.audio_parser.get_params(t)
+                    band_dict = self.audio_parser.get_params(t)
                     logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}")
-                    set_t(t, lo, mid, hi)
+                    set_t(t, band_dict)
                 else:
                     logger.debug(f"Time: {t:.4f} seconds")
                 # update_rotoscopers(
diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml
index f2a801e..4556bb5 100644
--- a/src/pytti/assets/default.yaml
+++ b/src/pytti/assets/default.yaml
@@ -83,7 +83,6 @@ far_plane: 10000
 ######################
 input_audio: ""
 input_audio_offset: 0
-input_audio_window_size: 8192
 input_audio_filters: []
 # - variable_name: fLo
 #   f_center: 60

From 21e5b3fe589f6e3811f058f96185abdb6d3ae087 Mon Sep 17 00:00:00 2001
From: Simon Baier <SimonBaier@outlook.de>
Date: Sun, 3 Apr 2022 16:46:56 +0200
Subject: [PATCH 12/20] refactor: remove window size config param, fix eval
 tooling for dict math env

---
 src/pytti/config/structured_config.py |  1 -
 src/pytti/eval_tools.py               | 18 +++++++-----------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py
index 2e0c8f2..0290860 100644
--- a/src/pytti/config/structured_config.py
+++ b/src/pytti/config/structured_config.py
@@ -109,7 +109,6 @@ def check(self, attribute, value):
 
     input_audio: str = ""
     input_audio_offset: float = 0
-    input_audio_window_size: int = 1024
     input_audio_filters: AudioFilterConfig = None
 
     #  _2d and _3d only apply to those animation modes
diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py
index a0741f8..64b936e 100644
--- a/src/pytti/eval_tools.py
+++ b/src/pytti/eval_tools.py
@@ -5,9 +5,7 @@
 
 math_env = None
 global_t = 0
-global_fLo = 0
-global_fMid = 0
-global_fHi = 0
+global_bands = {}
 eval_memo = {}
 
 
@@ -30,9 +28,9 @@ def parametric_eval(string, **vals):
             )
         math_env.update(vals)
         math_env["t"] = global_t
-        math_env["fLo"] = global_fLo
-        math_env["fMid"] = global_fMid
-        math_env["fHi"] = global_fHi
+        # TODO set envs from global bandpass dict values 
+        for band in global_bands:
+            math_env[band] = global_bands[band]
         try:
             output = eval(string, math_env)
         except SyntaxError as e:
@@ -43,12 +41,10 @@ def parametric_eval(string, **vals):
         return string
 
 
-def set_t(t, fLo, fMid, fHi):
-    global global_t, global_fLo, global_fMid, global_fHi, eval_memo
+def set_t(t, band_dict):
+    global global_t, global_bands, eval_memo
     global_t = t
-    global_fLo = fLo
-    global_fMid = fMid
-    global_fHi = fHi
+    global_bands = band_dict
     eval_memo = {}
 
 

From 139eaaafb5606cecd6d9effcc8549e505caea4b0 Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Fri, 8 Apr 2022 23:35:20 +0200
Subject: [PATCH 13/20] style: cleanup, remove useless todos, formatting

---
 src/pytti/AudioParse.py | 70 ++++++++++++++++++++++-------------------
 src/pytti/eval_tools.py | 11 +++++--
 2 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
index 7fc623a..fe94e58 100644
--- a/src/pytti/AudioParse.py
+++ b/src/pytti/AudioParse.py
@@ -4,32 +4,34 @@
 from loguru import logger
 from scipy.signal import butter, sosfilt, sosfreqz
 
-SAMPLERATE=44100
+SAMPLERATE = 44100
+
 
 class SpectralAudioParser:
     """
     reads a given input file, scans along it and parses the amplitude in selected bands using butterworth bandpass filters.
     the amplitude is normalized into the 0..1 range for easier use in transformation functions.
     """
+
     def __init__(
-        self,
-        input_audio,
-        offset,
-        frames_per_second,
-        filters
-        ):
+            self,
+            input_audio,
+            offset,
+            frames_per_second,
+            filters
+    ):
         if len(filters) < 1:
             raise RuntimeError("When using input_audio, at least 1 filter must be specified")
 
         pipe = subprocess.Popen(['ffmpeg', '-i', input_audio,
-            '-f', 's16le',
-            '-acodec', 'pcm_s16le',
-            '-ar', str(SAMPLERATE),
-            '-ac', '1',
-            '-'], stdout=subprocess.PIPE, bufsize=10**8)
+                                 '-f', 's16le',
+                                 '-acodec', 'pcm_s16le',
+                                 '-ar', str(SAMPLERATE),
+                                 '-ac', '1',
+                                 '-'], stdout=subprocess.PIPE, bufsize=10 ** 8)
 
         self.audio_samples = np.array([], dtype=np.int16)
-        
+
         # read the audio file from the pipe in 0.5s blocks (2 bytes per sample)
         while True:
             buf = pipe.stdout.read(SAMPLERATE)
@@ -39,24 +41,25 @@ def __init__(
         if len(self.audio_samples) < 0:
             raise RuntimeError("Audio samples are empty, assuming load failed")
         self.duration = len(self.audio_samples) / SAMPLERATE
-        logger.debug(f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s")
+        logger.debug(
+            f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s")
         self.offset = offset
         if offset > self.duration:
             raise RuntimeError(f"Audio offset set at {offset}s but input audio is only {duration}s long")
         # analyze all samples for the current frame
-        self.window_size = int(1/frames_per_second * SAMPLERATE)
+        self.window_size = int(1 / frames_per_second * SAMPLERATE)
         self.filters = filters
 
         # parse band maxima first for normalizing the filtered signal to 0..1 at arbitrary points in the file later
         # this initialization is a bit compute intensive, especially for higher fps numbers, but i couldn't find a cleaner way
         # (band-passing the entire track instead of windows creates maxima that are way off, some filtering anomaly i don't understand...)
         steps = int((self.duration - self.offset) * frames_per_second)
-        interval = 1/frames_per_second
+        interval = 1 / frames_per_second
         maxima = {}
         time_steps = np.linspace(0, steps, num=steps) * interval
         for t in time_steps:
             sample_offset = int(t * SAMPLERATE)
-            cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset+self.window_size], filters)
+            cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset + self.window_size], filters)
             for key in cur_maxima:
                 if key in maxima:
                     maxima[key] = max(maxima[key], cur_maxima[key])
@@ -72,16 +75,14 @@ def get_params(self, t) -> typing.Dict[str, float]:
         """
         # Get the point in time (sample-offset) in the track in seconds based on sample-rate
         sample_offset = int(t * SAMPLERATE + self.offset * SAMPLERATE)
-        logger.debug(f"Analyzing audio at {self.offset+t}s")
+        logger.debug(f"Analyzing audio at {self.offset + t}s")
         if sample_offset < len(self.audio_samples):
-            window_samples = self.audio_samples[sample_offset:sample_offset+self.window_size]
+            window_samples = self.audio_samples[sample_offset:sample_offset + self.window_size]
             if len(window_samples) < self.window_size:
                 # audio input file has likely ended
-                # TODO could round down to the next lower pow2 then do it anyway. not a critical case though IMO.
-                logger.debug(f"Warning: sample offset is out of range at time offset {t+self.offset}s. Returning null result")
+                logger.debug(
+                    f"Warning: sample offset is out of range at time offset {t + self.offset}s. Returning null result")
                 return {}
-            # fade-in / fade-out window to taper off the signal
-            #window_samples = window_samples * np.hamming(len(window_samples))
             return bp_filtered_norm(window_samples, self.filters, self.band_maxima)
         else:
             logger.debug(f"Warning: Audio input has ended. Returning null result")
@@ -90,32 +91,35 @@ def get_params(self, t) -> typing.Dict[str, float]:
     def get_duration(self):
         return self.duration
 
+
 def butter_bandpass(lowcut, highcut, fs, order=5):
-        nyq = 0.5 * fs
-        low = lowcut / nyq
-        high = highcut / nyq
-        sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos')
-        return sos
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos')
+    return sos
+
 
 def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
-        sos = butter_bandpass(lowcut, highcut, fs, order=order)
-        y = sosfilt(sos, data)
-        return y
+    sos = butter_bandpass(lowcut, highcut, fs, order=order)
+    y = sosfilt(sos, data)
+    return y
 
 
 def bp_filtered(window_samples, filters) -> typing.Dict[str, float]:
     results = {}
     for filter in filters:
-        offset = filter.f_width/2
+        offset = filter.f_width / 2
         lower = filter.f_center - offset
         upper = filter.f_center + offset
         filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order)
         results[filter.variable_name] = np.max(np.abs(filtered))
     return results
 
+
 def bp_filtered_norm(window_samples, filters, norm_factors) -> typing.Dict[str, float]:
     results = bp_filtered(window_samples, filters)
     for key in results:
         # normalize
         results[key] = results[key] / norm_factors[key]
-    return results
\ No newline at end of file
+    return results
diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py
index 64b936e..e637d06 100644
--- a/src/pytti/eval_tools.py
+++ b/src/pytti/eval_tools.py
@@ -6,6 +6,7 @@
 math_env = None
 global_t = 0
 global_bands = {}
+global_bands_prev = {}
 eval_memo = {}
 
 
@@ -28,9 +29,11 @@ def parametric_eval(string, **vals):
             )
         math_env.update(vals)
         math_env["t"] = global_t
-        # TODO set envs from global bandpass dict values 
         for band in global_bands:
             math_env[band] = global_bands[band]
+        if global_bands_prev:
+            for band in global_bands_prev:
+                math_env[f'{band}_prev'] = global_bands_prev[band]
         try:
             output = eval(string, math_env)
         except SyntaxError as e:
@@ -42,8 +45,12 @@ def parametric_eval(string, **vals):
 
 
 def set_t(t, band_dict):
-    global global_t, global_bands, eval_memo
+    global global_t, global_bands, global_bands_prev, eval_memo
     global_t = t
+    if global_bands:
+        global_bands_prev = global_bands
+    else:
+        global_bands_prev = band_dict
     global_bands = band_dict
     eval_memo = {}
 

From 52c58b172547d6ae6d364fddc568de5d8bf42f11 Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Fri, 8 Apr 2022 23:43:15 +0200
Subject: [PATCH 14/20] fix: use correct parameter naming, properly print bands

---
 src/pytti/ImageGuide.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index 352a47a..88d086d 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -109,8 +109,9 @@ def __init__(
             self.optimizer = optimizer
         self.dataframe = []
 
-        if params.input_audio:
-            self.audio_parser = SpectralAudioParser(params.input_audio, params.offset, params.frames_per_second, params.filters)
+        if params.input_audio and params.input_audio_filters:
+            self.audio_parser = SpectralAudioParser(params.input_audio, params.input_audio_offset,
+                                                    params.frames_per_second, params.input_audio_filters)
         else:
             self.audio_parser = None
 
@@ -434,7 +435,7 @@ def update(self, model, img, i, stage_i, *args, **kwargs):
             if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
                 if self.audio_parser is not None:
                     band_dict = self.audio_parser.get_params(t)
-                    logger.debug(f"Time: {t:.4f} seconds, audio params: lo: {lo:.4f}, mid: {mid:.4f}, hi: {hi:.4f}")
+                    logger.debug(f"Time: {t:.4f} seconds, audio params: {band_dict}")
                     set_t(t, band_dict)
                 else:
                     logger.debug(f"Time: {t:.4f} seconds")

From 1688ab4d6950356fc816fc1bf915a1dfb6f66330 Mon Sep 17 00:00:00 2001
From: Simon Baier <simonbaier@outlook.de>
Date: Mon, 18 Apr 2022 19:59:39 +0200
Subject: [PATCH 15/20] fix: param must be dict instead of tuple now

---
 src/pytti/ImageGuide.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index 88d086d..25dfcbe 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -428,7 +428,7 @@ def update(self, model, img, i, stage_i, *args, **kwargs):
             params.steps_per_frame * params.frames_per_second
         )
         if self.audio_parser is None:
-            set_t(t, 0, 0, 0)
+            set_t(t, {})
         # set_t(t)  # this won't need to be a thing with `t`` attached to the class
         if i >= params.pre_animation_steps:
             # next_step_pil = None

From 548edba1bf318cdf3e03c245b9bb0a3954850bec Mon Sep 17 00:00:00 2001
From: David Marx <david.marx84@gmail.com>
Date: Wed, 20 Apr 2022 14:03:17 -0700
Subject: [PATCH 16/20] fixed dangling merge conflict

---
 src/pytti/ImageGuide.py  | 172 ++-------------------------------------
 src/pytti/update_func.py |  20 ++++-
 2 files changed, 25 insertions(+), 167 deletions(-)

diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index 25dfcbe..2266223 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -110,8 +110,12 @@ def __init__(
         self.dataframe = []
 
         if params.input_audio and params.input_audio_filters:
-            self.audio_parser = SpectralAudioParser(params.input_audio, params.input_audio_offset,
-                                                    params.frames_per_second, params.input_audio_filters)
+            self.audio_parser = SpectralAudioParser(
+                params.input_audio,
+                params.input_audio_offset,
+                params.frames_per_second,
+                params.input_audio_filters,
+            )
         else:
             self.audio_parser = None
 
@@ -372,168 +376,4 @@ def update(self, model, img, i, stage_i, *args, **kwargs):
         """
         update hook called ever step
         """
-<<<<<<< HEAD
         pass
-=======
-        # logger.debug("model.update called")
-
-        # ... I have regrets.
-        params = self.params
-        writer = self.writer
-        OUTPATH = self.OUTPATH
-        base_name = self.base_name
-        fig = self.fig
-        axs = self.axs
-        video_frames = self.video_frames
-        optical_flows = self.optical_flows
-        stabilization_augs = self.stabilization_augs
-        last_frame_semantic = self.last_frame_semantic
-        semantic_init_prompt = self.semantic_init_prompt
-        init_augs = self.init_augs
-
-        model = self
-        img = self.image_rep
-        embedder = self.embedder
-
-        model.report_out(
-            i=i,
-            stage_i=stage_i,
-            # model=model,
-            writer=writer,
-            fig=fig,  # default to None...
-            axs=axs,  # default to None...
-            clear_every=params.clear_every,
-            display_every=params.display_every,
-            approximate_vram_usage=params.approximate_vram_usage,
-            display_scale=params.display_scale,
-            show_graphs=params.show_graphs,
-            show_palette=params.show_palette,
-        )
-
-        model.save_out(
-            i=i,
-            # img=img,
-            writer=writer,
-            OUTPATH=OUTPATH,
-            base_name=base_name,
-            save_every=params.save_every,
-            file_namespace=params.file_namespace,
-            backups=params.backups,
-        )
-
-        # animate
-        ################
-        ## TO DO: attach T as a class attribute
-        t = (i - params.pre_animation_steps) / (
-            params.steps_per_frame * params.frames_per_second
-        )
-        if self.audio_parser is None:
-            set_t(t, {})
-        # set_t(t)  # this won't need to be a thing with `t`` attached to the class
-        if i >= params.pre_animation_steps:
-            # next_step_pil = None
-            if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
-                if self.audio_parser is not None:
-                    band_dict = self.audio_parser.get_params(t)
-                    logger.debug(f"Time: {t:.4f} seconds, audio params: {band_dict}")
-                    set_t(t, band_dict)
-                else:
-                    logger.debug(f"Time: {t:.4f} seconds")
-                # update_rotoscopers(
-                ROTOSCOPERS.update_rotoscopers(
-                    ((i - params.pre_animation_steps) // params.steps_per_frame + 1)
-                    * params.frame_stride
-                )
-                if params.reset_lr_each_frame:
-                    model.set_optim(None)
-
-                if params.animation_mode == "2D":
-
-                    next_step_pil = animate_2d(
-                        translate_y=params.translate_y,
-                        translate_x=params.translate_x,
-                        rotate_2d=params.rotate_2d,
-                        zoom_x_2d=params.zoom_x_2d,
-                        zoom_y_2d=params.zoom_y_2d,
-                        infill_mode=params.infill_mode,
-                        sampling_mode=params.sampling_mode,
-                        writer=writer,
-                        i=i,
-                        img=img,
-                        t=t,  # just here for logging
-                    )
-
-                    ###########################
-                elif params.animation_mode == "3D":
-                    try:
-                        im
-                    except NameError:
-                        im = img.decode_image()
-                    with vram_usage_mode("Optical Flow Loss"):
-                        # zoom_3d -> rename to animate_3d or transform_3d
-                        flow, next_step_pil = zoom_3d(
-                            img,
-                            (
-                                params.translate_x,
-                                params.translate_y,
-                                params.translate_z_3d,
-                            ),
-                            params.rotate_3d,
-                            params.field_of_view,
-                            params.near_plane,
-                            params.far_plane,
-                            border_mode=params.infill_mode,
-                            sampling_mode=params.sampling_mode,
-                            stabilize=params.lock_camera,
-                        )
-                        freeze_vram_usage()
-
-                    for optical_flow in optical_flows:
-                        optical_flow.set_last_step(im)
-                        optical_flow.set_target_flow(flow)
-                        optical_flow.set_enabled(True)
-
-                elif params.animation_mode == "Video Source":
-
-                    flow_im, next_step_pil = animate_video_source(
-                        i=i,
-                        img=img,
-                        video_frames=video_frames,
-                        optical_flows=optical_flows,
-                        base_name=base_name,
-                        pre_animation_steps=params.pre_animation_steps,
-                        frame_stride=params.frame_stride,
-                        steps_per_frame=params.steps_per_frame,
-                        file_namespace=params.file_namespace,
-                        reencode_each_frame=params.reencode_each_frame,
-                        lock_palette=params.lock_palette,
-                        save_every=params.save_every,
-                        infill_mode=params.infill_mode,
-                        sampling_mode=params.sampling_mode,
-                    )
-
-                if params.animation_mode != "off":
-                    try:
-                        for aug in stabilization_augs:
-                            aug.set_comp(next_step_pil)
-                            aug.set_enabled(True)
-                        if last_frame_semantic is not None:
-                            last_frame_semantic.set_image(embedder, next_step_pil)
-                            last_frame_semantic.set_enabled(True)
-                        for aug in init_augs:
-                            aug.set_enabled(False)
-                        if semantic_init_prompt is not None:
-                            semantic_init_prompt.set_enabled(False)
-                    except UnboundLocalError:
-                        logger.critical(
-                            "\n\n-----< PYTTI-TOOLS > ------"
-                            "If you are seeing this error, it might mean "
-                            "you are using an option that expects you have "
-                            "provided an init_image or video_file.\n\nIf you "
-                            "think you are seeing this message in error, please "
-                            "file an issue here: "
-                            "https://github.com/pytti-tools/pytti-core/issues/new"
-                            "-----< PYTTI-TOOLS > ------\n\n"
-                        )
-                        raise
->>>>>>> 57553a3 (feat: initial rough audio parsing logic)
diff --git a/src/pytti/update_func.py b/src/pytti/update_func.py
index 1f25736..bb7fe62 100644
--- a/src/pytti/update_func.py
+++ b/src/pytti/update_func.py
@@ -190,10 +190,28 @@ def save_out(
     t = (i - params.pre_animation_steps) / (
         params.steps_per_frame * params.frames_per_second
     )
-    set_t(t)
+    set_t(t, {})
     if i >= params.pre_animation_steps:
         if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
             logger.debug(f"Time: {t:.4f} seconds")
+
+            # Audio Reactivity ############
+            if model.audio_parser is None:
+                set_t(t, {})
+            # set_t(t)  # this won't need to be a thing with `t`` attached to the class
+            if i >= params.pre_animation_steps:
+                # next_step_pil = None
+                if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
+                    if model.audio_parser is not None:
+                        band_dict = model.audio_parser.get_params(t)
+                        logger.debug(
+                            f"Time: {t:.4f} seconds, audio params: {band_dict}"
+                        )
+                        set_t(t, band_dict)
+                    else:
+                        logger.debug(f"Time: {t:.4f} seconds")
+            ###############################
+
             update_rotoscopers(
                 ((i - params.pre_animation_steps) // params.steps_per_frame + 1)
                 * params.frame_stride

From ba63ea97d0865c7cfb73e974640a57e3b0ca5f79 Mon Sep 17 00:00:00 2001
From: David Marx <david.marx84@gmail.com>
Date: Wed, 20 Apr 2022 14:07:15 -0700
Subject: [PATCH 17/20] audio filter config needs to be optional.

---
 src/pytti/config/structured_config.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py
index 0290860..54662b1 100644
--- a/src/pytti/config/structured_config.py
+++ b/src/pytti/config/structured_config.py
@@ -18,11 +18,12 @@ def check_input_against_list(attribute, value, valid_values):
 
 @define(auto_attribs=True)
 class AudioFilterConfig:
-    variable_name: str = "???"
-    f_center: int = "???"
-    f_width: int = "???"
+    variable_name: str = ""
+    f_center: int = -1
+    f_width: int = -1
     order: int = 5
 
+
 @define(auto_attribs=True)
 class ConfigSchema:
     #############

From 7f59f1e6af94ba9a4ad6544c2a4ad0a6af59936f Mon Sep 17 00:00:00 2001
From: David Marx <david.marx84@gmail.com>
Date: Wed, 20 Apr 2022 14:20:14 -0700
Subject: [PATCH 18/20] typing.Optional for backwards compatibility

---
 src/pytti/config/structured_config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py
index 54662b1..9db6c43 100644
--- a/src/pytti/config/structured_config.py
+++ b/src/pytti/config/structured_config.py
@@ -108,9 +108,9 @@ def check(self, attribute, value):
     ### Induced Motion ###
     ######################
 
-    input_audio: str = ""
-    input_audio_offset: float = 0
-    input_audio_filters: AudioFilterConfig = None
+    input_audio: Optional[str] = ""
+    input_audio_offset: Optional[float] = 0
+    input_audio_filters: Optional[AudioFilterConfig] = None
 
     #  _2d and _3d only apply to those animation modes
 
@@ -207,7 +207,7 @@ def check(self, attribute, value):
     backups: int = 0
     show_graphs: bool = False
     approximate_vram_usage: bool = False
-    use_tensorboard: bool = False
+    use_tensorboard: Optional[bool] = False
 
     #####################################
 

From 92c5f4acf1f901f552e4d30f536c7bc8af5a1e5e Mon Sep 17 00:00:00 2001
From: David Marx <david.marx84@gmail.com>
Date: Wed, 20 Apr 2022 14:39:34 -0700
Subject: [PATCH 19/20] sadly, optional doesn't permit absent...

---
 tests/config/default.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/config/default.yaml b/tests/config/default.yaml
index 79f9e4d..4aa9c6c 100644
--- a/tests/config/default.yaml
+++ b/tests/config/default.yaml
@@ -154,3 +154,11 @@ models_parent_dir: ${user_cache:}
 ##########################
 
 gradient_accumulation_steps: 1
+
+##################
+
+# This shouldn't be necessary, but let's see if maybe it squashes test errors?
+
+input_audio: ""
+input_audio_offset: 0
+input_audio_filters: null

From 80faea26ecd8dffc4d1a57dd4f49c352497ff96c Mon Sep 17 00:00:00 2001
From: David Marx <david.marx84@gmail.com>
Date: Wed, 20 Apr 2022 14:53:22 -0700
Subject: [PATCH 20/20] fixed minor integration errors

---
 src/pytti/ImageGuide.py               | 20 +++++++++++---------
 src/pytti/config/structured_config.py |  4 ++--
 tests/test_animation_broken.py        |  4 ++++
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
index 2266223..1855915 100644
--- a/src/pytti/ImageGuide.py
+++ b/src/pytti/ImageGuide.py
@@ -109,15 +109,17 @@ def __init__(
             self.optimizer = optimizer
         self.dataframe = []
 
-        if params.input_audio and params.input_audio_filters:
-            self.audio_parser = SpectralAudioParser(
-                params.input_audio,
-                params.input_audio_offset,
-                params.frames_per_second,
-                params.input_audio_filters,
-            )
-        else:
-            self.audio_parser = None
+        self.audio_parser = None
+        if params is not None:
+            if params.input_audio and params.input_audio_filters:
+                self.audio_parser = SpectralAudioParser(
+                    params.input_audio,
+                    params.input_audio_offset,
+                    params.frames_per_second,
+                    params.input_audio_filters,
+                )
+            # else:
+            #    self.audio_parser = None
 
         # self.null_update = null_update
         self.params = params
diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py
index 9db6c43..a574d5f 100644
--- a/src/pytti/config/structured_config.py
+++ b/src/pytti/config/structured_config.py
@@ -108,8 +108,8 @@ def check(self, attribute, value):
     ### Induced Motion ###
     ######################
 
-    input_audio: Optional[str] = ""
-    input_audio_offset: Optional[float] = 0
+    input_audio: str = ""
+    input_audio_offset: float = 0
     input_audio_filters: Optional[AudioFilterConfig] = None
 
     #  _2d and _3d only apply to those animation modes
diff --git a/tests/test_animation_broken.py b/tests/test_animation_broken.py
index 0bfbbec..8d36f1a 100644
--- a/tests/test_animation_broken.py
+++ b/tests/test_animation_broken.py
@@ -90,6 +90,10 @@
     ##########################
     # adding new config items for backwards compatibility
     "use_tensorboard": True,  # This should actually default to False. Prior to April2022, tb was non-optional
+    # Default null audio input parameters
+    "input_audio": "",
+    "input_audio_offset": 0,
+    "input_audio_filters": [],
 }