pytti-tools · dmarx · Apr 20, 2022 · Mar 23, 2022 · Mar 24, 2022 · Mar 25, 2022
diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py
@@ -0,0 +1,125 @@
+import numpy as np
+import typing
+import subprocess
+from loguru import logger
+from scipy.signal import butter, sosfilt, sosfreqz
+
+SAMPLERATE = 44100
+
+
+class SpectralAudioParser:
+    """
+    reads a given input file, scans along it and parses the amplitude in selected bands using butterworth bandpass filters.
+    the amplitude is normalized into the 0..1 range for easier use in transformation functions.
+    """
+
+    def __init__(
+            self,
+            input_audio,
+            offset,
+            frames_per_second,
+            filters
+    ):
+        if len(filters) < 1:
+            raise RuntimeError("When using input_audio, at least 1 filter must be specified")
+
+        pipe = subprocess.Popen(['ffmpeg', '-i', input_audio,
+                                 '-f', 's16le',
+                                 '-acodec', 'pcm_s16le',
+                                 '-ar', str(SAMPLERATE),
+                                 '-ac', '1',
+                                 '-'], stdout=subprocess.PIPE, bufsize=10 ** 8)
+
+        self.audio_samples = np.array([], dtype=np.int16)
+
+        # read the audio file from the pipe in 0.5s blocks (2 bytes per sample)
+        while True:
+            buf = pipe.stdout.read(SAMPLERATE)
+            self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16))
+            if len(buf) < SAMPLERATE:
+                break
+        if len(self.audio_samples) < 0:
+            raise RuntimeError("Audio samples are empty, assuming load failed")
+        self.duration = len(self.audio_samples) / SAMPLERATE
+        logger.debug(
+            f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s")
+        self.offset = offset
+        if offset > self.duration:
+            raise RuntimeError(f"Audio offset set at {offset}s but input audio is only {duration}s long")
+        # analyze all samples for the current frame
+        self.window_size = int(1 / frames_per_second * SAMPLERATE)
+        self.filters = filters
+
+        # parse band maxima first for normalizing the filtered signal to 0..1 at arbitrary points in the file later
+        # this initialization is a bit compute intensive, especially for higher fps numbers, but i couldn't find a cleaner way
+        # (band-passing the entire track instead of windows creates maxima that are way off, some filtering anomaly i don't understand...)
+        steps = int((self.duration - self.offset) * frames_per_second)
+        interval = 1 / frames_per_second
+        maxima = {}
+        time_steps = np.linspace(0, steps, num=steps) * interval
+        for t in time_steps:
+            sample_offset = int(t * SAMPLERATE)
+            cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset + self.window_size], filters)
+            for key in cur_maxima:
+                if key in maxima:
+                    maxima[key] = max(maxima[key], cur_maxima[key])
+                else:
+                    maxima[key] = cur_maxima[key]
+        self.band_maxima = maxima
+        logger.debug(f"initialized band maxima for {len(filters)} filters: {self.band_maxima}")
+
+    def get_params(self, t) -> typing.Dict[str, float]:
+        """
+        Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended.
+        Amplitude/energy parameters are normalized into the [0,1] range.
+        """
+        # Get the point in time (sample-offset) in the track in seconds based on sample-rate
+        sample_offset = int(t * SAMPLERATE + self.offset * SAMPLERATE)
+        logger.debug(f"Analyzing audio at {self.offset + t}s")
+        if sample_offset < len(self.audio_samples):
+            window_samples = self.audio_samples[sample_offset:sample_offset + self.window_size]
+            if len(window_samples) < self.window_size:
+                # audio input file has likely ended
+                logger.debug(
+                    f"Warning: sample offset is out of range at time offset {t + self.offset}s. Returning null result")
+                return {}
+            return bp_filtered_norm(window_samples, self.filters, self.band_maxima)
+        else:
+            logger.debug(f"Warning: Audio input has ended. Returning null result")
+            return {}
+
+    def get_duration(self):
+        return self.duration
+
+
+def butter_bandpass(lowcut, highcut, fs, order=5):
+    nyq = 0.5 * fs
+    low = lowcut / nyq
+    high = highcut / nyq
+    sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos')
+    return sos
+
+
+def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
+    sos = butter_bandpass(lowcut, highcut, fs, order=order)
+    y = sosfilt(sos, data)
+    return y
+
+
+def bp_filtered(window_samples, filters) -> typing.Dict[str, float]:
+    results = {}
+    for filter in filters:
+        offset = filter.f_width / 2
+        lower = filter.f_center - offset
+        upper = filter.f_center + offset
+        filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order)
+        results[filter.variable_name] = np.max(np.abs(filtered))
+    return results
+
+
+def bp_filtered_norm(window_samples, filters, norm_factors) -> typing.Dict[str, float]:
+    results = bp_filtered(window_samples, filters)
+    for key in results:
+        # normalize
+        results[key] = results[key] / norm_factors[key]
+    return results
diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py
@@ -19,6 +19,7 @@
     freeze_vram_usage,
     vram_usage_mode,
 )
+from pytti.AudioParse import SpectralAudioParser
 from pytti.Image.differentiable_image import DifferentiableImage
 from pytti.Image.PixelImage import PixelImage
 from pytti.Notebook import tqdm, make_hbox
@@ -108,6 +109,18 @@ def __init__(
             self.optimizer = optimizer
         self.dataframe = []
 
+        self.audio_parser = None
+        if params is not None:
+            if params.input_audio and params.input_audio_filters:
+                self.audio_parser = SpectralAudioParser(
+                    params.input_audio,
+                    params.input_audio_offset,
+                    params.frames_per_second,
+                    params.input_audio_filters,
+                )
+            # else:
+            #    self.audio_parser = None
+
         # self.null_update = null_update
         self.params = params
         self.writer = writer

diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml
@@ -81,6 +81,13 @@ far_plane: 10000
 ######################
 ### Induced Motion ###
 ######################
+input_audio: ""
+input_audio_offset: 0
+input_audio_filters: []
+# - variable_name: fLo
+#   f_center: 60
+#   f_width: 30
+#   order: 5
 
 pre_animation_steps: 100
 lock_camera: true

diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py
@@ -16,6 +16,14 @@ def check_input_against_list(attribute, value, valid_values):
         )
 
 
+@define(auto_attribs=True)
+class AudioFilterConfig:
+    variable_name: str = ""
+    f_center: int = -1
+    f_width: int = -1
+    order: int = 5
+
+
 @define(auto_attribs=True)
 class ConfigSchema:
     #############
@@ -100,6 +108,10 @@ def check(self, attribute, value):
     ### Induced Motion ###
     ######################
 
+    input_audio: str = ""
+    input_audio_offset: float = 0
+    input_audio_filters: Optional[AudioFilterConfig] = None
+
     #  _2d and _3d only apply to those animation modes
 
     translate_x: str = "0"
@@ -195,7 +207,7 @@ def check(self, attribute, value):
     backups: int = 0
     show_graphs: bool = False
     approximate_vram_usage: bool = False
-    use_tensorboard: bool = False
+    use_tensorboard: Optional[bool] = False
 
     #####################################
 

diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py
@@ -5,6 +5,8 @@
 
 math_env = None
 global_t = 0
+global_bands = {}
+global_bands_prev = {}
 eval_memo = {}
 
 
@@ -27,6 +29,11 @@ def parametric_eval(string, **vals):
             )
         math_env.update(vals)
         math_env["t"] = global_t
+        for band in global_bands:
+            math_env[band] = global_bands[band]
+        if global_bands_prev:
+            for band in global_bands_prev:
+                math_env[f'{band}_prev'] = global_bands_prev[band]
         try:
             output = eval(string, math_env)
         except SyntaxError as e:
@@ -37,9 +44,14 @@ def parametric_eval(string, **vals):
         return string
 
 
-def set_t(t):
-    global global_t, eval_memo
+def set_t(t, band_dict):
+    global global_t, global_bands, global_bands_prev, eval_memo
     global_t = t
+    if global_bands:
+        global_bands_prev = global_bands
+    else:
+        global_bands_prev = band_dict
+    global_bands = band_dict
     eval_memo = {}
 
 

diff --git a/src/pytti/update_func.py b/src/pytti/update_func.py
@@ -190,10 +190,28 @@ def save_out(
     t = (i - params.pre_animation_steps) / (
         params.steps_per_frame * params.frames_per_second
     )
-    set_t(t)
+    set_t(t, {})
     if i >= params.pre_animation_steps:
         if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
             logger.debug(f"Time: {t:.4f} seconds")
+
+            # Audio Reactivity ############
+            if model.audio_parser is None:
+                set_t(t, {})
+            # set_t(t)  # this won't need to be a thing with `t`` attached to the class
+            if i >= params.pre_animation_steps:
+                # next_step_pil = None
+                if (i - params.pre_animation_steps) % params.steps_per_frame == 0:
+                    if model.audio_parser is not None:
+                        band_dict = model.audio_parser.get_params(t)
+                        logger.debug(
+                            f"Time: {t:.4f} seconds, audio params: {band_dict}"
+                        )
+                        set_t(t, band_dict)
+                    else:
+                        logger.debug(f"Time: {t:.4f} seconds")
+            ###############################
+
             update_rotoscopers(
                 ((i - params.pre_animation_steps) // params.steps_per_frame + 1)
                 * params.frame_stride

diff --git a/tests/config/default.yaml b/tests/config/default.yaml
@@ -154,3 +154,11 @@ models_parent_dir: ${user_cache:}
 ##########################
 
 gradient_accumulation_steps: 1
+
+##################
+
+# This shouldn't be necessary, but let's see if maybe it squashes test errors?
+
+input_audio: ""
+input_audio_offset: 0
+input_audio_filters: null
diff --git a/tests/test_animation_broken.py b/tests/test_animation_broken.py
@@ -90,6 +90,10 @@
     ##########################
     # adding new config items for backwards compatibility
     "use_tensorboard": True,  # This should actually default to False. Prior to April2022, tb was non-optional
+    # Default null audio input parameters
+    "input_audio": "",
+    "input_audio_offset": 0,
+    "input_audio_filters": [],
 }