diff --git a/src/pytti/AudioParse.py b/src/pytti/AudioParse.py new file mode 100644 index 0000000..fe94e58 --- /dev/null +++ b/src/pytti/AudioParse.py @@ -0,0 +1,125 @@ +import numpy as np +import typing +import subprocess +from loguru import logger +from scipy.signal import butter, sosfilt, sosfreqz + +SAMPLERATE = 44100 + + +class SpectralAudioParser: + """ + reads a given input file, scans along it and parses the amplitude in selected bands using butterworth bandpass filters. + the amplitude is normalized into the 0..1 range for easier use in transformation functions. + """ + + def __init__( + self, + input_audio, + offset, + frames_per_second, + filters + ): + if len(filters) < 1: + raise RuntimeError("When using input_audio, at least 1 filter must be specified") + + pipe = subprocess.Popen(['ffmpeg', '-i', input_audio, + '-f', 's16le', + '-acodec', 'pcm_s16le', + '-ar', str(SAMPLERATE), + '-ac', '1', + '-'], stdout=subprocess.PIPE, bufsize=10 ** 8) + + self.audio_samples = np.array([], dtype=np.int16) + + # read the audio file from the pipe in 0.5s blocks (2 bytes per sample) + while True: + buf = pipe.stdout.read(SAMPLERATE) + self.audio_samples = np.append(self.audio_samples, np.frombuffer(buf, dtype=np.int16)) + if len(buf) < SAMPLERATE: + break + if len(self.audio_samples) < 0: + raise RuntimeError("Audio samples are empty, assuming load failed") + self.duration = len(self.audio_samples) / SAMPLERATE + logger.debug( + f"initialized audio file {input_audio}, samples read: {len(self.audio_samples)}, total duration: {self.duration}s") + self.offset = offset + if offset > self.duration: + raise RuntimeError(f"Audio offset set at {offset}s but input audio is only {duration}s long") + # analyze all samples for the current frame + self.window_size = int(1 / frames_per_second * SAMPLERATE) + self.filters = filters + + # parse band maxima first for normalizing the filtered signal to 0..1 at arbitrary points in the file later + # this initialization is a bit compute intensive, especially for higher fps numbers, but i couldn't find a cleaner way + # (band-passing the entire track instead of windows creates maxima that are way off, some filtering anomaly i don't understand...) + steps = int((self.duration - self.offset) * frames_per_second) + interval = 1 / frames_per_second + maxima = {} + time_steps = np.linspace(0, steps, num=steps) * interval + for t in time_steps: + sample_offset = int(t * SAMPLERATE) + cur_maxima = bp_filtered(self.audio_samples[sample_offset:sample_offset + self.window_size], filters) + for key in cur_maxima: + if key in maxima: + maxima[key] = max(maxima[key], cur_maxima[key]) + else: + maxima[key] = cur_maxima[key] + self.band_maxima = maxima + logger.debug(f"initialized band maxima for {len(filters)} filters: {self.band_maxima}") + + def get_params(self, t) -> typing.Dict[str, float]: + """ + Return the amplitude parameters at the given point in time t within the audio track, or 0 if the track has ended. + Amplitude/energy parameters are normalized into the [0,1] range. + """ + # Get the point in time (sample-offset) in the track in seconds based on sample-rate + sample_offset = int(t * SAMPLERATE + self.offset * SAMPLERATE) + logger.debug(f"Analyzing audio at {self.offset + t}s") + if sample_offset < len(self.audio_samples): + window_samples = self.audio_samples[sample_offset:sample_offset + self.window_size] + if len(window_samples) < self.window_size: + # audio input file has likely ended + logger.debug( + f"Warning: sample offset is out of range at time offset {t + self.offset}s. Returning null result") + return {} + return bp_filtered_norm(window_samples, self.filters, self.band_maxima) + else: + logger.debug(f"Warning: Audio input has ended. Returning null result") + return {} + + def get_duration(self): + return self.duration + + +def butter_bandpass(lowcut, highcut, fs, order=5): + nyq = 0.5 * fs + low = lowcut / nyq + high = highcut / nyq + sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos') + return sos + + +def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): + sos = butter_bandpass(lowcut, highcut, fs, order=order) + y = sosfilt(sos, data) + return y + + +def bp_filtered(window_samples, filters) -> typing.Dict[str, float]: + results = {} + for filter in filters: + offset = filter.f_width / 2 + lower = filter.f_center - offset + upper = filter.f_center + offset + filtered = butter_bandpass_filter(window_samples, lower, upper, SAMPLERATE, order=filter.order) + results[filter.variable_name] = np.max(np.abs(filtered)) + return results + + +def bp_filtered_norm(window_samples, filters, norm_factors) -> typing.Dict[str, float]: + results = bp_filtered(window_samples, filters) + for key in results: + # normalize + results[key] = results[key] / norm_factors[key] + return results diff --git a/src/pytti/ImageGuide.py b/src/pytti/ImageGuide.py index 3534ce0..1855915 100644 --- a/src/pytti/ImageGuide.py +++ b/src/pytti/ImageGuide.py @@ -19,6 +19,7 @@ freeze_vram_usage, vram_usage_mode, ) +from pytti.AudioParse import SpectralAudioParser from pytti.Image.differentiable_image import DifferentiableImage from pytti.Image.PixelImage import PixelImage from pytti.Notebook import tqdm, make_hbox @@ -108,6 +109,18 @@ def __init__( self.optimizer = optimizer self.dataframe = [] + self.audio_parser = None + if params is not None: + if params.input_audio and params.input_audio_filters: + self.audio_parser = SpectralAudioParser( + params.input_audio, + params.input_audio_offset, + params.frames_per_second, + params.input_audio_filters, + ) + # else: + # self.audio_parser = None + # self.null_update = null_update self.params = params self.writer = writer diff --git a/src/pytti/assets/default.yaml b/src/pytti/assets/default.yaml index a4f9630..4556bb5 100644 --- a/src/pytti/assets/default.yaml +++ b/src/pytti/assets/default.yaml @@ -81,6 +81,13 @@ far_plane: 10000 ###################### ### Induced Motion ### ###################### +input_audio: "" +input_audio_offset: 0 +input_audio_filters: [] +# - variable_name: fLo +# f_center: 60 +# f_width: 30 +# order: 5 pre_animation_steps: 100 lock_camera: true diff --git a/src/pytti/config/structured_config.py b/src/pytti/config/structured_config.py index 6ef7ea3..a574d5f 100644 --- a/src/pytti/config/structured_config.py +++ b/src/pytti/config/structured_config.py @@ -16,6 +16,14 @@ def check_input_against_list(attribute, value, valid_values): ) +@define(auto_attribs=True) +class AudioFilterConfig: + variable_name: str = "" + f_center: int = -1 + f_width: int = -1 + order: int = 5 + + @define(auto_attribs=True) class ConfigSchema: ############# @@ -100,6 +108,10 @@ def check(self, attribute, value): ### Induced Motion ### ###################### + input_audio: str = "" + input_audio_offset: float = 0 + input_audio_filters: Optional[AudioFilterConfig] = None + # _2d and _3d only apply to those animation modes translate_x: str = "0" @@ -195,7 +207,7 @@ def check(self, attribute, value): backups: int = 0 show_graphs: bool = False approximate_vram_usage: bool = False - use_tensorboard: bool = False + use_tensorboard: Optional[bool] = False ##################################### diff --git a/src/pytti/eval_tools.py b/src/pytti/eval_tools.py index 85cfac1..e637d06 100644 --- a/src/pytti/eval_tools.py +++ b/src/pytti/eval_tools.py @@ -5,6 +5,8 @@ math_env = None global_t = 0 +global_bands = {} +global_bands_prev = {} eval_memo = {} @@ -27,6 +29,11 @@ def parametric_eval(string, **vals): ) math_env.update(vals) math_env["t"] = global_t + for band in global_bands: + math_env[band] = global_bands[band] + if global_bands_prev: + for band in global_bands_prev: + math_env[f'{band}_prev'] = global_bands_prev[band] try: output = eval(string, math_env) except SyntaxError as e: @@ -37,9 +44,14 @@ def parametric_eval(string, **vals): return string -def set_t(t): - global global_t, eval_memo +def set_t(t, band_dict): + global global_t, global_bands, global_bands_prev, eval_memo global_t = t + if global_bands: + global_bands_prev = global_bands + else: + global_bands_prev = band_dict + global_bands = band_dict eval_memo = {} diff --git a/src/pytti/update_func.py b/src/pytti/update_func.py index 1f25736..bb7fe62 100644 --- a/src/pytti/update_func.py +++ b/src/pytti/update_func.py @@ -190,10 +190,28 @@ def save_out( t = (i - params.pre_animation_steps) / ( params.steps_per_frame * params.frames_per_second ) - set_t(t) + set_t(t, {}) if i >= params.pre_animation_steps: if (i - params.pre_animation_steps) % params.steps_per_frame == 0: logger.debug(f"Time: {t:.4f} seconds") + + # Audio Reactivity ############ + if model.audio_parser is None: + set_t(t, {}) + # set_t(t) # this won't need to be a thing with `t`` attached to the class + if i >= params.pre_animation_steps: + # next_step_pil = None + if (i - params.pre_animation_steps) % params.steps_per_frame == 0: + if model.audio_parser is not None: + band_dict = model.audio_parser.get_params(t) + logger.debug( + f"Time: {t:.4f} seconds, audio params: {band_dict}" + ) + set_t(t, band_dict) + else: + logger.debug(f"Time: {t:.4f} seconds") + ############################### + update_rotoscopers( ((i - params.pre_animation_steps) // params.steps_per_frame + 1) * params.frame_stride diff --git a/tests/config/default.yaml b/tests/config/default.yaml index 79f9e4d..4aa9c6c 100644 --- a/tests/config/default.yaml +++ b/tests/config/default.yaml @@ -154,3 +154,11 @@ models_parent_dir: ${user_cache:} ########################## gradient_accumulation_steps: 1 + +################## + +# This shouldn't be necessary, but let's see if maybe it squashes test errors? + +input_audio: "" +input_audio_offset: 0 +input_audio_filters: null diff --git a/tests/test_animation_broken.py b/tests/test_animation_broken.py index 0bfbbec..8d36f1a 100644 --- a/tests/test_animation_broken.py +++ b/tests/test_animation_broken.py @@ -90,6 +90,10 @@ ########################## # adding new config items for backwards compatibility "use_tensorboard": True, # This should actually default to False. Prior to April2022, tb was non-optional + # Default null audio input parameters + "input_audio": "", + "input_audio_offset": 0, + "input_audio_filters": [], }