From 492efa1c6a47eca6a7f5b3ecc18765f421e94baf Mon Sep 17 00:00:00 2001 From: zjduan Date: Wed, 6 Sep 2023 22:36:57 +0800 Subject: [PATCH] improve efficiency --- .../DiffSynth/smoother/PySynthSmoother.py | 315 ++++++++---------- diffusion/DiffSynth/README.md | 19 +- .../1_text_guided_video_stylization.json | 11 +- .../config/2_fashion_video_synthesis.json | 1 - .../3_image_guided_video_stylization.json | 4 +- .../DiffSynth/config/4_video_restoring.json | 4 +- .../DiffSynth/config/5_3d_rendering.json | 4 +- 7 files changed, 155 insertions(+), 203 deletions(-) diff --git a/diffusion/DiffSynth/DiffSynth/smoother/PySynthSmoother.py b/diffusion/DiffSynth/DiffSynth/smoother/PySynthSmoother.py index e40bd34..91938ff 100644 --- a/diffusion/DiffSynth/DiffSynth/smoother/PySynthSmoother.py +++ b/diffusion/DiffSynth/DiffSynth/smoother/PySynthSmoother.py @@ -1,10 +1,8 @@ -import torch, os, cv2 +import cv2 from PIL import Image, ImageEnhance import numpy as np -from einops import rearrange, repeat from tqdm import tqdm import cupy as cp -from DiffSynth.utils import save_video, save_images remapping_kernel = cp.RawKernel(r''' @@ -269,107 +267,111 @@ def estimate_nnf(self, source_guide, target_guide, source_style, nnf=None): return nnf, target_style -class LeftVideoGraph: - def __init__(self, n): - self.n = n - self.edges = {} - for i in range(n): - father = self.father(i) - if father>1)>(x^y): - return None - return x^y - - def cousin_leaves(self, x): - y = 1 - while x&y: - y <<= 1 - x -= x & (y - 1) - return range(x+y, x+(y<<1)) - - def query_middle_node(self, x, y): - for z in range(x+1, y): - if (x, z) in self.edges and (z, y) in self.edges: - return z - return None - - def query(self, x): - z_list = [] - z = -1 - for i in range(10): - y = 1 - while z + (y<<1)<=x: - y <<= 1 - z += y - z_list.append(z) - if z==x: - break - return z_list - - def query_edge(self, level): - edge_list = [] - step = 1<=leftbound: + node_level = 0 + while (1<=leftbound: + node_level += 1 + node_list.append((node_index, node_level)) + node_index -= 1<=len(frames_style) or i==frame_id: + if i<0 or i>=len(data) or i==frame_id: continue - _, remapped_frame = self.patch_match_engine.estimate_nnf(frames_guide[i], frames_guide[frame_id], frames_style[i]) - remapped_frames.append(remapped_frame.get()) - blended_frame = self.blending_operator(remapped_frames) / len(remapped_frames) + remapped_frame = data.remap(data(i), i, frame_id) + remapped_frames.append(remapped_frame) + blended_frame, _ = data.blend(remapped_frames) frames_output.append(blended_frame) return frames_output - def remap_and_blend_left(self, frames_guide, frames_style): - n = len(frames_guide) - graph = LeftVideoGraph(n) - # Estimate NNF - nnf_dict = {} - for u, v in tqdm(graph.edges, desc="Estimating NNF"): - nnf, _ = self.patch_match_engine.estimate_nnf( - source_guide=frames_guide[u], - target_guide=frames_guide[v], - source_style=frames_style[u] - ) - nnf_dict[(u, v)] = nnf.get() - # remap_table and blend_table - remap_table = [[frames_style[i]] for i in range(n)] - blend_table = [[frames_style[i]] for i in range(n)] - level = 0 - while True: - edges = graph.query_edge(level) - level += 1 - if len(edges)==0: - break - for u, v in edges: - nnf = nnf_dict[(u, v)] - remaping_result = self.remapping_operator(nnf, blend_table[u][-1]) - remap_table[v].append(remaping_result) - blending_result = self.blending_operator(remap_table[v]) - blend_table[v].append(blending_result) - # calculate remapping prefix sum - blending_inputs = [] - for i in tqdm(range(n), desc="Remapping frames"): - blending_input = [] - # sum of 0...i-1 - nodes = graph.query(i) - for u in nodes: - if u==i: - if len(remap_table[u])==1: - continue - else: - remaping_result = self.blending_operator(remap_table[u][1:]) - else: - nnf = nnf_dict[(u, i)] - remaping_result = self.remapping_operator(nnf, blend_table[u][-1]) - blending_input.append(remaping_result) - blending_inputs.append(blending_input) - return blending_inputs - def smooth_fastest(self, frames_guide, frames_style): - n = len(frames_guide) - prefix_sum = self.remap_and_blend_left(frames_guide, frames_style) - suffix_sum = self.remap_and_blend_left(frames_guide[::-1], frames_style[::-1])[::-1] + # left + data = VideoWithOperator(frames_guide, frames_style, **self.ebsynth_config) + algo = FastBlendingAlgorithm(data) + remapped_frames_l = [] + for frame_id in tqdm(range(len(data)), desc="Remapping and blending (left part)"): + bound = max(frame_id - self.window_size, 0) + remapped_frames_l.append(algo.query(bound, frame_id)) + # right + data = VideoWithOperator(frames_guide[::-1], frames_style[::-1], **self.ebsynth_config) + algo = FastBlendingAlgorithm(data) + remapped_frames_r = [] + for frame_id in tqdm(range(len(data)), desc="Remapping and blending (right part)"): + bound = max(frame_id - self.window_size, 0) + remapped_frames_r.append(algo.query(bound, frame_id)) + remapped_frames_r = remapped_frames_r[::-1] + # merge frames_output = [] - for i, l, m, r in zip(range(n), prefix_sum, frames_style, suffix_sum): - window_size = min(i + self.window_size, n - 1) - max(i - self.window_size, 0) + 1 - frame = self.blending_operator(l + [m] + r) / n - frames_output.append(frame) + data = VideoWithOperator(frames_guide, frames_style, **self.ebsynth_config) + for frame_id in range(len(data)): + frame, _ = data(frame_id) + frame_output, _ = data.blend([ + remapped_frames_l[frame_id], + (frame, -1), + remapped_frames_r[frame_id] + ]) + frames_output.append(frame_output) return frames_output - def postprocessing_contrast(self, style, rate): - style = [ImageEnhance.Contrast(i).enhance(rate) for i in style] - return style - - def postprocessing_sharpness(self, style, rate): - style = [ImageEnhance.Sharpness(i).enhance(rate) for i in style] - return style - - def image_postprocessing(self, images): - for name in self.postprocessing: - rate = self.postprocessing[name] - if name == "contrast": - images = self.postprocessing_contrast(images, rate) - elif name == "sharpness": - images = self.postprocessing_sharpness(images, rate) - return images - def smooth(self, frames_style): frames_guide = self.PIL_to_numpy(self.frames_guide) frames_style = self.PIL_to_numpy(frames_style) @@ -486,5 +432,6 @@ def smooth(self, frames_style): else: raise NotImplementedError() frames_output = self.numpy_to_PIL(frames_output) - frames_output = self.image_postprocessing(frames_output) + frames_output = self.postprocessor(frames_output) return frames_output + diff --git a/diffusion/DiffSynth/README.md b/diffusion/DiffSynth/README.md index 468c7a2..58dad57 100644 --- a/diffusion/DiffSynth/README.md +++ b/diffusion/DiffSynth/README.md @@ -4,6 +4,8 @@ DiffSynth is an open-source project that aims to apply diffusion models to video synthesis. You can use DiffSynth to synthesize coherent and realistic videos. +**Now an extention of stable-diffusion-webui is available! See [here](https://github.com/Artiprocher/sd-webui-fastblend).** This extension is an implementation of the fast blending algorithm in DiffSynth. We notice that this algorithm is very effective. Thus we develop this extension independently, making it easy to use. + ## Installation environment.yml: @@ -125,23 +127,24 @@ DiffSynth is still under development. Now we recommend you to only use `PySynthS ```json "smoother": "PySynthSmoother", "smoother_config": { - "gpu_id": 0, - "speed": "fastest" -}, + "speed": "fastest", + "window_size": 30 +} ``` or ```json "smoother": "PySynthSmoother", "smoother_config": { - "gpu_id": 0, "speed": "slowest", - "window_size": 3 -}, + "window_size": 30 +} ``` -If `speed` is `fastest`, this algorithm will blend all frames together. The time complexity is O(nlogn), where n is the number of frames. This algorithm may make the video foggy when the number of frames is large. +This algorithm will blend the frames in a sliding window. It may make the video foggy when window size is large. + +If `speed` is `fastest`, the time complexity is O(nlogn), where n is the number of frames. -If `speed` is `slowest`, this algorithm will blend the frames in a sliding window. The time complexity is O(nk), where k is the size of sliding window. +If `speed` is `slowest`, the time complexity is O(nk), where k is the size of sliding window. Additionally, you can adjust the contrast and sharpness in the smoother. You only need to add the following parameters in the `smoother_config`. diff --git a/diffusion/DiffSynth/config/1_text_guided_video_stylization.json b/diffusion/DiffSynth/config/1_text_guided_video_stylization.json index 7edb56b..4e1932c 100644 --- a/diffusion/DiffSynth/config/1_text_guided_video_stylization.json +++ b/diffusion/DiffSynth/config/1_text_guided_video_stylization.json @@ -13,8 +13,8 @@ ], "smoother": "PySynthSmoother", "smoother_config": { - "gpu_id": 0, "speed": "fastest", + "window_size": 30, "postprocessing": { "contrast": 1.5, "sharpness": 5.0 @@ -38,8 +38,11 @@ "output_path": "output_video", "post_smoother": "PySynthSmoother", "post_smoother_config": { - "gpu_id": 0, - "speed": "slowest", - "window_size": 60 + "speed": "fastest", + "window_size": 30, + "postprocessing": { + "contrast": 1.5, + "sharpness": 3.0 + } } } \ No newline at end of file diff --git a/diffusion/DiffSynth/config/2_fashion_video_synthesis.json b/diffusion/DiffSynth/config/2_fashion_video_synthesis.json index c82f11c..1cfaeb1 100644 --- a/diffusion/DiffSynth/config/2_fashion_video_synthesis.json +++ b/diffusion/DiffSynth/config/2_fashion_video_synthesis.json @@ -13,7 +13,6 @@ ], "smoother": "PySynthSmoother", "smoother_config": { - "gpu_id": 0, "speed": "slowest", "window_size": 3 }, diff --git a/diffusion/DiffSynth/config/3_image_guided_video_stylization.json b/diffusion/DiffSynth/config/3_image_guided_video_stylization.json index 1c62cea..d1234b9 100644 --- a/diffusion/DiffSynth/config/3_image_guided_video_stylization.json +++ b/diffusion/DiffSynth/config/3_image_guided_video_stylization.json @@ -16,8 +16,8 @@ ], "smoother": "PySynthSmoother", "smoother_config": { - "gpu_id": 0, - "speed": "fastest" + "speed": "fastest", + "window_size": 10000 }, "model_id": "../models/stable-diffusion-v1-5", "input_video": "../data/app_2_image_guided_style_transfer/clip1.mp4", diff --git a/diffusion/DiffSynth/config/4_video_restoring.json b/diffusion/DiffSynth/config/4_video_restoring.json index 2b24fc4..d2fac11 100644 --- a/diffusion/DiffSynth/config/4_video_restoring.json +++ b/diffusion/DiffSynth/config/4_video_restoring.json @@ -10,8 +10,8 @@ ], "smoother": "PySynthSmoother", "smoother_config": { - "gpu_id": 0, - "speed": "fastest" + "speed": "fastest", + "window_size": 10000 }, "model_id": "../models/stable-diffusion-v1-5", "input_video": "../data/app_3_video_restoring/clip1.mp4", diff --git a/diffusion/DiffSynth/config/5_3d_rendering.json b/diffusion/DiffSynth/config/5_3d_rendering.json index a8ef90e..04d10ac 100644 --- a/diffusion/DiffSynth/config/5_3d_rendering.json +++ b/diffusion/DiffSynth/config/5_3d_rendering.json @@ -13,8 +13,8 @@ ], "smoother": "PySynthSmoother", "smoother_config": { - "gpu_id": 0, - "speed": "fastest" + "speed": "fastest", + "window_size": 10000 }, "model_id": "../models/stable-diffusion-v1-5", "input_video": "../data/app_1_render/raw.webm",