Fixed bugs

JusperLee · Nov 22, 2024 · 33b525e · 33b525e
1 parent f309c79
commit 33b525e
Show file tree

Hide file tree

Showing 55 changed files with 121 additions and 11 deletions.
diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml
diff --git a/.gitignore b/.gitignore
diff --git a/20words_mean_face.npy b/20words_mean_face.npy
diff --git a/DataPreProcess/process_lrs23.py b/DataPreProcess/process_lrs23.py
diff --git a/DataPreProcess/process_vox2.py b/DataPreProcess/process_vox2.py
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
diff --git a/README_zh-CN.md b/README_zh-CN.md
diff --git a/configs/LRS2-IIANet.yml b/configs/LRS2-IIANet.yml
diff --git a/configs/LRS3-IIANet.yml b/configs/LRS3-IIANet.yml
diff --git a/configs/Vox2-IIANet.yml b/configs/Vox2-IIANet.yml
diff --git a/figures/IIANet-Figure1B.png b/figures/IIANet-Figure1B.png
diff --git a/figures/audio-waves.png b/figures/audio-waves.png
diff --git a/figures/overall.gif b/figures/overall.gif
diff --git a/figures/overall.mp4 b/figures/overall.mp4
diff --git a/figures/results.png b/figures/results.png
diff --git a/figures/separation.gif b/figures/separation.gif
diff --git a/figures/separation.mp4 b/figures/separation.mp4
diff --git a/figures/spec.png b/figures/spec.png
diff --git a/generate_data_100mix.py b/generate_data_100mix.py
@@ -0,0 +1,93 @@
+import os
+
+# def get_wav_files_dict(directory):
+#     # 初始化空字典
+#     files_dict = {}
+
+#     # 遍历指定目录下的所有文件
+#     for root, dirs, files in os.walk(directory):
+#         for file in files:
+#             # 只处理.wav文件
+#             if file.endswith(".wav"):
+#                 # 获取文件的绝对路径
+#                 file_path = os.path.join(root, file)
+#                 # 根据"_"分割文件名
+#                 key = file.split('_')[0]
+#                 # 如果key不在字典中，初始化一个空list
+#                 if key not in files_dict:
+#                     files_dict[key] = []
+#                 # 将文件路径加入到对应的key的list中
+#                 files_dict[key].append(file_path)
+
+#     return files_dict
+
+# # 示例用法
+# directory = "/home/likai/ssd/vox2/vox2/audio_10w/wav16k/min/tr/s1"  # 替换为你的文件夹路径
+# wav_files_dict = get_wav_files_dict(directory)
+
+# # Save the dictionary to json
+# import json
+# with open('wav_files_dict.json', 'w') as f:
+#     json.dump(wav_files_dict, f)
+
+# Load the dictionary from json
+import json
+import random
+import torchaudio
+import torch
+import numpy as np
+import shutil
+import yaml
+
+from look2hear.models import IIANet
+from look2hear.datas.transform import get_preprocessing_pipelines
+from look2hear.videomodels import ResNetVideoModel
+
+with open('wav_files_dict.json', 'r') as f:
+    wav_files_dict = json.load(f)
+
+# print(wav_files_dict)
+datas = []
+select_keys = random.sample(wav_files_dict.keys(), k=3)
+datapath = random.sample(wav_files_dict[select_keys[0]], k=1)
+mouthpath = datapath[0].split('/')[-1].split("_")
+mouthpath = f"{mouthpath[0]}_{mouthpath[1]}_{mouthpath[2]}.npz"
+audio_gt = torchaudio.load(datapath[0])[0]
+# mouth = torch.from_numpy(np.load(mouthpath)['data'])
+datas.append(audio_gt)
+
+for key in select_keys[1:]:
+    datapath = random.sample(wav_files_dict[key], k=1)
+    audio = torchaudio.load(datapath[0])[0]
+    sirs = torch.Tensor(1).uniform_(-30,-10).numpy()
+    audio *= 10.**(sirs/20.)
+    datas.append(audio)
+
+mix = torch.stack(datas).sum(0)
+torchaudio.save("mix.wav", mix, 16000)
+torchaudio.save("audio_gt.wav", audio_gt, 16000)
+shutil.copy(f"/home/likai/ssd/vox2/vox2/mouths/{mouthpath}", "mouth.npz")
+# Load training config
+with open("checkpoints/vox2/conf.yml", "rb") as f:
+    train_conf = yaml.safe_load(f)
+
+# Load model
+# print(["main_args"]["exp_dir"])
+checkpoint_path = os.path.join(train_conf["main_args"]["exp_dir"], "best_model.pth")
+audiomodel = IIANet.from_pretrain(checkpoint_path, sample_rate=train_conf["datamodule"]["data_config"]["sample_rate"], **train_conf["audionet"]["audionet_config"])
+videomodel = ResNetVideoModel(**train_conf["videonet"]["videonet_config"])
+audiomodel.cuda()
+audiomodel.eval()
+videomodel.cuda()
+videomodel.eval()
+
+with torch.no_grad():
+    mouth_roi = np.load("mouth.npz")["data"]
+    mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi)
+
+    mix = torchaudio.load("mix.wav")[0].cuda()
+
+    mouth_emb = videomodel(torch.from_numpy(mouth_roi[None, None]).float().cuda())
+    est_sources = audiomodel(mix[None], mouth_emb)
+
+    torchaudio.save("est.wav", est_sources[0].cpu(), 16000)
diff --git a/inference.py b/inference.py
@@ -351,25 +351,27 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g
 
 if __name__ == '__main__':
 
-    os.environ["CUDA_VISIBLE_DEVICES"] = "8"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "9"
 
     input_file = './test_videos/video.mp4'
     temp_output_file = './test_videos/video25fps.mp4'
     final_output_file = './test_videos/video.mp4'
     output_path = "./test_videos/video/"
-    subprocess.run(['ffmpeg', '-i', input_file, '-filter:v', 'fps=fps=25', temp_output_file])
+    number_of_speakers = 2
+
+    # subprocess.run(['ffmpeg', '-i', input_file, '-filter:v', 'fps=fps=25', temp_output_file])
 
-    os.rename(temp_output_file, final_output_file)
+    # os.rename(temp_output_file, final_output_file)
 
-    print(f'File has been converted and saved to {final_output_file}')
+    # print(f'File has been converted and saved to {final_output_file}')
 
-    filename_path = detectface(video_input_path=final_output_file, output_path=output_path, detect_every_N_frame=8, scalar_face_detection=1.5, number_of_speakers=2)
+    # filename_path = detectface(video_input_path=final_output_file, output_path=output_path, detect_every_N_frame=8, scalar_face_detection=1.5, number_of_speakers=number_of_speakers)
 
-    # extract audio
-    subprocess.run(['ffmpeg', '-i', final_output_file, '-vn', '-ar', '16000', '-ac', '1', '-ab', '192k', '-f', 'wav', os.path.join(output_path, 'audio.wav')])
+    # # extract audio
+    # subprocess.run(['ffmpeg', '-i', final_output_file, '-vn', '-ar', '16000', '-ac', '1', '-ab', '192k', '-f', 'wav', os.path.join(output_path, 'audio.wav')])
 
-    # crop mouth
-    crop_mouth(video_direc=output_path+"faces/", landmark_direc=output_path+"landmark/", filename_path=filename_path, save_direc=output_path+"mouthroi/", convert_gray=True, testset_only=False)
+    # # crop mouth
+    # crop_mouth(video_direc=output_path+"faces/", landmark_direc=output_path+"landmark/", filename_path=filename_path, save_direc=output_path+"mouthroi/", convert_gray=True, testset_only=False)
 
     # Load training config
     with open("checkpoints/vox2/conf.yml", "rb") as f:
@@ -386,7 +388,7 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g
     videomodel.eval()
 
     with torch.no_grad():
-        for i in range(2):
+        for i in range(number_of_speakers):
             mouth_roi = np.load(output_path+"mouthroi/speaker"+str(i+1)+".npz")["data"]
             mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi)
 
@@ -395,4 +397,19 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g
             mouth_emb = videomodel(torch.from_numpy(mouth_roi[None, None]).float().cuda())
             est_sources = audiomodel(mix[None], mouth_emb)
 
-            torchaudio.save(output_path+"speaker"+str(i+1)+"_est.wav", est_sources[0].cpu(), 16000)
+            torchaudio.save(output_path+"speaker"+str(i+1)+"_est.wav", est_sources[0].cpu(), 16000)
+
+    # FFmpeg命令
+    for i in range(number_of_speakers):
+        command = [
+            'ffmpeg',
+            '-i', output_path+f"video_tracked{i+1}.mp4", 
+            '-i', output_path+"speaker"+str(i+1)+"_est.wav",
+            '-c:v', 'copy',         
+            '-c:a', 'aac',        
+            '-strict', 'experimental',
+            '-map', '0:v:0',      
+            '-map', '1:a:0',   
+            output_path+f"s{i+1}.mp4" 
+        ]
+        subprocess.run(command)
diff --git a/look2hear/datas/__init__.py b/look2hear/datas/__init__.py
diff --git a/look2hear/datas/avspeech_dataset.py b/look2hear/datas/avspeech_dataset.py
diff --git a/look2hear/datas/avspeech_dymanic_dataset.py b/look2hear/datas/avspeech_dymanic_dataset.py
diff --git a/look2hear/datas/transform.py b/look2hear/datas/transform.py
diff --git a/look2hear/losses/__init__.py b/look2hear/losses/__init__.py
diff --git a/look2hear/losses/matrix.py b/look2hear/losses/matrix.py
diff --git a/look2hear/losses/mixit.py b/look2hear/losses/mixit.py
diff --git a/look2hear/losses/pit_wrapper.py b/look2hear/losses/pit_wrapper.py
diff --git a/look2hear/metrics/__init__.py b/look2hear/metrics/__init__.py
diff --git a/look2hear/metrics/splitwrapper.py b/look2hear/metrics/splitwrapper.py
diff --git a/look2hear/metrics/wrapper.py b/look2hear/metrics/wrapper.py
diff --git a/look2hear/models/IIANet.py b/look2hear/models/IIANet.py
diff --git a/look2hear/models/__init__.py b/look2hear/models/__init__.py
diff --git a/look2hear/models/base_av_model.py b/look2hear/models/base_av_model.py
diff --git a/look2hear/system/__init__.py b/look2hear/system/__init__.py
diff --git a/look2hear/system/av_litmodule.py b/look2hear/system/av_litmodule.py
diff --git a/look2hear/system/comet.py b/look2hear/system/comet.py
diff --git a/look2hear/system/core.py b/look2hear/system/core.py
diff --git a/look2hear/system/optimizers.py b/look2hear/system/optimizers.py
diff --git a/look2hear/system/tensorboard.py b/look2hear/system/tensorboard.py
diff --git a/look2hear/utils/__init__.py b/look2hear/utils/__init__.py
diff --git a/look2hear/utils/lightning_utils.py b/look2hear/utils/lightning_utils.py
diff --git a/look2hear/utils/parser_utils.py b/look2hear/utils/parser_utils.py
diff --git a/look2hear/utils/separator.py b/look2hear/utils/separator.py
diff --git a/look2hear/utils/stft.py b/look2hear/utils/stft.py
diff --git a/look2hear/utils/torch_utils.py b/look2hear/utils/torch_utils.py
diff --git a/look2hear/videomodels/__init__.py b/look2hear/videomodels/__init__.py
diff --git a/look2hear/videomodels/resnet.py b/look2hear/videomodels/resnet.py
diff --git a/look2hear/videomodels/resnet_videomodel.py b/look2hear/videomodels/resnet_videomodel.py
diff --git a/replace_weight.py b/replace_weight.py
diff --git a/requirements.txt b/requirements.txt
diff --git a/test.py b/test.py
diff --git a/test_videos/video.mp4 b/test_videos/video.mp4
diff --git a/train.py b/train.py