diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/20words_mean_face.npy b/20words_mean_face.npy old mode 100644 new mode 100755 diff --git a/DataPreProcess/process_lrs23.py b/DataPreProcess/process_lrs23.py old mode 100644 new mode 100755 diff --git a/DataPreProcess/process_vox2.py b/DataPreProcess/process_vox2.py old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/README_zh-CN.md b/README_zh-CN.md old mode 100644 new mode 100755 diff --git a/configs/LRS2-IIANet.yml b/configs/LRS2-IIANet.yml old mode 100644 new mode 100755 diff --git a/configs/LRS3-IIANet.yml b/configs/LRS3-IIANet.yml old mode 100644 new mode 100755 diff --git a/configs/Vox2-IIANet.yml b/configs/Vox2-IIANet.yml old mode 100644 new mode 100755 diff --git a/figures/IIANet-Figure1B.png b/figures/IIANet-Figure1B.png old mode 100644 new mode 100755 diff --git a/figures/audio-waves.png b/figures/audio-waves.png old mode 100644 new mode 100755 diff --git a/figures/overall.gif b/figures/overall.gif old mode 100644 new mode 100755 diff --git a/figures/overall.mp4 b/figures/overall.mp4 old mode 100644 new mode 100755 diff --git a/figures/results.png b/figures/results.png old mode 100644 new mode 100755 diff --git a/figures/separation.gif b/figures/separation.gif old mode 100644 new mode 100755 diff --git a/figures/separation.mp4 b/figures/separation.mp4 old mode 100644 new mode 100755 diff --git a/figures/spec.png b/figures/spec.png old mode 100644 new mode 100755 diff --git a/generate_data_100mix.py b/generate_data_100mix.py new file mode 100755 index 0000000..0531a4b --- /dev/null +++ b/generate_data_100mix.py @@ -0,0 +1,93 @@ +import os + +# def get_wav_files_dict(directory): +# # 初始化空字典 +# files_dict = {} + +# # 遍历指定目录下的所有文件 +# for root, dirs, files in os.walk(directory): +# for file in files: +# # 只处理.wav文件 +# if file.endswith(".wav"): +# # 获取文件的绝对路径 +# file_path = os.path.join(root, file) +# # 根据"_"分割文件名 +# key = file.split('_')[0] +# # 如果key不在字典中,初始化一个空list +# if key not in files_dict: +# files_dict[key] = [] +# # 将文件路径加入到对应的key的list中 +# files_dict[key].append(file_path) + +# return files_dict + +# # 示例用法 +# directory = "/home/likai/ssd/vox2/vox2/audio_10w/wav16k/min/tr/s1" # 替换为你的文件夹路径 +# wav_files_dict = get_wav_files_dict(directory) + +# # Save the dictionary to json +# import json +# with open('wav_files_dict.json', 'w') as f: +# json.dump(wav_files_dict, f) + +# Load the dictionary from json +import json +import random +import torchaudio +import torch +import numpy as np +import shutil +import yaml + +from look2hear.models import IIANet +from look2hear.datas.transform import get_preprocessing_pipelines +from look2hear.videomodels import ResNetVideoModel + +with open('wav_files_dict.json', 'r') as f: + wav_files_dict = json.load(f) + +# print(wav_files_dict) +datas = [] +select_keys = random.sample(wav_files_dict.keys(), k=3) +datapath = random.sample(wav_files_dict[select_keys[0]], k=1) +mouthpath = datapath[0].split('/')[-1].split("_") +mouthpath = f"{mouthpath[0]}_{mouthpath[1]}_{mouthpath[2]}.npz" +audio_gt = torchaudio.load(datapath[0])[0] +# mouth = torch.from_numpy(np.load(mouthpath)['data']) +datas.append(audio_gt) + +for key in select_keys[1:]: + datapath = random.sample(wav_files_dict[key], k=1) + audio = torchaudio.load(datapath[0])[0] + sirs = torch.Tensor(1).uniform_(-30,-10).numpy() + audio *= 10.**(sirs/20.) + datas.append(audio) + +mix = torch.stack(datas).sum(0) +torchaudio.save("mix.wav", mix, 16000) +torchaudio.save("audio_gt.wav", audio_gt, 16000) +shutil.copy(f"/home/likai/ssd/vox2/vox2/mouths/{mouthpath}", "mouth.npz") +# Load training config +with open("checkpoints/vox2/conf.yml", "rb") as f: + train_conf = yaml.safe_load(f) + +# Load model +# print(["main_args"]["exp_dir"]) +checkpoint_path = os.path.join(train_conf["main_args"]["exp_dir"], "best_model.pth") +audiomodel = IIANet.from_pretrain(checkpoint_path, sample_rate=train_conf["datamodule"]["data_config"]["sample_rate"], **train_conf["audionet"]["audionet_config"]) +videomodel = ResNetVideoModel(**train_conf["videonet"]["videonet_config"]) +audiomodel.cuda() +audiomodel.eval() +videomodel.cuda() +videomodel.eval() + +with torch.no_grad(): + mouth_roi = np.load("mouth.npz")["data"] + mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi) + + mix = torchaudio.load("mix.wav")[0].cuda() + + mouth_emb = videomodel(torch.from_numpy(mouth_roi[None, None]).float().cuda()) + est_sources = audiomodel(mix[None], mouth_emb) + + torchaudio.save("est.wav", est_sources[0].cpu(), 16000) \ No newline at end of file diff --git a/inference.py b/inference.py old mode 100644 new mode 100755 index 5001cee..98d7073 --- a/inference.py +++ b/inference.py @@ -351,25 +351,27 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g if __name__ == '__main__': - os.environ["CUDA_VISIBLE_DEVICES"] = "8" + os.environ["CUDA_VISIBLE_DEVICES"] = "9" input_file = './test_videos/video.mp4' temp_output_file = './test_videos/video25fps.mp4' final_output_file = './test_videos/video.mp4' output_path = "./test_videos/video/" - subprocess.run(['ffmpeg', '-i', input_file, '-filter:v', 'fps=fps=25', temp_output_file]) + number_of_speakers = 2 + + # subprocess.run(['ffmpeg', '-i', input_file, '-filter:v', 'fps=fps=25', temp_output_file]) - os.rename(temp_output_file, final_output_file) + # os.rename(temp_output_file, final_output_file) - print(f'File has been converted and saved to {final_output_file}') + # print(f'File has been converted and saved to {final_output_file}') - filename_path = detectface(video_input_path=final_output_file, output_path=output_path, detect_every_N_frame=8, scalar_face_detection=1.5, number_of_speakers=2) + # filename_path = detectface(video_input_path=final_output_file, output_path=output_path, detect_every_N_frame=8, scalar_face_detection=1.5, number_of_speakers=number_of_speakers) - # extract audio - subprocess.run(['ffmpeg', '-i', final_output_file, '-vn', '-ar', '16000', '-ac', '1', '-ab', '192k', '-f', 'wav', os.path.join(output_path, 'audio.wav')]) + # # extract audio + # subprocess.run(['ffmpeg', '-i', final_output_file, '-vn', '-ar', '16000', '-ac', '1', '-ab', '192k', '-f', 'wav', os.path.join(output_path, 'audio.wav')]) - # crop mouth - crop_mouth(video_direc=output_path+"faces/", landmark_direc=output_path+"landmark/", filename_path=filename_path, save_direc=output_path+"mouthroi/", convert_gray=True, testset_only=False) + # # crop mouth + # crop_mouth(video_direc=output_path+"faces/", landmark_direc=output_path+"landmark/", filename_path=filename_path, save_direc=output_path+"mouthroi/", convert_gray=True, testset_only=False) # Load training config with open("checkpoints/vox2/conf.yml", "rb") as f: @@ -386,7 +388,7 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g videomodel.eval() with torch.no_grad(): - for i in range(2): + for i in range(number_of_speakers): mouth_roi = np.load(output_path+"mouthroi/speaker"+str(i+1)+".npz")["data"] mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi) @@ -395,4 +397,19 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g mouth_emb = videomodel(torch.from_numpy(mouth_roi[None, None]).float().cuda()) est_sources = audiomodel(mix[None], mouth_emb) - torchaudio.save(output_path+"speaker"+str(i+1)+"_est.wav", est_sources[0].cpu(), 16000) \ No newline at end of file + torchaudio.save(output_path+"speaker"+str(i+1)+"_est.wav", est_sources[0].cpu(), 16000) + + # FFmpeg命令 + for i in range(number_of_speakers): + command = [ + 'ffmpeg', + '-i', output_path+f"video_tracked{i+1}.mp4", + '-i', output_path+"speaker"+str(i+1)+"_est.wav", + '-c:v', 'copy', + '-c:a', 'aac', + '-strict', 'experimental', + '-map', '0:v:0', + '-map', '1:a:0', + output_path+f"s{i+1}.mp4" + ] + subprocess.run(command) \ No newline at end of file diff --git a/look2hear/datas/__init__.py b/look2hear/datas/__init__.py old mode 100644 new mode 100755 diff --git a/look2hear/datas/avspeech_dataset.py b/look2hear/datas/avspeech_dataset.py old mode 100644 new mode 100755 diff --git a/look2hear/datas/avspeech_dymanic_dataset.py b/look2hear/datas/avspeech_dymanic_dataset.py old mode 100644 new mode 100755 diff --git a/look2hear/datas/transform.py b/look2hear/datas/transform.py old mode 100644 new mode 100755 diff --git a/look2hear/losses/__init__.py b/look2hear/losses/__init__.py old mode 100644 new mode 100755 diff --git a/look2hear/losses/matrix.py b/look2hear/losses/matrix.py old mode 100644 new mode 100755 diff --git a/look2hear/losses/mixit.py b/look2hear/losses/mixit.py old mode 100644 new mode 100755 diff --git a/look2hear/losses/pit_wrapper.py b/look2hear/losses/pit_wrapper.py old mode 100644 new mode 100755 diff --git a/look2hear/metrics/__init__.py b/look2hear/metrics/__init__.py old mode 100644 new mode 100755 diff --git a/look2hear/metrics/splitwrapper.py b/look2hear/metrics/splitwrapper.py old mode 100644 new mode 100755 diff --git a/look2hear/metrics/wrapper.py b/look2hear/metrics/wrapper.py old mode 100644 new mode 100755 diff --git a/look2hear/models/IIANet.py b/look2hear/models/IIANet.py old mode 100644 new mode 100755 diff --git a/look2hear/models/__init__.py b/look2hear/models/__init__.py old mode 100644 new mode 100755 diff --git a/look2hear/models/base_av_model.py b/look2hear/models/base_av_model.py old mode 100644 new mode 100755 diff --git a/look2hear/system/__init__.py b/look2hear/system/__init__.py old mode 100644 new mode 100755 diff --git a/look2hear/system/av_litmodule.py b/look2hear/system/av_litmodule.py old mode 100644 new mode 100755 diff --git a/look2hear/system/comet.py b/look2hear/system/comet.py old mode 100644 new mode 100755 diff --git a/look2hear/system/core.py b/look2hear/system/core.py old mode 100644 new mode 100755 diff --git a/look2hear/system/optimizers.py b/look2hear/system/optimizers.py old mode 100644 new mode 100755 diff --git a/look2hear/system/tensorboard.py b/look2hear/system/tensorboard.py old mode 100644 new mode 100755 diff --git a/look2hear/utils/__init__.py b/look2hear/utils/__init__.py old mode 100644 new mode 100755 diff --git a/look2hear/utils/lightning_utils.py b/look2hear/utils/lightning_utils.py old mode 100644 new mode 100755 diff --git a/look2hear/utils/parser_utils.py b/look2hear/utils/parser_utils.py old mode 100644 new mode 100755 diff --git a/look2hear/utils/separator.py b/look2hear/utils/separator.py old mode 100644 new mode 100755 diff --git a/look2hear/utils/stft.py b/look2hear/utils/stft.py old mode 100644 new mode 100755 diff --git a/look2hear/utils/torch_utils.py b/look2hear/utils/torch_utils.py old mode 100644 new mode 100755 diff --git a/look2hear/videomodels/__init__.py b/look2hear/videomodels/__init__.py old mode 100644 new mode 100755 diff --git a/look2hear/videomodels/resnet.py b/look2hear/videomodels/resnet.py old mode 100644 new mode 100755 diff --git a/look2hear/videomodels/resnet_videomodel.py b/look2hear/videomodels/resnet_videomodel.py old mode 100644 new mode 100755 diff --git a/replace_weight.py b/replace_weight.py old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/test.py b/test.py old mode 100644 new mode 100755 diff --git a/test_videos/video.mp4 b/test_videos/video.mp4 index cabb29a..b686ebc 100644 Binary files a/test_videos/video.mp4 and b/test_videos/video.mp4 differ diff --git a/train.py b/train.py old mode 100644 new mode 100755