Skip to content

Commit

Permalink
Fixed bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
JusperLee committed Nov 22, 2024
1 parent f309c79 commit 33b525e
Show file tree
Hide file tree
Showing 55 changed files with 121 additions and 11 deletions.
Empty file modified .github/workflows/static.yml
100644 → 100755
Empty file.
Empty file modified .gitignore
100644 → 100755
Empty file.
Empty file modified 20words_mean_face.npy
100644 → 100755
Empty file.
Empty file modified DataPreProcess/process_lrs23.py
100644 → 100755
Empty file.
Empty file modified DataPreProcess/process_vox2.py
100644 → 100755
Empty file.
Empty file modified LICENSE
100644 → 100755
Empty file.
Empty file modified README.md
100644 → 100755
Empty file.
Empty file modified README_zh-CN.md
100644 → 100755
Empty file.
Empty file modified configs/LRS2-IIANet.yml
100644 → 100755
Empty file.
Empty file modified configs/LRS3-IIANet.yml
100644 → 100755
Empty file.
Empty file modified configs/Vox2-IIANet.yml
100644 → 100755
Empty file.
Empty file modified figures/IIANet-Figure1B.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified figures/audio-waves.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified figures/overall.gif
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified figures/overall.mp4
100644 → 100755
Empty file.
Empty file modified figures/results.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified figures/separation.gif
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified figures/separation.mp4
100644 → 100755
Empty file.
Empty file modified figures/spec.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
93 changes: 93 additions & 0 deletions generate_data_100mix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os

# def get_wav_files_dict(directory):
# # 初始化空字典
# files_dict = {}

# # 遍历指定目录下的所有文件
# for root, dirs, files in os.walk(directory):
# for file in files:
# # 只处理.wav文件
# if file.endswith(".wav"):
# # 获取文件的绝对路径
# file_path = os.path.join(root, file)
# # 根据"_"分割文件名
# key = file.split('_')[0]
# # 如果key不在字典中,初始化一个空list
# if key not in files_dict:
# files_dict[key] = []
# # 将文件路径加入到对应的key的list中
# files_dict[key].append(file_path)

# return files_dict

# # 示例用法
# directory = "/home/likai/ssd/vox2/vox2/audio_10w/wav16k/min/tr/s1" # 替换为你的文件夹路径
# wav_files_dict = get_wav_files_dict(directory)

# # Save the dictionary to json
# import json
# with open('wav_files_dict.json', 'w') as f:
# json.dump(wav_files_dict, f)

# Load the dictionary from json
import json
import random
import torchaudio
import torch
import numpy as np
import shutil
import yaml

from look2hear.models import IIANet
from look2hear.datas.transform import get_preprocessing_pipelines
from look2hear.videomodels import ResNetVideoModel

with open('wav_files_dict.json', 'r') as f:
wav_files_dict = json.load(f)

# print(wav_files_dict)
datas = []
select_keys = random.sample(wav_files_dict.keys(), k=3)
datapath = random.sample(wav_files_dict[select_keys[0]], k=1)
mouthpath = datapath[0].split('/')[-1].split("_")
mouthpath = f"{mouthpath[0]}_{mouthpath[1]}_{mouthpath[2]}.npz"
audio_gt = torchaudio.load(datapath[0])[0]
# mouth = torch.from_numpy(np.load(mouthpath)['data'])
datas.append(audio_gt)

for key in select_keys[1:]:
datapath = random.sample(wav_files_dict[key], k=1)
audio = torchaudio.load(datapath[0])[0]
sirs = torch.Tensor(1).uniform_(-30,-10).numpy()
audio *= 10.**(sirs/20.)
datas.append(audio)

mix = torch.stack(datas).sum(0)
torchaudio.save("mix.wav", mix, 16000)
torchaudio.save("audio_gt.wav", audio_gt, 16000)
shutil.copy(f"/home/likai/ssd/vox2/vox2/mouths/{mouthpath}", "mouth.npz")
# Load training config
with open("checkpoints/vox2/conf.yml", "rb") as f:
train_conf = yaml.safe_load(f)

# Load model
# print(["main_args"]["exp_dir"])
checkpoint_path = os.path.join(train_conf["main_args"]["exp_dir"], "best_model.pth")
audiomodel = IIANet.from_pretrain(checkpoint_path, sample_rate=train_conf["datamodule"]["data_config"]["sample_rate"], **train_conf["audionet"]["audionet_config"])
videomodel = ResNetVideoModel(**train_conf["videonet"]["videonet_config"])
audiomodel.cuda()
audiomodel.eval()
videomodel.cuda()
videomodel.eval()

with torch.no_grad():
mouth_roi = np.load("mouth.npz")["data"]
mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi)

mix = torchaudio.load("mix.wav")[0].cuda()

mouth_emb = videomodel(torch.from_numpy(mouth_roi[None, None]).float().cuda())
est_sources = audiomodel(mix[None], mouth_emb)

torchaudio.save("est.wav", est_sources[0].cpu(), 16000)
39 changes: 28 additions & 11 deletions inference.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -351,25 +351,27 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g

if __name__ == '__main__':

os.environ["CUDA_VISIBLE_DEVICES"] = "8"
os.environ["CUDA_VISIBLE_DEVICES"] = "9"

input_file = './test_videos/video.mp4'
temp_output_file = './test_videos/video25fps.mp4'
final_output_file = './test_videos/video.mp4'
output_path = "./test_videos/video/"
subprocess.run(['ffmpeg', '-i', input_file, '-filter:v', 'fps=fps=25', temp_output_file])
number_of_speakers = 2

# subprocess.run(['ffmpeg', '-i', input_file, '-filter:v', 'fps=fps=25', temp_output_file])

os.rename(temp_output_file, final_output_file)
# os.rename(temp_output_file, final_output_file)

print(f'File has been converted and saved to {final_output_file}')
# print(f'File has been converted and saved to {final_output_file}')

filename_path = detectface(video_input_path=final_output_file, output_path=output_path, detect_every_N_frame=8, scalar_face_detection=1.5, number_of_speakers=2)
# filename_path = detectface(video_input_path=final_output_file, output_path=output_path, detect_every_N_frame=8, scalar_face_detection=1.5, number_of_speakers=number_of_speakers)

# extract audio
subprocess.run(['ffmpeg', '-i', final_output_file, '-vn', '-ar', '16000', '-ac', '1', '-ab', '192k', '-f', 'wav', os.path.join(output_path, 'audio.wav')])
# # extract audio
# subprocess.run(['ffmpeg', '-i', final_output_file, '-vn', '-ar', '16000', '-ac', '1', '-ab', '192k', '-f', 'wav', os.path.join(output_path, 'audio.wav')])

# crop mouth
crop_mouth(video_direc=output_path+"faces/", landmark_direc=output_path+"landmark/", filename_path=filename_path, save_direc=output_path+"mouthroi/", convert_gray=True, testset_only=False)
# # crop mouth
# crop_mouth(video_direc=output_path+"faces/", landmark_direc=output_path+"landmark/", filename_path=filename_path, save_direc=output_path+"mouthroi/", convert_gray=True, testset_only=False)

# Load training config
with open("checkpoints/vox2/conf.yml", "rb") as f:
Expand All @@ -386,7 +388,7 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g
videomodel.eval()

with torch.no_grad():
for i in range(2):
for i in range(number_of_speakers):
mouth_roi = np.load(output_path+"mouthroi/speaker"+str(i+1)+".npz")["data"]
mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi)

Expand All @@ -395,4 +397,19 @@ def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_g
mouth_emb = videomodel(torch.from_numpy(mouth_roi[None, None]).float().cuda())
est_sources = audiomodel(mix[None], mouth_emb)

torchaudio.save(output_path+"speaker"+str(i+1)+"_est.wav", est_sources[0].cpu(), 16000)
torchaudio.save(output_path+"speaker"+str(i+1)+"_est.wav", est_sources[0].cpu(), 16000)

# FFmpeg命令
for i in range(number_of_speakers):
command = [
'ffmpeg',
'-i', output_path+f"video_tracked{i+1}.mp4",
'-i', output_path+"speaker"+str(i+1)+"_est.wav",
'-c:v', 'copy',
'-c:a', 'aac',
'-strict', 'experimental',
'-map', '0:v:0',
'-map', '1:a:0',
output_path+f"s{i+1}.mp4"
]
subprocess.run(command)
Empty file modified look2hear/datas/__init__.py
100644 → 100755
Empty file.
Empty file modified look2hear/datas/avspeech_dataset.py
100644 → 100755
Empty file.
Empty file modified look2hear/datas/avspeech_dymanic_dataset.py
100644 → 100755
Empty file.
Empty file modified look2hear/datas/transform.py
100644 → 100755
Empty file.
Empty file modified look2hear/losses/__init__.py
100644 → 100755
Empty file.
Empty file modified look2hear/losses/matrix.py
100644 → 100755
Empty file.
Empty file modified look2hear/losses/mixit.py
100644 → 100755
Empty file.
Empty file modified look2hear/losses/pit_wrapper.py
100644 → 100755
Empty file.
Empty file modified look2hear/metrics/__init__.py
100644 → 100755
Empty file.
Empty file modified look2hear/metrics/splitwrapper.py
100644 → 100755
Empty file.
Empty file modified look2hear/metrics/wrapper.py
100644 → 100755
Empty file.
Empty file modified look2hear/models/IIANet.py
100644 → 100755
Empty file.
Empty file modified look2hear/models/__init__.py
100644 → 100755
Empty file.
Empty file modified look2hear/models/base_av_model.py
100644 → 100755
Empty file.
Empty file modified look2hear/system/__init__.py
100644 → 100755
Empty file.
Empty file modified look2hear/system/av_litmodule.py
100644 → 100755
Empty file.
Empty file modified look2hear/system/comet.py
100644 → 100755
Empty file.
Empty file modified look2hear/system/core.py
100644 → 100755
Empty file.
Empty file modified look2hear/system/optimizers.py
100644 → 100755
Empty file.
Empty file modified look2hear/system/tensorboard.py
100644 → 100755
Empty file.
Empty file modified look2hear/utils/__init__.py
100644 → 100755
Empty file.
Empty file modified look2hear/utils/lightning_utils.py
100644 → 100755
Empty file.
Empty file modified look2hear/utils/parser_utils.py
100644 → 100755
Empty file.
Empty file modified look2hear/utils/separator.py
100644 → 100755
Empty file.
Empty file modified look2hear/utils/stft.py
100644 → 100755
Empty file.
Empty file modified look2hear/utils/torch_utils.py
100644 → 100755
Empty file.
Empty file modified look2hear/videomodels/__init__.py
100644 → 100755
Empty file.
Empty file modified look2hear/videomodels/resnet.py
100644 → 100755
Empty file.
Empty file modified look2hear/videomodels/resnet_videomodel.py
100644 → 100755
Empty file.
Empty file modified replace_weight.py
100644 → 100755
Empty file.
Empty file modified requirements.txt
100644 → 100755
Empty file.
Empty file modified test.py
100644 → 100755
Empty file.
Binary file modified test_videos/video.mp4
Binary file not shown.
Empty file modified train.py
100644 → 100755
Empty file.

0 comments on commit 33b525e

Please sign in to comment.