Skip to content

Commit

Permalink
feat: add more f0 calculation methods (#39)
Browse files Browse the repository at this point in the history
  • Loading branch information
34j authored Mar 20, 2023
1 parent 0060491 commit 6b3b20d
Show file tree
Hide file tree
Showing 9 changed files with 217 additions and 173 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ pip install so-vits-svc-fork
## Features not available in the original repo

- **Realtime voice conversion**
- More accurate pitch estimation using CREPE
- GUI available
- Unified command-line interface (no need to run Python scripts)
- Ready to use just by installing with `pip`.
- Automatically download pretrained base model and HuBERT model
- Code completely formatted with black, isort, autoflake etc.
- Volume normalization in preprocessing
- Other minor differences

## Usage
Expand Down Expand Up @@ -79,6 +81,10 @@ svc vc --model-path <model-path>
svc --model-path <model-path> source.wav
```

#### Notes

- In real-time inference, if there is noise on the inputs, the HuBERT model will react to those as well. Consider using realtime noise reduction applications such as [RTX Voice](https://www.nvidia.com/en-us/geforce/guides/nvidia-rtx-voice-setup-guide/) in this case.

### Training

#### Google Colab
Expand All @@ -96,7 +102,10 @@ svc pre-hubert
svc train
```

It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.
#### Notes

- Dataset audio duration per file should be <~ 10s or VRAM will run out.
- It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.

### Further help

Expand Down
171 changes: 35 additions & 136 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ fairseq = "*"
flask = "*"
flask_cors = "*"
gradio = "*"
numpy = ">=1.23"
numpy = "^1.23"
pydub = "*"
pyworld = "*"
requests = "*"
Expand All @@ -51,16 +51,17 @@ praat-parselmouth = "*"
onnx = "*"
onnxsim = "*"
onnxoptimizer = "*"
torch = "*"
torchaudio = "*"
torch = ">=1.12"
torchaudio = ">=0.12"
tensorboard = "*"
rich = "*"
tqdm-joblib = "*"
tensorboardx = "*"
pyinputplus = "*"
cm-time = "^0.1.2"
cm-time = ">=0.1.2"
pysimplegui = ">=4.6"
pebble = "^5.0.3"
pebble = ">=5.0"
torchcrepe = ">=0.0.17"

[tool.poetry.group.dev.dependencies]
pre-commit = ">=3"
Expand Down
34 changes: 29 additions & 5 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,13 @@ def infer(
@click.option(
"-db", "--db-thresh", type=int, default=-30, help="threshold (DB) (ABSOLUTE)"
)
@click.option(
"-fm",
"--f0-method",
type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
default="crepe",
help="f0 prediction method",
)
@click.option("-p", "--pad-seconds", type=float, default=0.02, help="pad seconds")
@click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
@click.option(
Expand Down Expand Up @@ -300,6 +307,7 @@ def vc(
auto_predict_f0: bool,
cluster_infer_ratio: float,
noise_scale: float,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
# slice config
db_thresh: int,
pad_seconds: float,
Expand Down Expand Up @@ -333,19 +341,24 @@ def vc(
LOG.info(f"Since model_path is a directory, use {model_path}")

realtime(
# paths
model_path=model_path,
config_path=config_path,
# svc config
speaker=speaker,
cluster_model_path=cluster_model_path,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
crossfade_seconds=crossfade_seconds,
block_seconds=block_seconds,
chunk_seconds=chunk_seconds,
f0_method=f0_method,
# slice config
db_thresh=db_thresh,
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
# realtime config
crossfade_seconds=crossfade_seconds,
block_seconds=block_seconds,
version=version,
input_device=input_device,
output_device=output_device,
Expand Down Expand Up @@ -446,13 +459,23 @@ def pre_config(
)
@click.option(
"-f",
"--force_rebuild",
"--force-rebuild",
type=bool,
default=True,
help="force rebuild existing preprocessed files",
)
@click.option(
"-fm",
"--f0-method",
type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
default="crepe",
)
def pre_hubert(
input_dir: Path, config_path: Path, n_jobs: bool, force_rebuild: bool
input_dir: Path,
config_path: Path,
n_jobs: bool,
force_rebuild: bool,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
) -> None:
"""Preprocessing part 3: hubert
If the HuBERT model is not found, it will be downloaded automatically."""
Expand All @@ -465,6 +488,7 @@ def pre_hubert(
config_path=config_path,
n_jobs=n_jobs,
force_rebuild=force_rebuild,
f0_method=f0_method,
)


Expand Down
9 changes: 9 additions & 0 deletions src/so_vits_svc_fork/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@ def main():
text="Auto predict F0 (Pitch may become unstable when turned on in real-time inference.)",
)
],
[
sg.Text("F0 prediction method"),
sg.Combo(
["crepe", "parselmouth", "dio", "harvest"],
key="f0_method",
default_value="crepe",
),
],
[
sg.Text("Cluster infer ratio"),
sg.Push(),
Expand Down Expand Up @@ -350,6 +358,7 @@ def update_combo() -> None:
auto_predict_f0=values["auto_predict_f0"],
cluster_infer_ratio=values["cluster_infer_ratio"],
noise_scale=values["noise_scale"],
f0_method=values["f0_method"],
crossfade_seconds=values["crossfade_seconds"],
db_thresh=values["silence_threshold"],
pad_seconds=values["pad_seconds"],
Expand Down
24 changes: 19 additions & 5 deletions src/so_vits_svc_fork/inference/infer_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from copy import deepcopy
from logging import getLogger
from pathlib import Path
from typing import Any, Callable, Iterable
from typing import Any, Callable, Iterable, Literal

import attrs
import librosa
Expand Down Expand Up @@ -121,13 +121,17 @@ def load_model(self):

def get_unit_f0(
self,
audio: np.ndarray[Any, np.dtype[np.float64]],
audio: ndarray[Any, dtype[float32]],
tran: int,
cluster_infer_ratio: float,
speaker: int | str,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
):
f0 = utils.compute_f0_parselmouth(
audio, sampling_rate=self.target_sample, hop_length=self.hop_size
f0 = utils.compute_f0(
audio,
sampling_rate=self.target_sample,
hop_length=self.hop_size,
method=f0_method,
)
f0, uv = utils.interpolate_f0(f0)
f0 = torch.FloatTensor(f0)
Expand Down Expand Up @@ -161,6 +165,7 @@ def infer(
cluster_infer_ratio: float = 0,
auto_predict_f0: bool = False,
noise_scale: float = 0.4,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
) -> tuple[torch.Tensor, int]:
audio = audio.astype(np.float32)
# get speaker id
Expand All @@ -180,7 +185,9 @@ def infer(
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)

# get unit f0
c, f0, uv = self.get_unit_f0(audio, transpose, cluster_infer_ratio, speaker)
c, f0, uv = self.get_unit_f0(
audio, transpose, cluster_infer_ratio, speaker, f0_method
)
if "half" in self.net_g_path and torch.cuda.is_available():
c = c.half()

Expand Down Expand Up @@ -215,6 +222,7 @@ def infer_silence(
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand Down Expand Up @@ -260,6 +268,7 @@ def infer_silence(
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noise_scale=noise_scale,
f0_method=f0_method,
)
audio_chunk_pad_infer = audio_chunk_pad_infer_tensor.cpu().numpy()
pad_len = int(self.target_sample * pad_seconds)
Expand Down Expand Up @@ -359,6 +368,7 @@ def infer(
cluster_infer_ratio: float = 0,
auto_predict_f0: bool = False,
noise_scale: float = 0.4,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand All @@ -373,6 +383,7 @@ def infer(
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noise_scale=noise_scale,
f0_method=f0_method,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
Expand All @@ -393,6 +404,7 @@ def infer(
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noise_scale=noise_scale,
f0_method=f0_method,
)
return infered_audio_c.cpu().numpy()

Expand All @@ -414,6 +426,7 @@ def process(
cluster_infer_ratio: float = 0,
auto_predict_f0: bool = False,
noise_scale: float = 0.4,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
# slice config
db_thresh: int = -40,
chunk_seconds: float = 0.5,
Expand All @@ -426,6 +439,7 @@ def infer(audio: ndarray[Any, dtype[float32]]) -> ndarray[Any, dtype[float32]]:
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noise_scale=noise_scale,
f0_method=f0_method,
)
return infered_audio_c.cpu().numpy()

Expand Down
19 changes: 13 additions & 6 deletions src/so_vits_svc_fork/inference_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ def infer(
model_path: Path | str,
config_path: Path | str,
# svc config
speaker: str,
speaker: int | str,
cluster_model_path: Path | str | None = None,
transpose: int = 0,
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand All @@ -51,14 +52,15 @@ def infer(

audio, _ = librosa.load(input_path, sr=svc_model.target_sample)
audio = svc_model.infer_silence(
audio,
audio.astype(np.float32),
speaker=speaker,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
f0_method=f0_method,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
absolute_thresh=absolute_thresh,
)
Expand All @@ -78,6 +80,7 @@ def realtime(
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
# slice config
db_thresh: int = -40,
pad_seconds: float = 0.5,
Expand Down Expand Up @@ -154,13 +157,17 @@ def callback(
)

kwargs = dict(
input_audio=indata.mean(axis=1),
input_audio=indata.mean(axis=1).astype(np.float32),
# svc config
speaker=speaker,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
noise_scale=noise_scale,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
f0_method=f0_method,
# slice config
db_thresh=db_thresh,
# pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
)
if version == 1:
Expand Down
Loading

0 comments on commit 6b3b20d

Please sign in to comment.