feat: add more f0 calculation methods (#39)

voicepaw · Mar 20, 2023 · 6b3b20d · 6b3b20d
1 parent 0060491
commit 6b3b20d
Show file tree

Hide file tree

Showing 9 changed files with 217 additions and 173 deletions.
diff --git a/README.md b/README.md
@@ -44,11 +44,13 @@ pip install so-vits-svc-fork
 ## Features not available in the original repo
 
 - **Realtime voice conversion**
+- More accurate pitch estimation using CREPE
 - GUI available
 - Unified command-line interface (no need to run Python scripts)
 - Ready to use just by installing with `pip`.
 - Automatically download pretrained base model and HuBERT model
 - Code completely formatted with black, isort, autoflake etc.
+- Volume normalization in preprocessing
 - Other minor differences
 
 ## Usage
@@ -79,6 +81,10 @@ svc vc --model-path <model-path>
 svc --model-path <model-path> source.wav
 ```
 
+#### Notes
+
+- In real-time inference, if there is noise on the inputs, the HuBERT model will react to those as well. Consider using realtime noise reduction applications such as [RTX Voice](https://www.nvidia.com/en-us/geforce/guides/nvidia-rtx-voice-setup-guide/) in this case.
+
 ### Training
 
 #### Google Colab
@@ -96,7 +102,10 @@ svc pre-hubert
 svc train
 ```
 
-It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.
+#### Notes
+
+- Dataset audio duration per file should be <~ 10s or VRAM will run out.
+- It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.
 
 ### Further help
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ fairseq = "*"
 flask = "*"
 flask_cors = "*"
 gradio = "*"
-numpy = ">=1.23"
+numpy = "^1.23"
 pydub = "*"
 pyworld = "*"
 requests = "*"
@@ -51,16 +51,17 @@ praat-parselmouth = "*"
 onnx = "*"
 onnxsim = "*"
 onnxoptimizer = "*"
-torch = "*"
-torchaudio = "*"
+torch = ">=1.12"
+torchaudio = ">=0.12"
 tensorboard = "*"
 rich = "*"
 tqdm-joblib = "*"
 tensorboardx = "*"
 pyinputplus = "*"
-cm-time = "^0.1.2"
+cm-time = ">=0.1.2"
 pysimplegui = ">=4.6"
-pebble = "^5.0.3"
+pebble = ">=5.0"
+torchcrepe = ">=0.0.17"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3"

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -268,6 +268,13 @@ def infer(
 @click.option(
     "-db", "--db-thresh", type=int, default=-30, help="threshold (DB) (ABSOLUTE)"
 )
+@click.option(
+    "-fm",
+    "--f0-method",
+    type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
+    default="crepe",
+    help="f0 prediction method",
+)
 @click.option("-p", "--pad-seconds", type=float, default=0.02, help="pad seconds")
 @click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
 @click.option(
@@ -300,6 +307,7 @@ def vc(
     auto_predict_f0: bool,
     cluster_infer_ratio: float,
     noise_scale: float,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
     # slice config
     db_thresh: int,
     pad_seconds: float,
@@ -333,19 +341,24 @@ def vc(
         LOG.info(f"Since model_path is a directory, use {model_path}")
 
     realtime(
+        # paths
         model_path=model_path,
         config_path=config_path,
+        # svc config
         speaker=speaker,
         cluster_model_path=cluster_model_path,
         transpose=transpose,
         auto_predict_f0=auto_predict_f0,
         cluster_infer_ratio=cluster_infer_ratio,
         noise_scale=noise_scale,
-        crossfade_seconds=crossfade_seconds,
-        block_seconds=block_seconds,
-        chunk_seconds=chunk_seconds,
+        f0_method=f0_method,
+        # slice config
         db_thresh=db_thresh,
         pad_seconds=pad_seconds,
+        chunk_seconds=chunk_seconds,
+        # realtime config
+        crossfade_seconds=crossfade_seconds,
+        block_seconds=block_seconds,
         version=version,
         input_device=input_device,
         output_device=output_device,
@@ -446,13 +459,23 @@ def pre_config(
 )
 @click.option(
     "-f",
-    "--force_rebuild",
+    "--force-rebuild",
     type=bool,
     default=True,
     help="force rebuild existing preprocessed files",
 )
+@click.option(
+    "-fm",
+    "--f0-method",
+    type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
+    default="crepe",
+)
 def pre_hubert(
-    input_dir: Path, config_path: Path, n_jobs: bool, force_rebuild: bool
+    input_dir: Path,
+    config_path: Path,
+    n_jobs: bool,
+    force_rebuild: bool,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
 ) -> None:
     """Preprocessing part 3: hubert
     If the HuBERT model is not found, it will be downloaded automatically."""
@@ -465,6 +488,7 @@ def pre_hubert(
         config_path=config_path,
         n_jobs=n_jobs,
         force_rebuild=force_rebuild,
+        f0_method=f0_method,
     )
 
 

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
@@ -125,6 +125,14 @@ def main():
                             text="Auto predict F0 (Pitch may become unstable when turned on in real-time inference.)",
                         )
                     ],
+                    [
+                        sg.Text("F0 prediction method"),
+                        sg.Combo(
+                            ["crepe", "parselmouth", "dio", "harvest"],
+                            key="f0_method",
+                            default_value="crepe",
+                        ),
+                    ],
                     [
                         sg.Text("Cluster infer ratio"),
                         sg.Push(),
@@ -350,6 +358,7 @@ def update_combo() -> None:
                         auto_predict_f0=values["auto_predict_f0"],
                         cluster_infer_ratio=values["cluster_infer_ratio"],
                         noise_scale=values["noise_scale"],
+                        f0_method=values["f0_method"],
                         crossfade_seconds=values["crossfade_seconds"],
                         db_thresh=values["silence_threshold"],
                         pad_seconds=values["pad_seconds"],

diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 from logging import getLogger
 from pathlib import Path
-from typing import Any, Callable, Iterable
+from typing import Any, Callable, Iterable, Literal
 
 import attrs
 import librosa
@@ -121,13 +121,17 @@ def load_model(self):
 
     def get_unit_f0(
         self,
-        audio: np.ndarray[Any, np.dtype[np.float64]],
+        audio: ndarray[Any, dtype[float32]],
         tran: int,
         cluster_infer_ratio: float,
         speaker: int | str,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     ):
-        f0 = utils.compute_f0_parselmouth(
-            audio, sampling_rate=self.target_sample, hop_length=self.hop_size
+        f0 = utils.compute_f0(
+            audio,
+            sampling_rate=self.target_sample,
+            hop_length=self.hop_size,
+            method=f0_method,
         )
         f0, uv = utils.interpolate_f0(f0)
         f0 = torch.FloatTensor(f0)
@@ -161,6 +165,7 @@ def infer(
         cluster_infer_ratio: float = 0,
         auto_predict_f0: bool = False,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     ) -> tuple[torch.Tensor, int]:
         audio = audio.astype(np.float32)
         # get speaker id
@@ -180,7 +185,9 @@ def infer(
         sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
 
         # get unit f0
-        c, f0, uv = self.get_unit_f0(audio, transpose, cluster_infer_ratio, speaker)
+        c, f0, uv = self.get_unit_f0(
+            audio, transpose, cluster_infer_ratio, speaker, f0_method
+        )
         if "half" in self.net_g_path and torch.cuda.is_available():
             c = c.half()
 
@@ -215,6 +222,7 @@ def infer_silence(
         auto_predict_f0: bool = False,
         cluster_infer_ratio: float = 0,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
         # slice config
         db_thresh: int = -40,
         pad_seconds: float = 0.5,
@@ -260,6 +268,7 @@ def infer_silence(
                     cluster_infer_ratio=cluster_infer_ratio,
                     auto_predict_f0=auto_predict_f0,
                     noise_scale=noise_scale,
+                    f0_method=f0_method,
                 )
                 audio_chunk_pad_infer = audio_chunk_pad_infer_tensor.cpu().numpy()
                 pad_len = int(self.target_sample * pad_seconds)
@@ -359,6 +368,7 @@ def infer(
         cluster_infer_ratio: float = 0,
         auto_predict_f0: bool = False,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
         # slice config
         db_thresh: int = -40,
         pad_seconds: float = 0.5,
@@ -373,6 +383,7 @@ def infer(
                 cluster_infer_ratio=cluster_infer_ratio,
                 auto_predict_f0=auto_predict_f0,
                 noise_scale=noise_scale,
+                f0_method=f0_method,
                 db_thresh=db_thresh,
                 pad_seconds=pad_seconds,
                 chunk_seconds=chunk_seconds,
@@ -393,6 +404,7 @@ def infer(
                     cluster_infer_ratio=cluster_infer_ratio,
                     auto_predict_f0=auto_predict_f0,
                     noise_scale=noise_scale,
+                    f0_method=f0_method,
                 )
                 return infered_audio_c.cpu().numpy()
 
@@ -414,6 +426,7 @@ def process(
         cluster_infer_ratio: float = 0,
         auto_predict_f0: bool = False,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
         # slice config
         db_thresh: int = -40,
         chunk_seconds: float = 0.5,
@@ -426,6 +439,7 @@ def infer(audio: ndarray[Any, dtype[float32]]) -> ndarray[Any, dtype[float32]]:
                 cluster_infer_ratio=cluster_infer_ratio,
                 auto_predict_f0=auto_predict_f0,
                 noise_scale=noise_scale,
+                f0_method=f0_method,
             )
             return infered_audio_c.cpu().numpy()
 

diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference_main.py
@@ -22,12 +22,13 @@ def infer(
     model_path: Path | str,
     config_path: Path | str,
     # svc config
-    speaker: str,
+    speaker: int | str,
     cluster_model_path: Path | str | None = None,
     transpose: int = 0,
     auto_predict_f0: bool = False,
     cluster_infer_ratio: float = 0,
     noise_scale: float = 0.4,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     # slice config
     db_thresh: int = -40,
     pad_seconds: float = 0.5,
@@ -51,14 +52,15 @@ def infer(
 
     audio, _ = librosa.load(input_path, sr=svc_model.target_sample)
     audio = svc_model.infer_silence(
-        audio,
+        audio.astype(np.float32),
         speaker=speaker,
-        db_thresh=db_thresh,
-        pad_seconds=pad_seconds,
         transpose=transpose,
         auto_predict_f0=auto_predict_f0,
         cluster_infer_ratio=cluster_infer_ratio,
         noise_scale=noise_scale,
+        f0_method=f0_method,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
         chunk_seconds=chunk_seconds,
         absolute_thresh=absolute_thresh,
     )
@@ -78,6 +80,7 @@ def realtime(
     auto_predict_f0: bool = False,
     cluster_infer_ratio: float = 0,
     noise_scale: float = 0.4,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     # slice config
     db_thresh: int = -40,
     pad_seconds: float = 0.5,
@@ -154,13 +157,17 @@ def callback(
         )
 
         kwargs = dict(
-            input_audio=indata.mean(axis=1),
+            input_audio=indata.mean(axis=1).astype(np.float32),
+            # svc config
             speaker=speaker,
             transpose=transpose,
             auto_predict_f0=auto_predict_f0,
-            noise_scale=noise_scale,
             cluster_infer_ratio=cluster_infer_ratio,
+            noise_scale=noise_scale,
+            f0_method=f0_method,
+            # slice config
             db_thresh=db_thresh,
+            # pad_seconds=pad_seconds,
             chunk_seconds=chunk_seconds,
         )
         if version == 1: