feat: enhance RealtimeVC (#52)

voicepaw · Mar 21, 2023 · 81551ce · 81551ce
1 parent 1c8305a
commit 81551ce
Show file tree

Hide file tree

Showing 7 changed files with 261 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ pip install so-vits-svc-fork
 
 ## Features not available in the original repo
 
-- **Realtime voice conversion**
+- **Realtime voice conversion** (enhanced in v1.1.0)
 - More accurate pitch estimation using CREPE
 - GUI available
 - Unified command-line interface (no need to run Python scripts)

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -271,7 +271,7 @@ def infer(
 @click.option(
     "-fm",
     "--f0-method",
-    type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
+    type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
     default="crepe",
     help="f0 prediction method",
 )
@@ -284,7 +284,21 @@ def infer(
     default=0.01,
     help="crossfade seconds",
 )
-@click.option("-b", "--block-seconds", type=float, default=1, help="block seconds")
+@click.option(
+    "-ab",
+    "--additional-infer-before-seconds",
+    type=float,
+    default=0.2,
+    help="additional infer before seconds",
+)
+@click.option(
+    "-aa",
+    "--additional-infer-after-seconds",
+    type=float,
+    default=0.1,
+    help="additional infer after seconds",
+)
+@click.option("-b", "--block-seconds", type=float, default=0.5, help="block seconds")
 @click.option(
     "-d",
     "--device",
@@ -296,6 +310,14 @@ def infer(
 @click.option("-v", "--version", type=int, default=2, help="version")
 @click.option("-i", "--input-device", type=int, default=None, help="input device")
 @click.option("-o", "--output-device", type=int, default=None, help="output device")
+@click.option(
+    "-po",
+    "--passthrough-original",
+    type=bool,
+    default=False,
+    is_flag=True,
+    help="passthrough original (for latency check)",
+)
 def vc(
     # paths
     model_path: Path,
@@ -307,18 +329,21 @@ def vc(
     auto_predict_f0: bool,
     cluster_infer_ratio: float,
     noise_scale: float,
-    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
+    f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
     # slice config
     db_thresh: int,
     pad_seconds: float,
     chunk_seconds: float,
     # realtime config
     crossfade_seconds: float,
+    additional_infer_before_seconds: float,
+    additional_infer_after_seconds: float,
     block_seconds: float,
     version: int,
     input_device: int | str | None,
     output_device: int | str | None,
     device: Literal["cpu", "cuda"],
+    passthrough_original: bool = False,
 ) -> None:
     """Realtime inference from microphone"""
     from .inference_main import realtime
@@ -358,11 +383,14 @@ def vc(
         chunk_seconds=chunk_seconds,
         # realtime config
         crossfade_seconds=crossfade_seconds,
+        additional_infer_before_seconds=additional_infer_before_seconds,
+        additional_infer_after_seconds=additional_infer_after_seconds,
         block_seconds=block_seconds,
         version=version,
         input_device=input_device,
         output_device=output_device,
         device=device,
+        passthrough_original=passthrough_original,
     )
 
 
@@ -467,15 +495,15 @@ def pre_config(
 @click.option(
     "-fm",
     "--f0-method",
-    type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
+    type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
     default="crepe",
 )
 def pre_hubert(
     input_dir: Path,
     config_path: Path,
     n_jobs: bool,
     force_rebuild: bool,
-    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
+    f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
 ) -> None:
     """Preprocessing part 3: hubert
     If the HuBERT model is not found, it will be downloaded automatically."""

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import textwrap
 from logging import getLogger
 from pathlib import Path
 
@@ -45,6 +46,7 @@ def main():
                             default_text=model_candidates[-1].absolute().as_posix()
                             if model_candidates
                             else "",
+                            enable_events=True,
                         ),
                         sg.FileBrowse(
                             initial_folder=Path("./logs/44k/").absolute
@@ -77,7 +79,7 @@ def main():
                     [
                         sg.Text("Cluster model path"),
                         sg.Push(),
-                        sg.InputText(key="cluster_model_path"),
+                        sg.InputText(key="cluster_model_path", enable_events=True),
                         sg.FileBrowse(
                             initial_folder="./logs/44k/"
                             if Path("./logs/44k/").exists()
@@ -128,7 +130,7 @@ def main():
                     [
                         sg.Text("F0 prediction method"),
                         sg.Combo(
-                            ["crepe", "parselmouth", "dio", "harvest"],
+                            ["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
                             key="f0_method",
                             default_value="crepe",
                         ),
@@ -206,33 +208,68 @@ def main():
             sg.Frame(
                 "Realtime",
                 [
+                    [
+                        sg.Text(
+                            "In Realtime Inference:\n"
+                            "    Setting F0 prediction method to 'crepe` may cause performance degradation.\n"
+                            "    Auto Predict F0 must be turned off.\n"
+                            + textwrap.fill(
+                                "If the audio sounds mumbly and choppy, the inference has not been made in time "
+                                "and the below parameters should be adjusted or the microphone input is too low and the "
+                                "silence threshold should be increased.",
+                                80,
+                            )
+                        )
+                    ],
                     [
                         sg.Text("Crossfade seconds"),
                         sg.Push(),
                         sg.Slider(
                             range=(0, 0.6),
                             orientation="h",
                             key="crossfade_seconds",
-                            default_value=0.1,
+                            default_value=0.08,
                             resolution=0.001,
                         ),
                     ],
                     [
                         sg.Text("Block seconds"),
                         sg.Push(),
                         sg.Slider(
-                            range=(0, 3.0),
+                            range=(0, 1.0),
                             orientation="h",
                             key="block_seconds",
-                            default_value=1,
-                            resolution=0.01,
+                            default_value=0.35,
+                            resolution=0.001,
+                        ),
+                    ],
+                    [
+                        sg.Text("Additional Infer seconds (before)"),
+                        sg.Push(),
+                        sg.Slider(
+                            range=(0, 1.0),
+                            orientation="h",
+                            key="additional_infer_before_seconds",
+                            default_value=0.2,
+                            resolution=0.001,
+                        ),
+                    ],
+                    [
+                        sg.Text("Additional Infer seconds (after)"),
+                        sg.Push(),
+                        sg.Slider(
+                            range=(0, 1.0),
+                            orientation="h",
+                            key="additional_infer_after_seconds",
+                            default_value=0.08,
+                            resolution=0.001,
                         ),
                     ],
                     [
                         sg.Text("Realtime algorithm"),
                         sg.Combo(
                             ["2 (Divide by speech)", "1 (Divide constantly)"],
-                            default_value="2 (Divide by speech)",
+                            default_value="1 (Divide constantly)",
                             key="realtime_algorithm",
                         ),
                     ],
@@ -254,6 +291,13 @@ def main():
                             default_value=output_devices[0],
                         ),
                     ],
+                    [
+                        sg.Checkbox(
+                            "Passthrough original audio (for latency check)",
+                            key="passthrough_original",
+                            default=False,
+                        ),
+                    ],
                 ],
             )
         ],
@@ -294,15 +338,26 @@ def update_combo() -> None:
             if values["speaker"] == "":
                 update_combo()
             if event.endswith("_path"):
-                browser = window[f"{event}_browse"]
+                for name in window.AllKeysDict:
+                    if str(name).endswith("_browse"):
+                        browser = window[name]
+                        if isinstance(browser, sg.Button):
+                            LOG.info(
+                                f"Updating browser {browser} to {Path(values[event]).parent}"
+                            )
+                            browser.InitialFolder = Path(values[event]).parent
+                            browser.update()
+                        else:
+                            LOG.warning(f"Browser {browser} is not a FileBrowse")
+                """browser = window[f"{event}_browse"]
                 if isinstance(browser, sg.Button):
                     LOG.info(
                         f"Updating browser {browser} to {Path(values[event]).parent}"
                     )
                     browser.InitialFolder = Path(values[event]).parent
                     browser.update()
                 else:
-                    LOG.warning(f"Browser {browser} is not a FileBrowse")
+                    LOG.warning(f"Browser {browser} is not a FileBrowse")"""
             if event == "config_path":
                 update_combo()
             elif event == "infer":
@@ -360,6 +415,12 @@ def update_combo() -> None:
                         noise_scale=values["noise_scale"],
                         f0_method=values["f0_method"],
                         crossfade_seconds=values["crossfade_seconds"],
+                        additional_infer_before_seconds=values[
+                            "additional_infer_before_seconds"
+                        ],
+                        additional_infer_after_seconds=values[
+                            "additional_infer_after_seconds"
+                        ],
                         db_thresh=values["silence_threshold"],
                         pad_seconds=values["pad_seconds"],
                         chunk_seconds=values["chunk_seconds"],
@@ -368,6 +429,7 @@ def update_combo() -> None:
                         block_seconds=values["block_seconds"],
                         input_device=values["input_device"],
                         output_device=values["output_device"],
+                        passthrough_original=values["passthrough_original"],
                     ),
                 )
             elif event == "stop_vc":