Skip to content

Commit

Permalink
feat: enhance RealtimeVC (#52)
Browse files Browse the repository at this point in the history
  • Loading branch information
34j authored Mar 21, 2023
1 parent 1c8305a commit 81551ce
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 53 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pip install so-vits-svc-fork

## Features not available in the original repo

- **Realtime voice conversion**
- **Realtime voice conversion** (enhanced in v1.1.0)
- More accurate pitch estimation using CREPE
- GUI available
- Unified command-line interface (no need to run Python scripts)
Expand Down
38 changes: 33 additions & 5 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def infer(
@click.option(
"-fm",
"--f0-method",
type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
default="crepe",
help="f0 prediction method",
)
Expand All @@ -284,7 +284,21 @@ def infer(
default=0.01,
help="crossfade seconds",
)
@click.option("-b", "--block-seconds", type=float, default=1, help="block seconds")
@click.option(
"-ab",
"--additional-infer-before-seconds",
type=float,
default=0.2,
help="additional infer before seconds",
)
@click.option(
"-aa",
"--additional-infer-after-seconds",
type=float,
default=0.1,
help="additional infer after seconds",
)
@click.option("-b", "--block-seconds", type=float, default=0.5, help="block seconds")
@click.option(
"-d",
"--device",
Expand All @@ -296,6 +310,14 @@ def infer(
@click.option("-v", "--version", type=int, default=2, help="version")
@click.option("-i", "--input-device", type=int, default=None, help="input device")
@click.option("-o", "--output-device", type=int, default=None, help="output device")
@click.option(
"-po",
"--passthrough-original",
type=bool,
default=False,
is_flag=True,
help="passthrough original (for latency check)",
)
def vc(
# paths
model_path: Path,
Expand All @@ -307,18 +329,21 @@ def vc(
auto_predict_f0: bool,
cluster_infer_ratio: float,
noise_scale: float,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
# slice config
db_thresh: int,
pad_seconds: float,
chunk_seconds: float,
# realtime config
crossfade_seconds: float,
additional_infer_before_seconds: float,
additional_infer_after_seconds: float,
block_seconds: float,
version: int,
input_device: int | str | None,
output_device: int | str | None,
device: Literal["cpu", "cuda"],
passthrough_original: bool = False,
) -> None:
"""Realtime inference from microphone"""
from .inference_main import realtime
Expand Down Expand Up @@ -358,11 +383,14 @@ def vc(
chunk_seconds=chunk_seconds,
# realtime config
crossfade_seconds=crossfade_seconds,
additional_infer_before_seconds=additional_infer_before_seconds,
additional_infer_after_seconds=additional_infer_after_seconds,
block_seconds=block_seconds,
version=version,
input_device=input_device,
output_device=output_device,
device=device,
passthrough_original=passthrough_original,
)


Expand Down Expand Up @@ -467,15 +495,15 @@ def pre_config(
@click.option(
"-fm",
"--f0-method",
type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]),
default="crepe",
)
def pre_hubert(
input_dir: Path,
config_path: Path,
n_jobs: bool,
force_rebuild: bool,
f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
) -> None:
"""Preprocessing part 3: hubert
If the HuBERT model is not found, it will be downloaded automatically."""
Expand Down
80 changes: 71 additions & 9 deletions src/so_vits_svc_fork/gui.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import textwrap
from logging import getLogger
from pathlib import Path

Expand Down Expand Up @@ -45,6 +46,7 @@ def main():
default_text=model_candidates[-1].absolute().as_posix()
if model_candidates
else "",
enable_events=True,
),
sg.FileBrowse(
initial_folder=Path("./logs/44k/").absolute
Expand Down Expand Up @@ -77,7 +79,7 @@ def main():
[
sg.Text("Cluster model path"),
sg.Push(),
sg.InputText(key="cluster_model_path"),
sg.InputText(key="cluster_model_path", enable_events=True),
sg.FileBrowse(
initial_folder="./logs/44k/"
if Path("./logs/44k/").exists()
Expand Down Expand Up @@ -128,7 +130,7 @@ def main():
[
sg.Text("F0 prediction method"),
sg.Combo(
["crepe", "parselmouth", "dio", "harvest"],
["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
key="f0_method",
default_value="crepe",
),
Expand Down Expand Up @@ -206,33 +208,68 @@ def main():
sg.Frame(
"Realtime",
[
[
sg.Text(
"In Realtime Inference:\n"
" Setting F0 prediction method to 'crepe` may cause performance degradation.\n"
" Auto Predict F0 must be turned off.\n"
+ textwrap.fill(
"If the audio sounds mumbly and choppy, the inference has not been made in time "
"and the below parameters should be adjusted or the microphone input is too low and the "
"silence threshold should be increased.",
80,
)
)
],
[
sg.Text("Crossfade seconds"),
sg.Push(),
sg.Slider(
range=(0, 0.6),
orientation="h",
key="crossfade_seconds",
default_value=0.1,
default_value=0.08,
resolution=0.001,
),
],
[
sg.Text("Block seconds"),
sg.Push(),
sg.Slider(
range=(0, 3.0),
range=(0, 1.0),
orientation="h",
key="block_seconds",
default_value=1,
resolution=0.01,
default_value=0.35,
resolution=0.001,
),
],
[
sg.Text("Additional Infer seconds (before)"),
sg.Push(),
sg.Slider(
range=(0, 1.0),
orientation="h",
key="additional_infer_before_seconds",
default_value=0.2,
resolution=0.001,
),
],
[
sg.Text("Additional Infer seconds (after)"),
sg.Push(),
sg.Slider(
range=(0, 1.0),
orientation="h",
key="additional_infer_after_seconds",
default_value=0.08,
resolution=0.001,
),
],
[
sg.Text("Realtime algorithm"),
sg.Combo(
["2 (Divide by speech)", "1 (Divide constantly)"],
default_value="2 (Divide by speech)",
default_value="1 (Divide constantly)",
key="realtime_algorithm",
),
],
Expand All @@ -254,6 +291,13 @@ def main():
default_value=output_devices[0],
),
],
[
sg.Checkbox(
"Passthrough original audio (for latency check)",
key="passthrough_original",
default=False,
),
],
],
)
],
Expand Down Expand Up @@ -294,15 +338,26 @@ def update_combo() -> None:
if values["speaker"] == "":
update_combo()
if event.endswith("_path"):
browser = window[f"{event}_browse"]
for name in window.AllKeysDict:
if str(name).endswith("_browse"):
browser = window[name]
if isinstance(browser, sg.Button):
LOG.info(
f"Updating browser {browser} to {Path(values[event]).parent}"
)
browser.InitialFolder = Path(values[event]).parent
browser.update()
else:
LOG.warning(f"Browser {browser} is not a FileBrowse")
"""browser = window[f"{event}_browse"]
if isinstance(browser, sg.Button):
LOG.info(
f"Updating browser {browser} to {Path(values[event]).parent}"
)
browser.InitialFolder = Path(values[event]).parent
browser.update()
else:
LOG.warning(f"Browser {browser} is not a FileBrowse")
LOG.warning(f"Browser {browser} is not a FileBrowse")"""
if event == "config_path":
update_combo()
elif event == "infer":
Expand Down Expand Up @@ -360,6 +415,12 @@ def update_combo() -> None:
noise_scale=values["noise_scale"],
f0_method=values["f0_method"],
crossfade_seconds=values["crossfade_seconds"],
additional_infer_before_seconds=values[
"additional_infer_before_seconds"
],
additional_infer_after_seconds=values[
"additional_infer_after_seconds"
],
db_thresh=values["silence_threshold"],
pad_seconds=values["pad_seconds"],
chunk_seconds=values["chunk_seconds"],
Expand All @@ -368,6 +429,7 @@ def update_combo() -> None:
block_seconds=values["block_seconds"],
input_device=values["input_device"],
output_device=values["output_device"],
passthrough_original=values["passthrough_original"],
),
)
elif event == "stop_vc":
Expand Down
Loading

0 comments on commit 81551ce

Please sign in to comment.