diff --git a/README.md b/README.md index b5b489e8..241e93bb 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,15 @@ A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with Install this via pip (or your favourite package manager that uses pip): ```shell +python -m pip install -U pip setuptools wheel pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117 pip install -U so-vits-svc-fork ``` +- If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu117` with `--index-url https://download.pytorch.org/whl/rocm5.4.2`. +- If no GPU is available, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu117`. +- If `fairseq` raises an error that [`Microsoft C++ Build Tools`](https://visualstudio.microsoft.com/visual-cpp-build-tools/) is not installed or that some dll is missing, please (re)install it. + ### Update Please update this package regularly to get the latest features and bug fixes. @@ -108,6 +113,14 @@ svc --model-path source.wav ### Training +#### Before training + +- If your dataset has BGM, please remove the BGM using software such as [Ultimate Vocal Remover](https://ultimatevocalremover.com/). `3_HP-Vocal-UVR.pth` or `UVR-MDX-NET Main` is recommended. [^1] +- If your dataset is a long audio file with multiple speakers, use `svc sd` to split the dataset into multiple files (using `pyannote.audio`). Further manual classification may be necessary due to accuracy issues. If speakers speak with a variety of speech styles, set --min-speakers larger than the actual number of speakers. Due to unresolved dependencies, please install `pyannote.audio` manually: `pip install pyannote-audio`. +- If your dataset is a long audio file with a single speaker, use `svc split` to split the dataset into multiple files (using `librosa`). + +[^1]: https://ytpmv.info/how-to-use-uvr/ + #### Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb) @@ -119,14 +132,14 @@ Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (s ```shell svc pre-resample svc pre-config -svc pre-hubert -fm dio +svc pre-hubert svc train ``` #### Notes - Dataset audio duration per file should be <~ 10s or VRAM will run out. -- To change the f0 inference method to CREPE, replace `svc pre-hubert -fm dio` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues. +- To change the f0 inference method to CREPE, replace `svc pre-hubert` with `svc pre-hubert -fm crepe`. You may need to reduce `--n-jobs` due to performance issues. - It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB. ### Further help @@ -139,7 +152,7 @@ Usage: svc [OPTIONS] COMMAND [ARGS]... so-vits-svc allows any folder structure for training data. However, the following folder structure is recommended. - When training: dataset_raw/{speaker_name}/{wav_name}.wav + When training: dataset_raw/{speaker_name}/**/{wav_name}.{any_format} When inference: configs/44k/config.json, logs/44k/G_XXXX.pth If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc. (The latest model will be automatically loaded.) @@ -156,6 +169,8 @@ Commands: pre-config Preprocessing part 2: config pre-hubert Preprocessing part 3: hubert If the HuBERT model is not found, it will be... pre-resample Preprocessing part 1: resample + pre-sd Speech diarization using pyannote.audio + pre-split Split audio files into multiple files train Train model If D_0.pth or G_0.pth not found, automatically download from hub. train-cluster Train k-means clustering vc Realtime inference from microphone diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py index 4086b529..e88fd070 100644 --- a/src/so_vits_svc_fork/__main__.py +++ b/src/so_vits_svc_fork/__main__.py @@ -530,6 +530,115 @@ def pre_hubert( ) +@cli.command() +@click.option( + "-i", + "--input-dir", + type=click.Path(exists=True), + default=Path("./dataset_raw_raw/"), + help="path to source dir", +) +@click.option( + "-o", + "--output-dir", + type=click.Path(), + default=Path("./dataset_raw/"), + help="path to output dir", +) +@click.option( + "-n", + "--n-jobs", + type=int, + default=-1, + help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)", +) +@click.option("-min", "--min-speakers", type=int, default=2, help="min speakers") +@click.option("-max", "--max-speakers", type=int, default=2, help="max speakers") +@click.option( + "-t", "--huggingface-token", type=str, default=None, help="huggingface token" +) +def pre_sd( + input_dir: Path | str, + output_dir: Path | str, + min_speakers: int, + max_speakers: int, + huggingface_token: str | None, + n_jobs: int, +): + """Speech diarization using pyannote.audio""" + if huggingface_token is None: + huggingface_token = os.environ.get("HUGGINGFACE_TOKEN", None) + if huggingface_token is None: + huggingface_token = click.prompt( + "Please enter your HuggingFace token", hide_input=True + ) + if os.environ.get("HUGGINGFACE_TOKEN", None) is None: + LOG.info("You can also set the HUGGINGFACE_TOKEN environment variable.") + assert huggingface_token is not None + huggingface_token = huggingface_token.rstrip(" \n\r\t\0") + if len(huggingface_token) <= 1: + raise ValueError("HuggingFace token is empty: " + huggingface_token) + + if max_speakers == 1: + LOG.warning("Consider using pre-split if max_speakers == 1") + from .preprocess_speaker_diarization import preprocess_speaker_diarization + + preprocess_speaker_diarization( + input_dir=input_dir, + output_dir=output_dir, + min_speakers=min_speakers, + max_speakers=max_speakers, + huggingface_token=huggingface_token, + n_jobs=n_jobs, + ) + + +@cli.command() +@click.option( + "-i", + "--input-dir", + type=click.Path(exists=True), + default=Path("./dataset_raw_raw/"), + help="path to source dir", +) +@click.option( + "-o", + "--output-dir", + type=click.Path(), + default=Path("./dataset_raw/"), + help="path to output dir", +) +@click.option( + "-n", + "--n-jobs", + type=int, + default=-1, + help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)", +) +@click.option("-d", "--top-db", type=float, default=30, help="top db") +@click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds") +@click.option("-h", "--hop-seconds", type=float, default=0.3, help="hop seconds") +def pre_split( + input_dir: Path | str, + output_dir: Path | str, + top_db: int, + frame_seconds: float, + hop_seconds: float, + n_jobs: int, +): + """Split audio files into multiple files""" + from .preprocess_split import preprocess_split + + preprocess_split( + input_dir=input_dir, + output_dir=output_dir, + top_db=top_db, + frame_seconds=frame_seconds, + hop_seconds=hop_seconds, + n_jobs=n_jobs, + ) + + @cli.command def clean(): """Clean up files, only useful if you are using the default file structure""" diff --git a/src/so_vits_svc_fork/preprocess_speaker_diarization.py b/src/so_vits_svc_fork/preprocess_speaker_diarization.py new file mode 100644 index 00000000..360cf5f8 --- /dev/null +++ b/src/so_vits_svc_fork/preprocess_speaker_diarization.py @@ -0,0 +1,87 @@ +from collections import defaultdict +from logging import getLogger +from pathlib import Path + +import soundfile as sf +import torch +from joblib import Parallel, delayed +from pyannote.audio import Pipeline +from tqdm import tqdm +from tqdm_joblib import tqdm_joblib + +LOG = getLogger(__name__) + + +def _process_one( + input_path: Path, + output_dir: Path, + *, + min_speakers: int = 1, + max_speakers: int = 1, + huggingface_token: str | None = None, +) -> None: + try: + audio, sr = sf.read(input_path) + except Exception as e: + LOG.warning(f"Failed to read {input_path}: {e}") + return + pipeline = Pipeline.from_pretrained( + "pyannote/speaker-diarization", use_auth_token=huggingface_token + ) + if pipeline is None: + raise ValueError("Failed to load pipeline") + + LOG.info(f"Processing {input_path}. This may take a while...") + diarization = pipeline( + input_path, min_speakers=min_speakers, max_speakers=max_speakers + ) + + LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}") + speaker_count = defaultdict(int) + + output_dir.mkdir(parents=True, exist_ok=True) + for segment, track, speaker in tqdm( + list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}" + ): + if segment.end - segment.start < 1: + continue + speaker_count[speaker] += 1 + audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)] + sf.write( + (output_dir / f"{speaker}_{speaker_count[speaker]}.wav"), + audio_cut, + sr, + ) + + LOG.info(f"Speaker count: {speaker_count}") + + +def preprocess_speaker_diarization( + input_dir: Path | str, + output_dir: Path | str, + *, + min_speakers: int = 1, + max_speakers: int = 1, + huggingface_token: str | None = None, + n_jobs: int = -1, +) -> None: + if huggingface_token is not None and not huggingface_token.startswith("hf_"): + LOG.warning("Huggingface token probably should start with hf_") + if not torch.cuda.is_available(): + LOG.warning("CUDA is not available. This will be extremely slow.") + input_dir = Path(input_dir) + output_dir = Path(output_dir) + input_dir.mkdir(parents=True, exist_ok=True) + output_dir.mkdir(parents=True, exist_ok=True) + input_paths = list(input_dir.rglob("*.*")) + with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)): + Parallel(n_jobs=n_jobs)( + delayed(_process_one)( + input_path, + output_dir / input_path.relative_to(input_dir).parent / input_path.stem, + max_speakers=max_speakers, + min_speakers=min_speakers, + huggingface_token=huggingface_token, + ) + for input_path in input_paths + ) diff --git a/src/so_vits_svc_fork/preprocess_split.py b/src/so_vits_svc_fork/preprocess_split.py new file mode 100644 index 00000000..6f6fd0b0 --- /dev/null +++ b/src/so_vits_svc_fork/preprocess_split.py @@ -0,0 +1,65 @@ +from logging import getLogger +from pathlib import Path + +import librosa +import soundfile as sf +from joblib import Parallel, delayed +from tqdm import tqdm +from tqdm_joblib import tqdm_joblib + +LOG = getLogger(__name__) + + +def _process_one( + input_path: Path, + output_dir: Path, + *, + top_db: int = 30, + frame_seconds: float = 0.5, + hop_seconds: float = 0.1, +): + try: + audio, sr = librosa.load(input_path) + except Exception as e: + LOG.warning(f"Failed to read {input_path}: {e}") + return + intervals = librosa.effects.split( + audio, + top_db=top_db, + frame_length=int(sr * frame_seconds), + hop_length=int(sr * hop_seconds), + ) + output_dir.mkdir(parents=True, exist_ok=True) + for start, end in tqdm(intervals, desc=f"Writing {input_path}"): + audio_cut = audio[start:end] + sf.write( + (output_dir / f"{input_path.stem}_{start / sr:.3f}_{end / sr:.3f}.wav"), + audio_cut, + sr, + ) + + +def preprocess_split( + input_dir: Path | str, + output_dir: Path | str, + *, + top_db: int = 30, + frame_seconds: float = 0.5, + hop_seconds: float = 0.1, + n_jobs: int = -1, +): + input_dir = Path(input_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + input_paths = list(input_dir.rglob("*.*")) + with tqdm_joblib(desc="Splitting", total=len(input_paths)): + Parallel(n_jobs=n_jobs)( + delayed(_process_one)( + input_path, + output_dir / input_path.relative_to(input_dir).parent, + top_db=top_db, + frame_seconds=frame_seconds, + hop_seconds=hop_seconds, + ) + for input_path in input_paths + )