From 6b3b20dfd609d81cb1184b7c8e8865a58f8d45f9 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Mon, 20 Mar 2023 13:59:47 +0900 Subject: [PATCH] feat: add more f0 calculation methods (#39) --- README.md | 11 +- poetry.lock | 171 ++++--------------- pyproject.toml | 11 +- src/so_vits_svc_fork/__main__.py | 34 +++- src/so_vits_svc_fork/gui.py | 9 + src/so_vits_svc_fork/inference/infer_tool.py | 24 ++- src/so_vits_svc_fork/inference_main.py | 19 ++- src/so_vits_svc_fork/preprocess_hubert_f0.py | 21 ++- src/so_vits_svc_fork/utils.py | 90 ++++++++-- 9 files changed, 217 insertions(+), 173 deletions(-) diff --git a/README.md b/README.md index bb8b7812..a515ce62 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,13 @@ pip install so-vits-svc-fork ## Features not available in the original repo - **Realtime voice conversion** +- More accurate pitch estimation using CREPE - GUI available - Unified command-line interface (no need to run Python scripts) - Ready to use just by installing with `pip`. - Automatically download pretrained base model and HuBERT model - Code completely formatted with black, isort, autoflake etc. +- Volume normalization in preprocessing - Other minor differences ## Usage @@ -79,6 +81,10 @@ svc vc --model-path svc --model-path source.wav ``` +#### Notes + +- In real-time inference, if there is noise on the inputs, the HuBERT model will react to those as well. Consider using realtime noise reduction applications such as [RTX Voice](https://www.nvidia.com/en-us/geforce/guides/nvidia-rtx-voice-setup-guide/) in this case. + ### Training #### Google Colab @@ -96,7 +102,10 @@ svc pre-hubert svc train ``` -It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB. +#### Notes + +- Dataset audio duration per file should be <~ 10s or VRAM will run out. +- It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB. ### Further help diff --git a/poetry.lock b/poetry.lock index 1e0992d7..5bdbc8b2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand. [[package]] name = "absl-py" @@ -1697,35 +1697,33 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "librosa" -version = "0.10.0" +version = "0.9.1" description = "Python module for audio and music processing" category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.6" files = [ - {file = "librosa-0.10.0-py3-none-any.whl", hash = "sha256:6db29c1467168da21313203dcef405a73a678d3aad0fbc67607250b2f08a3f5a"}, - {file = "librosa-0.10.0.tar.gz", hash = "sha256:8e8669e5084002d1a87f6c82b732f370784a368d0e55c2dd7d7aef3fa02fd058"}, + {file = "librosa-0.9.1-py3-none-any.whl", hash = "sha256:c2bb61a8008367cca89a3f1dad352d8e55fe5ca5f7414fb5d5258eb52765db33"}, + {file = "librosa-0.9.1.tar.gz", hash = "sha256:7ed5d6e3f4546e5e3c2840691f9ddc56878f914a35a50060df5fca2b26d4b614"}, ] [package.dependencies] -audioread = ">=2.1.9" -decorator = ">=4.3.0" +audioread = ">=2.1.5" +decorator = ">=4.0.10" joblib = ">=0.14" -lazy-loader = ">=0.1" -msgpack = ">=1.0" -numba = ">=0.51.0" -numpy = ">=1.20.3" +numba = ">=0.45.1" +numpy = ">=1.17.0" +packaging = ">=20.0" pooch = ">=1.0" -scikit-learn = ">=0.20.0" +resampy = ">=0.2.2" +scikit-learn = ">=0.19.1" scipy = ">=1.2.0" -soundfile = ">=0.12.1" -soxr = ">=0.3.2" -typing-extensions = ">=4.1.1" +soundfile = ">=0.10.2" [package.extras] display = ["matplotlib (>=3.3.0)"] -docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (>=0.51)", "numpydoc", "presets", "sphinx (!=1.3.1,<6)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=1.0.0,<2.0.0)", "sphinxcontrib-svg2pdfconverter"] -tests = ["matplotlib (>=3.3.0)", "packaging (>=20.0)", "pytest", "pytest-cov", "pytest-mpl", "resampy (>=0.2.2)", "samplerate", "types-decorator"] +docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (<0.50)", "numpydoc", "presets", "sphinx (!=1.3.1)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=0.5.0,<0.6.0)", "sphinxcontrib-svg2pdfconverter"] +tests = ["contextlib2", "matplotlib (>=3.3.0)", "pytest", "pytest-cov", "pytest-mpl", "samplerate", "soxr"] [[package]] name = "linkify-it-py" @@ -2107,79 +2105,6 @@ docs = ["sphinx"] gmpy = ["gmpy2 (>=2.1.0a4)"] tests = ["pytest (>=4.6)"] -[[package]] -name = "msgpack" -version = "1.0.5" -description = "MessagePack serializer" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "msgpack-1.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:525228efd79bb831cf6830a732e2e80bc1b05436b086d4264814b4b2955b2fa9"}, - {file = "msgpack-1.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f8d8b3bf1ff2672567d6b5c725a1b347fe838b912772aa8ae2bf70338d5a198"}, - {file = "msgpack-1.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdc793c50be3f01106245a61b739328f7dccc2c648b501e237f0699fe1395b81"}, - {file = "msgpack-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cb47c21a8a65b165ce29f2bec852790cbc04936f502966768e4aae9fa763cb7"}, - {file = "msgpack-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42b9594cc3bf4d838d67d6ed62b9e59e201862a25e9a157019e171fbe672dd3"}, - {file = "msgpack-1.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55b56a24893105dc52c1253649b60f475f36b3aa0fc66115bffafb624d7cb30b"}, - {file = "msgpack-1.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1967f6129fc50a43bfe0951c35acbb729be89a55d849fab7686004da85103f1c"}, - {file = "msgpack-1.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20a97bf595a232c3ee6d57ddaadd5453d174a52594bf9c21d10407e2a2d9b3bd"}, - {file = "msgpack-1.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d25dd59bbbbb996eacf7be6b4ad082ed7eacc4e8f3d2df1ba43822da9bfa122a"}, - {file = "msgpack-1.0.5-cp310-cp310-win32.whl", hash = "sha256:382b2c77589331f2cb80b67cc058c00f225e19827dbc818d700f61513ab47bea"}, - {file = "msgpack-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:4867aa2df9e2a5fa5f76d7d5565d25ec76e84c106b55509e78c1ede0f152659a"}, - {file = "msgpack-1.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9f5ae84c5c8a857ec44dc180a8b0cc08238e021f57abdf51a8182e915e6299f0"}, - {file = "msgpack-1.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e6ca5d5699bcd89ae605c150aee83b5321f2115695e741b99618f4856c50898"}, - {file = "msgpack-1.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5494ea30d517a3576749cad32fa27f7585c65f5f38309c88c6d137877fa28a5a"}, - {file = "msgpack-1.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ab2f3331cb1b54165976a9d976cb251a83183631c88076613c6c780f0d6e45a"}, - {file = "msgpack-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28592e20bbb1620848256ebc105fc420436af59515793ed27d5c77a217477705"}, - {file = "msgpack-1.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe5c63197c55bce6385d9aee16c4d0641684628f63ace85f73571e65ad1c1e8d"}, - {file = "msgpack-1.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ed40e926fa2f297e8a653c954b732f125ef97bdd4c889f243182299de27e2aa9"}, - {file = "msgpack-1.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b2de4c1c0538dcb7010902a2b97f4e00fc4ddf2c8cda9749af0e594d3b7fa3d7"}, - {file = "msgpack-1.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bf22a83f973b50f9d38e55c6aade04c41ddda19b00c4ebc558930d78eecc64ed"}, - {file = "msgpack-1.0.5-cp311-cp311-win32.whl", hash = "sha256:c396e2cc213d12ce017b686e0f53497f94f8ba2b24799c25d913d46c08ec422c"}, - {file = "msgpack-1.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:6c4c68d87497f66f96d50142a2b73b97972130d93677ce930718f68828b382e2"}, - {file = "msgpack-1.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a2b031c2e9b9af485d5e3c4520f4220d74f4d222a5b8dc8c1a3ab9448ca79c57"}, - {file = "msgpack-1.0.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f837b93669ce4336e24d08286c38761132bc7ab29782727f8557e1eb21b2080"}, - {file = "msgpack-1.0.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1d46dfe3832660f53b13b925d4e0fa1432b00f5f7210eb3ad3bb9a13c6204a6"}, - {file = "msgpack-1.0.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:366c9a7b9057e1547f4ad51d8facad8b406bab69c7d72c0eb6f529cf76d4b85f"}, - {file = "msgpack-1.0.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:4c075728a1095efd0634a7dccb06204919a2f67d1893b6aa8e00497258bf926c"}, - {file = "msgpack-1.0.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:f933bbda5a3ee63b8834179096923b094b76f0c7a73c1cfe8f07ad608c58844b"}, - {file = "msgpack-1.0.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:36961b0568c36027c76e2ae3ca1132e35123dcec0706c4b7992683cc26c1320c"}, - {file = "msgpack-1.0.5-cp36-cp36m-win32.whl", hash = "sha256:b5ef2f015b95f912c2fcab19c36814963b5463f1fb9049846994b007962743e9"}, - {file = "msgpack-1.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:288e32b47e67f7b171f86b030e527e302c91bd3f40fd9033483f2cacc37f327a"}, - {file = "msgpack-1.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:137850656634abddfb88236008339fdaba3178f4751b28f270d2ebe77a563b6c"}, - {file = "msgpack-1.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c05a4a96585525916b109bb85f8cb6511db1c6f5b9d9cbcbc940dc6b4be944b"}, - {file = "msgpack-1.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56a62ec00b636583e5cb6ad313bbed36bb7ead5fa3a3e38938503142c72cba4f"}, - {file = "msgpack-1.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef8108f8dedf204bb7b42994abf93882da1159728a2d4c5e82012edd92c9da9f"}, - {file = "msgpack-1.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1835c84d65f46900920b3708f5ba829fb19b1096c1800ad60bae8418652a951d"}, - {file = "msgpack-1.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:e57916ef1bd0fee4f21c4600e9d1da352d8816b52a599c46460e93a6e9f17086"}, - {file = "msgpack-1.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:17358523b85973e5f242ad74aa4712b7ee560715562554aa2134d96e7aa4cbbf"}, - {file = "msgpack-1.0.5-cp37-cp37m-win32.whl", hash = "sha256:cb5aaa8c17760909ec6cb15e744c3ebc2ca8918e727216e79607b7bbce9c8f77"}, - {file = "msgpack-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:ab31e908d8424d55601ad7075e471b7d0140d4d3dd3272daf39c5c19d936bd82"}, - {file = "msgpack-1.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b72d0698f86e8d9ddf9442bdedec15b71df3598199ba33322d9711a19f08145c"}, - {file = "msgpack-1.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:379026812e49258016dd84ad79ac8446922234d498058ae1d415f04b522d5b2d"}, - {file = "msgpack-1.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:332360ff25469c346a1c5e47cbe2a725517919892eda5cfaffe6046656f0b7bb"}, - {file = "msgpack-1.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:476a8fe8fae289fdf273d6d2a6cb6e35b5a58541693e8f9f019bfe990a51e4ba"}, - {file = "msgpack-1.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9985b214f33311df47e274eb788a5893a761d025e2b92c723ba4c63936b69b1"}, - {file = "msgpack-1.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48296af57cdb1d885843afd73c4656be5c76c0c6328db3440c9601a98f303d87"}, - {file = "msgpack-1.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:addab7e2e1fcc04bd08e4eb631c2a90960c340e40dfc4a5e24d2ff0d5a3b3edb"}, - {file = "msgpack-1.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:916723458c25dfb77ff07f4c66aed34e47503b2eb3188b3adbec8d8aa6e00f48"}, - {file = "msgpack-1.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:821c7e677cc6acf0fd3f7ac664c98803827ae6de594a9f99563e48c5a2f27eb0"}, - {file = "msgpack-1.0.5-cp38-cp38-win32.whl", hash = "sha256:1c0f7c47f0087ffda62961d425e4407961a7ffd2aa004c81b9c07d9269512f6e"}, - {file = "msgpack-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:bae7de2026cbfe3782c8b78b0db9cbfc5455e079f1937cb0ab8d133496ac55e1"}, - {file = "msgpack-1.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:20c784e66b613c7f16f632e7b5e8a1651aa5702463d61394671ba07b2fc9e025"}, - {file = "msgpack-1.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:266fa4202c0eb94d26822d9bfd7af25d1e2c088927fe8de9033d929dd5ba24c5"}, - {file = "msgpack-1.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18334484eafc2b1aa47a6d42427da7fa8f2ab3d60b674120bce7a895a0a85bdd"}, - {file = "msgpack-1.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57e1f3528bd95cc44684beda696f74d3aaa8a5e58c816214b9046512240ef437"}, - {file = "msgpack-1.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:586d0d636f9a628ddc6a17bfd45aa5b5efaf1606d2b60fa5d87b8986326e933f"}, - {file = "msgpack-1.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a740fa0e4087a734455f0fc3abf5e746004c9da72fbd541e9b113013c8dc3282"}, - {file = "msgpack-1.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:3055b0455e45810820db1f29d900bf39466df96ddca11dfa6d074fa47054376d"}, - {file = "msgpack-1.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a61215eac016f391129a013c9e46f3ab308db5f5ec9f25811e811f96962599a8"}, - {file = "msgpack-1.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:362d9655cd369b08fda06b6657a303eb7172d5279997abe094512e919cf74b11"}, - {file = "msgpack-1.0.5-cp39-cp39-win32.whl", hash = "sha256:ac9dd47af78cae935901a9a500104e2dea2e253207c924cc95de149606dc43cc"}, - {file = "msgpack-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:06f5174b5f8ed0ed919da0e62cbd4ffde676a374aba4020034da05fab67b9164"}, - {file = "msgpack-1.0.5.tar.gz", hash = "sha256:c075544284eadc5cddc70f4757331d99dcbc16b2bbd4849d15f8aae4cf36d31c"}, -] - [[package]] name = "multidict" version = "6.0.4" @@ -4130,51 +4055,6 @@ cffi = ">=1.0" [package.extras] numpy = ["numpy"] -[[package]] -name = "soxr" -version = "0.3.4" -description = "High quality, one-dimensional sample-rate conversion library" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "soxr-0.3.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b7b84126643c063d5daa203f7f9137e21734dabbd7e68c097607b2ef457e2f2e"}, - {file = "soxr-0.3.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:380d2d43871a68e8b1ef1702a0abe6f9e48ddb3933c7a303c45d67e121503e7c"}, - {file = "soxr-0.3.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4a1b4019c9972f57612482c4f85523d6e832e3d10935e2f070a9dcd334a4dcb"}, - {file = "soxr-0.3.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e613cee023b7c3f162b9da3f6b169cd7f58de345275be1fde9f19adc9cf144df"}, - {file = "soxr-0.3.4-cp310-cp310-win32.whl", hash = "sha256:182c02a7ba45a159a0dbb0a297335df2381ead03a65377b19663ea0ff720ecb7"}, - {file = "soxr-0.3.4-cp310-cp310-win_amd64.whl", hash = "sha256:1e95c96ce94524fae453b4331c9910d33f97506f99bae06d76a9c0649710619e"}, - {file = "soxr-0.3.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2678d2719e7496803983584d661eb5fddc7017154a8dda4a774407c56ff07973"}, - {file = "soxr-0.3.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11bd1396052049e6d389225a0e96a9df15f706da501c619b35d3c72ac6bc7257"}, - {file = "soxr-0.3.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e23de4dfe54ac30e583bbc9cc3feda1cd776fedce13206bc4b3115b75ecab82"}, - {file = "soxr-0.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e7396498a5f5b7d8f23b656f65c24517a6ff5bdc3ee0623ccd491036a43ea08"}, - {file = "soxr-0.3.4-cp311-cp311-win32.whl", hash = "sha256:e57e9703c2bff834cabc06800d3c11a259544891d2c24a78949f3cf2f5492cc5"}, - {file = "soxr-0.3.4-cp311-cp311-win_amd64.whl", hash = "sha256:7c8350acd7150f74261a0569b47ccb1bb4aa39b2d575860bc97cfa69aab8aead"}, - {file = "soxr-0.3.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:941f7355116fe77fe6a82938fa7799a0e466a494ebc093f676969ce32b2815b1"}, - {file = "soxr-0.3.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:00fdbf24f64d8c3fb800425c383048cb24c32defac80901cde4a57fb6ce5d431"}, - {file = "soxr-0.3.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bb6d4dc807d04c536674429e2b05ae08a1efac9815c4595e41ffd6b57c2c662"}, - {file = "soxr-0.3.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff15853895b54f1b627799c6600be1ce5f7286724e7a93e4b7b9d79e5d4166f5"}, - {file = "soxr-0.3.4-cp38-cp38-win32.whl", hash = "sha256:d858becbc1fcc7b38c3436d3276290fae09403cdcbdf1d5986a18dab7023a6c3"}, - {file = "soxr-0.3.4-cp38-cp38-win_amd64.whl", hash = "sha256:068ab4df549df5783cc1eb4eb6c94f53823b164dc27134fc621fc9f5097f38cd"}, - {file = "soxr-0.3.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20130329985f9767c8417bbd125fe138790a71802b000481c386a800e2ad2bca"}, - {file = "soxr-0.3.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:78090e97abfb326b7cf14ef37d08a17252b07d438388dcbbd82a6836a9d551b1"}, - {file = "soxr-0.3.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84e590e75b7e5dca12bf68bfb090276f34a88fbcd793781c62d47f5d7dbe525e"}, - {file = "soxr-0.3.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3479d265574b960e12bca0878baba0862c43278915e0319d84679bb4d4fcd33"}, - {file = "soxr-0.3.4-cp39-cp39-win32.whl", hash = "sha256:83de825d6a713c7b2e76d9ec3f229a58a9ed290237e7adc05d80e8b39be995a6"}, - {file = "soxr-0.3.4-cp39-cp39-win_amd64.whl", hash = "sha256:2082f88cae89de854c3e0d62f55d0cb31eb11764f5c2a28299121fb642a22472"}, - {file = "soxr-0.3.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fe8b5f92c802f1e7793c40344f5368dc6163718c9ffa82e79ee6ad779d318ac5"}, - {file = "soxr-0.3.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0063d5f9a4e1a367084f4705301e9da131cf4d2d32aa3fe0072a1245e18088f"}, - {file = "soxr-0.3.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:a680bab57adae462cdc86abcc7330beb5daa3ba5101165583eedcda88b7ba551"}, - {file = "soxr-0.3.4.tar.gz", hash = "sha256:fe68daf00e8f020977b187699903d219f9e39b9fb3d915f3f923eed8ba431449"}, -] - -[package.dependencies] -numpy = "*" - -[package.extras] -docs = ["linkify-it-py", "myst-parser", "sphinx", "sphinx-book-theme"] -test = ["pytest"] - [[package]] name = "sphinx" version = "5.3.0" @@ -4607,6 +4487,25 @@ files = [ [package.dependencies] torch = "2.0.0" +[[package]] +name = "torchcrepe" +version = "0.0.17" +description = "Pytorch implementation of CREPE pitch tracker" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "torchcrepe-0.0.17-py3-none-any.whl", hash = "sha256:c840366a0be98835107fe926d41266658689b27a8e235c30cab6e13202ec09da"}, + {file = "torchcrepe-0.0.17.tar.gz", hash = "sha256:03ad7907f384a581a2d1783cf77834bbf2cd77282c2de577df9f6f6a02bf58be"}, +] + +[package.dependencies] +librosa = "0.9.1" +resampy = "*" +scipy = "*" +torch = "*" +tqdm = "*" + [[package]] name = "tqdm" version = "4.65.0" @@ -4974,4 +4873,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "6fa2b7382f0358e9ae6d760edb98e23659568d86083263a9251c32248ba162f3" +content-hash = "fd616d54a65bd30fd875dd579a03a7f44119f9e5698823d7d3f074b15e2df62c" diff --git a/pyproject.toml b/pyproject.toml index c0b2e927..1b884f41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ fairseq = "*" flask = "*" flask_cors = "*" gradio = "*" -numpy = ">=1.23" +numpy = "^1.23" pydub = "*" pyworld = "*" requests = "*" @@ -51,16 +51,17 @@ praat-parselmouth = "*" onnx = "*" onnxsim = "*" onnxoptimizer = "*" -torch = "*" -torchaudio = "*" +torch = ">=1.12" +torchaudio = ">=0.12" tensorboard = "*" rich = "*" tqdm-joblib = "*" tensorboardx = "*" pyinputplus = "*" -cm-time = "^0.1.2" +cm-time = ">=0.1.2" pysimplegui = ">=4.6" -pebble = "^5.0.3" +pebble = ">=5.0" +torchcrepe = ">=0.0.17" [tool.poetry.group.dev.dependencies] pre-commit = ">=3" diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py index 0a38cfba..b247ccb6 100644 --- a/src/so_vits_svc_fork/__main__.py +++ b/src/so_vits_svc_fork/__main__.py @@ -268,6 +268,13 @@ def infer( @click.option( "-db", "--db-thresh", type=int, default=-30, help="threshold (DB) (ABSOLUTE)" ) +@click.option( + "-fm", + "--f0-method", + type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]), + default="crepe", + help="f0 prediction method", +) @click.option("-p", "--pad-seconds", type=float, default=0.02, help="pad seconds") @click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds") @click.option( @@ -300,6 +307,7 @@ def vc( auto_predict_f0: bool, cluster_infer_ratio: float, noise_scale: float, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"], # slice config db_thresh: int, pad_seconds: float, @@ -333,19 +341,24 @@ def vc( LOG.info(f"Since model_path is a directory, use {model_path}") realtime( + # paths model_path=model_path, config_path=config_path, + # svc config speaker=speaker, cluster_model_path=cluster_model_path, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, - crossfade_seconds=crossfade_seconds, - block_seconds=block_seconds, - chunk_seconds=chunk_seconds, + f0_method=f0_method, + # slice config db_thresh=db_thresh, pad_seconds=pad_seconds, + chunk_seconds=chunk_seconds, + # realtime config + crossfade_seconds=crossfade_seconds, + block_seconds=block_seconds, version=version, input_device=input_device, output_device=output_device, @@ -446,13 +459,23 @@ def pre_config( ) @click.option( "-f", - "--force_rebuild", + "--force-rebuild", type=bool, default=True, help="force rebuild existing preprocessed files", ) +@click.option( + "-fm", + "--f0-method", + type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]), + default="crepe", +) def pre_hubert( - input_dir: Path, config_path: Path, n_jobs: bool, force_rebuild: bool + input_dir: Path, + config_path: Path, + n_jobs: bool, + force_rebuild: bool, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"], ) -> None: """Preprocessing part 3: hubert If the HuBERT model is not found, it will be downloaded automatically.""" @@ -465,6 +488,7 @@ def pre_hubert( config_path=config_path, n_jobs=n_jobs, force_rebuild=force_rebuild, + f0_method=f0_method, ) diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py index 3586a3a7..3d529f9a 100644 --- a/src/so_vits_svc_fork/gui.py +++ b/src/so_vits_svc_fork/gui.py @@ -125,6 +125,14 @@ def main(): text="Auto predict F0 (Pitch may become unstable when turned on in real-time inference.)", ) ], + [ + sg.Text("F0 prediction method"), + sg.Combo( + ["crepe", "parselmouth", "dio", "harvest"], + key="f0_method", + default_value="crepe", + ), + ], [ sg.Text("Cluster infer ratio"), sg.Push(), @@ -350,6 +358,7 @@ def update_combo() -> None: auto_predict_f0=values["auto_predict_f0"], cluster_infer_ratio=values["cluster_infer_ratio"], noise_scale=values["noise_scale"], + f0_method=values["f0_method"], crossfade_seconds=values["crossfade_seconds"], db_thresh=values["silence_threshold"], pad_seconds=values["pad_seconds"], diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py index 808aa961..e2a2589e 100644 --- a/src/so_vits_svc_fork/inference/infer_tool.py +++ b/src/so_vits_svc_fork/inference/infer_tool.py @@ -3,7 +3,7 @@ from copy import deepcopy from logging import getLogger from pathlib import Path -from typing import Any, Callable, Iterable +from typing import Any, Callable, Iterable, Literal import attrs import librosa @@ -121,13 +121,17 @@ def load_model(self): def get_unit_f0( self, - audio: np.ndarray[Any, np.dtype[np.float64]], + audio: ndarray[Any, dtype[float32]], tran: int, cluster_infer_ratio: float, speaker: int | str, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", ): - f0 = utils.compute_f0_parselmouth( - audio, sampling_rate=self.target_sample, hop_length=self.hop_size + f0 = utils.compute_f0( + audio, + sampling_rate=self.target_sample, + hop_length=self.hop_size, + method=f0_method, ) f0, uv = utils.interpolate_f0(f0) f0 = torch.FloatTensor(f0) @@ -161,6 +165,7 @@ def infer( cluster_infer_ratio: float = 0, auto_predict_f0: bool = False, noise_scale: float = 0.4, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", ) -> tuple[torch.Tensor, int]: audio = audio.astype(np.float32) # get speaker id @@ -180,7 +185,9 @@ def infer( sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) # get unit f0 - c, f0, uv = self.get_unit_f0(audio, transpose, cluster_infer_ratio, speaker) + c, f0, uv = self.get_unit_f0( + audio, transpose, cluster_infer_ratio, speaker, f0_method + ) if "half" in self.net_g_path and torch.cuda.is_available(): c = c.half() @@ -215,6 +222,7 @@ def infer_silence( auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, @@ -260,6 +268,7 @@ def infer_silence( cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, + f0_method=f0_method, ) audio_chunk_pad_infer = audio_chunk_pad_infer_tensor.cpu().numpy() pad_len = int(self.target_sample * pad_seconds) @@ -359,6 +368,7 @@ def infer( cluster_infer_ratio: float = 0, auto_predict_f0: bool = False, noise_scale: float = 0.4, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, @@ -373,6 +383,7 @@ def infer( cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, + f0_method=f0_method, db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, @@ -393,6 +404,7 @@ def infer( cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, + f0_method=f0_method, ) return infered_audio_c.cpu().numpy() @@ -414,6 +426,7 @@ def process( cluster_infer_ratio: float = 0, auto_predict_f0: bool = False, noise_scale: float = 0.4, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", # slice config db_thresh: int = -40, chunk_seconds: float = 0.5, @@ -426,6 +439,7 @@ def infer(audio: ndarray[Any, dtype[float32]]) -> ndarray[Any, dtype[float32]]: cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, + f0_method=f0_method, ) return infered_audio_c.cpu().numpy() diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference_main.py index 40a6f0b0..c7ad8eab 100644 --- a/src/so_vits_svc_fork/inference_main.py +++ b/src/so_vits_svc_fork/inference_main.py @@ -22,12 +22,13 @@ def infer( model_path: Path | str, config_path: Path | str, # svc config - speaker: str, + speaker: int | str, cluster_model_path: Path | str | None = None, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, @@ -51,14 +52,15 @@ def infer( audio, _ = librosa.load(input_path, sr=svc_model.target_sample) audio = svc_model.infer_silence( - audio, + audio.astype(np.float32), speaker=speaker, - db_thresh=db_thresh, - pad_seconds=pad_seconds, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, + f0_method=f0_method, + db_thresh=db_thresh, + pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, ) @@ -78,6 +80,7 @@ def realtime( auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, @@ -154,13 +157,17 @@ def callback( ) kwargs = dict( - input_audio=indata.mean(axis=1), + input_audio=indata.mean(axis=1).astype(np.float32), + # svc config speaker=speaker, transpose=transpose, auto_predict_f0=auto_predict_f0, - noise_scale=noise_scale, cluster_infer_ratio=cluster_infer_ratio, + noise_scale=noise_scale, + f0_method=f0_method, + # slice config db_thresh=db_thresh, + # pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, ) if version == 1: diff --git a/src/so_vits_svc_fork/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocess_hubert_f0.py index 885c91dc..f7b60ab4 100644 --- a/src/so_vits_svc_fork/preprocess_hubert_f0.py +++ b/src/so_vits_svc_fork/preprocess_hubert_f0.py @@ -23,9 +23,12 @@ def _process_one( sampling_rate: int, hop_length: int, device: Literal["cuda", "cpu"] = "cuda", + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", force_rebuild: bool = False, ): wav, sr = librosa.load(filepath, sr=sampling_rate) + + # Compute HuBERT content soft_path = filepath.parent / (filepath.name + ".soft.pt") if not soft_path.exists() or force_rebuild: wav16k = librosa.resample( @@ -36,10 +39,12 @@ def _process_one( torch.save(c.cpu(), soft_path) else: LOG.info(f"Skip {filepath} because {soft_path} exists.") + + # Compute f0 f0_path = filepath.parent / (filepath.name + ".f0.npy") if not f0_path.exists() or force_rebuild: - f0 = utils.compute_f0_dio( - wav, sampling_rate=sampling_rate, hop_length=hop_length + f0 = utils.compute_f0( + wav, sampling_rate=sampling_rate, hop_length=hop_length, method=f0_method ) np.save(f0_path, f0) else: @@ -52,6 +57,7 @@ def _process_batch( sampling_rate: int, hop_length: int, pbar_position: int, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", force_rebuild: bool = False, ): device = "cuda" if torch.cuda.is_available() else "cpu" @@ -59,7 +65,13 @@ def _process_batch( for filepath in tqdm(filepaths, position=pbar_position): _process_one( - filepath, hubert_model, sampling_rate, hop_length, device, force_rebuild + filepath, + hubert_model, + sampling_rate, + hop_length, + device, + f0_method, + force_rebuild, ) @@ -67,6 +79,7 @@ def preprocess_hubert_f0( input_dir: Path | str, config_path: Path | str, n_jobs: int = 4, + f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", force_rebuild: bool = False, ): input_dir = Path(input_dir) @@ -82,7 +95,7 @@ def preprocess_hubert_f0( filepath_chunks = np.array_split(filepaths, n_jobs) Parallel(n_jobs=n_jobs)( delayed(_process_batch)( - chunk, sampling_rate, hop_length, pbar_position, force_rebuild + chunk, sampling_rate, hop_length, pbar_position, f0_method, force_rebuild ) for (pbar_position, chunk) in enumerate(filepath_chunks) ) diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py index d91864eb..89e96504 100644 --- a/src/so_vits_svc_fork/utils.py +++ b/src/so_vits_svc_fork/utils.py @@ -5,11 +5,12 @@ from itertools import groupby from logging import getLogger from pathlib import Path -from typing import Any +from typing import Any, Literal import numpy as np import requests import torch +import torchcrepe from numpy import dtype, float32, ndarray from scipy.io.wavfile import read from torch import FloatTensor @@ -122,7 +123,12 @@ def interpolate_f0( return ip_data[:, 0], vuv_vector[:, 0] -def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512): +def compute_f0_parselmouth( + wav_numpy: ndarray[Any, dtype[float32]], + p_len: None | int = None, + sampling_rate: int = 44100, + hop_length: int = 512, +): import parselmouth x = wav_numpy @@ -150,7 +156,7 @@ def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_lengt return f0 -def resize_f0( +def _resize_f0( x: ndarray[Any, dtype[float32]], target_len: int ) -> ndarray[Any, dtype[float32]]: source = np.array(x) @@ -164,26 +170,88 @@ def resize_f0( return res -def compute_f0_dio( +def compute_f0_pyworld( wav_numpy: ndarray[Any, dtype[float32]], p_len: None | int = None, sampling_rate: int = 44100, hop_length: int = 512, + type_: Literal["dio", "harvest"] = "dio", ): import pyworld if p_len is None: p_len = wav_numpy.shape[0] // hop_length - f0, t = pyworld.dio( - wav_numpy.astype(np.double), - fs=sampling_rate, - f0_ceil=800, - frame_period=1000 * hop_length / sampling_rate, - ) + if type_ == "dio": + f0, t = pyworld.dio( + wav_numpy.astype(np.double), + fs=sampling_rate, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=1000 * hop_length / sampling_rate, + ) + elif type_ == "harvest": + f0, t = pyworld.harvest( + wav_numpy.astype(np.double), + fs=sampling_rate, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=1000 * hop_length / sampling_rate, + ) f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) - return resize_f0(f0, p_len) + return _resize_f0(f0, p_len) + + +def compute_f0_crepe( + wav_numpy: ndarray[Any, dtype[float32]], + p_len: None | int = None, + sampling_rate: int = 44100, + hop_length: int = 512, + device: str = "cuda" if torch.cuda.is_available() else "cpu", + model: Literal["full", "tiny"] = "full", +): + audio = torch.from_numpy(wav_numpy).to(device, copy=True) + audio = torch.unsqueeze(audio, dim=0) + + if audio.ndim == 2 and audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True).detach() + # (T) -> (1, T) + audio = audio.detach() + + pitch = torchcrepe.predict( + audio, + sampling_rate, + hop_length, + f0_min, + f0_max, + model, + batch_size=hop_length * 2, + device=device, + pad=True, + ) + + return pitch.detach().cpu().numpy()[0] + + +def compute_f0( + wav_numpy: ndarray[Any, dtype[float32]], + p_len: None | int = None, + sampling_rate: int = 44100, + hop_length: int = 512, + method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe", + **kwargs, +): + wav_numpy = wav_numpy.astype(np.float32) + wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999) + if method in ["dio", "harvest"]: + return compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method) + elif method == "crepe": + return compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs) + elif method == "parselmouth": + return compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length) + else: + raise ValueError("type must be dio, crepe, harvest or parselmouth") def f0_to_coarse(f0: torch.Tensor | float):