From 6b3b20dfd609d81cb1184b7c8e8865a58f8d45f9 Mon Sep 17 00:00:00 2001
From: 34j <55338215+34j@users.noreply.github.com>
Date: Mon, 20 Mar 2023 13:59:47 +0900
Subject: [PATCH] feat: add more f0 calculation methods (#39)

---
 README.md                                    |  11 +-
 poetry.lock                                  | 171 ++++---------------
 pyproject.toml                               |  11 +-
 src/so_vits_svc_fork/__main__.py             |  34 +++-
 src/so_vits_svc_fork/gui.py                  |   9 +
 src/so_vits_svc_fork/inference/infer_tool.py |  24 ++-
 src/so_vits_svc_fork/inference_main.py       |  19 ++-
 src/so_vits_svc_fork/preprocess_hubert_f0.py |  21 ++-
 src/so_vits_svc_fork/utils.py                |  90 ++++++++--
 9 files changed, 217 insertions(+), 173 deletions(-)
diff --git a/README.md b/README.md
index bb8b7812..a515ce62 100644
--- a/README.md
+++ b/README.md
@@ -44,11 +44,13 @@ pip install so-vits-svc-fork
 ## Features not available in the original repo
 
 - **Realtime voice conversion**
+- More accurate pitch estimation using CREPE
 - GUI available
 - Unified command-line interface (no need to run Python scripts)
 - Ready to use just by installing with `pip`.
 - Automatically download pretrained base model and HuBERT model
 - Code completely formatted with black, isort, autoflake etc.
+- Volume normalization in preprocessing
 - Other minor differences
 
 ## Usage
@@ -79,6 +81,10 @@ svc vc --model-path <model-path>
 svc --model-path <model-path> source.wav
 ```
 
+#### Notes
+
+- In real-time inference, if there is noise on the inputs, the HuBERT model will react to those as well. Consider using realtime noise reduction applications such as [RTX Voice](https://www.nvidia.com/en-us/geforce/guides/nvidia-rtx-voice-setup-guide/) in this case.
+
 ### Training
 
 #### Google Colab
@@ -96,7 +102,10 @@ svc pre-hubert
 svc train
 ```
 
-It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.
+#### Notes
+
+- Dataset audio duration per file should be <~ 10s or VRAM will run out.
+- It is recommended to change the batch_size in `config.json` before the `train` command to match the VRAM capacity. As tested, the default requires about 14 GB.
 
 ### Further help
 
diff --git a/poetry.lock b/poetry.lock
index 1e0992d7..5bdbc8b2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
 
 [[package]]
 name = "absl-py"
@@ -1697,35 +1697,33 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"]
 
 [[package]]
 name = "librosa"
-version = "0.10.0"
+version = "0.9.1"
 description = "Python module for audio and music processing"
 category = "main"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.6"
 files = [
-    {file = "librosa-0.10.0-py3-none-any.whl", hash = "sha256:6db29c1467168da21313203dcef405a73a678d3aad0fbc67607250b2f08a3f5a"},
-    {file = "librosa-0.10.0.tar.gz", hash = "sha256:8e8669e5084002d1a87f6c82b732f370784a368d0e55c2dd7d7aef3fa02fd058"},
+    {file = "librosa-0.9.1-py3-none-any.whl", hash = "sha256:c2bb61a8008367cca89a3f1dad352d8e55fe5ca5f7414fb5d5258eb52765db33"},
+    {file = "librosa-0.9.1.tar.gz", hash = "sha256:7ed5d6e3f4546e5e3c2840691f9ddc56878f914a35a50060df5fca2b26d4b614"},
 ]
 
 [package.dependencies]
-audioread = ">=2.1.9"
-decorator = ">=4.3.0"
+audioread = ">=2.1.5"
+decorator = ">=4.0.10"
 joblib = ">=0.14"
-lazy-loader = ">=0.1"
-msgpack = ">=1.0"
-numba = ">=0.51.0"
-numpy = ">=1.20.3"
+numba = ">=0.45.1"
+numpy = ">=1.17.0"
+packaging = ">=20.0"
 pooch = ">=1.0"
-scikit-learn = ">=0.20.0"
+resampy = ">=0.2.2"
+scikit-learn = ">=0.19.1"
 scipy = ">=1.2.0"
-soundfile = ">=0.12.1"
-soxr = ">=0.3.2"
-typing-extensions = ">=4.1.1"
+soundfile = ">=0.10.2"
 
 [package.extras]
 display = ["matplotlib (>=3.3.0)"]
-docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (>=0.51)", "numpydoc", "presets", "sphinx (!=1.3.1,<6)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=1.0.0,<2.0.0)", "sphinxcontrib-svg2pdfconverter"]
-tests = ["matplotlib (>=3.3.0)", "packaging (>=20.0)", "pytest", "pytest-cov", "pytest-mpl", "resampy (>=0.2.2)", "samplerate", "types-decorator"]
+docs = ["ipython (>=7.0)", "matplotlib (>=3.3.0)", "mir-eval (>=0.5)", "numba (<0.50)", "numpydoc", "presets", "sphinx (!=1.3.1)", "sphinx-gallery (>=0.7)", "sphinx-multiversion (>=0.2.3)", "sphinx-rtd-theme (>=0.5.0,<0.6.0)", "sphinxcontrib-svg2pdfconverter"]
+tests = ["contextlib2", "matplotlib (>=3.3.0)", "pytest", "pytest-cov", "pytest-mpl", "samplerate", "soxr"]
 
 [[package]]
 name = "linkify-it-py"
@@ -2107,79 +2105,6 @@ docs = ["sphinx"]
 gmpy = ["gmpy2 (>=2.1.0a4)"]
 tests = ["pytest (>=4.6)"]
 
-[[package]]
-name = "msgpack"
-version = "1.0.5"
-description = "MessagePack serializer"
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "msgpack-1.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:525228efd79bb831cf6830a732e2e80bc1b05436b086d4264814b4b2955b2fa9"},
-    {file = "msgpack-1.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f8d8b3bf1ff2672567d6b5c725a1b347fe838b912772aa8ae2bf70338d5a198"},
-    {file = "msgpack-1.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdc793c50be3f01106245a61b739328f7dccc2c648b501e237f0699fe1395b81"},
-    {file = "msgpack-1.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cb47c21a8a65b165ce29f2bec852790cbc04936f502966768e4aae9fa763cb7"},
-    {file = "msgpack-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42b9594cc3bf4d838d67d6ed62b9e59e201862a25e9a157019e171fbe672dd3"},
-    {file = "msgpack-1.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55b56a24893105dc52c1253649b60f475f36b3aa0fc66115bffafb624d7cb30b"},
-    {file = "msgpack-1.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1967f6129fc50a43bfe0951c35acbb729be89a55d849fab7686004da85103f1c"},
-    {file = "msgpack-1.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20a97bf595a232c3ee6d57ddaadd5453d174a52594bf9c21d10407e2a2d9b3bd"},
-    {file = "msgpack-1.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d25dd59bbbbb996eacf7be6b4ad082ed7eacc4e8f3d2df1ba43822da9bfa122a"},
-    {file = "msgpack-1.0.5-cp310-cp310-win32.whl", hash = "sha256:382b2c77589331f2cb80b67cc058c00f225e19827dbc818d700f61513ab47bea"},
-    {file = "msgpack-1.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:4867aa2df9e2a5fa5f76d7d5565d25ec76e84c106b55509e78c1ede0f152659a"},
-    {file = "msgpack-1.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9f5ae84c5c8a857ec44dc180a8b0cc08238e021f57abdf51a8182e915e6299f0"},
-    {file = "msgpack-1.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e6ca5d5699bcd89ae605c150aee83b5321f2115695e741b99618f4856c50898"},
-    {file = "msgpack-1.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5494ea30d517a3576749cad32fa27f7585c65f5f38309c88c6d137877fa28a5a"},
-    {file = "msgpack-1.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ab2f3331cb1b54165976a9d976cb251a83183631c88076613c6c780f0d6e45a"},
-    {file = "msgpack-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28592e20bbb1620848256ebc105fc420436af59515793ed27d5c77a217477705"},
-    {file = "msgpack-1.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe5c63197c55bce6385d9aee16c4d0641684628f63ace85f73571e65ad1c1e8d"},
-    {file = "msgpack-1.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ed40e926fa2f297e8a653c954b732f125ef97bdd4c889f243182299de27e2aa9"},
-    {file = "msgpack-1.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b2de4c1c0538dcb7010902a2b97f4e00fc4ddf2c8cda9749af0e594d3b7fa3d7"},
-    {file = "msgpack-1.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bf22a83f973b50f9d38e55c6aade04c41ddda19b00c4ebc558930d78eecc64ed"},
-    {file = "msgpack-1.0.5-cp311-cp311-win32.whl", hash = "sha256:c396e2cc213d12ce017b686e0f53497f94f8ba2b24799c25d913d46c08ec422c"},
-    {file = "msgpack-1.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:6c4c68d87497f66f96d50142a2b73b97972130d93677ce930718f68828b382e2"},
-    {file = "msgpack-1.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a2b031c2e9b9af485d5e3c4520f4220d74f4d222a5b8dc8c1a3ab9448ca79c57"},
-    {file = "msgpack-1.0.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f837b93669ce4336e24d08286c38761132bc7ab29782727f8557e1eb21b2080"},
-    {file = "msgpack-1.0.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1d46dfe3832660f53b13b925d4e0fa1432b00f5f7210eb3ad3bb9a13c6204a6"},
-    {file = "msgpack-1.0.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:366c9a7b9057e1547f4ad51d8facad8b406bab69c7d72c0eb6f529cf76d4b85f"},
-    {file = "msgpack-1.0.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:4c075728a1095efd0634a7dccb06204919a2f67d1893b6aa8e00497258bf926c"},
-    {file = "msgpack-1.0.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:f933bbda5a3ee63b8834179096923b094b76f0c7a73c1cfe8f07ad608c58844b"},
-    {file = "msgpack-1.0.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:36961b0568c36027c76e2ae3ca1132e35123dcec0706c4b7992683cc26c1320c"},
-    {file = "msgpack-1.0.5-cp36-cp36m-win32.whl", hash = "sha256:b5ef2f015b95f912c2fcab19c36814963b5463f1fb9049846994b007962743e9"},
-    {file = "msgpack-1.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:288e32b47e67f7b171f86b030e527e302c91bd3f40fd9033483f2cacc37f327a"},
-    {file = "msgpack-1.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:137850656634abddfb88236008339fdaba3178f4751b28f270d2ebe77a563b6c"},
-    {file = "msgpack-1.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c05a4a96585525916b109bb85f8cb6511db1c6f5b9d9cbcbc940dc6b4be944b"},
-    {file = "msgpack-1.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56a62ec00b636583e5cb6ad313bbed36bb7ead5fa3a3e38938503142c72cba4f"},
-    {file = "msgpack-1.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef8108f8dedf204bb7b42994abf93882da1159728a2d4c5e82012edd92c9da9f"},
-    {file = "msgpack-1.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1835c84d65f46900920b3708f5ba829fb19b1096c1800ad60bae8418652a951d"},
-    {file = "msgpack-1.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:e57916ef1bd0fee4f21c4600e9d1da352d8816b52a599c46460e93a6e9f17086"},
-    {file = "msgpack-1.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:17358523b85973e5f242ad74aa4712b7ee560715562554aa2134d96e7aa4cbbf"},
-    {file = "msgpack-1.0.5-cp37-cp37m-win32.whl", hash = "sha256:cb5aaa8c17760909ec6cb15e744c3ebc2ca8918e727216e79607b7bbce9c8f77"},
-    {file = "msgpack-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:ab31e908d8424d55601ad7075e471b7d0140d4d3dd3272daf39c5c19d936bd82"},
-    {file = "msgpack-1.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b72d0698f86e8d9ddf9442bdedec15b71df3598199ba33322d9711a19f08145c"},
-    {file = "msgpack-1.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:379026812e49258016dd84ad79ac8446922234d498058ae1d415f04b522d5b2d"},
-    {file = "msgpack-1.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:332360ff25469c346a1c5e47cbe2a725517919892eda5cfaffe6046656f0b7bb"},
-    {file = "msgpack-1.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:476a8fe8fae289fdf273d6d2a6cb6e35b5a58541693e8f9f019bfe990a51e4ba"},
-    {file = "msgpack-1.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9985b214f33311df47e274eb788a5893a761d025e2b92c723ba4c63936b69b1"},
-    {file = "msgpack-1.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48296af57cdb1d885843afd73c4656be5c76c0c6328db3440c9601a98f303d87"},
-    {file = "msgpack-1.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:addab7e2e1fcc04bd08e4eb631c2a90960c340e40dfc4a5e24d2ff0d5a3b3edb"},
-    {file = "msgpack-1.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:916723458c25dfb77ff07f4c66aed34e47503b2eb3188b3adbec8d8aa6e00f48"},
-    {file = "msgpack-1.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:821c7e677cc6acf0fd3f7ac664c98803827ae6de594a9f99563e48c5a2f27eb0"},
-    {file = "msgpack-1.0.5-cp38-cp38-win32.whl", hash = "sha256:1c0f7c47f0087ffda62961d425e4407961a7ffd2aa004c81b9c07d9269512f6e"},
-    {file = "msgpack-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:bae7de2026cbfe3782c8b78b0db9cbfc5455e079f1937cb0ab8d133496ac55e1"},
-    {file = "msgpack-1.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:20c784e66b613c7f16f632e7b5e8a1651aa5702463d61394671ba07b2fc9e025"},
-    {file = "msgpack-1.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:266fa4202c0eb94d26822d9bfd7af25d1e2c088927fe8de9033d929dd5ba24c5"},
-    {file = "msgpack-1.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18334484eafc2b1aa47a6d42427da7fa8f2ab3d60b674120bce7a895a0a85bdd"},
-    {file = "msgpack-1.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57e1f3528bd95cc44684beda696f74d3aaa8a5e58c816214b9046512240ef437"},
-    {file = "msgpack-1.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:586d0d636f9a628ddc6a17bfd45aa5b5efaf1606d2b60fa5d87b8986326e933f"},
-    {file = "msgpack-1.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a740fa0e4087a734455f0fc3abf5e746004c9da72fbd541e9b113013c8dc3282"},
-    {file = "msgpack-1.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:3055b0455e45810820db1f29d900bf39466df96ddca11dfa6d074fa47054376d"},
-    {file = "msgpack-1.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a61215eac016f391129a013c9e46f3ab308db5f5ec9f25811e811f96962599a8"},
-    {file = "msgpack-1.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:362d9655cd369b08fda06b6657a303eb7172d5279997abe094512e919cf74b11"},
-    {file = "msgpack-1.0.5-cp39-cp39-win32.whl", hash = "sha256:ac9dd47af78cae935901a9a500104e2dea2e253207c924cc95de149606dc43cc"},
-    {file = "msgpack-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:06f5174b5f8ed0ed919da0e62cbd4ffde676a374aba4020034da05fab67b9164"},
-    {file = "msgpack-1.0.5.tar.gz", hash = "sha256:c075544284eadc5cddc70f4757331d99dcbc16b2bbd4849d15f8aae4cf36d31c"},
-]
-
 [[package]]
 name = "multidict"
 version = "6.0.4"
@@ -4130,51 +4055,6 @@ cffi = ">=1.0"
 [package.extras]
 numpy = ["numpy"]
 
-[[package]]
-name = "soxr"
-version = "0.3.4"
-description = "High quality, one-dimensional sample-rate conversion library"
-category = "main"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "soxr-0.3.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b7b84126643c063d5daa203f7f9137e21734dabbd7e68c097607b2ef457e2f2e"},
-    {file = "soxr-0.3.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:380d2d43871a68e8b1ef1702a0abe6f9e48ddb3933c7a303c45d67e121503e7c"},
-    {file = "soxr-0.3.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4a1b4019c9972f57612482c4f85523d6e832e3d10935e2f070a9dcd334a4dcb"},
-    {file = "soxr-0.3.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e613cee023b7c3f162b9da3f6b169cd7f58de345275be1fde9f19adc9cf144df"},
-    {file = "soxr-0.3.4-cp310-cp310-win32.whl", hash = "sha256:182c02a7ba45a159a0dbb0a297335df2381ead03a65377b19663ea0ff720ecb7"},
-    {file = "soxr-0.3.4-cp310-cp310-win_amd64.whl", hash = "sha256:1e95c96ce94524fae453b4331c9910d33f97506f99bae06d76a9c0649710619e"},
-    {file = "soxr-0.3.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2678d2719e7496803983584d661eb5fddc7017154a8dda4a774407c56ff07973"},
-    {file = "soxr-0.3.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11bd1396052049e6d389225a0e96a9df15f706da501c619b35d3c72ac6bc7257"},
-    {file = "soxr-0.3.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e23de4dfe54ac30e583bbc9cc3feda1cd776fedce13206bc4b3115b75ecab82"},
-    {file = "soxr-0.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e7396498a5f5b7d8f23b656f65c24517a6ff5bdc3ee0623ccd491036a43ea08"},
-    {file = "soxr-0.3.4-cp311-cp311-win32.whl", hash = "sha256:e57e9703c2bff834cabc06800d3c11a259544891d2c24a78949f3cf2f5492cc5"},
-    {file = "soxr-0.3.4-cp311-cp311-win_amd64.whl", hash = "sha256:7c8350acd7150f74261a0569b47ccb1bb4aa39b2d575860bc97cfa69aab8aead"},
-    {file = "soxr-0.3.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:941f7355116fe77fe6a82938fa7799a0e466a494ebc093f676969ce32b2815b1"},
-    {file = "soxr-0.3.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:00fdbf24f64d8c3fb800425c383048cb24c32defac80901cde4a57fb6ce5d431"},
-    {file = "soxr-0.3.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bb6d4dc807d04c536674429e2b05ae08a1efac9815c4595e41ffd6b57c2c662"},
-    {file = "soxr-0.3.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff15853895b54f1b627799c6600be1ce5f7286724e7a93e4b7b9d79e5d4166f5"},
-    {file = "soxr-0.3.4-cp38-cp38-win32.whl", hash = "sha256:d858becbc1fcc7b38c3436d3276290fae09403cdcbdf1d5986a18dab7023a6c3"},
-    {file = "soxr-0.3.4-cp38-cp38-win_amd64.whl", hash = "sha256:068ab4df549df5783cc1eb4eb6c94f53823b164dc27134fc621fc9f5097f38cd"},
-    {file = "soxr-0.3.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20130329985f9767c8417bbd125fe138790a71802b000481c386a800e2ad2bca"},
-    {file = "soxr-0.3.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:78090e97abfb326b7cf14ef37d08a17252b07d438388dcbbd82a6836a9d551b1"},
-    {file = "soxr-0.3.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84e590e75b7e5dca12bf68bfb090276f34a88fbcd793781c62d47f5d7dbe525e"},
-    {file = "soxr-0.3.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3479d265574b960e12bca0878baba0862c43278915e0319d84679bb4d4fcd33"},
-    {file = "soxr-0.3.4-cp39-cp39-win32.whl", hash = "sha256:83de825d6a713c7b2e76d9ec3f229a58a9ed290237e7adc05d80e8b39be995a6"},
-    {file = "soxr-0.3.4-cp39-cp39-win_amd64.whl", hash = "sha256:2082f88cae89de854c3e0d62f55d0cb31eb11764f5c2a28299121fb642a22472"},
-    {file = "soxr-0.3.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fe8b5f92c802f1e7793c40344f5368dc6163718c9ffa82e79ee6ad779d318ac5"},
-    {file = "soxr-0.3.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0063d5f9a4e1a367084f4705301e9da131cf4d2d32aa3fe0072a1245e18088f"},
-    {file = "soxr-0.3.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:a680bab57adae462cdc86abcc7330beb5daa3ba5101165583eedcda88b7ba551"},
-    {file = "soxr-0.3.4.tar.gz", hash = "sha256:fe68daf00e8f020977b187699903d219f9e39b9fb3d915f3f923eed8ba431449"},
-]
-
-[package.dependencies]
-numpy = "*"
-
-[package.extras]
-docs = ["linkify-it-py", "myst-parser", "sphinx", "sphinx-book-theme"]
-test = ["pytest"]
-
 [[package]]
 name = "sphinx"
 version = "5.3.0"
@@ -4607,6 +4487,25 @@ files = [
 [package.dependencies]
 torch = "2.0.0"
 
+[[package]]
+name = "torchcrepe"
+version = "0.0.17"
+description = "Pytorch implementation of CREPE pitch tracker"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "torchcrepe-0.0.17-py3-none-any.whl", hash = "sha256:c840366a0be98835107fe926d41266658689b27a8e235c30cab6e13202ec09da"},
+    {file = "torchcrepe-0.0.17.tar.gz", hash = "sha256:03ad7907f384a581a2d1783cf77834bbf2cd77282c2de577df9f6f6a02bf58be"},
+]
+
+[package.dependencies]
+librosa = "0.9.1"
+resampy = "*"
+scipy = "*"
+torch = "*"
+tqdm = "*"
+
 [[package]]
 name = "tqdm"
 version = "4.65.0"
@@ -4974,4 +4873,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<3.11"
-content-hash = "6fa2b7382f0358e9ae6d760edb98e23659568d86083263a9251c32248ba162f3"
+content-hash = "fd616d54a65bd30fd875dd579a03a7f44119f9e5698823d7d3f074b15e2df62c"
diff --git a/pyproject.toml b/pyproject.toml
index c0b2e927..1b884f41 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ fairseq = "*"
 flask = "*"
 flask_cors = "*"
 gradio = "*"
-numpy = ">=1.23"
+numpy = "^1.23"
 pydub = "*"
 pyworld = "*"
 requests = "*"
@@ -51,16 +51,17 @@ praat-parselmouth = "*"
 onnx = "*"
 onnxsim = "*"
 onnxoptimizer = "*"
-torch = "*"
-torchaudio = "*"
+torch = ">=1.12"
+torchaudio = ">=0.12"
 tensorboard = "*"
 rich = "*"
 tqdm-joblib = "*"
 tensorboardx = "*"
 pyinputplus = "*"
-cm-time = "^0.1.2"
+cm-time = ">=0.1.2"
 pysimplegui = ">=4.6"
-pebble = "^5.0.3"
+pebble = ">=5.0"
+torchcrepe = ">=0.0.17"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3"
diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
index 0a38cfba..b247ccb6 100644
--- a/src/so_vits_svc_fork/__main__.py
+++ b/src/so_vits_svc_fork/__main__.py
@@ -268,6 +268,13 @@ def infer(
 @click.option(
     "-db", "--db-thresh", type=int, default=-30, help="threshold (DB) (ABSOLUTE)"
 )
+@click.option(
+    "-fm",
+    "--f0-method",
+    type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
+    default="crepe",
+    help="f0 prediction method",
+)
 @click.option("-p", "--pad-seconds", type=float, default=0.02, help="pad seconds")
 @click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds")
 @click.option(
@@ -300,6 +307,7 @@ def vc(
     auto_predict_f0: bool,
     cluster_infer_ratio: float,
     noise_scale: float,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
     # slice config
     db_thresh: int,
     pad_seconds: float,
@@ -333,19 +341,24 @@ def vc(
         LOG.info(f"Since model_path is a directory, use {model_path}")
 
     realtime(
+        # paths
         model_path=model_path,
         config_path=config_path,
+        # svc config
         speaker=speaker,
         cluster_model_path=cluster_model_path,
         transpose=transpose,
         auto_predict_f0=auto_predict_f0,
         cluster_infer_ratio=cluster_infer_ratio,
         noise_scale=noise_scale,
-        crossfade_seconds=crossfade_seconds,
-        block_seconds=block_seconds,
-        chunk_seconds=chunk_seconds,
+        f0_method=f0_method,
+        # slice config
         db_thresh=db_thresh,
         pad_seconds=pad_seconds,
+        chunk_seconds=chunk_seconds,
+        # realtime config
+        crossfade_seconds=crossfade_seconds,
+        block_seconds=block_seconds,
         version=version,
         input_device=input_device,
         output_device=output_device,
@@ -446,13 +459,23 @@ def pre_config(
 )
 @click.option(
     "-f",
-    "--force_rebuild",
+    "--force-rebuild",
     type=bool,
     default=True,
     help="force rebuild existing preprocessed files",
 )
+@click.option(
+    "-fm",
+    "--f0-method",
+    type=click.Choice(["crepe", "parselmouth", "dio", "harvest"]),
+    default="crepe",
+)
 def pre_hubert(
-    input_dir: Path, config_path: Path, n_jobs: bool, force_rebuild: bool
+    input_dir: Path,
+    config_path: Path,
+    n_jobs: bool,
+    force_rebuild: bool,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"],
 ) -> None:
     """Preprocessing part 3: hubert
     If the HuBERT model is not found, it will be downloaded automatically."""
@@ -465,6 +488,7 @@ def pre_hubert(
         config_path=config_path,
         n_jobs=n_jobs,
         force_rebuild=force_rebuild,
+        f0_method=f0_method,
     )
 
 
diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
index 3586a3a7..3d529f9a 100644
--- a/src/so_vits_svc_fork/gui.py
+++ b/src/so_vits_svc_fork/gui.py
@@ -125,6 +125,14 @@ def main():
                             text="Auto predict F0 (Pitch may become unstable when turned on in real-time inference.)",
                         )
                     ],
+                    [
+                        sg.Text("F0 prediction method"),
+                        sg.Combo(
+                            ["crepe", "parselmouth", "dio", "harvest"],
+                            key="f0_method",
+                            default_value="crepe",
+                        ),
+                    ],
                     [
                         sg.Text("Cluster infer ratio"),
                         sg.Push(),
@@ -350,6 +358,7 @@ def update_combo() -> None:
                         auto_predict_f0=values["auto_predict_f0"],
                         cluster_infer_ratio=values["cluster_infer_ratio"],
                         noise_scale=values["noise_scale"],
+                        f0_method=values["f0_method"],
                         crossfade_seconds=values["crossfade_seconds"],
                         db_thresh=values["silence_threshold"],
                         pad_seconds=values["pad_seconds"],
diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py
index 808aa961..e2a2589e 100644
--- a/src/so_vits_svc_fork/inference/infer_tool.py
+++ b/src/so_vits_svc_fork/inference/infer_tool.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 from logging import getLogger
 from pathlib import Path
-from typing import Any, Callable, Iterable
+from typing import Any, Callable, Iterable, Literal
 
 import attrs
 import librosa
@@ -121,13 +121,17 @@ def load_model(self):
 
     def get_unit_f0(
         self,
-        audio: np.ndarray[Any, np.dtype[np.float64]],
+        audio: ndarray[Any, dtype[float32]],
         tran: int,
         cluster_infer_ratio: float,
         speaker: int | str,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     ):
-        f0 = utils.compute_f0_parselmouth(
-            audio, sampling_rate=self.target_sample, hop_length=self.hop_size
+        f0 = utils.compute_f0(
+            audio,
+            sampling_rate=self.target_sample,
+            hop_length=self.hop_size,
+            method=f0_method,
         )
         f0, uv = utils.interpolate_f0(f0)
         f0 = torch.FloatTensor(f0)
@@ -161,6 +165,7 @@ def infer(
         cluster_infer_ratio: float = 0,
         auto_predict_f0: bool = False,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     ) -> tuple[torch.Tensor, int]:
         audio = audio.astype(np.float32)
         # get speaker id
@@ -180,7 +185,9 @@ def infer(
         sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
 
         # get unit f0
-        c, f0, uv = self.get_unit_f0(audio, transpose, cluster_infer_ratio, speaker)
+        c, f0, uv = self.get_unit_f0(
+            audio, transpose, cluster_infer_ratio, speaker, f0_method
+        )
         if "half" in self.net_g_path and torch.cuda.is_available():
             c = c.half()
 
@@ -215,6 +222,7 @@ def infer_silence(
         auto_predict_f0: bool = False,
         cluster_infer_ratio: float = 0,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
         # slice config
         db_thresh: int = -40,
         pad_seconds: float = 0.5,
@@ -260,6 +268,7 @@ def infer_silence(
                     cluster_infer_ratio=cluster_infer_ratio,
                     auto_predict_f0=auto_predict_f0,
                     noise_scale=noise_scale,
+                    f0_method=f0_method,
                 )
                 audio_chunk_pad_infer = audio_chunk_pad_infer_tensor.cpu().numpy()
                 pad_len = int(self.target_sample * pad_seconds)
@@ -359,6 +368,7 @@ def infer(
         cluster_infer_ratio: float = 0,
         auto_predict_f0: bool = False,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
         # slice config
         db_thresh: int = -40,
         pad_seconds: float = 0.5,
@@ -373,6 +383,7 @@ def infer(
                 cluster_infer_ratio=cluster_infer_ratio,
                 auto_predict_f0=auto_predict_f0,
                 noise_scale=noise_scale,
+                f0_method=f0_method,
                 db_thresh=db_thresh,
                 pad_seconds=pad_seconds,
                 chunk_seconds=chunk_seconds,
@@ -393,6 +404,7 @@ def infer(
                     cluster_infer_ratio=cluster_infer_ratio,
                     auto_predict_f0=auto_predict_f0,
                     noise_scale=noise_scale,
+                    f0_method=f0_method,
                 )
                 return infered_audio_c.cpu().numpy()
 
@@ -414,6 +426,7 @@ def process(
         cluster_infer_ratio: float = 0,
         auto_predict_f0: bool = False,
         noise_scale: float = 0.4,
+        f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
         # slice config
         db_thresh: int = -40,
         chunk_seconds: float = 0.5,
@@ -426,6 +439,7 @@ def infer(audio: ndarray[Any, dtype[float32]]) -> ndarray[Any, dtype[float32]]:
                 cluster_infer_ratio=cluster_infer_ratio,
                 auto_predict_f0=auto_predict_f0,
                 noise_scale=noise_scale,
+                f0_method=f0_method,
             )
             return infered_audio_c.cpu().numpy()
 
diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference_main.py
index 40a6f0b0..c7ad8eab 100644
--- a/src/so_vits_svc_fork/inference_main.py
+++ b/src/so_vits_svc_fork/inference_main.py
@@ -22,12 +22,13 @@ def infer(
     model_path: Path | str,
     config_path: Path | str,
     # svc config
-    speaker: str,
+    speaker: int | str,
     cluster_model_path: Path | str | None = None,
     transpose: int = 0,
     auto_predict_f0: bool = False,
     cluster_infer_ratio: float = 0,
     noise_scale: float = 0.4,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     # slice config
     db_thresh: int = -40,
     pad_seconds: float = 0.5,
@@ -51,14 +52,15 @@ def infer(
 
     audio, _ = librosa.load(input_path, sr=svc_model.target_sample)
     audio = svc_model.infer_silence(
-        audio,
+        audio.astype(np.float32),
         speaker=speaker,
-        db_thresh=db_thresh,
-        pad_seconds=pad_seconds,
         transpose=transpose,
         auto_predict_f0=auto_predict_f0,
         cluster_infer_ratio=cluster_infer_ratio,
         noise_scale=noise_scale,
+        f0_method=f0_method,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
         chunk_seconds=chunk_seconds,
         absolute_thresh=absolute_thresh,
     )
@@ -78,6 +80,7 @@ def realtime(
     auto_predict_f0: bool = False,
     cluster_infer_ratio: float = 0,
     noise_scale: float = 0.4,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     # slice config
     db_thresh: int = -40,
     pad_seconds: float = 0.5,
@@ -154,13 +157,17 @@ def callback(
         )
 
         kwargs = dict(
-            input_audio=indata.mean(axis=1),
+            input_audio=indata.mean(axis=1).astype(np.float32),
+            # svc config
             speaker=speaker,
             transpose=transpose,
             auto_predict_f0=auto_predict_f0,
-            noise_scale=noise_scale,
             cluster_infer_ratio=cluster_infer_ratio,
+            noise_scale=noise_scale,
+            f0_method=f0_method,
+            # slice config
             db_thresh=db_thresh,
+            # pad_seconds=pad_seconds,
             chunk_seconds=chunk_seconds,
         )
         if version == 1:
diff --git a/src/so_vits_svc_fork/preprocess_hubert_f0.py b/src/so_vits_svc_fork/preprocess_hubert_f0.py
index 885c91dc..f7b60ab4 100644
--- a/src/so_vits_svc_fork/preprocess_hubert_f0.py
+++ b/src/so_vits_svc_fork/preprocess_hubert_f0.py
@@ -23,9 +23,12 @@ def _process_one(
     sampling_rate: int,
     hop_length: int,
     device: Literal["cuda", "cpu"] = "cuda",
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     force_rebuild: bool = False,
 ):
     wav, sr = librosa.load(filepath, sr=sampling_rate)
+
+    # Compute HuBERT content
     soft_path = filepath.parent / (filepath.name + ".soft.pt")
     if not soft_path.exists() or force_rebuild:
         wav16k = librosa.resample(
@@ -36,10 +39,12 @@ def _process_one(
         torch.save(c.cpu(), soft_path)
     else:
         LOG.info(f"Skip {filepath} because {soft_path} exists.")
+
+    # Compute f0
     f0_path = filepath.parent / (filepath.name + ".f0.npy")
     if not f0_path.exists() or force_rebuild:
-        f0 = utils.compute_f0_dio(
-            wav, sampling_rate=sampling_rate, hop_length=hop_length
+        f0 = utils.compute_f0(
+            wav, sampling_rate=sampling_rate, hop_length=hop_length, method=f0_method
         )
         np.save(f0_path, f0)
     else:
@@ -52,6 +57,7 @@ def _process_batch(
     sampling_rate: int,
     hop_length: int,
     pbar_position: int,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     force_rebuild: bool = False,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -59,7 +65,13 @@ def _process_batch(
 
     for filepath in tqdm(filepaths, position=pbar_position):
         _process_one(
-            filepath, hubert_model, sampling_rate, hop_length, device, force_rebuild
+            filepath,
+            hubert_model,
+            sampling_rate,
+            hop_length,
+            device,
+            f0_method,
+            force_rebuild,
         )
 
 
@@ -67,6 +79,7 @@ def preprocess_hubert_f0(
     input_dir: Path | str,
     config_path: Path | str,
     n_jobs: int = 4,
+    f0_method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
     force_rebuild: bool = False,
 ):
     input_dir = Path(input_dir)
@@ -82,7 +95,7 @@ def preprocess_hubert_f0(
     filepath_chunks = np.array_split(filepaths, n_jobs)
     Parallel(n_jobs=n_jobs)(
         delayed(_process_batch)(
-            chunk, sampling_rate, hop_length, pbar_position, force_rebuild
+            chunk, sampling_rate, hop_length, pbar_position, f0_method, force_rebuild
         )
         for (pbar_position, chunk) in enumerate(filepath_chunks)
     )
diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py
index d91864eb..89e96504 100644
--- a/src/so_vits_svc_fork/utils.py
+++ b/src/so_vits_svc_fork/utils.py
@@ -5,11 +5,12 @@
 from itertools import groupby
 from logging import getLogger
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 import numpy as np
 import requests
 import torch
+import torchcrepe
 from numpy import dtype, float32, ndarray
 from scipy.io.wavfile import read
 from torch import FloatTensor
@@ -122,7 +123,12 @@ def interpolate_f0(
     return ip_data[:, 0], vuv_vector[:, 0]
 
 
-def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
+def compute_f0_parselmouth(
+    wav_numpy: ndarray[Any, dtype[float32]],
+    p_len: None | int = None,
+    sampling_rate: int = 44100,
+    hop_length: int = 512,
+):
     import parselmouth
 
     x = wav_numpy
@@ -150,7 +156,7 @@ def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_lengt
     return f0
 
 
-def resize_f0(
+def _resize_f0(
     x: ndarray[Any, dtype[float32]], target_len: int
 ) -> ndarray[Any, dtype[float32]]:
     source = np.array(x)
@@ -164,26 +170,88 @@ def resize_f0(
     return res
 
 
-def compute_f0_dio(
+def compute_f0_pyworld(
     wav_numpy: ndarray[Any, dtype[float32]],
     p_len: None | int = None,
     sampling_rate: int = 44100,
     hop_length: int = 512,
+    type_: Literal["dio", "harvest"] = "dio",
 ):
     import pyworld
 
     if p_len is None:
         p_len = wav_numpy.shape[0] // hop_length
-    f0, t = pyworld.dio(
-        wav_numpy.astype(np.double),
-        fs=sampling_rate,
-        f0_ceil=800,
-        frame_period=1000 * hop_length / sampling_rate,
-    )
+    if type_ == "dio":
+        f0, t = pyworld.dio(
+            wav_numpy.astype(np.double),
+            fs=sampling_rate,
+            f0_ceil=f0_max,
+            f0_floor=f0_min,
+            frame_period=1000 * hop_length / sampling_rate,
+        )
+    elif type_ == "harvest":
+        f0, t = pyworld.harvest(
+            wav_numpy.astype(np.double),
+            fs=sampling_rate,
+            f0_ceil=f0_max,
+            f0_floor=f0_min,
+            frame_period=1000 * hop_length / sampling_rate,
+        )
     f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
     for index, pitch in enumerate(f0):
         f0[index] = round(pitch, 1)
-    return resize_f0(f0, p_len)
+    return _resize_f0(f0, p_len)
+
+
+def compute_f0_crepe(
+    wav_numpy: ndarray[Any, dtype[float32]],
+    p_len: None | int = None,
+    sampling_rate: int = 44100,
+    hop_length: int = 512,
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    model: Literal["full", "tiny"] = "full",
+):
+    audio = torch.from_numpy(wav_numpy).to(device, copy=True)
+    audio = torch.unsqueeze(audio, dim=0)
+
+    if audio.ndim == 2 and audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True).detach()
+    # (T) -> (1, T)
+    audio = audio.detach()
+
+    pitch = torchcrepe.predict(
+        audio,
+        sampling_rate,
+        hop_length,
+        f0_min,
+        f0_max,
+        model,
+        batch_size=hop_length * 2,
+        device=device,
+        pad=True,
+    )
+
+    return pitch.detach().cpu().numpy()[0]
+
+
+def compute_f0(
+    wav_numpy: ndarray[Any, dtype[float32]],
+    p_len: None | int = None,
+    sampling_rate: int = 44100,
+    hop_length: int = 512,
+    method: Literal["crepe", "parselmouth", "dio", "harvest"] = "crepe",
+    **kwargs,
+):
+    wav_numpy = wav_numpy.astype(np.float32)
+    wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999)
+    if method in ["dio", "harvest"]:
+        return compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method)
+    elif method == "crepe":
+        return compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs)
+    elif method == "parselmouth":
+        return compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length)
+    else:
+        raise ValueError("type must be dio, crepe, harvest or parselmouth")
 
 
 def f0_to_coarse(f0: torch.Tensor | float):