ENH: Support ChatTTS 0.2 (#2449)

xorbitsai · Oct 19, 2024 · b3cd77f · b3cd77f
1 parent 5f7dea4
commit b3cd77f
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 27 deletions.
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -162,7 +162,7 @@ jobs:
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing<1.0.4
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U librosa
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U xxhash
-            ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>0.1,<0.2"
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2"
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U HyperPyYAML
             ${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y matcha-tts
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxruntime-gpu==1.16.0; sys_platform == 'linux'

diff --git a/setup.cfg b/setup.cfg
@@ -110,7 +110,7 @@ all =
     librosa  # For ChatTTS
     xxhash  # For ChatTTS
     torchaudio  # For ChatTTS
-    ChatTTS>0.1,<0.2
+    ChatTTS>=0.2
     lightning>=2.0.0  # For CosyVoice, matcha
     hydra-core>=1.3.2  # For CosyVoice, matcha
     inflect  # For CosyVoice, matcha
@@ -185,7 +185,7 @@ audio =
     librosa
     xxhash
     torchaudio
-    ChatTTS>0.1,<0.2
+    ChatTTS>=0.2
     tiktoken  # For CosyVoice, openai-whisper
     torch>=2.0.0  # For CosyVoice, matcha
     lightning>=2.0.0  # For CosyVoice, matcha

diff --git a/xinference/deploy/docker/requirements.txt b/xinference/deploy/docker/requirements.txt
@@ -49,7 +49,7 @@ nemo_text_processing<1.1.0  # 1.1.0 requires pynini==2.1.6.post1
 WeTextProcessing<1.0.4  # 1.0.4 requires pynini==2.1.6
 librosa  # For ChatTTS
 torchaudio  # For ChatTTS
-ChatTTS>0.1,<0.2
+ChatTTS>=0.2
 xxhash  # For ChatTTS
 torch>=2.0.0  # For CosyVoice
 lightning>=2.0.0  # For CosyVoice, matcha

diff --git a/xinference/deploy/docker/requirements_cpu.txt b/xinference/deploy/docker/requirements_cpu.txt
@@ -46,7 +46,7 @@ nemo_text_processing<1.1.0  # 1.1.0 requires pynini==2.1.6.post1
 WeTextProcessing<1.0.4  # 1.0.4 requires pynini==2.1.6
 librosa  # For ChatTTS
 torchaudio  # For ChatTTS
-ChatTTS>0.1,<0.2
+ChatTTS>=0.2
 xxhash  # For ChatTTS
 torch>=2.0.0  # For CosyVoice
 lightning>=2.0.0  # For CosyVoice, matcha

diff --git a/xinference/model/audio/chattts.py b/xinference/model/audio/chattts.py
@@ -54,7 +54,11 @@ def load(self):
         torch.set_float32_matmul_precision("high")
         self._model = ChatTTS.Chat()
         logger.info("Load ChatTTS model with kwargs: %s", self._kwargs)
-        self._model.load(source="custom", custom_path=self._model_path, **self._kwargs)
+        ok = self._model.load(
+            source="custom", custom_path=self._model_path, **self._kwargs
+        )
+        if not ok:
+            raise Exception(f"The ChatTTS model is not correct: {self._model_path}")
 
     def speech(
         self,
@@ -114,24 +118,31 @@ def _generator():
                     last_pos = 0
                     with writer.open():
                         for it in iter:
-                            for itt in it:
-                                for chunk in itt:
-                                    chunk = np.array([chunk]).transpose()
-                                    writer.write_audio_chunk(i, torch.from_numpy(chunk))
-                                    new_last_pos = out.tell()
-                                    if new_last_pos != last_pos:
-                                        out.seek(last_pos)
-                                        encoded_bytes = out.read()
-                                        yield encoded_bytes
-                                        last_pos = new_last_pos
+                            for chunk in it:
+                                chunk = np.array([chunk]).transpose()
+                                writer.write_audio_chunk(i, torch.from_numpy(chunk))
+                                new_last_pos = out.tell()
+                                if new_last_pos != last_pos:
+                                    out.seek(last_pos)
+                                    encoded_bytes = out.read()
+                                    yield encoded_bytes
+                                    last_pos = new_last_pos
 
             return _generator()
         else:
             wavs = self._model.infer([input], params_infer_code=params_infer_code)
 
             # Save the generated audio
             with BytesIO() as out:
-                torchaudio.save(
-                    out, torch.from_numpy(wavs[0]), 24000, format=response_format
-                )
+                try:
+                    torchaudio.save(
+                        out,
+                        torch.from_numpy(wavs[0]).unsqueeze(0),
+                        24000,
+                        format=response_format,
+                    )
+                except:
+                    torchaudio.save(
+                        out, torch.from_numpy(wavs[0]), 24000, format=response_format
+                    )
                 return out.getvalue()
diff --git a/xinference/model/audio/model_spec.json b/xinference/model/audio/model_spec.json
@@ -127,7 +127,7 @@
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
     "model_id": "2Noise/ChatTTS",
-    "model_revision": "ce5913842aebd78e4a01a02d47244b8d62ac4ee3",
+    "model_revision": "3b34118f6d25850440b8901cef3e71c6ef8619c8",
     "model_ability": "text-to-audio",
     "multilingual": true
   },

diff --git a/xinference/model/audio/model_spec_modelscope.json b/xinference/model/audio/model_spec_modelscope.json
@@ -42,7 +42,7 @@
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
     "model_hub": "modelscope",
-    "model_id": "pzc163/chatTTS",
+    "model_id": "AI-ModelScope/ChatTTS",
     "model_revision": "master",
     "model_ability": "text-to-audio",
     "multilingual": true

diff --git a/xinference/model/audio/tests/test_chattts.py b/xinference/model/audio/tests/test_chattts.py
@@ -46,12 +46,14 @@ def test_chattts(setup):
 
     response = model.speech(input_string, stream=True)
     assert inspect.isgenerator(response)
-    i = 0
-    for chunk in response:
-        i += 1
-        assert type(chunk) is bytes
-        assert len(chunk) > 0
-    assert i > 5
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as f:
+        i = 0
+        for chunk in response:
+            f.write(chunk)
+            i += 1
+            assert type(chunk) is bytes
+            assert len(chunk) > 0
+        assert i > 5
 
     # Test openai API
     import openai