Skip to content

Commit

Permalink
ENH: Support ChatTTS 0.2 (#2449)
Browse files Browse the repository at this point in the history
  • Loading branch information
codingl2k1 authored Oct 19, 2024
1 parent 5f7dea4 commit b3cd77f
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 27 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ jobs:
${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing<1.0.4
${{ env.SELF_HOST_PYTHON }} -m pip install -U librosa
${{ env.SELF_HOST_PYTHON }} -m pip install -U xxhash
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>0.1,<0.2"
${{ env.SELF_HOST_PYTHON }} -m pip install -U "ChatTTS>=0.2"
${{ env.SELF_HOST_PYTHON }} -m pip install -U HyperPyYAML
${{ env.SELF_HOST_PYTHON }} -m pip uninstall -y matcha-tts
${{ env.SELF_HOST_PYTHON }} -m pip install -U onnxruntime-gpu==1.16.0; sys_platform == 'linux'
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ all =
librosa # For ChatTTS
xxhash # For ChatTTS
torchaudio # For ChatTTS
ChatTTS>0.1,<0.2
ChatTTS>=0.2
lightning>=2.0.0 # For CosyVoice, matcha
hydra-core>=1.3.2 # For CosyVoice, matcha
inflect # For CosyVoice, matcha
Expand Down Expand Up @@ -185,7 +185,7 @@ audio =
librosa
xxhash
torchaudio
ChatTTS>0.1,<0.2
ChatTTS>=0.2
tiktoken # For CosyVoice, openai-whisper
torch>=2.0.0 # For CosyVoice, matcha
lightning>=2.0.0 # For CosyVoice, matcha
Expand Down
2 changes: 1 addition & 1 deletion xinference/deploy/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ nemo_text_processing<1.1.0 # 1.1.0 requires pynini==2.1.6.post1
WeTextProcessing<1.0.4 # 1.0.4 requires pynini==2.1.6
librosa # For ChatTTS
torchaudio # For ChatTTS
ChatTTS>0.1,<0.2
ChatTTS>=0.2
xxhash # For ChatTTS
torch>=2.0.0 # For CosyVoice
lightning>=2.0.0 # For CosyVoice, matcha
Expand Down
2 changes: 1 addition & 1 deletion xinference/deploy/docker/requirements_cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ nemo_text_processing<1.1.0 # 1.1.0 requires pynini==2.1.6.post1
WeTextProcessing<1.0.4 # 1.0.4 requires pynini==2.1.6
librosa # For ChatTTS
torchaudio # For ChatTTS
ChatTTS>0.1,<0.2
ChatTTS>=0.2
xxhash # For ChatTTS
torch>=2.0.0 # For CosyVoice
lightning>=2.0.0 # For CosyVoice, matcha
Expand Down
39 changes: 25 additions & 14 deletions xinference/model/audio/chattts.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ def load(self):
torch.set_float32_matmul_precision("high")
self._model = ChatTTS.Chat()
logger.info("Load ChatTTS model with kwargs: %s", self._kwargs)
self._model.load(source="custom", custom_path=self._model_path, **self._kwargs)
ok = self._model.load(
source="custom", custom_path=self._model_path, **self._kwargs
)
if not ok:
raise Exception(f"The ChatTTS model is not correct: {self._model_path}")

def speech(
self,
Expand Down Expand Up @@ -114,24 +118,31 @@ def _generator():
last_pos = 0
with writer.open():
for it in iter:
for itt in it:
for chunk in itt:
chunk = np.array([chunk]).transpose()
writer.write_audio_chunk(i, torch.from_numpy(chunk))
new_last_pos = out.tell()
if new_last_pos != last_pos:
out.seek(last_pos)
encoded_bytes = out.read()
yield encoded_bytes
last_pos = new_last_pos
for chunk in it:
chunk = np.array([chunk]).transpose()
writer.write_audio_chunk(i, torch.from_numpy(chunk))
new_last_pos = out.tell()
if new_last_pos != last_pos:
out.seek(last_pos)
encoded_bytes = out.read()
yield encoded_bytes
last_pos = new_last_pos

return _generator()
else:
wavs = self._model.infer([input], params_infer_code=params_infer_code)

# Save the generated audio
with BytesIO() as out:
torchaudio.save(
out, torch.from_numpy(wavs[0]), 24000, format=response_format
)
try:
torchaudio.save(
out,
torch.from_numpy(wavs[0]).unsqueeze(0),
24000,
format=response_format,
)
except:
torchaudio.save(
out, torch.from_numpy(wavs[0]), 24000, format=response_format
)
return out.getvalue()
2 changes: 1 addition & 1 deletion xinference/model/audio/model_spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@
"model_name": "ChatTTS",
"model_family": "ChatTTS",
"model_id": "2Noise/ChatTTS",
"model_revision": "ce5913842aebd78e4a01a02d47244b8d62ac4ee3",
"model_revision": "3b34118f6d25850440b8901cef3e71c6ef8619c8",
"model_ability": "text-to-audio",
"multilingual": true
},
Expand Down
2 changes: 1 addition & 1 deletion xinference/model/audio/model_spec_modelscope.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"model_name": "ChatTTS",
"model_family": "ChatTTS",
"model_hub": "modelscope",
"model_id": "pzc163/chatTTS",
"model_id": "AI-ModelScope/ChatTTS",
"model_revision": "master",
"model_ability": "text-to-audio",
"multilingual": true
Expand Down
14 changes: 8 additions & 6 deletions xinference/model/audio/tests/test_chattts.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@ def test_chattts(setup):

response = model.speech(input_string, stream=True)
assert inspect.isgenerator(response)
i = 0
for chunk in response:
i += 1
assert type(chunk) is bytes
assert len(chunk) > 0
assert i > 5
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as f:
i = 0
for chunk in response:
f.write(chunk)
i += 1
assert type(chunk) is bytes
assert len(chunk) > 0
assert i > 5

# Test openai API
import openai
Expand Down

0 comments on commit b3cd77f

Please sign in to comment.