mozilla-ai · Kostis-S-Z · Jan 10, 2025 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -30,9 +30,6 @@ jobs:
       - name: Install test dependencies
         run: pip install -e '.[tests]'
 
-      - name: Install parler dependency
-        run: pip install git+https://github.com/huggingface/parler-tts.git
-
       - name: Run Unit Tests
         run: pytest -v tests/unit
 

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ It is designed to work on most local setups or with [GitHub Codespaces](https://
 ### Built with
 - Python 3.10+ (use Python 3.12 for Apple M1/2/3 chips)
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
-- [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
+- [OuteAI](https://github.com/edwko/OuteTTS) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)
 
 
@@ -106,10 +106,10 @@ For the complete list of models supported out-of-the-box, visit this [link](http
 
 ### text-to-speech
 
-We support models from the [OuteAI](https://github.com/edwko/OuteTTS) and [Parler_tts](https://github.com/huggingface/parler-tts) packages. The default text-to-speech model in this repo is [OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M). Note that the `0.1-350M` version has a `CC-By-4.0` (permissive) license, whereas the newer / better `0.2-500M` version has a `CC-By-NC-4.0` (non-commercial) license.
-For a complete list of models visit [Oute HF](https://huggingface.co/collections/OuteAI) (only the GGUF versions) and [Parler HF](https://huggingface.co/collections/parler-tts).
+We support models from the [OuteAI](https://github.com/edwko/OuteTTS) package. The default text-to-speech model in this repo is [OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M). Note that the `0.1-350M` version has a `CC-By-4.0` (permissive) license, whereas the newer / better `0.2-500M` version has a `CC-By-NC-4.0` (non-commercial) license.
+For a complete list of models visit [Oute HF](https://huggingface.co/collections/OuteAI) (only the GGUF versions).
 
-**Important note:** In order to keep the package dependencies as lightweight as possible, only the Oute interface is installed by default. If you want to use the parler models, please also follow the instructions at https://github.com/huggingface/parler-tts.
+In this [repo](https://github.com/Kostis-S-Z/document-to-podcast) you can see examples of using different TTS models with minimal code changes.
 
 ## Pre-requisites
 

diff --git a/demo/app.py b/demo/app.py
@@ -10,13 +10,13 @@
 from bs4 import BeautifulSoup
 from requests.exceptions import RequestException
 
+from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
+    load_tts_model,
 )
 from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
-from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.inference.text_to_text import text_to_text_stream
 
 
@@ -29,7 +29,7 @@ def load_text_to_text_model():
 
 @st.cache_resource
 def load_text_to_speech_model():
-    return load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+    return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 
 
 script = "script"
@@ -67,15 +67,15 @@ def gen_button_clicked():
 
     col1, col2 = st.columns(2)
 
-    raw_text = DATA_LOADERS[extension](uploaded_file)
+    original_text = DATA_LOADERS[extension](uploaded_file)
     with col1:
         st.subheader("Raw Text")
         st.text_area(
-            f"Number of characters before cleaning: {len(raw_text)}",
-            f"{raw_text[:500]} . . .",
+            f"Number of characters before cleaning: {len(original_text)}",
+            f"{original_text[:500]} . . .",
         )
 
-    clean_text = DATA_CLEANERS[extension](raw_text)
+    clean_text = DATA_CLEANERS[extension](original_text)
     with col2:
         st.subheader("Cleaned Text")
         st.text_area(
@@ -91,16 +91,16 @@ def gen_button_clicked():
 process_url = st.button("Clean URL Content")
 
 
-def process_url_content(url: str) -> tuple[str, str]:
+def process_url_content(url_to_process: str) -> tuple[str, str]:
     """Fetch and clean content from a URL.
 
     Args:
-        url: The URL to fetch content from
+        url_to_process: The URL to fetch content from
 
     Returns:
         tuple containing raw and cleaned text
     """
-    response = requests.get(url)
+    response = requests.get(url_to_process)
     response.raise_for_status()
     soup = BeautifulSoup(response.text, "html.parser")
     raw_text = soup.get_text()
@@ -110,21 +110,21 @@ def process_url_content(url: str) -> tuple[str, str]:
 if url and process_url:
     try:
         with st.spinner("Fetching and cleaning content..."):
-            raw_text, clean_text = process_url_content(url)
+            original_text, clean_text = process_url_content(url)
             st.session_state["clean_text"] = clean_text
 
             # Display results
             col1, col2 = st.columns(2)
             with col1:
                 st.subheader("Raw Text")
                 st.text_area(
-                    "Number of characters before cleaning: " f"{len(raw_text)}",
-                    f"{raw_text[:500]}...",
+                    f"Number of characters before cleaning: {len(original_text)}",
+                    f"{original_text[:500]}...",
                 )
             with col2:
                 st.subheader("Cleaned Text")
                 st.text_area(
-                    "Number of characters after cleaning: " f"{len(clean_text)}",
+                    f"Number of characters after cleaning: {len(clean_text)}",
                     f"{clean_text[:500]}...",
                 )
     except RequestException as e:
@@ -211,7 +211,8 @@ def process_url_content(url: str) -> tuple[str, str]:
                             speech_model,
                             voice_profile,
                         )
-                    st.audio(speech, sample_rate=speech_model.audio_codec.sr)
+                    st.audio(speech, sample_rate=speech_model.sample_rate)
+
                     st.session_state.audio.append(speech)
                     text = ""
 
@@ -221,7 +222,7 @@ def process_url_content(url: str) -> tuple[str, str]:
             sf.write(
                 "podcast.wav",
                 st.session_state.audio,
-                samplerate=speech_model.audio_codec.sr,
+                samplerate=speech_model.sample_rate,
             )
             st.markdown("Podcast saved to disk!")
 

diff --git a/demo/download_models.py b/demo/download_models.py
@@ -4,10 +4,10 @@
 
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
+    load_tts_model,
 )
 
 load_llama_cpp_model(
     "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
 )
-load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
diff --git a/docs/customization.md b/docs/customization.md
@@ -74,6 +74,7 @@ Looking for inspiration? Check out these examples of how others have customized
 
 - **[Radio Drama Generator](https://github.com/stefanfrench/radio-drama-generator)**: A creative adaptation that generates radio dramas by customizing ng the Blueprint parameters.
 - **[Readme-to-Podcast](https://github.com/alexmeckes/readme-to-podcast)**: This project transforms GitHub README files into podcast-style audio, showcasing the Blueprint’s ability to handle diverse text inputs.
+- **[Multilingual Podcast](https://github.com/Kostis-S-Z/document-to-podcast/)**: A repo that showcases how to use this package in other languages, like Hindi, Polish, Korean and many more.
 
 ## 🤝 **Contributing to the Blueprint**
 

diff --git a/docs/future-features-contributions.md b/docs/future-features-contributions.md
@@ -15,7 +15,6 @@ The Document-to-Podcast Blueprint is an evolving project designed to grow with t
 This Blueprint is designed to be a foundation you can build upon. By extending its capabilities, you can open the door to new applications, improve user experience, and adapt the Blueprint to address other use cases. Here are a few ideas for how you can expand its potential:
 
 
-- **Multi-language podcast generation:** Add support for multi-language podcast generation to expand the reach of this Blueprint.
 - **New modalities input:** Add support to the Blueprint to be able to handle different input modalities, like audio or images, enabling more flexibility in podcast generation.
 - **Improved audio quality:** Explore and integrate more advanced open-source TTS frameworks to enhance the quality of generated audio, making podcasts sound more natural.
 

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -37,10 +37,3 @@ pip install -e .
 python -m streamlit run demo/app.py
 ```
 
-
-### [Optional]: Use Parler models for text-to-speech
-
-If you want to use the [parler tts](https://github.com/huggingface/parler-tts) models, you will need to **additionally** install an optional dependency by running:
-```bash
-pip install -e '.[parler]'
-```
diff --git a/docs/index.md b/docs/index.md
@@ -11,7 +11,7 @@ These docs are your companion to mastering the **Document-to-Podcast Blueprint**
 ### Built with
 - Python 3.10+
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
-- [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
+- [OuteAI](https://github.com/edwko/OuteTTS) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)
 
 

diff --git a/docs/step-by-step-guide.md b/docs/step-by-step-guide.md
@@ -160,11 +160,10 @@ In this final step, the generated podcast transcript is brought to life as an au
 
  **1 - Model Loading**
 
-   - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-speech` models using the `outetts` and `parler_tts` libraries.
+   - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-text` and `text-to-speech` models.
 
    - The function `load_outetts_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model, either on CPU or GPU, based on the `device` parameter. The parameter `language` also enables to swap between the languages the Oute package supports (as of Dec 2024: `en, zh, ja, ko`)
 
-   - The function `load_parler_tts_model_and_tokenizer` takes a model ID in the format `{repo}/{filename}` and loads the specified model and tokenizer, either on CPU or GPU, based on the `device` parameter.
 
 **2 - Text-to-Speech Audio Generation**
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ dependencies = [
   "pydantic",
   "PyPDF2[crypto]",
   "python-docx",
+  "transformers>4.31.0",
   "streamlit",
 ]
 

diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py
@@ -12,15 +12,14 @@
     Speaker,
     DEFAULT_PROMPT,
     DEFAULT_SPEAKERS,
-    SUPPORTED_TTS_MODELS,
+    TTS_LOADERS,
 )
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
-    load_parler_tts_model_and_tokenizer,
+    load_tts_model,
 )
-from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.inference.text_to_speech import text_to_speech
+from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS
 
 
@@ -30,8 +29,9 @@ def document_to_podcast(
     output_folder: str | None = None,
     text_to_text_model: str = "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
     text_to_text_prompt: str = DEFAULT_PROMPT,
-    text_to_speech_model: SUPPORTED_TTS_MODELS = "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
+    text_to_speech_model: TTS_LOADERS = "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf",
     speakers: list[Speaker] | None = None,
+    outetts_language: str = "en",  # Only applicable to OuteTTS models
     from_config: str | None = None,
 ):
     """
@@ -70,8 +70,10 @@ def document_to_podcast(
         speakers (list[Speaker] | None, optional): The speakers for the podcast.
             Defaults to DEFAULT_SPEAKERS.
 
-        from_config (str, optional): The path to the config file. Defaults to None.
+        outetts_language (str): For OuteTTS models we need to specify which language to use.
+            Supported languages in 0.2-500M: en, zh, ja, ko. More info: https://github.com/edwko/OuteTTS
 
+        from_config (str, optional): The path to the config file. Defaults to None.
 
             If provided, all other arguments will be ignored.
     """
@@ -86,6 +88,7 @@ def document_to_podcast(
             text_to_text_prompt=text_to_text_prompt,
             text_to_speech_model=text_to_speech_model,
             speakers=[Speaker.model_validate(speaker) for speaker in speakers],
+            outetts_language=outetts_language,
         )
 
     output_folder = Path(config.output_folder)
@@ -106,15 +109,9 @@ def document_to_podcast(
     text_model = load_llama_cpp_model(model_id=config.text_to_text_model)
 
     logger.info(f"Loading {config.text_to_speech_model}")
-    if "oute" in config.text_to_speech_model.lower():
-        speech_model = load_outetts_model(model_id=config.text_to_speech_model)
-        speech_tokenizer = None
-        sample_rate = speech_model.audio_codec.sr
-    else:
-        speech_model, speech_tokenizer = load_parler_tts_model_and_tokenizer(
-            model_id=config.text_to_speech_model
-        )
-        sample_rate = speech_model.config.sampling_rate
+    speech_model = load_tts_model(
+        model_id=config.text_to_speech_model, outetts_language=outetts_language
+    )
 
     # ~4 characters per token is considered a reasonable default.
     max_characters = text_model.n_ctx() * 4
@@ -151,18 +148,17 @@ def document_to_podcast(
                     text.split(f'"Speaker {speaker_id}":')[-1],
                     speech_model,
                     voice_profile,
-                    tokenizer=speech_tokenizer,  # Applicable only for parler models
                 )
                 podcast_audio.append(speech)
                 text = ""
+
     except KeyboardInterrupt:
         logger.warning("Podcast generation stopped by user.")
-
     logger.info("Saving Podcast...")
     sf.write(
         str(output_folder / "podcast.wav"),
         np.concatenate(podcast_audio),
-        samplerate=sample_rate,
+        samplerate=speech_model.sample_rate,
     )
     (output_folder / "podcast.txt").write_text(podcast_script)
     logger.success("Done!")

diff --git a/src/document_to_podcast/config.py b/src/document_to_podcast/config.py
@@ -1,10 +1,11 @@
 from pathlib import Path
-from typing import Literal
 from typing_extensions import Annotated
 
 from pydantic import BaseModel, FilePath
 from pydantic.functional_validators import AfterValidator
 
+from document_to_podcast.inference.model_loaders import TTS_LOADERS
+from document_to_podcast.inference.text_to_speech import TTS_INFERENCE
 from document_to_podcast.preprocessing import DATA_LOADERS
 
 
@@ -41,14 +42,6 @@
     },
 ]
 
-SUPPORTED_TTS_MODELS = Literal[
-    "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
-    "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf",
-    "parler-tts/parler-tts-large-v1",
-    "parler-tts/parler-tts-mini-v1",
-    "parler-tts/parler-tts-mini-v1.1",
-]
-
 
 def validate_input_file(value):
     if Path(value).suffix not in DATA_LOADERS:
@@ -73,6 +66,18 @@ def validate_text_to_text_prompt(value):
     return value
 
 
+def validate_text_to_speech_model(value):
+    if value not in TTS_LOADERS:
+        raise ValueError(
+            f"Model {value} is missing a loading function. Please define it under model_loaders.py"
+        )
+    if value not in TTS_INFERENCE:
+        raise ValueError(
+            f"Model {value} is missing an inference function. Please define it under text_to_speech.py"
+        )
+    return value
+
+
 class Speaker(BaseModel):
     id: int
     name: str
@@ -88,5 +93,6 @@ class Config(BaseModel):
     output_folder: str
     text_to_text_model: Annotated[str, AfterValidator(validate_text_to_text_model)]
     text_to_text_prompt: Annotated[str, AfterValidator(validate_text_to_text_prompt)]
-    text_to_speech_model: SUPPORTED_TTS_MODELS
+    text_to_speech_model: Annotated[str, AfterValidator(validate_text_to_speech_model)]
     speakers: list[Speaker]
+    outetts_language: str = "en"