Updates for URL support (#83)

* Add load_url * Update app * Add tests for load_url
mozilla-ai · Jan 9, 2025 · faf5f9a · faf5f9a
1 parent 95e347b
commit faf5f9a
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 56 deletions.
diff --git a/demo/app.py b/demo/app.py
@@ -6,9 +6,6 @@
 import numpy as np
 import soundfile as sf
 import streamlit as st
-import requests
-from bs4 import BeautifulSoup
-from requests.exceptions import RequestException
 
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
@@ -55,19 +52,26 @@ def gen_button_clicked():
     "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
 )
 
-if uploaded_file is not None:
+st.header("Or Enter a Website URL")
+url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
+
+if uploaded_file is not None or url:
     st.divider()
     st.header("Loading and Cleaning Data")
     st.markdown(
         "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-1-document-pre-processing)"
     )
     st.divider()
 
-    extension = Path(uploaded_file.name).suffix
+    if uploaded_file:
+        extension = Path(uploaded_file.name).suffix
+        raw_text = DATA_LOADERS[extension](uploaded_file)
+    else:
+        extension = ".html"
+        raw_text = DATA_LOADERS["url"](url)
 
     col1, col2 = st.columns(2)
 
-    raw_text = DATA_LOADERS[extension](uploaded_file)
     with col1:
         st.subheader("Raw Text")
         st.text_area(
@@ -86,53 +90,6 @@ def gen_button_clicked():
 
 st.divider()
 
-st.header("Or Enter a Website URL")
-url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
-process_url = st.button("Clean URL Content")
-
-
-def process_url_content(url: str) -> tuple[str, str]:
-    """Fetch and clean content from a URL.
-
-    Args:
-        url: The URL to fetch content from
-
-    Returns:
-        tuple containing raw and cleaned text
-    """
-    response = requests.get(url)
-    response.raise_for_status()
-    soup = BeautifulSoup(response.text, "html.parser")
-    raw_text = soup.get_text()
-    return raw_text, DATA_CLEANERS[".html"](raw_text)
-
-
-if url and process_url:
-    try:
-        with st.spinner("Fetching and cleaning content..."):
-            raw_text, clean_text = process_url_content(url)
-            st.session_state["clean_text"] = clean_text
-
-            # Display results
-            col1, col2 = st.columns(2)
-            with col1:
-                st.subheader("Raw Text")
-                st.text_area(
-                    "Number of characters before cleaning: " f"{len(raw_text)}",
-                    f"{raw_text[:500]}...",
-                )
-            with col2:
-                st.subheader("Cleaned Text")
-                st.text_area(
-                    "Number of characters after cleaning: " f"{len(clean_text)}",
-                    f"{clean_text[:500]}...",
-                )
-    except RequestException as e:
-        st.error(f"Error fetching URL: {str(e)}")
-    except Exception as e:
-        st.error(f"Error processing content: {str(e)}")
-
-# Second part - Podcast generation
 if "clean_text" in st.session_state:
     clean_text = st.session_state["clean_text"]
 
@@ -143,7 +100,6 @@ def process_url_content(url: str) -> tuple[str, str]:
     )
     st.divider()
 
-    # Load models
     text_model = load_text_to_text_model()
     speech_model = load_text_to_speech_model()
 

diff --git a/src/document_to_podcast/preprocessing/__init__.py b/src/document_to_podcast/preprocessing/__init__.py
@@ -1,4 +1,4 @@
-from .data_loaders import load_pdf, load_txt, load_docx
+from .data_loaders import load_pdf, load_txt, load_docx, load_url
 from .data_cleaners import clean_with_regex, clean_html, clean_markdown
 
 
@@ -8,6 +8,7 @@
     ".md": load_txt,
     ".pdf": load_pdf,
     ".txt": load_txt,
+    "url": load_url,
 }
 
 DATA_CLEANERS = {

diff --git a/src/document_to_podcast/preprocessing/data_loaders.py b/src/document_to_podcast/preprocessing/data_loaders.py
@@ -1,4 +1,5 @@
 import PyPDF2
+import requests
 
 from docx import Document
 from loguru import logger
@@ -33,3 +34,13 @@ def load_docx(docx_file: str | UploadedFile) -> str | None:
     except Exception as e:
         logger.exception(e)
         return None
+
+
+def load_url(url: str) -> str | None:
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        logger.exception(e)
+        return None
diff --git a/tests/unit/preprocessing/test_data_loaders.py b/tests/unit/preprocessing/test_data_loaders.py
@@ -1,4 +1,9 @@
-from document_to_podcast.preprocessing.data_loaders import load_pdf, load_txt, load_docx
+from document_to_podcast.preprocessing.data_loaders import (
+    load_pdf,
+    load_txt,
+    load_docx,
+    load_url,
+)
 
 
 def test_load_pdf(example_data):
@@ -50,3 +55,15 @@ def test_load_markdown(example_data):
 def test_load_invalid_markdown():
     result = load_txt("invalid.md")
     assert result is None
+
+
+def test_load_url():
+    result = load_url(
+        "https://blog.mozilla.ai/introducing-mozilla-ai-investing-in-trustworthy-ai/"
+    )
+    assert "Introducing Mozilla.ai: Investing in trustworthy AI" in result
+
+
+def test_load_invalid_url():
+    result = load_url("invalid")
+    assert result is None