Skip to content

Commit

Permalink
Updates for URL support (#83)
Browse files Browse the repository at this point in the history
* Add load_url

* Update app

* Add tests for load_url
  • Loading branch information
daavoo authored Jan 9, 2025
1 parent 95e347b commit faf5f9a
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 56 deletions.
64 changes: 10 additions & 54 deletions demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
import numpy as np
import soundfile as sf
import streamlit as st
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
from document_to_podcast.inference.model_loaders import (
Expand Down Expand Up @@ -55,19 +52,26 @@ def gen_button_clicked():
"Choose a file", type=["pdf", "html", "txt", "docx", "md"]
)

if uploaded_file is not None:
st.header("Or Enter a Website URL")
url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")

if uploaded_file is not None or url:
st.divider()
st.header("Loading and Cleaning Data")
st.markdown(
"[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-1-document-pre-processing)"
)
st.divider()

extension = Path(uploaded_file.name).suffix
if uploaded_file:
extension = Path(uploaded_file.name).suffix
raw_text = DATA_LOADERS[extension](uploaded_file)
else:
extension = ".html"
raw_text = DATA_LOADERS["url"](url)

col1, col2 = st.columns(2)

raw_text = DATA_LOADERS[extension](uploaded_file)
with col1:
st.subheader("Raw Text")
st.text_area(
Expand All @@ -86,53 +90,6 @@ def gen_button_clicked():

st.divider()

st.header("Or Enter a Website URL")
url = st.text_input("URL", placeholder="https://blog.mozilla.ai/...")
process_url = st.button("Clean URL Content")


def process_url_content(url: str) -> tuple[str, str]:
"""Fetch and clean content from a URL.
Args:
url: The URL to fetch content from
Returns:
tuple containing raw and cleaned text
"""
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
raw_text = soup.get_text()
return raw_text, DATA_CLEANERS[".html"](raw_text)


if url and process_url:
try:
with st.spinner("Fetching and cleaning content..."):
raw_text, clean_text = process_url_content(url)
st.session_state["clean_text"] = clean_text

# Display results
col1, col2 = st.columns(2)
with col1:
st.subheader("Raw Text")
st.text_area(
"Number of characters before cleaning: " f"{len(raw_text)}",
f"{raw_text[:500]}...",
)
with col2:
st.subheader("Cleaned Text")
st.text_area(
"Number of characters after cleaning: " f"{len(clean_text)}",
f"{clean_text[:500]}...",
)
except RequestException as e:
st.error(f"Error fetching URL: {str(e)}")
except Exception as e:
st.error(f"Error processing content: {str(e)}")

# Second part - Podcast generation
if "clean_text" in st.session_state:
clean_text = st.session_state["clean_text"]

Expand All @@ -143,7 +100,6 @@ def process_url_content(url: str) -> tuple[str, str]:
)
st.divider()

# Load models
text_model = load_text_to_text_model()
speech_model = load_text_to_speech_model()

Expand Down
3 changes: 2 additions & 1 deletion src/document_to_podcast/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .data_loaders import load_pdf, load_txt, load_docx
from .data_loaders import load_pdf, load_txt, load_docx, load_url
from .data_cleaners import clean_with_regex, clean_html, clean_markdown


Expand All @@ -8,6 +8,7 @@
".md": load_txt,
".pdf": load_pdf,
".txt": load_txt,
"url": load_url,
}

DATA_CLEANERS = {
Expand Down
11 changes: 11 additions & 0 deletions src/document_to_podcast/preprocessing/data_loaders.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import PyPDF2
import requests

from docx import Document
from loguru import logger
Expand Down Expand Up @@ -33,3 +34,13 @@ def load_docx(docx_file: str | UploadedFile) -> str | None:
except Exception as e:
logger.exception(e)
return None


def load_url(url: str) -> str | None:
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except Exception as e:
logger.exception(e)
return None
19 changes: 18 additions & 1 deletion tests/unit/preprocessing/test_data_loaders.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from document_to_podcast.preprocessing.data_loaders import load_pdf, load_txt, load_docx
from document_to_podcast.preprocessing.data_loaders import (
load_pdf,
load_txt,
load_docx,
load_url,
)


def test_load_pdf(example_data):
Expand Down Expand Up @@ -50,3 +55,15 @@ def test_load_markdown(example_data):
def test_load_invalid_markdown():
result = load_txt("invalid.md")
assert result is None


def test_load_url():
result = load_url(
"https://blog.mozilla.ai/introducing-mozilla-ai-investing-in-trustworthy-ai/"
)
assert "Introducing Mozilla.ai: Investing in trustworthy AI" in result


def test_load_invalid_url():
result = load_url("invalid")
assert result is None

0 comments on commit faf5f9a

Please sign in to comment.