Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Improve Recipe Imports with Cleaner #4517

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions mealie/services/migrations/_migration_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,5 @@ def clean_recipe_dictionary(self, recipe_dict: dict) -> Recipe:
with contextlib.suppress(KeyError):
del recipe_dict["id"]

recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))

return Recipe(**recipe_dict)
recipe = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
return recipe
2 changes: 2 additions & 0 deletions mealie/services/recipe/recipe_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from mealie.services._base_service import BaseService
from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService
from mealie.services.recipe.recipe_data_service import RecipeDataService
from mealie.services.scraper import cleaner

from .template_service import TemplateService

Expand Down Expand Up @@ -297,6 +298,7 @@ async def create_from_images(self, images: list[UploadFile], translate_language:
recipe_data = await openai_recipe_service.build_recipe_from_images(
local_images, translate_language=translate_language
)
recipe_data = cleaner.clean(recipe_data, self.translator)

recipe = self.create_one(recipe_data)
data_service = RecipeDataService(recipe.id)
Expand Down
14 changes: 11 additions & 3 deletions mealie/services/scraper/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe

logger = get_logger("recipe-scraper")

Expand All @@ -33,16 +34,23 @@
""" Matches multiple new lines and removes erroneous white space """


def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
def clean(recipe_data: Recipe | dict, translator: Translator, url=None) -> Recipe:
"""Main entrypoint to clean a recipe extracted from the web
and format the data into an accectable format for the database

Args:
recipe_data (dict): raw recipe dicitonary
recipe_data (dict): raw recipe or recipe dictionary

Returns:
dict: cleaned recipe dictionary
"""
if not isinstance(recipe_data, dict):
# format the recipe like a scraped dictionary
recipe_data_dict = recipe_data.model_dump(by_alias=True)
recipe_data_dict["recipeIngredient"] = [ing.display for ing in recipe_data.recipe_ingredient]

recipe_data = recipe_data_dict

recipe_data["description"] = clean_string(recipe_data.get("description", ""))

# Times
Expand All @@ -59,7 +67,7 @@ def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
recipe_data["notes"] = clean_notes(recipe_data.get("notes"))
recipe_data["rating"] = clean_int(recipe_data.get("rating"))

return recipe_data
return Recipe(**recipe_data)


def clean_string(text: str | list | int) -> str:
Expand Down
23 changes: 20 additions & 3 deletions mealie/services/scraper/recipe_scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe
from mealie.services.scraper import cleaner
from mealie.services.scraper.scraped_extras import ScrapedExtras

from .scraper_strategies import (
Expand Down Expand Up @@ -31,6 +33,7 @@ def __init__(self, translator: Translator, scrapers: list[type[ABCScraperStrateg

self.scrapers = scrapers
self.translator = translator
self.logger = get_logger()

async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
"""
Expand All @@ -41,9 +44,23 @@ async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, Scrap
raw_html = html or await safe_scrape_html(url)
for scraper_type in self.scrapers:
scraper = scraper_type(url, self.translator, raw_html=raw_html)
result = await scraper.parse()

if result is not None:
return result
try:
result = await scraper.parse()
except Exception:
hay-kot marked this conversation as resolved.
Show resolved Hide resolved
self.logger.exception(f"Failed to scrape HTML with {scraper.__class__.__name__}")
result = None

if result is None or result[0] is None:
continue

recipe_result, extras = result
try:
recipe = cleaner.clean(recipe_result, self.translator)
except Exception:
hay-kot marked this conversation as resolved.
Show resolved Hide resolved
self.logger.exception(f"Failed to clean recipe data from {scraper.__class__.__name__}")
continue

return recipe, extras

return None, None
16 changes: 15 additions & 1 deletion mealie/services/scraper/scraper_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,18 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
rather than trying to scrape it directly.
"""

def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str:
data_parts: list[str] = []
for script in soup.find_all("script", type="application/ld+json"):
try:
script_data = script.string
if script_data:
data_parts.append(str(script_data))
except AttributeError:
pass

return "\n\n".join(data_parts)

def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
# find the open graph image tag
og_image = soup.find("meta", property="og:image")
Expand Down Expand Up @@ -285,8 +297,10 @@ def format_html_to_text(self, html: str) -> str:
soup = bs4.BeautifulSoup(html, "lxml")

text = soup.get_text(separator="\n", strip=True)
text += self.extract_json_ld_data_from_html(soup)
if not text:
raise Exception("No text found in HTML")
raise Exception("No text or ld+json data found in HTML")

try:
image = self.find_image(soup)
except Exception:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
def test_cleaner_clean(json_file: Path, num_steps):
translator = local_provider()
recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator)
assert len(recipe_data["recipeInstructions"]) == num_steps
assert len(recipe_data.recipe_instructions or []) == num_steps


def test_html_with_recipe_data():
Expand Down
Loading