diff --git a/mealie/services/migrations/_migration_base.py b/mealie/services/migrations/_migration_base.py index f209b4ebd41..1f7cf574db0 100644 --- a/mealie/services/migrations/_migration_base.py +++ b/mealie/services/migrations/_migration_base.py @@ -268,6 +268,5 @@ def clean_recipe_dictionary(self, recipe_dict: dict) -> Recipe: with contextlib.suppress(KeyError): del recipe_dict["id"] - recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None)) - - return Recipe(**recipe_dict) + recipe = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None)) + return recipe diff --git a/mealie/services/recipe/recipe_service.py b/mealie/services/recipe/recipe_service.py index b420216c73f..e07f5f3666b 100644 --- a/mealie/services/recipe/recipe_service.py +++ b/mealie/services/recipe/recipe_service.py @@ -32,6 +32,7 @@ from mealie.services._base_service import BaseService from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService from mealie.services.recipe.recipe_data_service import RecipeDataService +from mealie.services.scraper import cleaner from .template_service import TemplateService @@ -297,6 +298,7 @@ async def create_from_images(self, images: list[UploadFile], translate_language: recipe_data = await openai_recipe_service.build_recipe_from_images( local_images, translate_language=translate_language ) + recipe_data = cleaner.clean(recipe_data, self.translator) recipe = self.create_one(recipe_data) data_service = RecipeDataService(recipe.id) diff --git a/mealie/services/scraper/cleaner.py b/mealie/services/scraper/cleaner.py index d685c54d861..bfbc971d40d 100644 --- a/mealie/services/scraper/cleaner.py +++ b/mealie/services/scraper/cleaner.py @@ -11,6 +11,7 @@ from mealie.core.root_logger import get_logger from mealie.lang.providers import Translator +from mealie.schema.recipe.recipe import Recipe logger = get_logger("recipe-scraper") @@ -33,16 +34,23 @@ """ Matches multiple new lines and removes erroneous white space """ -def clean(recipe_data: dict, translator: Translator, url=None) -> dict: +def clean(recipe_data: Recipe | dict, translator: Translator, url=None) -> Recipe: """Main entrypoint to clean a recipe extracted from the web and format the data into an accectable format for the database Args: - recipe_data (dict): raw recipe dicitonary + recipe_data (dict): raw recipe or recipe dictionary Returns: dict: cleaned recipe dictionary """ + if not isinstance(recipe_data, dict): + # format the recipe like a scraped dictionary + recipe_data_dict = recipe_data.model_dump(by_alias=True) + recipe_data_dict["recipeIngredient"] = [ing.display for ing in recipe_data.recipe_ingredient] + + recipe_data = recipe_data_dict + recipe_data["description"] = clean_string(recipe_data.get("description", "")) # Times @@ -59,7 +67,7 @@ def clean(recipe_data: dict, translator: Translator, url=None) -> dict: recipe_data["notes"] = clean_notes(recipe_data.get("notes")) recipe_data["rating"] = clean_int(recipe_data.get("rating")) - return recipe_data + return Recipe(**recipe_data) def clean_string(text: str | list | int) -> str: diff --git a/mealie/services/scraper/recipe_scraper.py b/mealie/services/scraper/recipe_scraper.py index ad3bb1632db..9f17f58a728 100644 --- a/mealie/services/scraper/recipe_scraper.py +++ b/mealie/services/scraper/recipe_scraper.py @@ -1,5 +1,7 @@ +from mealie.core.root_logger import get_logger from mealie.lang.providers import Translator from mealie.schema.recipe.recipe import Recipe +from mealie.services.scraper import cleaner from mealie.services.scraper.scraped_extras import ScrapedExtras from .scraper_strategies import ( @@ -31,6 +33,7 @@ def __init__(self, translator: Translator, scrapers: list[type[ABCScraperStrateg self.scrapers = scrapers self.translator = translator + self.logger = get_logger() async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]: """ @@ -41,9 +44,23 @@ async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, Scrap raw_html = html or await safe_scrape_html(url) for scraper_type in self.scrapers: scraper = scraper_type(url, self.translator, raw_html=raw_html) - result = await scraper.parse() - if result is not None: - return result + try: + result = await scraper.parse() + except Exception: + self.logger.exception(f"Failed to scrape HTML with {scraper.__class__.__name__}") + result = None + + if result is None or result[0] is None: + continue + + recipe_result, extras = result + try: + recipe = cleaner.clean(recipe_result, self.translator) + except Exception: + self.logger.exception(f"Failed to clean recipe data from {scraper.__class__.__name__}") + continue + + return recipe, extras return None, None diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py index 77c326d3ead..2fecd0dd85d 100644 --- a/mealie/services/scraper/scraper_strategies.py +++ b/mealie/services/scraper/scraper_strategies.py @@ -253,6 +253,18 @@ class RecipeScraperOpenAI(RecipeScraperPackage): rather than trying to scrape it directly. """ + def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str: + data_parts: list[str] = [] + for script in soup.find_all("script", type="application/ld+json"): + try: + script_data = script.string + if script_data: + data_parts.append(str(script_data)) + except AttributeError: + pass + + return "\n\n".join(data_parts) + def find_image(self, soup: bs4.BeautifulSoup) -> str | None: # find the open graph image tag og_image = soup.find("meta", property="og:image") @@ -285,8 +297,10 @@ def format_html_to_text(self, html: str) -> str: soup = bs4.BeautifulSoup(html, "lxml") text = soup.get_text(separator="\n", strip=True) + text += self.extract_json_ld_data_from_html(soup) if not text: - raise Exception("No text found in HTML") + raise Exception("No text or ld+json data found in HTML") + try: image = self.find_image(soup) except Exception: diff --git a/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py b/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py index 7e7bf047fa6..8574fc7071f 100644 --- a/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py +++ b/tests/unit_tests/services_tests/scraper_tests/test_cleaner.py @@ -40,7 +40,7 @@ def test_cleaner_clean(json_file: Path, num_steps): translator = local_provider() recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator) - assert len(recipe_data["recipeInstructions"]) == num_steps + assert len(recipe_data.recipe_instructions or []) == num_steps def test_html_with_recipe_data():