Skip to content

Commit

Permalink
Merge pull request #2810 from michael-genson/fix/translation-issues-w…
Browse files Browse the repository at this point in the history
…hen-scraping

fix: Translate ISO 8601 Datetime Durations During Scraping/Parsing/Migrating
  • Loading branch information
boc-the-git authored Feb 8, 2024
2 parents 4c60feb + 9434208 commit 3174216
Show file tree
Hide file tree
Showing 14 changed files with 137 additions and 54 deletions.
9 changes: 9 additions & 0 deletions mealie/lang/messages/en-US.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,14 @@
"generic-updated-with-url": "{name} has been updated, {url}",
"generic-duplicated": "{name} has been duplicated",
"generic-deleted": "{name} has been deleted"
},
"datetime": {
"year": "year|years",
"day": "day|days",
"hour": "hour|hours",
"minute": "minute|minutes",
"second": "second|seconds",
"millisecond": "millisecond|milliseconds",
"microsecond": "microsecond|microseconds"
}
}
26 changes: 23 additions & 3 deletions mealie/pkgs/i18n/json_provider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from dataclasses import dataclass
from pathlib import Path
from typing import cast


@dataclass(slots=True)
Expand All @@ -13,6 +14,22 @@ def __init__(self, path: Path | dict):
else:
self.translations = path

def _parse_plurals(self, value: str, count: float):
# based off of: https://kazupon.github.io/vue-i18n/guide/pluralization.html

values = [v.strip() for v in value.split("|")]
if len(values) == 1:
return value
elif len(values) == 2:
return values[0] if count == 1 else values[1]
elif len(values) == 3:
if count == 0:
return values[0]
else:
return values[1] if count == 1 else values[2]
else:
return values[0]

def t(self, key: str, default=None, **kwargs) -> str:
keys = key.split(".")

Expand All @@ -30,9 +47,12 @@ def t(self, key: str, default=None, **kwargs) -> str:

if i == last:
for key, value in kwargs.items():
if not value:
translation_value = cast(str, translation_value)
if value is None:
value = ""
translation_value = translation_value.replace("{" + key + "}", value)
return translation_value
if key == "count":
translation_value = self._parse_plurals(translation_value, float(value))
translation_value = translation_value.replace("{" + key + "}", str(value)) # type: ignore
return translation_value # type: ignore

return default or key
1 change: 1 addition & 0 deletions mealie/routes/groups/controller_migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def start_data_migration(
"user_id": self.user.id,
"group_id": self.group_id,
"add_migration_tag": add_migration_tag,
"translator": self.translator,
}

table: dict[SupportedMigrations, type[BaseMigrator]] = {
Expand Down
6 changes: 3 additions & 3 deletions mealie/routes/recipe/recipe_crud_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def handle_exceptions(self, ex: Exception) -> None:
async def parse_recipe_url(self, req: ScrapeRecipe):
"""Takes in a URL and attempts to scrape data and load it into the database"""
try:
recipe, extras = await create_from_url(req.url)
recipe, extras = await create_from_url(req.url, self.translator)
except ForceTimeoutException as e:
raise HTTPException(
status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
Expand Down Expand Up @@ -193,7 +193,7 @@ async def parse_recipe_url(self, req: ScrapeRecipe):
@router.post("/create-url/bulk", status_code=202)
def parse_recipe_url_bulk(self, bulk: CreateRecipeByUrlBulk, bg_tasks: BackgroundTasks):
"""Takes in a URL and attempts to scrape data and load it into the database"""
bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group)
bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group, self.translator)
report_id = bulk_scraper.get_report_id()
bg_tasks.add_task(bulk_scraper.scrape, bulk)

Expand All @@ -208,7 +208,7 @@ def parse_recipe_url_bulk(self, bulk: CreateRecipeByUrlBulk, bg_tasks: Backgroun
async def test_parse_recipe_url(self, url: ScrapeRecipeTest):
# Debugger should produce the same result as the scraper sees before cleaning
try:
if scraped_data := await RecipeScraperPackage(url.url).scrape_url():
if scraped_data := await RecipeScraperPackage(url.url, self.translator).scrape_url():
return scraped_data.schema.data
except ForceTimeoutException as e:
raise HTTPException(
Expand Down
13 changes: 11 additions & 2 deletions mealie/services/migrations/_migration_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from mealie.core import root_logger
from mealie.core.exceptions import UnexpectedNone
from mealie.lang.providers import Translator
from mealie.repos.all_repositories import AllRepositories
from mealie.schema.recipe import Recipe
from mealie.schema.recipe.recipe_settings import RecipeSettings
Expand Down Expand Up @@ -35,12 +36,20 @@ class BaseMigrator(BaseService):
helpers: DatabaseMigrationHelpers

def __init__(
self, archive: Path, db: AllRepositories, session, user_id: UUID4, group_id: UUID, add_migration_tag: bool
self,
archive: Path,
db: AllRepositories,
session,
user_id: UUID4,
group_id: UUID,
add_migration_tag: bool,
translator: Translator,
):
self.archive = archive
self.db = db
self.session = session
self.add_migration_tag = add_migration_tag
self.translator = translator

user = db.users.get_one(user_id)
if not user:
Expand Down Expand Up @@ -229,6 +238,6 @@ def clean_recipe_dictionary(self, recipe_dict: dict) -> Recipe:
with contextlib.suppress(KeyError):
del recipe_dict["id"]

recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))

return Recipe(**recipe_dict)
40 changes: 21 additions & 19 deletions mealie/services/scraper/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from slugify import slugify

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator

logger = get_logger("recipe-scraper")

Expand All @@ -32,7 +33,7 @@
""" Matches multiple new lines and removes erroneous white space """


def clean(recipe_data: dict, url=None) -> dict:
def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
"""Main entrypoint to clean a recipe extracted from the web
and format the data into an accectable format for the database
Expand All @@ -45,9 +46,9 @@ def clean(recipe_data: dict, url=None) -> dict:
recipe_data["description"] = clean_string(recipe_data.get("description", ""))

# Times
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"), translator)
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"), translator)
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"), translator)
recipe_data["recipeCategory"] = clean_categories(recipe_data.get("recipeCategory", []))
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
Expand Down Expand Up @@ -335,7 +336,7 @@ def clean_yield(yld: str | list[str] | None) -> str:
return yld


def clean_time(time_entry: str | timedelta | None) -> None | str:
def clean_time(time_entry: str | timedelta | None, translator: Translator) -> None | str:
"""_summary_
Supported Structures:
Expand All @@ -361,11 +362,11 @@ def clean_time(time_entry: str | timedelta | None) -> None | str:

try:
time_delta_instructionsect = parse_duration(time_entry)
return pretty_print_timedelta(time_delta_instructionsect)
return pretty_print_timedelta(time_delta_instructionsect, translator)
except ValueError:
return str(time_entry)
case timedelta():
return pretty_print_timedelta(time_entry)
return pretty_print_timedelta(time_entry, translator)
case {"minValue": str(value)}:
return clean_time(value)
case [str(), *_]:
Expand All @@ -374,7 +375,7 @@ def clean_time(time_entry: str | timedelta | None) -> None | str:
# TODO: Not sure what to do here
return str(time_entry)
case _:
logger.warning("[SCRAPER] Unexpected type or structure for time_entrys")
logger.warning("[SCRAPER] Unexpected type or structure for variable time_entry")
return None


Expand Down Expand Up @@ -408,25 +409,25 @@ def parse_duration(iso_duration: str) -> timedelta:
return timedelta(**times)


def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places=2):
def pretty_print_timedelta(t: timedelta, translator: Translator, max_components=None, max_decimal_places=2):
"""
Print a pretty string for a timedelta.
For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days 4 Hours 48 Minutes'.
Setting max_components to e.g. 1 will change this to '2.2 days', where the number of decimal
points can also be set.
"""
time_scale_names_dict = {
timedelta(days=365): "year",
timedelta(days=1): "day",
timedelta(hours=1): "Hour",
timedelta(minutes=1): "Minute",
timedelta(seconds=1): "Second",
timedelta(microseconds=1000): "millisecond",
timedelta(microseconds=1): "microsecond",
time_scale_translation_keys_dict = {
timedelta(days=365): "datetime.year",
timedelta(days=1): "datetime.day",
timedelta(hours=1): "datetime.hour",
timedelta(minutes=1): "datetime.minute",
timedelta(seconds=1): "datetime.second",
timedelta(microseconds=1000): "datetime.millisecond",
timedelta(microseconds=1): "datetime.microsecond",
}
count = 0
out_list = []
for scale, scale_name in time_scale_names_dict.items():
for scale, scale_translation_key in time_scale_translation_keys_dict.items():
if t >= scale:
count += 1
n = t / scale if count == max_components else int(t / scale)
Expand All @@ -436,7 +437,8 @@ def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places
if n_txt[-2:] == ".0":
n_txt = n_txt[:-2]

out_list.append(f"{n_txt} {scale_name}{'s' if n > 1 else ''}")
scale_value = translator.t(scale_translation_key, count=n)
out_list.append(f"{n_txt} {scale_value}")

if out_list == []:
return "none"
Expand Down
8 changes: 6 additions & 2 deletions mealie/services/scraper/recipe_bulk_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from pydantic import UUID4

from mealie.lang.providers import Translator
from mealie.repos.repository_factory import AllRepositories
from mealie.schema.recipe.recipe import CreateRecipeByUrlBulk, Recipe
from mealie.schema.reports.reports import (
Expand All @@ -20,11 +21,14 @@
class RecipeBulkScraperService(BaseService):
report_entries: list[ReportEntryCreate]

def __init__(self, service: RecipeService, repos: AllRepositories, group: GroupInDB) -> None:
def __init__(
self, service: RecipeService, repos: AllRepositories, group: GroupInDB, translator: Translator
) -> None:
self.service = service
self.repos = repos
self.group = group
self.report_entries = []
self.translator = translator

super().__init__()

Expand Down Expand Up @@ -81,7 +85,7 @@ async def scrape(self, urls: CreateRecipeByUrlBulk) -> None:
async def _do(url: str) -> Recipe | None:
async with sem:
try:
recipe, _ = await create_from_url(url)
recipe, _ = await create_from_url(url, self.translator)
return recipe
except Exception as e:
self.service.logger.error(f"failed to scrape url during bulk url import {url}")
Expand Down
6 changes: 4 additions & 2 deletions mealie/services/scraper/recipe_scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe
from mealie.services.scraper.scraped_extras import ScrapedExtras

Expand All @@ -14,19 +15,20 @@ class RecipeScraper:
# List of recipe scrapers. Note that order matters
scrapers: list[type[ABCScraperStrategy]]

def __init__(self, scrapers: list[type[ABCScraperStrategy]] | None = None) -> None:
def __init__(self, translator: Translator, scrapers: list[type[ABCScraperStrategy]] | None = None) -> None:
if scrapers is None:
scrapers = DEFAULT_SCRAPER_STRATEGIES

self.scrapers = scrapers
self.translator = translator

async def scrape(self, url: str) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
"""
Scrapes a recipe from the web.
"""

for scraper_type in self.scrapers:
scraper = scraper_type(url)
scraper = scraper_type(url, self.translator)
result = await scraper.parse()

if result is not None:
Expand Down
5 changes: 3 additions & 2 deletions mealie/services/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from slugify import slugify

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.pkgs import cache
from mealie.schema.recipe import Recipe
from mealie.services.recipe.recipe_data_service import RecipeDataService
Expand All @@ -19,7 +20,7 @@ class ParserErrors(str, Enum):
CONNECTION_ERROR = "CONNECTION_ERROR"


async def create_from_url(url: str) -> tuple[Recipe, ScrapedExtras | None]:
async def create_from_url(url: str, translator: Translator) -> tuple[Recipe, ScrapedExtras | None]:
"""Main entry point for generating a recipe from a URL. Pass in a URL and
a Recipe object will be returned if successful.
Expand All @@ -29,7 +30,7 @@ async def create_from_url(url: str) -> tuple[Recipe, ScrapedExtras | None]:
Returns:
Recipe: Recipe Object
"""
scraper = RecipeScraper()
scraper = RecipeScraper(translator)
new_recipe, extras = await scraper.scrape(url)

if not new_recipe:
Expand Down
20 changes: 12 additions & 8 deletions mealie/services/scraper/scraper_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from w3lib.html import get_base_url

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe, RecipeStep
from mealie.services.scraper.scraped_extras import ScrapedExtras

Expand Down Expand Up @@ -77,9 +78,10 @@ class ABCScraperStrategy(ABC):

url: str

def __init__(self, url: str) -> None:
def __init__(self, url: str, translator: Translator) -> None:
self.logger = get_logger()
self.url = url
self.translator = translator

@abstractmethod
async def get_html(self, url: str) -> str: ...
Expand All @@ -102,7 +104,9 @@ async def get_html(self, url: str) -> str:
return await safe_scrape_html(url)

def clean_scraper(self, scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> tuple[Recipe, ScrapedExtras]:
def try_get_default(func_call: Callable | None, get_attr: str, default: Any, clean_func=None):
def try_get_default(
func_call: Callable | None, get_attr: str, default: Any, clean_func=None, **clean_func_kwargs
):
value = default

if func_call:
Expand All @@ -118,7 +122,7 @@ def try_get_default(func_call: Callable | None, get_attr: str, default: Any, cle
self.logger.error(f"Error parsing recipe attribute '{get_attr}'")

if clean_func:
value = clean_func(value)
value = clean_func(value, **clean_func_kwargs)

return value

Expand All @@ -138,9 +142,9 @@ def get_instructions() -> list[RecipeStep]:
except TypeError:
return []

cook_time = try_get_default(None, "performTime", None, cleaner.clean_time) or try_get_default(
None, "cookTime", None, cleaner.clean_time
)
cook_time = try_get_default(
None, "performTime", None, cleaner.clean_time, translator=self.translator
) or try_get_default(None, "cookTime", None, cleaner.clean_time, translator=self.translator)

extras = ScrapedExtras()

Expand All @@ -157,8 +161,8 @@ def get_instructions() -> list[RecipeStep]:
scraped_data.ingredients, "recipeIngredient", [""], cleaner.clean_ingredients
),
recipe_instructions=get_instructions(),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time, translator=self.translator),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time, translator=self.translator),
perform_time=cook_time,
org_url=url,
)
Expand Down
Loading

0 comments on commit 3174216

Please sign in to comment.