Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Translate ISO 8601 Datetime Durations During Scraping/Parsing/Migrating #2810

Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions mealie/lang/messages/en-US.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,14 @@
"generic-updated-with-url": "{name} has been updated, {url}",
"generic-duplicated": "{name} has been duplicated",
"generic-deleted": "{name} has been deleted"
},
"datetime": {
"year": "year|years",
"day": "day|days",
"hour": "hour|hours",
"minute": "minute|minutes",
"second": "second|seconds",
"millisecond": "millisecond|milliseconds",
"microsecond": "microsecond|microseconds"
}
}
26 changes: 23 additions & 3 deletions mealie/pkgs/i18n/json_provider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from dataclasses import dataclass
from pathlib import Path
from typing import cast


@dataclass(slots=True)
Expand All @@ -13,6 +14,22 @@ def __init__(self, path: Path | dict):
else:
self.translations = path

def _parse_plurals(self, value: str, count: float):
# based off of: https://kazupon.github.io/vue-i18n/guide/pluralization.html

values = [v.strip() for v in value.split("|")]
if len(values) == 1:
return value
elif len(values) == 2:
return values[0] if count == 1 else values[1]
elif len(values) == 3:
if count == 0:
return values[0]
else:
return values[1] if count == 1 else values[2]
else:
return values[0]

def t(self, key: str, default=None, **kwargs) -> str:
keys = key.split(".")

Expand All @@ -30,9 +47,12 @@ def t(self, key: str, default=None, **kwargs) -> str:

if i == last:
for key, value in kwargs.items():
if not value:
translation_value = cast(str, translation_value)
if value is None:
value = ""
translation_value = translation_value.replace("{" + key + "}", value)
return translation_value
if key == "count":
translation_value = self._parse_plurals(translation_value, float(value))
translation_value = translation_value.replace("{" + key + "}", str(value)) # type: ignore
return translation_value # type: ignore

return default or key
1 change: 1 addition & 0 deletions mealie/routes/groups/controller_migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def start_data_migration(
"user_id": self.user.id,
"group_id": self.group_id,
"add_migration_tag": add_migration_tag,
"translator": self.translator,
}

table: dict[SupportedMigrations, type[BaseMigrator]] = {
Expand Down
6 changes: 3 additions & 3 deletions mealie/routes/recipe/recipe_crud_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def handle_exceptions(self, ex: Exception) -> None:
async def parse_recipe_url(self, req: ScrapeRecipe):
"""Takes in a URL and attempts to scrape data and load it into the database"""
try:
recipe, extras = await create_from_url(req.url)
recipe, extras = await create_from_url(req.url, self.translator)
except ForceTimeoutException as e:
raise HTTPException(
status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
Expand Down Expand Up @@ -193,7 +193,7 @@ async def parse_recipe_url(self, req: ScrapeRecipe):
@router.post("/create-url/bulk", status_code=202)
def parse_recipe_url_bulk(self, bulk: CreateRecipeByUrlBulk, bg_tasks: BackgroundTasks):
"""Takes in a URL and attempts to scrape data and load it into the database"""
bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group)
bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group, self.translator)
report_id = bulk_scraper.get_report_id()
bg_tasks.add_task(bulk_scraper.scrape, bulk)

Expand All @@ -208,7 +208,7 @@ def parse_recipe_url_bulk(self, bulk: CreateRecipeByUrlBulk, bg_tasks: Backgroun
async def test_parse_recipe_url(self, url: ScrapeRecipeTest):
# Debugger should produce the same result as the scraper sees before cleaning
try:
if scraped_data := await RecipeScraperPackage(url.url).scrape_url():
if scraped_data := await RecipeScraperPackage(url.url, self.translator).scrape_url():
return scraped_data.schema.data
except ForceTimeoutException as e:
raise HTTPException(
Expand Down
13 changes: 11 additions & 2 deletions mealie/services/migrations/_migration_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from mealie.core import root_logger
from mealie.core.exceptions import UnexpectedNone
from mealie.lang.providers import Translator
from mealie.repos.all_repositories import AllRepositories
from mealie.schema.recipe import Recipe
from mealie.schema.recipe.recipe_settings import RecipeSettings
Expand Down Expand Up @@ -35,12 +36,20 @@ class BaseMigrator(BaseService):
helpers: DatabaseMigrationHelpers

def __init__(
self, archive: Path, db: AllRepositories, session, user_id: UUID4, group_id: UUID, add_migration_tag: bool
self,
archive: Path,
db: AllRepositories,
session,
user_id: UUID4,
group_id: UUID,
add_migration_tag: bool,
translator: Translator,
):
self.archive = archive
self.db = db
self.session = session
self.add_migration_tag = add_migration_tag
self.translator = translator

user = db.users.get_one(user_id)
if not user:
Expand Down Expand Up @@ -229,6 +238,6 @@ def clean_recipe_dictionary(self, recipe_dict: dict) -> Recipe:
with contextlib.suppress(KeyError):
del recipe_dict["id"]

recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))

return Recipe(**recipe_dict)
40 changes: 21 additions & 19 deletions mealie/services/scraper/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from slugify import slugify

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator

logger = get_logger("recipe-scraper")

Expand All @@ -32,7 +33,7 @@
""" Matches multiple new lines and removes erroneous white space """


def clean(recipe_data: dict, url=None) -> dict:
def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
"""Main entrypoint to clean a recipe extracted from the web
and format the data into an accectable format for the database

Expand All @@ -45,9 +46,9 @@ def clean(recipe_data: dict, url=None) -> dict:
recipe_data["description"] = clean_string(recipe_data.get("description", ""))

# Times
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"), translator)
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"), translator)
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"), translator)
recipe_data["recipeCategory"] = clean_categories(recipe_data.get("recipeCategory", []))
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
Expand Down Expand Up @@ -335,7 +336,7 @@ def clean_yield(yld: str | list[str] | None) -> str:
return yld


def clean_time(time_entry: str | timedelta | None) -> None | str:
def clean_time(time_entry: str | timedelta | None, translator: Translator) -> None | str:
"""_summary_

Supported Structures:
Expand All @@ -361,11 +362,11 @@ def clean_time(time_entry: str | timedelta | None) -> None | str:

try:
time_delta_instructionsect = parse_duration(time_entry)
return pretty_print_timedelta(time_delta_instructionsect)
return pretty_print_timedelta(time_delta_instructionsect, translator)
except ValueError:
return str(time_entry)
case timedelta():
return pretty_print_timedelta(time_entry)
return pretty_print_timedelta(time_entry, translator)
case {"minValue": str(value)}:
return clean_time(value)
case [str(), *_]:
Expand All @@ -374,7 +375,7 @@ def clean_time(time_entry: str | timedelta | None) -> None | str:
# TODO: Not sure what to do here
return str(time_entry)
case _:
logger.warning("[SCRAPER] Unexpected type or structure for time_entrys")
logger.warning("[SCRAPER] Unexpected type or structure for time_entries")
michael-genson marked this conversation as resolved.
Show resolved Hide resolved
return None


Expand Down Expand Up @@ -408,25 +409,25 @@ def parse_duration(iso_duration: str) -> timedelta:
return timedelta(**times)


def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places=2):
def pretty_print_timedelta(t: timedelta, translator: Translator, max_components=None, max_decimal_places=2):
"""
Print a pretty string for a timedelta.
For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days 4 Hours 48 Minutes'.
Setting max_components to e.g. 1 will change this to '2.2 days', where the number of decimal
points can also be set.
"""
time_scale_names_dict = {
timedelta(days=365): "year",
timedelta(days=1): "day",
timedelta(hours=1): "Hour",
timedelta(minutes=1): "Minute",
timedelta(seconds=1): "Second",
timedelta(microseconds=1000): "millisecond",
timedelta(microseconds=1): "microsecond",
time_scale_translation_keys_dict = {
timedelta(days=365): "datetime.year",
timedelta(days=1): "datetime.day",
timedelta(hours=1): "datetime.hour",
timedelta(minutes=1): "datetime.minute",
timedelta(seconds=1): "datetime.second",
timedelta(microseconds=1000): "datetime.millisecond",
timedelta(microseconds=1): "datetime.microsecond",
}
count = 0
out_list = []
for scale, scale_name in time_scale_names_dict.items():
for scale, scale_translation_key in time_scale_translation_keys_dict.items():
if t >= scale:
count += 1
n = t / scale if count == max_components else int(t / scale)
Expand All @@ -436,7 +437,8 @@ def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places
if n_txt[-2:] == ".0":
n_txt = n_txt[:-2]

out_list.append(f"{n_txt} {scale_name}{'s' if n > 1 else ''}")
scale_value = translator.t(scale_translation_key, count=n)
out_list.append(f"{n_txt} {scale_value}")

if out_list == []:
return "none"
Expand Down
8 changes: 6 additions & 2 deletions mealie/services/scraper/recipe_bulk_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from pydantic import UUID4

from mealie.lang.providers import Translator
from mealie.repos.repository_factory import AllRepositories
from mealie.schema.recipe.recipe import CreateRecipeByUrlBulk, Recipe
from mealie.schema.reports.reports import (
Expand All @@ -20,11 +21,14 @@
class RecipeBulkScraperService(BaseService):
report_entries: list[ReportEntryCreate]

def __init__(self, service: RecipeService, repos: AllRepositories, group: GroupInDB) -> None:
def __init__(
self, service: RecipeService, repos: AllRepositories, group: GroupInDB, translator: Translator
) -> None:
self.service = service
self.repos = repos
self.group = group
self.report_entries = []
self.translator = translator

super().__init__()

Expand Down Expand Up @@ -81,7 +85,7 @@ async def scrape(self, urls: CreateRecipeByUrlBulk) -> None:
async def _do(url: str) -> Recipe | None:
async with sem:
try:
recipe, _ = await create_from_url(url)
recipe, _ = await create_from_url(url, self.translator)
return recipe
except Exception as e:
self.service.logger.error(f"failed to scrape url during bulk url import {url}")
Expand Down
6 changes: 4 additions & 2 deletions mealie/services/scraper/recipe_scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe
from mealie.services.scraper.scraped_extras import ScrapedExtras

Expand All @@ -14,19 +15,20 @@ class RecipeScraper:
# List of recipe scrapers. Note that order matters
scrapers: list[type[ABCScraperStrategy]]

def __init__(self, scrapers: list[type[ABCScraperStrategy]] | None = None) -> None:
def __init__(self, translator: Translator, scrapers: list[type[ABCScraperStrategy]] | None = None) -> None:
if scrapers is None:
scrapers = DEFAULT_SCRAPER_STRATEGIES

self.scrapers = scrapers
self.translator = translator

async def scrape(self, url: str) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
"""
Scrapes a recipe from the web.
"""

for scraper_type in self.scrapers:
scraper = scraper_type(url)
scraper = scraper_type(url, self.translator)
result = await scraper.parse()

if result is not None:
Expand Down
5 changes: 3 additions & 2 deletions mealie/services/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from slugify import slugify

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.pkgs import cache
from mealie.schema.recipe import Recipe
from mealie.services.recipe.recipe_data_service import RecipeDataService
Expand All @@ -19,7 +20,7 @@ class ParserErrors(str, Enum):
CONNECTION_ERROR = "CONNECTION_ERROR"


async def create_from_url(url: str) -> tuple[Recipe, ScrapedExtras | None]:
async def create_from_url(url: str, translator: Translator) -> tuple[Recipe, ScrapedExtras | None]:
"""Main entry point for generating a recipe from a URL. Pass in a URL and
a Recipe object will be returned if successful.

Expand All @@ -29,7 +30,7 @@ async def create_from_url(url: str) -> tuple[Recipe, ScrapedExtras | None]:
Returns:
Recipe: Recipe Object
"""
scraper = RecipeScraper()
scraper = RecipeScraper(translator)
new_recipe, extras = await scraper.scrape(url)

if not new_recipe:
Expand Down
20 changes: 12 additions & 8 deletions mealie/services/scraper/scraper_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from w3lib.html import get_base_url

from mealie.core.root_logger import get_logger
from mealie.lang.providers import Translator
from mealie.schema.recipe.recipe import Recipe, RecipeStep
from mealie.services.scraper.scraped_extras import ScrapedExtras

Expand Down Expand Up @@ -77,9 +78,10 @@ class ABCScraperStrategy(ABC):

url: str

def __init__(self, url: str) -> None:
def __init__(self, url: str, translator: Translator) -> None:
self.logger = get_logger()
self.url = url
self.translator = translator

@abstractmethod
async def get_html(self, url: str) -> str: ...
Expand All @@ -102,7 +104,9 @@ async def get_html(self, url: str) -> str:
return await safe_scrape_html(url)

def clean_scraper(self, scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> tuple[Recipe, ScrapedExtras]:
def try_get_default(func_call: Callable | None, get_attr: str, default: Any, clean_func=None):
def try_get_default(
func_call: Callable | None, get_attr: str, default: Any, clean_func=None, **clean_func_kwargs
):
value = default

if func_call:
Expand All @@ -118,7 +122,7 @@ def try_get_default(func_call: Callable | None, get_attr: str, default: Any, cle
self.logger.error(f"Error parsing recipe attribute '{get_attr}'")

if clean_func:
value = clean_func(value)
value = clean_func(value, **clean_func_kwargs)

return value

Expand All @@ -138,9 +142,9 @@ def get_instructions() -> list[RecipeStep]:
except TypeError:
return []

cook_time = try_get_default(None, "performTime", None, cleaner.clean_time) or try_get_default(
None, "cookTime", None, cleaner.clean_time
)
cook_time = try_get_default(
None, "performTime", None, cleaner.clean_time, translator=self.translator
) or try_get_default(None, "cookTime", None, cleaner.clean_time, translator=self.translator)

extras = ScrapedExtras()

Expand All @@ -157,8 +161,8 @@ def get_instructions() -> list[RecipeStep]:
scraped_data.ingredients, "recipeIngredient", [""], cleaner.clean_ingredients
),
recipe_instructions=get_instructions(),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time, translator=self.translator),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time, translator=self.translator),
perform_time=cook_time,
org_url=url,
)
Expand Down
Loading
Loading