diff --git a/argilla-server/src/argilla_server/jobs/hub_jobs.py b/argilla-server/src/argilla_server/jobs/hub_jobs.py index 60c915f524..0315435b24 100644 --- a/argilla-server/src/argilla_server/jobs/hub_jobs.py +++ b/argilla-server/src/argilla_server/jobs/hub_jobs.py @@ -29,6 +29,8 @@ # TODO: Move this to be defined on jobs queues as a shared constant JOB_TIMEOUT_DISABLED = -1 +HUB_DATASET_TAKE_ROWS = 10_000 + # TODO: Once we merge webhooks we should change the queue to use a different one (default queue is deleted there) @job(DEFAULT_QUEUE, timeout=JOB_TIMEOUT_DISABLED, retry=Retry(max=3)) @@ -47,4 +49,8 @@ async def import_dataset_from_hub_job(name: str, subset: str, split: str, datase async with SearchEngine.get_by_name(settings.search_engine) as search_engine: parsed_mapping = HubDatasetMapping.parse_obj(mapping) - await HubDataset(name, subset, split, parsed_mapping).import_to(db, search_engine, dataset) + await ( + HubDataset(name, subset, split, parsed_mapping) + .take(HUB_DATASET_TAKE_ROWS) + .import_to(db, search_engine, dataset) + )