From ee00432cd72fa55ddb3e67c8c7d7b72b2520ccd1 Mon Sep 17 00:00:00 2001 From: Robbe Sneyders Date: Thu, 15 Feb 2024 09:53:14 +0100 Subject: [PATCH] Add docs --- docs/components/components.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/components/components.md b/docs/components/components.md index 914c75f3..dfa8c231 100644 --- a/docs/components/components.md +++ b/docs/components/components.md @@ -1,3 +1,5 @@ +from distributed import Client + # Components Fondant makes it easy to build data preparation pipelines leveraging reusable components. Fondant @@ -65,6 +67,32 @@ this data can be accessed using `dataframe["image"]`. The `transform` method should return a single dataframe, with the columns complying to the schema defined by the `produces` section of the component specification. +### Configuring Dask + +You can configure the Dask client based on the needs of your component by overriding the +`dask_client` method: + +```python +import os + +from dask.distributed import Client, LocalCluster +from fondant.component import PandasTransformComponent + +class Component(PandasTransformComponent): + + def dask_client(self) -> Client: + """Initialize the dask client to use for this component.""" + cluster = LocalCluster( + processes=True, + n_workers=os.cpu_count(), + threads_per_worker=1, + ) + return Client(cluster) +``` + +The default Dask client is configured to work with processes, the same amount of workers as +logical CPUs available, and on thread per worker. + ## Component types We can distinguish two different types of components: