From dee704426f8b9be2666aa1e3b9b988d800b5dbce Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Mon, 3 Jul 2023 14:12:54 +0200 Subject: [PATCH 01/11] add clip url as an argument --- .../prompt_based_laion_retrieval/fondant_component.yaml | 4 ++++ components/prompt_based_laion_retrieval/src/main.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml index 61e50d6fe..3fdea226e 100644 --- a/components/prompt_based_laion_retrieval/fondant_component.yaml +++ b/components/prompt_based_laion_retrieval/fondant_component.yaml @@ -25,3 +25,7 @@ args: aesthetic_weight: description: Weight of the aesthetic embedding when added to the query, between 0 and 1 type: float + url: + description: The url of the backend clip retrieval service, defaults to the public service + type: str + default: https://knn.laion.ai/knn-service \ No newline at end of file diff --git a/components/prompt_based_laion_retrieval/src/main.py b/components/prompt_based_laion_retrieval/src/main.py index 5109e94e5..6dbc39a57 100644 --- a/components/prompt_based_laion_retrieval/src/main.py +++ b/components/prompt_based_laion_retrieval/src/main.py @@ -21,6 +21,7 @@ def setup( num_images: int, aesthetic_score: int, aesthetic_weight: float, + url: str, ) -> None: """ @@ -30,10 +31,11 @@ def setup( between 0 and 9. aesthetic_weight: weight of the aesthetic embedding to add to the query, between 0 and 1. + url: The url of the backend clip retrieval service, defaults to the public clip url. """ self.client = ClipClient( - url="https://knn.laion.ai/knn-service", - indice_name="laion5B-L-14", + url=url, + indice_name="laion5B", #TODO:revert back to laion5b-L-14 after backend correction num_images=num_images, aesthetic_score=aesthetic_score, aesthetic_weight=aesthetic_weight, From 17f9d97d790ed4b9209626252bb3ebb7e12c1392 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Mon, 3 Jul 2023 14:13:49 +0200 Subject: [PATCH 02/11] handle nested data type in HF writer --- components/write_to_hf_hub/src/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py index a81bcb5c9..c3022b234 100644 --- a/components/write_to_hf_hub/src/main.py +++ b/components/write_to_hf_hub/src/main.py @@ -8,6 +8,7 @@ # Define the schema for the struct using PyArrow import huggingface_hub +from datasets.features.features import generate_from_arrow_type from PIL import Image from fondant.component import WriteComponent @@ -71,7 +72,7 @@ def write( if image_column_names and column_name in image_column_names: schema_dict[column_name] = datasets.Image() else: - schema_dict[column_name] = datasets.Value(str(field.type.value)) + schema_dict[column_name] = generate_from_arrow_type(field.type.value) schema = datasets.Features(schema_dict).arrow_schema dataframe = dataframe[write_columns] From dbb0506c58a5f1e6e57e0757a1f5bfb6e8d24c18 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Mon, 3 Jul 2023 14:13:49 +0200 Subject: [PATCH 03/11] handle nested data type in HF writer --- components/write_to_hf_hub/src/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py index a81bcb5c9..c3022b234 100644 --- a/components/write_to_hf_hub/src/main.py +++ b/components/write_to_hf_hub/src/main.py @@ -8,6 +8,7 @@ # Define the schema for the struct using PyArrow import huggingface_hub +from datasets.features.features import generate_from_arrow_type from PIL import Image from fondant.component import WriteComponent @@ -71,7 +72,7 @@ def write( if image_column_names and column_name in image_column_names: schema_dict[column_name] = datasets.Image() else: - schema_dict[column_name] = datasets.Value(str(field.type.value)) + schema_dict[column_name] = generate_from_arrow_type(field.type.value) schema = datasets.Features(schema_dict).arrow_schema dataframe = dataframe[write_columns] From fd080554f8f5db3fb2aba5e519666b5641201f63 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Mon, 3 Jul 2023 14:12:54 +0200 Subject: [PATCH 04/11] add clip url as an argument --- .../prompt_based_laion_retrieval/fondant_component.yaml | 4 ++++ components/prompt_based_laion_retrieval/src/main.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/components/prompt_based_laion_retrieval/fondant_component.yaml b/components/prompt_based_laion_retrieval/fondant_component.yaml index 61e50d6fe..3fdea226e 100644 --- a/components/prompt_based_laion_retrieval/fondant_component.yaml +++ b/components/prompt_based_laion_retrieval/fondant_component.yaml @@ -25,3 +25,7 @@ args: aesthetic_weight: description: Weight of the aesthetic embedding when added to the query, between 0 and 1 type: float + url: + description: The url of the backend clip retrieval service, defaults to the public service + type: str + default: https://knn.laion.ai/knn-service \ No newline at end of file diff --git a/components/prompt_based_laion_retrieval/src/main.py b/components/prompt_based_laion_retrieval/src/main.py index 5109e94e5..6dbc39a57 100644 --- a/components/prompt_based_laion_retrieval/src/main.py +++ b/components/prompt_based_laion_retrieval/src/main.py @@ -21,6 +21,7 @@ def setup( num_images: int, aesthetic_score: int, aesthetic_weight: float, + url: str, ) -> None: """ @@ -30,10 +31,11 @@ def setup( between 0 and 9. aesthetic_weight: weight of the aesthetic embedding to add to the query, between 0 and 1. + url: The url of the backend clip retrieval service, defaults to the public clip url. """ self.client = ClipClient( - url="https://knn.laion.ai/knn-service", - indice_name="laion5B-L-14", + url=url, + indice_name="laion5B", #TODO:revert back to laion5b-L-14 after backend correction num_images=num_images, aesthetic_score=aesthetic_score, aesthetic_weight=aesthetic_weight, From 6640adb478e289892c33a6a75886824ea010e48e Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Mon, 3 Jul 2023 17:42:51 +0200 Subject: [PATCH 05/11] save segmentation as an images --- components/segment_images/src/main.py | 13 +++++++++---- .../write_to_hub_controlnet/fondant_component.yaml | 4 +--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/components/segment_images/src/main.py b/components/segment_images/src/main.py index b666127b3..89e9193ef 100644 --- a/components/segment_images/src/main.py +++ b/components/segment_images/src/main.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) -def convert_to_rgb(seg: np.array): +def convert_to_rgb(seg: np.array) -> bytes: """ Converts a 2D segmentation to a RGB one which makes it possible to visualize it. @@ -23,7 +23,7 @@ def convert_to_rgb(seg: np.array): seg: 2D segmentation map as a NumPy array. Returns: - color_seg: 3D segmentation map contain RGB values for each pixel. + color_seg: the RGB segmentation map as a binary string """ color_seg = np.zeros( (seg.shape[0], seg.shape[1], 3), dtype=np.uint8, @@ -32,9 +32,13 @@ def convert_to_rgb(seg: np.array): for label, color in enumerate(palette): color_seg[seg == label, :] = color - color_seg = color_seg.astype(np.uint8).tobytes() + color_seg = color_seg.astype(np.uint8) + image = Image.fromarray(color_seg).convert('RGB') - return color_seg + crop_bytes = io.BytesIO() + image.save(crop_bytes, format="JPEG") + + return crop_bytes.getvalue() def process_image(image: bytes, *, processor: SegformerImageProcessor, device: str) -> torch.Tensor: @@ -46,6 +50,7 @@ def process_image(image: bytes, *, processor: SegformerImageProcessor, device: s processor: The processor object for transforming the image. device: The device to move the transformed image to. """ + def load(img: bytes) -> Image: """Load the bytestring as an image.""" bytes_ = io.BytesIO(img) diff --git a/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml b/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml index efb253159..4915810f0 100644 --- a/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml +++ b/examples/pipelines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml @@ -16,9 +16,7 @@ consumes: segmentations: fields: data: - type: array - items: - type: binary + type: binary args: hf_token: From 87178fc5d6a46b3a2e405ee0bfa87b7bd63e5532 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Mon, 3 Jul 2023 17:45:01 +0200 Subject: [PATCH 06/11] change pipeline code --- .../pipelines/controlnet-interior-design/pipeline.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/pipelines/controlnet-interior-design/pipeline.py b/examples/pipelines/controlnet-interior-design/pipeline.py index 33b4d9054..2a89e021b 100644 --- a/examples/pipelines/controlnet-interior-design/pipeline.py +++ b/examples/pipelines/controlnet-interior-design/pipeline.py @@ -20,7 +20,12 @@ ) laion_retrieval_op = ComponentOp.from_registry( name="prompt_based_laion_retrieval", - arguments={"num_images": 2, "aesthetic_score": 9, "aesthetic_weight": 0.5}, + arguments={ + "num_images": 2, + "aesthetic_score": 9, + "aesthetic_weight": 0.5, + "url": None, + }, ) download_images_op = ComponentOp.from_registry( name="download_images", @@ -63,8 +68,6 @@ "hf_token": "hf_token", "image_column_names": ["images_data"], }, - number_of_gpus=1, - node_pool_name="model-inference-pool", ) pipeline = Pipeline(pipeline_name=pipeline_name, base_path=PipelineConfigs.BASE_PATH) From 1644fc3cf7448d2d1ef6964a3d9e22337674831f Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 5 Jul 2023 17:10:41 +0200 Subject: [PATCH 07/11] bugfix resizer returning varying array lengths --- components/download_images/src/resizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/download_images/src/resizer.py b/components/download_images/src/resizer.py index 386a71d3f..f545a0bf1 100644 --- a/components/download_images/src/resizer.py +++ b/components/download_images/src/resizer.py @@ -174,20 +174,20 @@ def __call__(self, img_stream, blurring_bbox_list=None): original_height, original_width = img.shape[:2] # check if image is too small if min(original_height, original_width) < self.min_image_size: - return None, None, None, None, None, "image too small" + return None, None, None if original_height * original_width > self.max_image_area: - return None, None, None, None, None, "image area too large" + return None, None, None # check if wrong aspect ratio if ( max(original_height, original_width) / min(original_height, original_width) > self.max_aspect_ratio ): - return None, None, None, None, None, "aspect ratio too large" + return None, None, None # check if resizer was defined during init if needed if blurring_bbox_list is not None and self.blurrer is None: - return None, None, None, None, None, "blurrer not defined" + return None, None, None # Flag to check if blurring is still needed. maybe_blur_still_needed = True From 4e81f55f9c24c270c2e86ccaa55202ee8720607c Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 5 Jul 2023 17:11:29 +0200 Subject: [PATCH 08/11] add default downloader component arguments to fondant component --- components/download_images/fondant_component.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 6f4262e29..104caad2e 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:dev +image: ghcr.io/ml6team/download_images:latest consumes: images: @@ -23,21 +23,28 @@ args: timeout: description: Maximum time (in seconds) to wait when trying to download an image type: int + default: 10 retries: description: Number of times to retry downloading an image if it fails. type: int + default: 0 image_size: description: Size of the images after resizing. type: int + default: 256 resize_mode: description: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". type: str + default: 'border' resize_only_if_bigger: description: If True, resize only if image is bigger than image_size. type: bool + default: 'False' min_image_size: description: Minimum size of the images. type: int + default: 0 max_aspect_ratio: description: Maximum aspect ratio of the images. - type: float \ No newline at end of file + type: float + default: 'inf' \ No newline at end of file From 7ead951f51db32a07f42d5b56e29bad35b1b6e41 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 5 Jul 2023 17:16:15 +0200 Subject: [PATCH 09/11] switch downloader component from pandas to dask --- components/download_images/src/main.py | 100 +++++++++++++++---------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py index 9b222e3b0..633d78f5f 100644 --- a/components/download_images/src/main.py +++ b/components/download_images/src/main.py @@ -10,10 +10,10 @@ import traceback import urllib -import pandas as pd +import dask.dataframe as dd from resizer import Resizer -from fondant.component import PandasTransformComponent +from fondant.component import DaskTransformComponent logger = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def is_disallowed(headers, user_agent_token, disallowed_header_directives): else None ) if (ua_token is None or ua_token == user_agent_token) and any( - x in disallowed_header_directives for x in directives + x in disallowed_header_directives for x in directives ): return True except Exception as err: @@ -53,9 +53,9 @@ def download_image(url, timeout, user_agent_token, disallowed_header_directives) ) with urllib.request.urlopen(request, timeout=timeout) as r: if disallowed_header_directives and is_disallowed( - r.headers, - user_agent_token, - disallowed_header_directives, + r.headers, + user_agent_token, + disallowed_header_directives, ): return None img_stream = io.BytesIO(r.read()) @@ -67,13 +67,13 @@ def download_image(url, timeout, user_agent_token, disallowed_header_directives) def download_image_with_retry( - url, - *, - timeout, - retries, - resizer, - user_agent_token=None, - disallowed_header_directives=None, + url, + *, + timeout, + retries, + resizer, + user_agent_token=None, + disallowed_header_directives=None, ): for _ in range(retries + 1): img_stream = download_image( @@ -81,50 +81,70 @@ def download_image_with_retry( ) if img_stream is not None: # resize the image - return resizer(img_stream) + img_str, width, height = resizer(img_stream) + return img_str, width, height return None, None, None -class DownloadImagesComponent(PandasTransformComponent): +class DownloadImagesComponent(DaskTransformComponent): """Component that downloads images based on URLs.""" - def setup( - self, - *, - timeout: int = 10, - retries: int = 0, - image_size: int = 256, - resize_mode: str = "border", - resize_only_if_bigger: bool = False, - min_image_size: int = 0, - max_aspect_ratio: float = float("inf"), - ): + def transform( + self, + dataframe: dd.DataFrame, + *, + timeout: int, + retries: int, + image_size: int, + resize_mode: str, + resize_only_if_bigger: bool, + min_image_size: int, + max_aspect_ratio: float, + ) -> dd.DataFrame: + """Function that downloads images from a list of URLs and executes filtering and resizing + Args: + dataframe: Dask dataframe + timeout: Maximum time (in seconds) to wait when trying to download an image. + retries: Number of times to retry downloading an image if it fails. + image_size: Size of the images after resizing. + resize_mode: Resize mode to use. One of "no", "keep_ratio", "center_crop", "border". + resize_only_if_bigger: If True, resize only if image is bigger than image_size. + min_image_size: Minimum size of the images. + max_aspect_ratio: Maximum aspect ratio of the images. + + Returns: + Dask dataframe + """ logger.info("Instantiating resizer...") - self.resizer = Resizer( + resizer = Resizer( image_size=image_size, resize_mode=resize_mode, resize_only_if_bigger=resize_only_if_bigger, min_image_size=min_image_size, max_aspect_ratio=max_aspect_ratio, ) - self.timeout = timeout - self.retries = retries - - def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: - dataframe[[ - ("images", "data"), - ("images", "width"), - ("images", "height"), - ]] = dataframe.apply( + + dataframe = dataframe.drop_duplicates() + + dataframe = dataframe.apply( lambda example: download_image_with_retry( - url=example["images"]["url"], - timeout=self.timeout, - retries=self.retries, - resizer=self.resizer, + url=example.images_url, + timeout=timeout, + retries=retries, + resizer=resizer, ), axis=1, result_type="expand", + meta={0: bytes, 1: int, 2: int}, ) + dataframe.columns = [ + "images_data", + "images_width", + "images_height", + ] + + dataframe = dataframe.dropna() + dataframe = dataframe.repartition(partition_size='10MB') return dataframe From 400d1b70f1602de4f061f27204d10f402005adce Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 6 Jul 2023 15:07:53 +0200 Subject: [PATCH 10/11] small fixes --- components/download_images/src/main.py | 3 +- .../controlnet-interior-design/pipeline.py | 2 +- scripts/build_components.sh | 54 ++++++++++++++----- .../example_2/docker-compose.yml | 2 +- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/components/download_images/src/main.py b/components/download_images/src/main.py index 633d78f5f..017001e0d 100644 --- a/components/download_images/src/main.py +++ b/components/download_images/src/main.py @@ -124,6 +124,7 @@ def transform( max_aspect_ratio=max_aspect_ratio, ) + # Remove duplicates from laion retrieval dataframe = dataframe.drop_duplicates() dataframe = dataframe.apply( @@ -143,8 +144,8 @@ def transform( "images_height", ] + # Remove images that could not be fetched dataframe = dataframe.dropna() - dataframe = dataframe.repartition(partition_size='10MB') return dataframe diff --git a/examples/pipelines/controlnet-interior-design/pipeline.py b/examples/pipelines/controlnet-interior-design/pipeline.py index 2a89e021b..ecb1c2e06 100644 --- a/examples/pipelines/controlnet-interior-design/pipeline.py +++ b/examples/pipelines/controlnet-interior-design/pipeline.py @@ -30,7 +30,7 @@ download_images_op = ComponentOp.from_registry( name="download_images", arguments={ - "timeout": 10, + "timeout": 1, "retries": 0, "image_size": 512, "resize_mode": "center_crop", diff --git a/scripts/build_components.sh b/scripts/build_components.sh index f09c72613..857964437 100755 --- a/scripts/build_components.sh +++ b/scripts/build_components.sh @@ -4,16 +4,28 @@ set -e function usage { echo "Usage: $0 [options]" echo "Options:" - echo " -t, --tag= Tag to add to image, repeatable - The first tag is set in the component specifications" - echo " -h, --help Display this help message" + echo " -t, --tag Tag to add to image, repeatable + The first tag is set in the component specifications" + echo " -c, --cache Use registry caching when building the components (default:false)" + echo " -d, --component-dirs Directory containing components to build as subdirectories. + The path should be relative to the root directory (default:components)" + echo " -n, --namespace The namespace for the built images, should match the github organization (default: ml6team)" + echo " -co, --component Specific component to build. Pass the component subdirectory name(s) to build + certain component(s) or 'all' to build all components in the components + directory (default: all)" + echo " -r, --repo Set the repo (default: fondant)" + echo " -h, --help Display this help message" } # Parse the arguments while [[ "$#" -gt 0 ]]; do case $1 in - -t|--tag) tags+=("$2"); shift;; - -c|--cache) caching=true;; - -h|--help) usage; exit;; + -n |--namespace) namespace="$2"; shift;; + -d |--components-dir ) components_dir="$2"; shift;; + -r |--repo) repo="$2"; shift;; + -t |--tag) tags+=("$2"); shift;; + -co|--component) components+=("$2"); shift;; + -c |--cache) caching=true;; + -h |--help) usage; exit;; *) echo "Unknown parameter passed: $1"; exit 1;; esac; shift; done @@ -24,28 +36,40 @@ if [ -z "${tags}" ]; then exit 1 fi -# Set github repo information -namespace="ml6team" -repo="fondant" +# Set default values for optional arguments if not passed +component="${components:-all}" +components_dir="${components_dir:-components}" +namespace="${namespace:-ml6team}" +repo="${repo:-fondant}" # Get the component directory scripts_dir=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) root_dir=$(dirname "$scripts_dir") -component_dir=$root_dir/"components" +components_dir=$root_dir/${components_dir} + +# Determine the components to build +if [[ "${component}" == "all" ]]; then + components_to_build=("$components_dir"/*/); +else + for component in "${components[@]}"; do + components_to_build+=("$components_dir/${component}/") + done +fi # Loop through all subdirectories -for dir in "$component_dir"/*/; do +for dir in "${components_to_build[@]}"; do pushd "$dir" - BASENAME=${dir%/} BASENAME=${BASENAME##*/} + full_image_names=() echo "Tagging image with following tags:" for tag in "${tags[@]}"; do full_image_name=ghcr.io/${namespace}/${BASENAME}:${tag} echo "$full_image_name" full_image_names+=("$full_image_name") done + # Prevent this from mistakenly being used below unset full_image_name @@ -62,8 +86,9 @@ for dir in "$component_dir"/*/; do # Add cache arguments if caching is enabled if [ "$caching" = true ] ; then - echo "Caching from/to ${cache_name}" + cache_name=ghcr.io/${namespace}/${BASENAME}:build-cache + echo "Caching from/to ${cache_name}" args+=(--cache-to "type=registry,ref=${cache_name}") args+=(--cache-from "type=registry,ref=${cache_name}") fi @@ -75,4 +100,5 @@ for dir in "$component_dir"/*/; do . popd -done + +done \ No newline at end of file diff --git a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml index c94e28402..c4528454b 100644 --- a/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml +++ b/tests/example_pipelines/compiled_pipeline/example_2/docker-compose.yml @@ -40,6 +40,6 @@ services: depends_on: first_component: condition: service_completed_successfully - image: ghcr.io/ml6team/image_cropping:dev + image: ghcr.io/ml6team/image_cropping:latest volumes: [] version: '3.8' From b3856c1f9ad9d7ce99918e0549b995ac83124eca Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Thu, 6 Jul 2023 15:47:33 +0200 Subject: [PATCH 11/11] revert back latest to dev for download_images component --- components/download_images/fondant_component.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml index 104caad2e..1efaa48d4 100644 --- a/components/download_images/fondant_component.yaml +++ b/components/download_images/fondant_component.yaml @@ -1,6 +1,6 @@ name: Download images description: Component that downloads images based on URLs -image: ghcr.io/ml6team/download_images:latest +image: ghcr.io/ml6team/download_images:dev consumes: images: