huggingface · qgallouedec · Nov 6, 2023 · Aug 21, 2023 · Aug 21, 2023 · Aug 30, 2023
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 .PHONY: quality style test
 
 # Define directories variable
-DIRS = data examples gia scripts tests
+DIRS = data examples gia gia2 scripts tests
 
 # Check that source code meets quality standards
 quality:

diff --git a/data/conceptual_captions/generate_conceptual_caption.py b/data/conceptual_captions/generate_conceptual_caption.py
@@ -1,36 +1,43 @@
-import concurrent.futures
 import io
-import os
-import urllib
+import multiprocessing
+from typing import Dict, List, Union
+from urllib.request import Request, urlopen
 
 import PIL.Image
-from datasets import load_dataset
 from datasets.utils.file_utils import get_datasets_user_agent
 
 
 USER_AGENT = get_datasets_user_agent()
-PATH = "data/test"  # or "data/train"
 
-MAX_WORKERS = 10  # adjust to your needs
-MAX_QUEUE_SIZE = 2 * MAX_WORKERS  # adjust to your needs
 
+def fetch_image(image_url: str, timeout: float = 0.5) -> PIL.Image.Image:
+    """
+    Fetches a single image from a given URL and returns it as a PIL Image object.
 
-def fetch_single_image(image_url, timeout=1):
-    print(image_url)
-    try:
-        request = urllib.request.Request(
-            image_url,
-            data=None,
-            headers={"user-agent": USER_AGENT},
-        )
-        with urllib.request.urlopen(request, timeout=timeout) as req:
-            image = PIL.Image.open(io.BytesIO(req.read()))
-    except Exception:
-        image = None
+    Args:
+        image_url (str): The URL of the image to fetch.
+        timeout (float): The timeout value for the request (in seconds).
+
+    Returns:
+        A PIL Image object representing the fetched image, or None if the image could not be fetched.
+    """
+    request = Request(image_url, data=None, headers={"user-agent": USER_AGENT})
+    with urlopen(request, timeout=timeout) as req:
+        image = PIL.Image.open(io.BytesIO(req.read()))
     return image
 
 
-def resize_single_image(image: PIL.Image):
+def resize_image(image: PIL.Image) -> PIL.Image:
+    """
+    Resize a single image to have the bigger size at most 352 pixels while maintaining aspect ratio.
+    Remove metadata from the image.
+
+    Args:
+        image (PIL.Image): The image to be resized.
+
+    Returns:
+        PIL.Image: The resized image without metadata.
+    """
     # Resize so that the bigger size is at most 352
     width, height = image.size
     if width > height:
@@ -40,43 +47,68 @@ def resize_single_image(image: PIL.Image):
         new_height = 352
         new_width = int(width * 352 / height)
     image = image.resize((new_width, new_height), PIL.Image.BILINEAR)
-    image = image.convert("RGB")
+    image = image.convert("RGB")  # Make sure the image is RGB
+    data = list(image.getdata())  # Get only the image data, and place it in a new image to remove metadata
+    image_without_exif = PIL.Image.new(image.mode, image.size)
+    image_without_exif.putdata(data)
+    return image_without_exif
+
+
+def fetch_and_resize(img_url: str) -> Union[PIL.Image.Image, None]:
+    """
+    Fetches an image from a given URL and resizes it.
+
+    Args:
+        img_url (str): The URL of the image to fetch.
+
+    Returns:
+        numpy.ndarray: The resized image as a NumPy array, or None if an error occurred.
+    """
+    try:
+        image = fetch_image(img_url)
+        image = resize_image(image)
+    except Exception:
+        image = None
     return image
 
 
-dataset = load_dataset("conceptual_captions", split="validation")  # or "train"
-if not os.path.exists(f"{PATH}/metadata.csv"):
-    with open(f"{PATH}/metadata.csv", "w") as f:
-        f.write("file_name,caption,idx\n")
-    dataset_idx = 0
-    image_idx = 0
-else:  # get the lastest index
-    with open(f"{PATH}/metadata.csv", "r") as f:
-        lines = f.readlines()
-        image_idx = len(lines) - 1
-        dataset_idx = int(lines[-1].split(",")[-1]) + 1
-        print(image_idx, dataset_idx)
-
-with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-    future_to_idx = {executor.submit(fetch_single_image, dataset[dataset_idx]["image_url"]): dataset_idx}
-    dataset_idx += 1
-    while dataset_idx < len(dataset):
-        done, _ = concurrent.futures.wait(future_to_idx, return_when=concurrent.futures.FIRST_COMPLETED)
-        for future in done:
-            idx = future_to_idx.pop(future)
+def process(example: Dict[str, List[str]]) -> Dict[str, List[Union[str, PIL.Image.Image]]]:
+    output = {"images": [], "text": []}
+
+    with multiprocessing.Pool() as pool:
+        images = pool.starmap(fetch_and_resize, [(url,) for url in example["image_url"]])
+
+    for idx, image in enumerate(images):
+        if image is not None:
+            output["images"].append(image)
+            output["text"].append(example["caption"][idx])
+
+    return output
+
+
+if __name__ == "__main__":
+    from datasets import Dataset, features, load_dataset
+
+    for split in ["train", "test"]:
+        dataset = load_dataset("conceptual_captions", split="train" if split == "train" else "validation")
+        num_cpu = multiprocessing.cpu_count() // 2
+        dataset = dataset.map(
+            process,
+            batched=True,
+            batch_size=200,
+            remove_columns=["caption", "image_url"],
+            num_proc=num_cpu,
+            load_from_cache_file=True,
+            features=features.Features({"images": features.Image(decode=True), "text": features.Value("string")}),
+        )
+        dataset.save_to_disk(f"conceptual-captions-{split}")
+        dataset = Dataset.load_from_disk(f"conceptual-captions-{split}")
+
+        retry = 500
+
+        for i in range(retry):
             try:
-                image = future.result()
-                if image is not None:
-                    image = resize_single_image(image)
-                    sample = dataset[idx]
-                    caption = sample["caption"].replace(",", "").replace(";", "").replace("\n", "").replace("\t", "")
-                    image.save(f"{PATH}/{image_idx:07d}.png", "PNG")
-                    with open(f"{PATH}/metadata.csv", "a") as f:
-                        f.write(f"{image_idx:07d}.png,{caption},{idx}\n")
-                    image_idx += 1
-            except Exception as exc:
-                print(f"Generated an exception: {exc}")
-
-        while len(future_to_idx) < MAX_QUEUE_SIZE and dataset_idx < len(dataset):
-            future_to_idx[executor.submit(fetch_single_image, dataset[dataset_idx]["image_url"])] = dataset_idx
-            dataset_idx += 1
+                dataset.push_to_hub("gia-project/gia-dataset-parquet", "conceptual-captions", split=split)
+                break
+            except Exception:
+                print(f"Retry {i+1}/{retry}")