Merge pull request deepchem#3695 from aaronrockmenezes/image_loader

Modifications to Image Loader
GreatRSingh · Dec 17, 2023 · 5493df6 · 5493df6
2 parents aa27fa3 + 4073b1b
commit 5493df6
Show file tree

Hide file tree

Showing 3 changed files with 184 additions and 26 deletions.
diff --git a/deepchem/data/data_loader.py b/deepchem/data/data_loader.py
@@ -1297,15 +1297,49 @@ def _add_sequence(sequences: np.ndarray,
 
 
 class ImageLoader(DataLoader):
-    """Handles loading of image files.
+    """Creates `Dataset` objects from input image files.
 
     This class allows for loading of images in various formats.
     For user convenience, also accepts zip-files and directories
     of images and uses some limited intelligence to attempt to
     traverse subdirectories which contain images.
+
+    Currently, only .png and .tif files are supported. If the
+    inputs or labels are given as a list of files, the list must contain
+    only image files.
+
+    Examples
+    --------
+    Let's suppose we have some input image files in a zip folder.
+
+    >>> import os
+    >>> import deepchem as dc
+    >>> data_dir = dc.utils.get_data_dir()
+    >>> dataset_files = os.path.join(data_dir, "images.zip")
+    >>> labels = [1,2,3,4,5,6,7,8,9,10]
+
+    The label files can also be images similar to the inputs, in which case we
+    can provide a list of label files instead of a list of labels.
+
+    >>> label_files = os.path.join(data_dir, "labels.zip")
+
+    Continuing with the label files example, let's now write this to disk somewhere.
+    We can now use `ImageLoader` to process this Image dataset. We do not use a
+    featurizer here, hence the `UserDefinedFeaturizer` with an empty list.
+
+    >>> featurizer = dc.feat.UserDefinedFeaturizer([])
+    >>> loader = dc.data.ImageLoader(tasks=['demo-task'], sorting=False)
+    >>> dataset = loader.create_dataset(inputs=(dataset_files, label_files),
+    ...                                 in_memory=False)
+    >>> len(dataset)
+    10
+    >>> dataset.X.shape
+    (10, 256, 256)
+    >>> dataset.y.shape
+    (10, 256, 256)
     """
 
-    def __init__(self, tasks: Optional[List[str]] = None):
+    def __init__(self, tasks: Optional[List[str]] = None, sorting: bool = True):
         """Initialize image loader.
 
         At present, custom image featurizers aren't supported by this
@@ -1315,10 +1349,13 @@ def __init__(self, tasks: Optional[List[str]] = None):
         ----------
         tasks: List[str], optional (default None)
             List of task names for image labels.
+        sorting: bool, optional (default True)
+            Whether to sort image files by filename.
         """
         if tasks is None:
             tasks = []
         self.tasks = tasks
+        self.sorting = sorting
 
     def create_dataset(self,
                        inputs: Union[OneOrMany[str], Tuple[Any]],
@@ -1335,26 +1372,30 @@ def create_dataset(self,
               - filename
               - list of filenames
               - Tuple (list of filenames, labels)
+              - Tuple (list of filenames, list of label filenames)
               - Tuple (list of filenames, labels, weights)
+              - Tuple (list of filenames, list of label filenames, weights)
 
             Each file in a given list of filenames should either be of a supported
             image format (.png, .tif only for now) or of a compressed folder of
             image files (only .zip for now). If `labels` or `weights` are provided,
             they must correspond to the sorted order of all filenames provided, with
-            one label/weight per file.
+            one label/weight per file. Labels can be filenames too, in which case the
+            labels are loaded as images.
+
         data_dir: str, optional (default None)
             Directory to store featurized dataset.
         shard_size: int, optional (default 8192)
             Shard size when loading data.
         in_memory: bool, optioanl (default False)
-            If true, return in-memory NumpyDataset. Else return ImageDataset.
+            If true, return in-memory `NumpyDataset`. Else return `ImageDataset`.
 
         Returns
         -------
-        ImageDataset or NumpyDataset or DiskDataset
-            - if `in_memory == False`, the return value is ImageDataset.
-            - if `in_memory == True` and `data_dir is None`, the return value is NumpyDataset.
-            - if `in_memory == True` and `data_dir is not None`, the return value is DiskDataset.
+        `ImageDataset` or `NumpyDataset` or `DiskDataset`
+            - if `in_memory == False`, the return value is `ImageDataset`.
+            - if `in_memory == True` and `data_dir is None`, the return value is `NumpyDataset`.
+            - if `in_memory == True` and `data_dir is not None`, the return value is `DiskDataset`.
         """
         labels, weights = None, None
         if isinstance(inputs, tuple):
@@ -1380,7 +1421,6 @@ def create_dataset(self,
             for input_file in input_files:
                 filename, extension = os.path.splitext(input_file)
                 extension = extension.lower()
-                # TODO(rbharath): Add support for more extensions
                 if os.path.isdir(input_file):
                     dirfiles = [
                         os.path.join(input_file, subfile)
@@ -1408,30 +1448,94 @@ def create_dataset(self,
             input_files = remainder
 
         # Sort image files
-        image_files = sorted(image_files)
+        if self.sorting:
+            image_files = sorted(image_files)
+
+        if isinstance(labels, str):
+            label_files = [labels]
+        else:
+            label_files = []
+
+        label_image_files = []
+        # Sometimes zip files contain directories within. Traverse directories
+        while len(label_files) > 0:
+            remainder = []
+            for label_file in label_files:
+                filename, extension = os.path.splitext(label_file)
+                extension = extension.lower()
+                if os.path.isdir(label_file):
+                    dirfiles = [
+                        os.path.join(label_file, subfile)
+                        for subfile in os.listdir(label_file)
+                    ]
+                    remainder += dirfiles
+                elif extension == ".zip":
+                    zip_dir = tempfile.mkdtemp()
+                    zip_ref = zipfile.ZipFile(label_file, 'r')
+                    zip_ref.extractall(path=zip_dir)
+                    zip_ref.close()
+                    zip_files = [
+                        os.path.join(zip_dir, name)
+                        for name in zip_ref.namelist()
+                    ]
+                    for zip_file in zip_files:
+                        _, extension = os.path.splitext(zip_file)
+                        extension = extension.lower()
+                        if extension in [".png", ".tif"]:
+                            label_image_files.append(zip_file)
+                elif extension in [".png", ".tif"]:
+                    label_image_files.append(label_file)
+                else:
+                    raise ValueError("Unsupported file format")
+            label_files = remainder
+
+        # Sort label image files
+        if self.sorting:
+            label_image_files = sorted(label_image_files)
 
         if in_memory:
             if data_dir is None:
-                return NumpyDataset(load_image_files(image_files),
-                                    y=labels,
-                                    w=weights,
-                                    ids=image_files)
+                if isinstance(labels, str):
+                    return NumpyDataset(load_image_files(image_files),
+                                        y=load_image_files(label_image_files),
+                                        w=weights,
+                                        ids=image_files)
+                else:
+                    return NumpyDataset(load_image_files(image_files),
+                                        y=labels,
+                                        w=weights,
+                                        ids=image_files)
             else:
-                dataset = DiskDataset.from_numpy(load_image_files(image_files),
-                                                 y=labels,
-                                                 w=weights,
-                                                 ids=image_files,
-                                                 tasks=self.tasks,
-                                                 data_dir=data_dir)
+                if isinstance(labels, str):
+                    dataset = DiskDataset.from_numpy(
+                        load_image_files(image_files),
+                        y=load_image_files(label_image_files),
+                        w=weights,
+                        ids=image_files,
+                        tasks=self.tasks,
+                        data_dir=data_dir)
+                else:
+                    dataset = DiskDataset.from_numpy(
+                        load_image_files(image_files),
+                        y=labels,
+                        w=weights,
+                        ids=image_files,
+                        tasks=self.tasks,
+                        data_dir=data_dir)
                 if shard_size is not None:
                     dataset.reshard(shard_size)
                 return dataset
         else:
-            return ImageDataset(
-                image_files,
-                y=labels,  # type: ignore
-                w=weights,
-                ids=image_files)
+            if isinstance(labels, str):
+                return ImageDataset(image_files,
+                                    y=label_image_files,
+                                    w=weights,
+                                    ids=image_files)
+            else:
+                return ImageDataset(image_files,
+                                    y=labels,
+                                    w=weights,
+                                    ids=image_files)
 
 
 class InMemoryLoader(DataLoader):

diff --git a/deepchem/data/tests/test_image_loader.py b/deepchem/data/tests/test_image_loader.py
@@ -29,6 +29,23 @@ def setUp(self):
         self.face_copy_path = os.path.join(self.data_dir, "face_copy.png")
         Image.fromarray(self.face).save(self.face_copy_path)
 
+        # Create directory of multiple image files
+        self.order_path = os.path.join(self.data_dir, "order_check")
+        os.mkdir(self.order_path)
+        self.face_path = os.path.join(self.order_path, "face.png")
+        Image.fromarray(self.face).save(self.face_path)
+        self.face_copy_path = os.path.join(self.order_path, "face_copy.png")
+        Image.fromarray(self.face).save(self.face_copy_path)
+
+        # Zip directory of multiple image files
+        self.order_zip_path = os.path.join(self.data_dir, "order_check.zip")
+        with zipfile.ZipFile(self.order_zip_path, 'w') as zip_file:
+            for foldername, subfolders, filenames in os.walk(self.order_path):
+                for filename in filenames:
+                    file_path = os.path.join(foldername, filename)
+                    arcname = os.path.relpath(file_path, self.order_path)
+                    zip_file.write(file_path, arcname=arcname)
+
         # Create zip of image file
         self.zip_path = os.path.join(self.data_dir, "face.zip")
         zipf = zipfile.ZipFile(self.zip_path, "w", zipfile.ZIP_DEFLATED)
@@ -71,12 +88,34 @@ def test_png_simple_load_with_labels(self):
         assert dataset.X.shape == (1, 768, 1024, 3)
         assert (dataset.y == np.ones((1,))).all()
 
+    def test_png_simple_load_with_label_as_image(self):
+        loader = dc.data.ImageLoader()
+        dataset = loader.create_dataset((self.face_path, self.face_path))
+        # These are the known dimensions of face.png
+        assert dataset.X.shape == (1, 768, 1024, 3)
+        assert dataset.y.shape == (1, 768, 1024, 3)
+
     def test_tif_simple_load(self):
         loader = dc.data.ImageLoader()
         dataset = loader.create_dataset(self.tif_image_path)
         # TODO(rbharath): Where are the color channels?
         assert dataset.X.shape == (1, 44, 330)
 
+    def test_tif_simple_load_with_labels(self):
+        loader = dc.data.ImageLoader()
+        dataset = loader.create_dataset((self.tif_image_path, np.array(1)))
+        # These are the known dimensions of a_image.tif
+        assert dataset.X.shape == (1, 44, 330)
+        assert (dataset.y == np.ones((1,))).all()
+
+    def test_tif_simple_load_with_label_as_image(self):
+        loader = dc.data.ImageLoader()
+        dataset = loader.create_dataset(
+            (self.tif_image_path, self.tif_image_path))
+        # These are the known dimensions of a_image.tif
+        assert dataset.X.shape == (1, 44, 330)
+        assert dataset.y.shape == (1, 44, 330)
+
     def test_png_multi_load(self):
         loader = dc.data.ImageLoader()
         dataset = loader.create_dataset([self.face_path, self.face_copy_path])
@@ -102,3 +141,14 @@ def test_directory_load(self):
         loader = dc.data.ImageLoader()
         dataset = loader.create_dataset(self.image_dir)
         assert dataset.X.shape == (2, 768, 1024, 3)
+
+    def test_zip_order(self):
+        # Test that the order of the contents of an unzipped file is preserved.
+        # Load the zip file
+        loader = dc.data.ImageLoader()
+        dataset_dir = loader.create_dataset(self.order_path)
+        # Load multi_path directly
+        loader = dc.data.ImageLoader()
+        dataset_zipped = loader.create_dataset(self.order_zip_path)
+        # Check that the order of the files is the same
+        assert np.all(dataset_dir.X == dataset_zipped.X)
diff --git a/deepchem/utils/data_utils.py b/deepchem/utils/data_utils.py
@@ -211,7 +211,11 @@ def load_image_files(input_files: List[str]) -> np.ndarray:
             images.append(imarray)
         else:
             raise ValueError("Unsupported image filetype for %s" % input_file)
-    return np.array(images)
+    # dtype=object allows for arrays(images here) of arbitrary size
+    try:
+        return np.array(images)
+    except:
+        return np.array(images, dtype=object)
 
 
 def load_sdf_files(input_files: List[str],