HazyResearch · seyuboglu · May 8, 2022 · May 8, 2022 · May 8, 2022
diff --git a/meerkat/block/manager.py b/meerkat/block/manager.py
@@ -78,7 +78,7 @@ def apply(self, method_name: str = "_get", *args, **kwargs) -> BlockManager:
             results.reorder(self.keys())
         return results
 
-    def consolidate(self):
+    def consolidate(self, consolidate_unitary_groups: bool = False):
         column_order = list(
             self._columns.keys()
         )  # need to maintain order after consolidate
@@ -88,7 +88,7 @@ def consolidate(self):
             block_ref_groups[block_ref.block.signature].append(block_ref)
 
         for block_refs in block_ref_groups.values():
-            if len(block_refs) == 1:
+            if (not consolidate_unitary_groups) and len(block_refs) == 1:
                 # if there is only one block ref in the group, do not consolidate
                 continue
 
@@ -241,7 +241,9 @@ def write(self, path: str):
         os.makedirs(columns_dir)
 
         # consolidate before writing
-        self.consolidate()
+        # we also want to consolidate unitary groups (i.e. groups with only one block
+        # ref) so that we don't write any data not actually in the dataframe
+        self.consolidate(consolidate_unitary_groups=True)
         for block_id, block_ref in self._block_refs.items():
             block: AbstractBlock = block_ref.block
             block_dir = os.path.join(blocks_dir, str(block_id))

diff --git a/meerkat/columns/audio_column.py b/meerkat/columns/audio_column.py
@@ -10,11 +10,12 @@
 
 
 class AudioColumn(FileColumn):
-    """A lambda column where each cell represents an audio file on disk. The underlying
-    data is a `PandasSeriesColumn` of strings, where each string is the path to an
-    image. The column materializes the images into memory when indexed. If the column
-    is lazy indexed with the ``lz`` indexer, the images are not materialized and an
-    ``FileCell`` or an ``AudioColumn`` is returned instead.
+    """A lambda column where each cell represents an audio file on disk. The
+    underlying data is a `PandasSeriesColumn` of strings, where each string is
+    the path to an image. The column materializes the images into memory when
+    indexed. If the column is lazy indexed with the ``lz`` indexer, the images
+    are not materialized and an ``FileCell`` or an ``AudioColumn`` is returned
+    instead.
 
     Args:
         data (Sequence[str]): A list of filepaths to images.

diff --git a/meerkat/columns/file_column.py b/meerkat/columns/file_column.py
@@ -68,11 +68,12 @@ def __repr__(self):
 
 
 class FileColumn(FileLoaderMixin, LambdaColumn):
-    """A column where each cell represents an file stored on disk or the web. The
-    underlying data is a `PandasSeriesColumn` of strings, where each string is the path
-    to a file. The column materializes the files into memory when indexed. If the column
-    is lazy indexed with the ``lz`` indexer, the files are not materialized and a
-    ``FileCell`` or a ``FileColumn`` is returned instead.
+    """A column where each cell represents an file stored on disk or the web.
+    The underlying data is a `PandasSeriesColumn` of strings, where each string
+    is the path to a file. The column materializes the files into memory when
+    indexed. If the column is lazy indexed with the ``lz`` indexer, the files
+    are not materialized and a ``FileCell`` or a ``FileColumn`` is returned
+    instead.
 
     Args:
         data (Sequence[str]): A list of filepaths to images.
@@ -93,8 +94,6 @@ class FileColumn(FileLoaderMixin, LambdaColumn):
 
         base_dir (str): A base directory that the paths in ``data`` are relative to. If
             ``None``, the paths are assumed to be absolute.
-
-
     """
 
     def __init__(

diff --git a/meerkat/columns/image_column.py b/meerkat/columns/image_column.py
@@ -13,11 +13,12 @@
 
 
 class ImageColumn(FileColumn):
-    """A column where each cell represents an image stored on disk. The underlying data
-    is a `PandasSeriesColumn` of strings, where each string is the path to an image.
-    The column materializes the images into memory when indexed. If the column is
-    lazy indexed with the ``lz`` indexer, the images are not materialized and an
-    ``ImageCell`` or an ``ImageColumn`` is returned instead.
+    """A column where each cell represents an image stored on disk. The
+    underlying data is a `PandasSeriesColumn` of strings, where each string is
+    the path to an image. The column materializes the images into memory when
+    indexed. If the column is lazy indexed with the ``lz`` indexer, the images
+    are not materialized and an ``ImageCell`` or an ``ImageColumn`` is returned
+    instead.
 
     Args:
         data (Sequence[str]): A list of filepaths to images.
@@ -38,8 +39,6 @@ class ImageColumn(FileColumn):
 
         base_dir (str): A base directory that the paths in ``data`` are relative to. If
             ``None``, the paths are assumed to be absolute.
-
-
     """
 
     @staticmethod

diff --git a/meerkat/contrib/audioset.py b/meerkat/contrib/audioset.py
@@ -11,12 +11,11 @@ def build_audioset_dp(
     audio_column: bool = True,
     overwrite: bool = False,
 ) -> Dict[str, mk.DataPanel]:
-    """
-    Build DataPanels for the audioset dataset downloaded to ``dataset_dir``. By
-    default, the resulting DataPanels will be written to ``dataset_dir`` under the
-    filenames "audioset_examples.mk" and "audioset_labels.mk". If these files already
-    exist and ``overwrite`` is False, the DataPanels will not be built anew, and instead
-    will be simply loaded from disk.
+    """Build DataPanels for the audioset dataset downloaded to ``dataset_dir``.
+    By default, the resulting DataPanels will be written to ``dataset_dir``
+    under the filenames "audioset_examples.mk" and "audioset_labels.mk". If
+    these files already exist and ``overwrite`` is False, the DataPanels will
+    not be built anew, and instead will be simply loaded from disk.
 
     Args:
         dataset_dir: The directory where the dataset is stored
@@ -99,8 +98,7 @@ def build_audioset_dp(
 
 
 def build_ontology_dp(dataset_dir: str) -> Dict[str, mk.DataPanel]:
-    """
-    Build a DataPanel from the ontology.json file
+    """Build a DataPanel from the ontology.json file.
 
     Args:
         dataset_dir: The directory where the ontology.json file is stored
@@ -124,8 +122,7 @@ def find_submids(
     relations: mk.DataPanel = None,
     dataset_dir: str = None,
 ) -> List[str]:
-    """
-    Returns a list of IDs of all subcategories of an audio category
+    """Returns a list of IDs of all subcategories of an audio category.
 
     Args:
         ids: ID or list of IDs for which to find the subcategories

diff --git a/meerkat/contrib/inaturalist.py b/meerkat/contrib/inaturalist.py
@@ -23,8 +23,7 @@
 def build_inaturalist_dp(
     dataset_dir: str, download: bool = True, splits: List[str] = None
 ) -> mk.DataPanel:
-    """
-    Build a DataPanel from the inaturalist dataset.
+    """Build a DataPanel from the inaturalist dataset.
 
     Args:
         dataset_dir: The directory to store the dataset in.

diff --git a/meerkat/mixins/cloneable.py b/meerkat/mixins/cloneable.py
@@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs):
 
     @classmethod
     def _state_keys(cls) -> set:
-        """ """
+        """"""
         raise NotImplementedError()
 
     @classmethod

diff --git a/meerkat/tools/utils.py b/meerkat/tools/utils.py
@@ -6,8 +6,9 @@
 
 
 class MeerkatLoader(yaml.FullLoader):
-    """PyYaml does not load unimported modules for safety reasons. We want to allow
-    importing only meerkat modules
+    """PyYaml does not load unimported modules for safety reasons.
+
+    We want to allow importing only meerkat modules
     """
 
     def find_python_module(self, name: str, mark, unsafe=False):