From fcf67da6cc0a267ecc1b549aa58c84230528c530 Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Thu, 11 Jan 2024 14:58:56 -0800
Subject: [PATCH] address review comments

---
 src/tape/ensemble.py              | 53 +++++++++++++++-------------
 src/tape/ensemble_readers.py      | 17 ---------
 tests/tape_tests/conftest.py      | 57 ++++++++++++++++++++++++++++---
 tests/tape_tests/test_ensemble.py |  4 ++-
 4 files changed, 84 insertions(+), 47 deletions(-)

diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py
index 2deb5186..90bf1a7c 100644
--- a/src/tape/ensemble.py
+++ b/src/tape/ensemble.py
@@ -34,6 +34,8 @@
 
 DEFAULT_FRAME_LABEL = "result"  # A base default label for an Ensemble's result frames.
 
+METADATA_FILENAME = "ensemble_metadata.json"
+
 
 class Ensemble:
     """Ensemble object is a collection of light curve ids"""
@@ -1286,9 +1288,9 @@ def save_ensemble(self, path=".", dirname="ensemble", additional_frames=True, **
         # Determine the path
         ens_path = os.path.join(path, dirname)
 
-        # First look for an existing metadata.json file in the path
+        # First look for an existing metadata file in the path
         try:
-            with open(os.path.join(ens_path, "metadata.json"), "r") as oldfile:
+            with open(os.path.join(ens_path, METADATA_FILENAME), "r") as oldfile:
                 # Reading from json file
                 old_metadata = json.load(oldfile)
                 old_subdirs = old_metadata["subdirs"]
@@ -1302,7 +1304,7 @@ def save_ensemble(self, path=".", dirname="ensemble", additional_frames=True, **
         if additional_frames is True:
             frames_to_save = list(self.frames.keys())  # save all frames
         elif additional_frames is False:
-            frames_to_save = ["object", "source"]  # save just object and source
+            frames_to_save = [OBJECT_FRAME_LABEL, SOURCE_FRAME_LABEL]  # save just object and source
         elif isinstance(additional_frames, Iterable):
             frames_to_save = set(additional_frames)
             invalid_frames = frames_to_save.difference(set(self.frames.keys()))
@@ -1314,14 +1316,14 @@ def save_ensemble(self, path=".", dirname="ensemble", additional_frames=True, **
             frames_to_save = list(frames_to_save)
 
             # Make sure object and source are in the frame list
-            if "object" not in frames_to_save:
-                frames_to_save.append("object")
-            if "source" not in frames_to_save:
-                frames_to_save.append("source")
+            if OBJECT_FRAME_LABEL not in frames_to_save:
+                frames_to_save.append(OBJECT_FRAME_LABEL)
+            if SOURCE_FRAME_LABEL not in frames_to_save:
+                frames_to_save.append(SOURCE_FRAME_LABEL)
         else:
             raise ValueError("Invalid input to `additional_frames`, must be boolean or list-like")
 
-        # Save the frame list to disk
+        # Generate the metadata first
         created_subdirs = []  # track the list of created subdirectories
         divisions_known = []  # log whether divisions were known for each frame
         for frame_label in frames_to_save:
@@ -1331,17 +1333,14 @@ def save_ensemble(self, path=".", dirname="ensemble", additional_frames=True, **
             # When the frame has no columns, avoid the save as parquet doesn't handle it
             # Most commonly this applies to the object table when it's built from source
             if len(frame.columns) == 0:
-                print(f"Frame: {frame_label} was not saved as no columns were present.")
+                print(f"Frame: {frame_label} will not be saved as no columns are present.")
                 continue
 
-            # creates a subdirectory for the frame partition files
-            frame.to_parquet(os.path.join(ens_path, frame_label), write_metadata_file=True, **kwargs)
             created_subdirs.append(frame_label)
             divisions_known.append(frame.known_divisions)
 
         # Save a metadata file
         col_map = self.make_column_map()  # grab the current column_mapper
-
         metadata = {
             "subdirs": created_subdirs,
             "known_divisions": divisions_known,
@@ -1349,10 +1348,14 @@ def save_ensemble(self, path=".", dirname="ensemble", additional_frames=True, **
         }
         json_metadata = json.dumps(metadata, indent=4)
 
-        with open(os.path.join(ens_path, "metadata.json"), "w") as outfile:
+        # Make the directory if it doesn't already exist
+        os.makedirs(ens_path, exist_ok=True)
+        with open(os.path.join(ens_path, METADATA_FILENAME), "w") as outfile:
             outfile.write(json_metadata)
 
-        # np.save(os.path.join(ens_path, "column_mapper.npy"), col_map.map)
+        # Now write out the frames to subdirectories
+        for subdir in created_subdirs:
+            self.frames[subdir].to_parquet(os.path.join(ens_path, subdir), write_metadata_file=True, **kwargs)
 
         print(f"Saved to {os.path.join(path, dirname)}")
 
@@ -1390,8 +1393,8 @@ def from_ensemble(
             The ensemble object.
         """
 
-        # Read in the metadata.json file
-        with open(os.path.join(dirpath, "metadata.json"), "r") as metadatafile:
+        # Read in the metadata file
+        with open(os.path.join(dirpath, METADATA_FILENAME), "r") as metadatafile:
             # Reading from json file
             metadata = json.load(metadatafile)
 
@@ -1405,16 +1408,16 @@ def from_ensemble(
         # Load Object and Source
 
         # Check for whether or not object is present, it's not saved when no columns are present
-        if "object" in subdirs:
+        if OBJECT_FRAME_LABEL in subdirs:
             # divisions should be known for both tables to use the sorted kwarg
             use_sorted = (
-                frame_known_divisions[subdirs.index("object")]
-                and frame_known_divisions[subdirs.index("source")]
+                frame_known_divisions[subdirs.index(OBJECT_FRAME_LABEL)]
+                and frame_known_divisions[subdirs.index(SOURCE_FRAME_LABEL)]
             )
 
             self.from_parquet(
-                os.path.join(dirpath, "source"),
-                os.path.join(dirpath, "object"),
+                os.path.join(dirpath, SOURCE_FRAME_LABEL),
+                os.path.join(dirpath, OBJECT_FRAME_LABEL),
                 column_mapper=column_mapper,
                 sorted=use_sorted,
                 sort=False,
@@ -1422,9 +1425,9 @@ def from_ensemble(
                 **kwargs,
             )
         else:
-            use_sorted = frame_known_divisions[subdirs.index("source")]
+            use_sorted = frame_known_divisions[subdirs.index(SOURCE_FRAME_LABEL)]
             self.from_parquet(
-                os.path.join(dirpath, "source"),
+                os.path.join(dirpath, SOURCE_FRAME_LABEL),
                 column_mapper=column_mapper,
                 sorted=use_sorted,
                 sort=False,
@@ -1446,7 +1449,9 @@ def from_ensemble(
 
             # Filter out object and source from additional frames
             frames_to_load = [
-                frame for frame in frames_to_load if os.path.split(frame)[1] not in ["object", "source"]
+                frame
+                for frame in frames_to_load
+                if os.path.split(frame)[1] not in [OBJECT_FRAME_LABEL, SOURCE_FRAME_LABEL]
             ]
             if len(frames_to_load) > 0:
                 for frame in frames_to_load:
diff --git a/src/tape/ensemble_readers.py b/src/tape/ensemble_readers.py
index 55b93e26..91f9add2 100644
--- a/src/tape/ensemble_readers.py
+++ b/src/tape/ensemble_readers.py
@@ -15,8 +15,6 @@ def read_ensemble(
     additional_frames=True,
     column_mapper=None,
     dask_client=True,
-    additional_cols=True,
-    partition_size=None,
     **kwargs,
 ):
     """Load an ensemble from an on-disk ensemble.
@@ -37,19 +35,6 @@ def read_ensemble(
         Supplies a ColumnMapper to the Ensemble, if None (default) searches
         for a column_mapper.npy file in the directory, which should be
         created when the ensemble is saved.
-    additional_cols: 'bool', optional
-        Boolean to indicate whether to carry in columns beyond the
-        critical columns, true will, while false will only load the columns
-        containing the critical quantities (id,time,flux,err,band)
-    partition_size: `int`, optional
-            If specified, attempts to repartition the ensemble to partitions
-            of size `partition_size`.
-    sorted: bool, optional
-        If the index column is already sorted in increasing order.
-        Defaults to False
-    sort: `bool`, optional
-        If True, sorts the DataFrame by the id column. Otherwise set the
-        index on the individual existing partitions. Defaults to False.
     dask_client: `dask.distributed.client` or `bool`, optional
         Accepts an existing `dask.distributed.Client`, or creates one if
         `client=True`, passing any additional kwargs to a
@@ -68,8 +53,6 @@ def read_ensemble(
         dirpath,
         additional_frames=additional_frames,
         column_mapper=column_mapper,
-        additional_cols=additional_cols,
-        partition_size=partition_size,
         **kwargs,
     )
 
diff --git a/tests/tape_tests/conftest.py b/tests/tape_tests/conftest.py
index c0af84c3..ec6b4521 100644
--- a/tests/tape_tests/conftest.py
+++ b/tests/tape_tests/conftest.py
@@ -233,6 +233,7 @@ def parquet_ensemble_without_client():
 
     return ens
 
+
 @pytest.fixture
 def parquet_files_and_ensemble_without_client():
     """Create an Ensemble from parquet data without a dask client."""
@@ -246,12 +247,10 @@ def parquet_files_and_ensemble_without_client():
         err_col="psFluxErr",
         band_col="filterName",
     )
-    ens = ens.from_parquet(
-        source_file,
-        object_file,
-        column_mapper=colmap)
+    ens = ens.from_parquet(source_file, object_file, column_mapper=colmap)
     return ens, source_file, object_file, colmap
 
+
 # pylint: disable=redefined-outer-name
 @pytest.fixture
 def parquet_ensemble(dask_client):
@@ -270,6 +269,25 @@ def parquet_ensemble(dask_client):
     return ens
 
 
+# pylint: disable=redefined-outer-name
+@pytest.fixture
+def parquet_ensemble_partition_size(dask_client):
+    """Create an Ensemble from parquet data."""
+    ens = Ensemble(client=dask_client)
+    ens.from_parquet(
+        "tests/tape_tests/data/source/test_source.parquet",
+        "tests/tape_tests/data/object/test_object.parquet",
+        id_col="ps1_objid",
+        time_col="midPointTai",
+        band_col="filterName",
+        flux_col="psFlux",
+        err_col="psFluxErr",
+        partition_size="1MB",
+    )
+
+    return ens
+
+
 # pylint: disable=redefined-outer-name
 @pytest.fixture
 def parquet_ensemble_with_divisions(dask_client):
@@ -386,6 +404,34 @@ def dask_dataframe_ensemble(dask_client):
     return ens
 
 
+# pylint: disable=redefined-outer-name
+@pytest.fixture
+def dask_dataframe_ensemble_partition_size(dask_client):
+    """Create an Ensemble from parquet data."""
+    ens = Ensemble(client=dask_client)
+
+    num_points = 1000
+    all_bands = np.array(["r", "g", "b", "i"])
+    rows = {
+        "id": 8000 + (np.arange(num_points) % 5),
+        "time": np.arange(num_points),
+        "flux": np.arange(num_points) % len(all_bands),
+        "band": np.repeat(all_bands, num_points / len(all_bands)),
+        "err": 0.1 * (np.arange(num_points) % 10),
+        "count": np.arange(num_points),
+        "something_else": np.full(num_points, None),
+    }
+    cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band")
+
+    ens.from_dask_dataframe(
+        source_frame=dd.from_dict(rows, npartitions=1),
+        column_mapper=cmap,
+        partition_size="1MB",
+    )
+
+    return ens
+
+
 # pylint: disable=redefined-outer-name
 @pytest.fixture
 def dask_dataframe_with_object_ensemble(dask_client):
@@ -490,6 +536,7 @@ def pandas_with_object_ensemble(dask_client):
 
     return ens
 
+
 # pylint: disable=redefined-outer-name
 @pytest.fixture
 def ensemble_from_source_dict(dask_client):
@@ -511,4 +558,4 @@ def ensemble_from_source_dict(dask_client):
     cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="error", band_col="band")
     ens.from_source_dict(source_dict, column_mapper=cmap)
 
-    return ens, source_dict
\ No newline at end of file
+    return ens, source_dict
diff --git a/tests/tape_tests/test_ensemble.py b/tests/tape_tests/test_ensemble.py
index 7fc52c2e..c2a8945c 100644
--- a/tests/tape_tests/test_ensemble.py
+++ b/tests/tape_tests/test_ensemble.py
@@ -50,6 +50,7 @@ def test_with_client():
         "parquet_ensemble_from_hipscat",
         "parquet_ensemble_with_column_mapper",
         "parquet_ensemble_with_known_column_mapper",
+        "parquet_ensemble_partition_size",
         "read_parquet_ensemble",
         "read_parquet_ensemble_without_client",
         "read_parquet_ensemble_from_source",
@@ -102,6 +103,7 @@ def test_parquet_construction(data_fixture, request):
     "data_fixture",
     [
         "dask_dataframe_ensemble",
+        "dask_dataframe_ensemble_partition_size",
         "dask_dataframe_with_object_ensemble",
         "pandas_ensemble",
         "pandas_with_object_ensemble",
@@ -533,7 +535,7 @@ def test_save_and_load_ensemble(dask_client, tmp_path, add_frames, obj_nocols, u
         dircontents = os.listdir(os.path.join(save_path, "ensemble"))
 
         assert "source" in dircontents  # Source should always be there
-        assert "metadata.json" in dircontents  # should make a metadata file
+        assert "ensemble_metadata.json" in dircontents  # should make a metadata file
         if obj_nocols:  # object shouldn't if it was empty
             assert "object" not in dircontents
         else:  # otherwise it should be present