flexcompute · momchil-flex · Aug 5, 2022 · Aug 2, 2022 · Aug 5, 2022
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -10,4 +10,5 @@ pylint
 tox
 pytest
 pytest-timeout
-gdspy
+gdspy
+memory_profiler
diff --git a/tests/test_data_performance.py b/tests/test_data_performance.py
@@ -0,0 +1,122 @@
+import pytest
+import numpy as np
+import os
+import sys
+from memory_profiler import profile
+
+from tidy3d.components.data import SimulationData, FieldData, ScalarFieldDataArray
+from tidy3d.components.monitor import FieldMonitor
+from tidy3d.components.simulation import Simulation
+from tidy3d.components.source import PointDipole, GaussianPulse
+from tidy3d.components.grid import GridSpec
+
+import tidy3d as td
+
+sys.path.append("/users/twhughes/Documents/Flexcompute/tidy3d-core")
+from tidy3d_backend.utils import Profile
+
+PATH = "tests/tmp/memory.hdf5"
+
+""" Testing the memory usage of writing SimulationData to and from .hdf5 file.
+
+    pip install memory_profiler
+    python -m memory_profiler tests/test_data_memory.py
+
+    note, units are in MiB, so need to convert to MB / GB.
+
+    https://www.thecalculatorsite.com/conversions/datastorage.php
+
+"""
+
+# will set size of sim_data1 to give a file size of this many GB
+FILE_SIZE_GB = 4.0
+
+
+def make_sim_data_1(file_size_gb=FILE_SIZE_GB):
+    # approximate # of points in the scalar field data
+
+    N = int(2.528e8 / 4 * file_size_gb)
+
+    n = int(N ** (0.25))
+
+    data = (1 + 1j) * np.random.random((n, n, n, n))
+    x = np.linspace(-1, 1, n)
+    y = np.linspace(-1, 1, n)
+    z = np.linspace(-1, 1, n)
+    f = np.linspace(2e14, 4e14, n)
+    src = PointDipole(
+        center=(0, 0, 0), source_time=GaussianPulse(freq0=3e14, fwidth=1e14), polarization="Ex"
+    )
+    coords = dict(x=x, y=y, z=z, f=f)
+    Ex = ScalarFieldDataArray(data, coords=coords)
+    monitor = FieldMonitor(size=(2, 2, 2), freqs=f, name="test", fields=["Ex"])
+    field_data = FieldData(monitor=monitor, Ex=Ex)
+    sim = Simulation(
+        size=(2, 2, 2),
+        grid_spec=GridSpec(wavelength=1),
+        monitors=(monitor,),
+        sources=(src,),
+        run_time=1e-12,
+    )
+    return SimulationData(
+        simulation=sim,
+        monitor_data=dict(test=field_data),
+    )
+
+
+SIM_DATA_1 = make_sim_data_1()
+
+
+@profile
+def test_memory_1_save():
+    print(f'sim_data_size = {SIM_DATA_1.monitor_data["test"].Ex.nbytes:.2e} Bytes')
+    SIM_DATA_1.to_file(PATH)
+    print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")
+
+
+@profile
+def test_memory_2_load():
+    print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")
+    return SimulationData.from_file(PATH)
+
+
+def test_core_profile_small_1_save():
+
+    Nx, Ny, Nz, Nt = 100, 100, 100, 10
+
+    x = np.arange(Nx)
+    y = np.arange(Ny)
+    z = np.arange(Nz)
+    t = np.arange(Nt)
+    coords = dict(x=x, y=y, z=z, t=t)
+    scalar_field = td.ScalarFieldTimeDataArray(np.random.random((Nx, Ny, Nz, Nt)), coords=coords)
+    monitor = td.FieldTimeMonitor(size=(2, 4, 6), interval=100, name="field", fields=["Ex", "Hz"])
+    data = td.FieldTimeData(monitor=monitor, Ex=scalar_field, Hz=scalar_field)
+    with Profile():
+        data.to_file(PATH)
+        print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")
+
+
+def test_core_profile_small_2_load():
+
+    with Profile():
+        print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")
+        data = td.FieldTimeData.from_file(PATH)
+
+
+def test_core_profile_large():
+
+    sim_data = make_sim_data_1()
+
+    with Profile():
+        sim_data.to_file(PATH)
+
+    print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")
+
+    with Profile():
+        sim_data.from_file(PATH)
+
+
+if __name__ == "__main__":
+    test_memory_1_save()
+    sim_data1 = test_memory_2_load()
diff --git a/tidy3d/components/base.py b/tidy3d/components/base.py
@@ -298,13 +298,17 @@ def to_hdf5(self, fname: str) -> None:
     """
 
     @staticmethod
-    def unpack_dataset(dataset: h5py.Dataset) -> Any:  # pylint:disable=too-many-return-statements
+    def unpack_dataset(  # pylint:disable=too-many-return-statements
+        dataset: h5py.Dataset, keep_numpy: bool = False
+    ) -> Any:
         """Gets the value contained in a dataset in a form ready to insert into final dict.
 
         Parameters
         ----------
         item : h5py.Dataset
             The raw value coming from the dataset, which needs to be decoded.
+        keep_numpy : bool = False
+            Whether to load a ``np.ndarray`` as such or convert it to list.
 
         Returns
         -------
@@ -320,7 +324,9 @@ def unpack_dataset(dataset: h5py.Dataset) -> Any:  # pylint:disable=too-many-ret
                 return [val.decode("utf-8") for val in value]
             if value.dtype == bool:
                 return value.astype(bool)
-            return value.tolist()
+            if not keep_numpy:
+                return value.tolist()
+            return value
 
         # decoding special types
         if isinstance(value, np.bool_):
@@ -349,18 +355,28 @@ def load_from_handle(cls, hdf5_group: h5py.Group, **kwargs) -> Tidy3dBaseModel:
         return cls.parse_obj(data_dict, **kwargs)
 
     @classmethod
-    def _load_group_data(cls, data_dict: dict, hdf5_group: h5py.Group) -> dict:
+    def _load_group_data(
+        cls, data_dict: dict, hdf5_group: h5py.Group, keep_numpy: bool = False
+    ) -> dict:
         """Recusively load the data from the group with dataset unpacking as base case."""
 
+        if "keep_numpy" in hdf5_group:
+            keep_numpy = hdf5_group["keep_numpy"]
+
         for key, value in hdf5_group.items():
 
+            if key == "keep_numpy":
+                continue
+
             # recurive case, try to load the group into data_dict[key]
             if isinstance(value, h5py.Group):
-                data_dict[key] = cls._load_group_data(data_dict={}, hdf5_group=value)
+                data_dict[key] = cls._load_group_data(
+                    data_dict={}, hdf5_group=value, keep_numpy=keep_numpy
+                )
 
             # base case, unpack the value in the dataset
             elif isinstance(value, h5py.Dataset):
-                data_dict[key] = cls.unpack_dataset(value)
+                data_dict[key] = cls.unpack_dataset(value, keep_numpy=keep_numpy)
 
         if any("TUPLE_ELEMENT_" in key for key in data_dict.keys()):
             return tuple(data_dict.values())
@@ -395,8 +411,10 @@ def pack_dataset(hdf5_group: h5py.Group, key: str, value: Any) -> None:
             return
         if isinstance(value, str):
             value = value.encode("utf-8")
-        elif isinstance(value, bool):
-            value = np.array(value)
+
+        # numpy array containing strings (usually direction=['-','+'])
+        elif isinstance(value, np.ndarray) and (value.dtype == "<U1"):
+            value = value.tolist()
 
         _ = hdf5_group.create_dataset(name=key, data=value)
 
@@ -421,7 +439,8 @@ def _save_group_data(self, data_dict: dict, hdf5_group: h5py.Group) -> None:
                 key = "_".join((str(i) for i in key))
 
             if isinstance(value, xr.DataArray):
-                value = value.to_dict()
+                coords = {key: np.array(val) for key, val in value.coords.items()}
+                value = dict(data=value.data, coords=coords, keep_numpy=True)
 
             # if a tuple of dicts, convert to a dict with special
             elif isinstance(value, tuple) and any(isinstance(val, dict) for val in value):

diff --git a/tidy3d/components/data/data_array.py b/tidy3d/components/data/data_array.py
@@ -78,8 +78,13 @@ def validate(cls, value):
         if isinstance(value, dict):
             data = value.get("data")
             coords = value.get("coords")
-            coords = {name: np.array(val.get("data")) for name, val in coords.items()}
-            return cls(np.array(data), coords=coords)
+
+            # convert to numpy if not already
+            coords = {k: v if isinstance(v, np.ndarray) else np.array(v) for k, v in coords.items()}
+            if not isinstance(data, np.ndarray):
+                data = np.array(data)
+
+            return cls(data, coords=coords)
 
         return cls(value)
 
@@ -90,8 +95,14 @@ def __modify_schema__(cls, field_schema):
 
     def __eq__(self, other) -> bool:
         """Whether two data array objects are equal."""
-
-        return self.to_dict() == other.to_dict()
+        if not np.all(self.data == other.data):
+            return False
+        for key, val in self.coords.items():
+            if not np.all(np.array(val) == np.array(other.coords[key])):
+                return False
+        return True
+
+        # return self.to_dict() == other.to_dict()
 
 
 class ScalarFieldDataArray(DataArray):