Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

performance improvements for saving / loading .hdf5 #451

Merged
merged 2 commits into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ pylint
tox
pytest
pytest-timeout
gdspy
gdspy
memory_profiler
122 changes: 122 additions & 0 deletions tests/test_data_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import pytest
import numpy as np
import os
import sys
from memory_profiler import profile

from tidy3d.components.data import SimulationData, FieldData, ScalarFieldDataArray
from tidy3d.components.monitor import FieldMonitor
from tidy3d.components.simulation import Simulation
from tidy3d.components.source import PointDipole, GaussianPulse
from tidy3d.components.grid import GridSpec

import tidy3d as td

sys.path.append("/users/twhughes/Documents/Flexcompute/tidy3d-core")
from tidy3d_backend.utils import Profile

PATH = "tests/tmp/memory.hdf5"

""" Testing the memory usage of writing SimulationData to and from .hdf5 file.

pip install memory_profiler
python -m memory_profiler tests/test_data_memory.py

note, units are in MiB, so need to convert to MB / GB.

https://www.thecalculatorsite.com/conversions/datastorage.php

"""

# will set size of sim_data1 to give a file size of this many GB
FILE_SIZE_GB = 4.0


def make_sim_data_1(file_size_gb=FILE_SIZE_GB):
# approximate # of points in the scalar field data

N = int(2.528e8 / 4 * file_size_gb)

n = int(N ** (0.25))

data = (1 + 1j) * np.random.random((n, n, n, n))
x = np.linspace(-1, 1, n)
y = np.linspace(-1, 1, n)
z = np.linspace(-1, 1, n)
f = np.linspace(2e14, 4e14, n)
src = PointDipole(
center=(0, 0, 0), source_time=GaussianPulse(freq0=3e14, fwidth=1e14), polarization="Ex"
)
coords = dict(x=x, y=y, z=z, f=f)
Ex = ScalarFieldDataArray(data, coords=coords)
monitor = FieldMonitor(size=(2, 2, 2), freqs=f, name="test", fields=["Ex"])
field_data = FieldData(monitor=monitor, Ex=Ex)
sim = Simulation(
size=(2, 2, 2),
grid_spec=GridSpec(wavelength=1),
monitors=(monitor,),
sources=(src,),
run_time=1e-12,
)
return SimulationData(
simulation=sim,
monitor_data=dict(test=field_data),
)


SIM_DATA_1 = make_sim_data_1()


@profile
def test_memory_1_save():
print(f'sim_data_size = {SIM_DATA_1.monitor_data["test"].Ex.nbytes:.2e} Bytes')
SIM_DATA_1.to_file(PATH)
print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")


@profile
def test_memory_2_load():
print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")
return SimulationData.from_file(PATH)


def test_core_profile_small_1_save():

Nx, Ny, Nz, Nt = 100, 100, 100, 10

x = np.arange(Nx)
y = np.arange(Ny)
z = np.arange(Nz)
t = np.arange(Nt)
coords = dict(x=x, y=y, z=z, t=t)
scalar_field = td.ScalarFieldTimeDataArray(np.random.random((Nx, Ny, Nz, Nt)), coords=coords)
monitor = td.FieldTimeMonitor(size=(2, 4, 6), interval=100, name="field", fields=["Ex", "Hz"])
data = td.FieldTimeData(monitor=monitor, Ex=scalar_field, Hz=scalar_field)
with Profile():
data.to_file(PATH)
print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")


def test_core_profile_small_2_load():

with Profile():
print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")
data = td.FieldTimeData.from_file(PATH)


def test_core_profile_large():

sim_data = make_sim_data_1()

with Profile():
sim_data.to_file(PATH)

print(f"file_size = {os.path.getsize(PATH):.2e} Bytes")

with Profile():
sim_data.from_file(PATH)


if __name__ == "__main__":
test_memory_1_save()
sim_data1 = test_memory_2_load()
35 changes: 27 additions & 8 deletions tidy3d/components/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,13 +298,17 @@ def to_hdf5(self, fname: str) -> None:
"""

@staticmethod
def unpack_dataset(dataset: h5py.Dataset) -> Any: # pylint:disable=too-many-return-statements
def unpack_dataset( # pylint:disable=too-many-return-statements
dataset: h5py.Dataset, keep_numpy: bool = False
) -> Any:
"""Gets the value contained in a dataset in a form ready to insert into final dict.

Parameters
----------
item : h5py.Dataset
The raw value coming from the dataset, which needs to be decoded.
keep_numpy : bool = False
Whether to load a ``np.ndarray`` as such or convert it to list.

Returns
-------
Expand All @@ -320,7 +324,9 @@ def unpack_dataset(dataset: h5py.Dataset) -> Any: # pylint:disable=too-many-ret
return [val.decode("utf-8") for val in value]
if value.dtype == bool:
return value.astype(bool)
return value.tolist()
if not keep_numpy:
return value.tolist()
return value

# decoding special types
if isinstance(value, np.bool_):
Expand Down Expand Up @@ -349,18 +355,28 @@ def load_from_handle(cls, hdf5_group: h5py.Group, **kwargs) -> Tidy3dBaseModel:
return cls.parse_obj(data_dict, **kwargs)

@classmethod
def _load_group_data(cls, data_dict: dict, hdf5_group: h5py.Group) -> dict:
def _load_group_data(
cls, data_dict: dict, hdf5_group: h5py.Group, keep_numpy: bool = False
) -> dict:
"""Recusively load the data from the group with dataset unpacking as base case."""

if "keep_numpy" in hdf5_group:
keep_numpy = hdf5_group["keep_numpy"]

for key, value in hdf5_group.items():

if key == "keep_numpy":
continue

# recurive case, try to load the group into data_dict[key]
if isinstance(value, h5py.Group):
data_dict[key] = cls._load_group_data(data_dict={}, hdf5_group=value)
data_dict[key] = cls._load_group_data(
data_dict={}, hdf5_group=value, keep_numpy=keep_numpy
)

# base case, unpack the value in the dataset
elif isinstance(value, h5py.Dataset):
data_dict[key] = cls.unpack_dataset(value)
data_dict[key] = cls.unpack_dataset(value, keep_numpy=keep_numpy)

if any("TUPLE_ELEMENT_" in key for key in data_dict.keys()):
return tuple(data_dict.values())
Expand Down Expand Up @@ -395,8 +411,10 @@ def pack_dataset(hdf5_group: h5py.Group, key: str, value: Any) -> None:
return
if isinstance(value, str):
value = value.encode("utf-8")
elif isinstance(value, bool):
value = np.array(value)

# numpy array containing strings (usually direction=['-','+'])
elif isinstance(value, np.ndarray) and (value.dtype == "<U1"):
value = value.tolist()

_ = hdf5_group.create_dataset(name=key, data=value)

Expand All @@ -421,7 +439,8 @@ def _save_group_data(self, data_dict: dict, hdf5_group: h5py.Group) -> None:
key = "_".join((str(i) for i in key))

if isinstance(value, xr.DataArray):
value = value.to_dict()
coords = {key: np.array(val) for key, val in value.coords.items()}
value = dict(data=value.data, coords=coords, keep_numpy=True)

# if a tuple of dicts, convert to a dict with special
elif isinstance(value, tuple) and any(isinstance(val, dict) for val in value):
Expand Down
19 changes: 15 additions & 4 deletions tidy3d/components/data/data_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,13 @@ def validate(cls, value):
if isinstance(value, dict):
data = value.get("data")
coords = value.get("coords")
coords = {name: np.array(val.get("data")) for name, val in coords.items()}
return cls(np.array(data), coords=coords)

# convert to numpy if not already
coords = {k: v if isinstance(v, np.ndarray) else np.array(v) for k, v in coords.items()}
if not isinstance(data, np.ndarray):
data = np.array(data)

return cls(data, coords=coords)

return cls(value)

Expand All @@ -90,8 +95,14 @@ def __modify_schema__(cls, field_schema):

def __eq__(self, other) -> bool:
"""Whether two data array objects are equal."""

return self.to_dict() == other.to_dict()
if not np.all(self.data == other.data):
return False
for key, val in self.coords.items():
if not np.all(np.array(val) == np.array(other.coords[key])):
return False
return True

# return self.to_dict() == other.to_dict()


class ScalarFieldDataArray(DataArray):
Expand Down