Skip to content

Commit

Permalink
Allow different storage engines for saving pandas dataframes in exper…
Browse files Browse the repository at this point in the history
…iments.
  • Loading branch information
bojan-karlas committed Sep 17, 2024
1 parent 7bc4301 commit 9615e35
Showing 1 changed file with 50 additions and 9 deletions.
59 changes: 50 additions & 9 deletions experiments/datascope/experiments/bench/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,11 @@ def __init__(
fdel: Optional[Callable[[Any], None]] = None,
doc: Optional[str] = None,
lazy: bool = False,
storage: Optional[str] = None,
) -> None:
super().__init__(fget, fset, fdel, doc)
self.lazy = lazy
self.storage = storage

def __call__(
self,
Expand All @@ -174,7 +176,7 @@ def __call__(
fdel = self.fdel
if doc is None:
doc = self.__doc__
return type(self)(fget, fset, fdel, doc, lazy=self.lazy)
return type(self)(fget, fset, fdel, doc, lazy=self.lazy, storage=self.storage)

__isresult__ = True

Expand Down Expand Up @@ -319,6 +321,17 @@ def has_attribute_value(target: object, name: str, value: Any, ignore_none: bool
identifier_outside_of_brackets = re.compile(r"[\w-]+\s*(?![^\(\)\[\]\{\}]*[\)\]\}])")


def remove_brackets(source: str) -> str:
source = source.strip()
while (
(source.startswith("(") and source.endswith(")"))
or (source.startswith("[") and source.endswith("]"))
or (source.startswith("{") and source.endswith("}"))
):
source = source[1:-1].strip()
return source


def make_type_parser(target: Optional[type]) -> Callable[[str], Any]:

parser: Callable[[str], Any]
Expand Down Expand Up @@ -423,7 +436,7 @@ def parser(source: str) -> Configurable:
source = source[class_id_match.end() :] # noqa: E203

# We assume the attributes lie within brackets separated with commas.
source = source.strip(string.whitespace + "()[]{}")
source = remove_brackets(source)
if len(source) > 0:
source_attributes = comma_outside_of_brackets.split(source)
for i, source_attribute in enumerate(source_attributes):
Expand All @@ -436,7 +449,7 @@ def parser(source: str) -> Configurable:
source_value = source_attribute_splits[-1].strip()
if source_name not in attribute_names:
raise ValueError(
"Attribute '%s' is not supported by configurable '%s'." % source_name, target._class_id
"Attribute '%s' is not supported by configurable '%s'." % (source_name, target._class_id)
)
attributes[source_name] = attribute_parsers[source_name](source_value)

Expand All @@ -447,7 +460,7 @@ def parser(source: str) -> Configurable:
raise ValueError("Unsupported type '%s'." % str(target))

def strip_and_parse(source: str) -> Any:
return parser(source.strip(string.whitespace + "()[]{}"))
return parser(remove_brackets(source))

return strip_and_parse

Expand All @@ -469,12 +482,19 @@ def __call__(self) -> T:
return self.value


def save_dict(source: Dict[str, Any], dirpath: str, basename: str, saveonly: Optional[Sequence[str]] = None) -> None:
def save_dict(
source: Dict[str, Any],
dirpath: str,
basename: str,
saveonly: Optional[Sequence[str]] = None,
storage: Optional[Dict[str, Optional[str]]] = None,
) -> None:
basedict: Dict[str, Any] = dict((k, v) for (k, v) in source.items() if type(v) in [int, float, bool, str])
basedict.update(dict((k, v.value) for (k, v) in source.items() if isinstance(v, Enum)))
if len(basedict) > 0:
with open(os.path.join(dirpath, ".".join([basename, "yaml"])), "w") as f:
yaml.safe_dump(basedict, f)
storage = {} if storage is None else storage

for name, data in source.items():
if data is None:
Expand All @@ -487,8 +507,15 @@ def save_dict(source: Dict[str, Any], dirpath: str, basename: str, saveonly: Opt
filename = os.path.join(dirpath, ".".join([basename, name, "npy"]))
np.save(filename, data)
elif isinstance(data, DataFrame):
filename = os.path.join(dirpath, ".".join([basename, name, "csv"]))
data.to_csv(filename)
if storage.get(name, None) == "feather":
filename = os.path.join(dirpath, ".".join([basename, name, "feather"]))
data.to_feather(filename)
elif storage.get(name, None) == "parquet":
filename = os.path.join(dirpath, ".".join([basename, name, "parquet"]))
data.to_parquet(filename, index=True)
else:
filename = os.path.join(dirpath, ".".join([basename, name, "csv"]))
data.to_csv(filename)
elif isinstance(data, dict):
filename = os.path.join(dirpath, ".".join([basename, name, "yaml"]))
with open(filename, "w") as f:
Expand Down Expand Up @@ -535,6 +562,18 @@ def load_dict(dirpath: str, basename: str, lazy: Optional[Sequence[str]] = None)
res[name] = LazyLoader(lambda path=path: pd.read_csv(path, index_col=0)) # type: ignore
else:
res[name] = pd.read_csv(path, index_col=0)
elif ext == ".feather":
if name in lazy:
# The path from the outside scope is captured according to https://stackoverflow.com/a/21054087.
res[name] = LazyLoader(lambda path=path: pd.read_feather(path)) # type: ignore
else:
res[name] = pd.read_feather(path)
elif ext == ".parquet":
if name in lazy:
# The path from the outside scope is captured according to https://stackoverflow.com/a/21054087.
res[name] = LazyLoader(lambda path=path: pd.read_parquet(path)) # type: ignore
else:
res[name] = pd.read_parquet(path)
elif ext == ".yaml":
with open(path) as f:
res[name] = yaml.safe_load(f)
Expand Down Expand Up @@ -1112,7 +1151,8 @@ def save(self, path: str, include_results: bool = True) -> None:
if include_results:
props: Dict[str, result] = result.get_properties(type(self))
results = dict((name, prop.fget(self) if prop.fget is not None else None) for (name, prop) in props.items())
save_dict(results, path, "results")
storage = {name: prop.storage for (name, prop) in props.items()}
save_dict(results, path, "results", storage=storage)

@classmethod
def load(cls, path: str) -> "Scenario":
Expand Down Expand Up @@ -1857,7 +1897,8 @@ def save(
# Save results as separate files.
props = result.get_properties(type(self))
results = dict((name, prop.fget(self) if prop.fget is not None else None) for (name, prop) in props.items())
save_dict(results, path, basename, saveonly=saveonly)
storage = {name: prop.storage for (name, prop) in props.items()}
save_dict(results, path, basename, saveonly=saveonly, storage=storage)

@classmethod
def get_instances(
Expand Down

0 comments on commit 9615e35

Please sign in to comment.