From a40007df20e3da487fd3297003a20cc1d87b1c9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Diego=20Alonso=20=C3=81lvarez?=
 <d.alonso-alvarez@imperial.ac.uk>
Date: Tue, 14 Nov 2023 12:46:10 +0000
Subject: [PATCH] Implements loading date into the settings

---
 wsimod/validation.py | 64 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/wsimod/validation.py b/wsimod/validation.py
index 590f7e73..59630c98 100644
--- a/wsimod/validation.py
+++ b/wsimod/validation.py
@@ -1,7 +1,9 @@
+import ast
 import os
 from pathlib import Path
 from typing import Any, Optional
 
+import pandas as pd
 from tomllib import load
 
 
@@ -89,3 +91,65 @@ def _validate_output_dir(output_dir: Optional[Path]) -> Path:
 
     os.makedirs(output_dir, exist_ok=True)
     return output_dir.absolute()
+
+
+def load_data_into_settings(
+    settings: dict[str, Any], inputs: Optional[Path]
+) -> dict[str, Any]:
+    input_dir: Path = inputs if inputs else settings["inputs"]
+    loaded_settings: dict[str, Any] = {}
+
+    for k, v in settings.items():
+        if isinstance(v, dict):
+            loaded_settings[k] = load_data_into_settings(v, input_dir)
+        elif isinstance(v, str) and v.startswith("file:"):
+            loaded_settings[k] = load_data(v.strip("file:"), inputs)
+        else:
+            loaded_settings[k] = v
+
+    return loaded_settings
+
+
+def load_data(instruction: str, inputs: Path) -> pd.DataFrame:
+    """Parses a string with information on how to load data, and then loads it.
+
+    The instruction string must follow the format:
+
+        FILENAME[:comma_separated_reading_options]
+
+    Where the reading options must be valid input arguments to `pandas.read_csv`. For
+    example, if instruction is simply `"data_file.csv"`, the reading command will be
+    `pd.read_csv("data_file.csv")`. However if the instruction string is
+    `"data_file.csv:sep=' ',index_col='datetime'"`, it will result in
+    `pd.read_csv("data_file.csv", sep=' ', index_col='datetime')` being called.
+
+    Args:
+        instruction (str): A string detailing how to load the data.
+        inputs (Path): Base directory of inputs.
+
+    Returns:
+        pd.DataFrame: Loaded dataframe following the instructions.
+    """
+    filename, _, options = instruction.partition(":")
+    options_: dict[str, Any] = process_options(options)
+    return pd.read_csv(inputs / Path(filename), **options_)
+
+
+def process_options(options: str) -> dict[str, Any]:
+    """Formats the options string as keyword arguments.
+
+    >>> process_options("sep=' ',index_col='datetime'")
+    {'sep': ' ', 'index_col': 'datetime'}
+
+    Args:
+        options (str): The strings with the arguments to process.
+
+    Returns:
+        dict[str, Any]: The dictionary with the processed keyword arguments.
+    """
+    args = "f({})".format(options)
+    tree = ast.parse(args)
+    funccall = tree.body[0].value
+
+    kwargs = {arg.arg: ast.literal_eval(arg.value) for arg in funccall.keywords}
+    return kwargs