diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd106724..b7b71cfa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,9 +23,11 @@ jobs: - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - + - name: Install and build - run: python -m pip install -r requirements-dev.txt + run: | + python -m pip install -r requirements-dev.txt + python -m pip install . - name: Run tests run: python -m pytest diff --git a/.gitignore b/.gitignore index 4921f22b..bdcf5200 100644 --- a/.gitignore +++ b/.gitignore @@ -152,4 +152,8 @@ cython_debug/ .idea/ # VScode configuration -.vscode \ No newline at end of file +.vscode + +# Scratch folder to do testing +scratch +results \ No newline at end of file diff --git a/docs/demo/examples/quickstart_demo.yaml b/docs/demo/examples/quickstart_demo.yaml new file mode 100644 index 00000000..167dfc93 --- /dev/null +++ b/docs/demo/examples/quickstart_demo.yaml @@ -0,0 +1,87 @@ +inputs: docs/demo/data/processed +outputs: results/quickstart_results + +data: + my_land_data: + filename: timeseries_data.csv + filter: + - where: site + is: oxford_land + scaling: + - where: variable + is: precipitation + variable: value + factor: "MM_TO_M" + format: dict + index: ['variable', 'date'] + output: 'value' + options: parse_dates=['date'] + + dates_data: + filename: timeseries_data.csv + options: usecols=['date'],parse_dates=['date'] + +dates: data:dates_data + +nodes: +- type_: Sewer + name: my_sewer + capacity: 0.04 + +- type_: Land + name: my_land + data_input_dict: data:my_land_data + surfaces: + - type_: ImperviousSurface + surface: urban + area: 10 + pollutant_load: + phosphate: 1.0e-07 + - type_: PerviousSurface + surface: rural + area: 100 + depth: 0.5 + pollutant_load: + phosphate: 1.0e-07 + +- type_: Groundwater + name: my_groundwater + capacity: 100 + area: 100 + +- type_: Node + name: my_river + +- type_: Waste + name: my_outlet + +arcs: +- type_: Arc + name: urban_drainage + in_port: my_land + out_port: my_sewer + +- type_: Arc + name: percolation + in_port: my_land + out_port: my_groundwater + +- type_: Arc + name: runoff + in_port: my_land + out_port: my_river + +- type_: Arc + name: storm_outflow + in_port: my_sewer + out_port: my_river + +- type_: Arc + name: baseflow + in_port: my_groundwater + out_port: my_river + +- type_: Arc + name: catchment_outflow + in_port: my_river + out_port: my_outlet diff --git a/pyproject.toml b/pyproject.toml index fee3524d..ddb42540 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,9 +19,13 @@ requires-python = ">=3.9" dependencies = [ "PyYAML", "tqdm", - "dill" + "dill", + "pandas" ] +[project.scripts] +wsimod = "wsimod.__main__:run" + [project.optional-dependencies] dev = [ "pytest", @@ -33,7 +37,6 @@ dev = [ ] demos = [ - "pandas", "geopandas", "matplotlib", "shapely" diff --git a/requirements-dev.txt b/requirements-dev.txt index 1e9dbfbe..333b4f37 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -36,11 +36,15 @@ mypy-extensions==1.0.0 # via black nodeenv==1.8.0 # via pre-commit +numpy==1.26.2 + # via pandas packaging==23.2 # via # black # build # pytest +pandas==2.1.3 + # via wsimod (pyproject.toml) pathspec==0.11.2 # via black pip-tools==7.3.0 @@ -61,14 +65,22 @@ pytest==7.4.3 # wsimod (pyproject.toml) pytest-cov==4.1.0 # via wsimod (pyproject.toml) +python-dateutil==2.8.2 + # via pandas +pytz==2023.3.post1 + # via pandas pyyaml==6.0.1 # via # pre-commit # wsimod (pyproject.toml) ruff==0.1.4 # via wsimod (pyproject.toml) +six==1.16.0 + # via python-dateutil tqdm==4.66.1 # via wsimod (pyproject.toml) +tzdata==2023.3 + # via pandas virtualenv==20.24.6 # via pre-commit wheel==0.41.3 diff --git a/requirements.txt b/requirements.txt index d58181de..31a11e07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,13 +2,25 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile +# pip-compile --output-file=requirements.txt # colorama==0.4.6 # via tqdm dill==0.3.7 # via wsimod (pyproject.toml) +numpy==1.26.2 + # via pandas +pandas==2.1.3 + # via wsimod (pyproject.toml) +python-dateutil==2.8.2 + # via pandas +pytz==2023.3.post1 + # via pandas pyyaml==6.0.1 # via wsimod (pyproject.toml) +six==1.16.0 + # via python-dateutil tqdm==4.66.1 # via wsimod (pyproject.toml) +tzdata==2023.3 + # via pandas diff --git a/tests/test_example_files.py b/tests/test_example_files.py new file mode 100644 index 00000000..332ee091 --- /dev/null +++ b/tests/test_example_files.py @@ -0,0 +1,20 @@ +from pytest import mark +from pathlib import Path +import subprocess + + +def collect_examples() -> list[Path]: + root = Path.cwd() / "docs" / "demo" / "examples" + return list(root.glob("**/*.yaml")) + + +@mark.parametrize("example", collect_examples()) +def test_examples(example: Path, tmp_path: Path) -> None: + result = subprocess.run( + f"wsimod {str(example)} -o {str(tmp_path)}", + shell=True, + check=True, + ) + assert (tmp_path / "flows.csv").exists() + assert (tmp_path / "tanks.csv").exists() + assert (tmp_path / "surfaces.csv").exists() diff --git a/wsimod/__main__.py b/wsimod/__main__.py new file mode 100644 index 00000000..8cc990ef --- /dev/null +++ b/wsimod/__main__.py @@ -0,0 +1,72 @@ +"""The entry point for the myproject program.""" +from argparse import ArgumentParser +from pathlib import Path +from typing import Any, cast + +import pandas as pd + +from wsimod.orchestration.model import Model +from wsimod.validation import assign_data_to_settings, load_data_files, validate_io_args + + +def create_parser() -> ArgumentParser: + """Create the CLI argument parser.""" + parser = ArgumentParser(prog="WSIMOD") + parser.add_argument( + "settings", + type=Path, + help="Path to the WSIMOD input file, in YAML format.", + ) + parser.add_argument( + "--inputs", + "-i", + type=Path, + help="Base directory for all input files. If present, overwrites value in the" + " settings file.", + ) + parser.add_argument( + "--outputs", + "-o", + type=Path, + help="Base directory for all output files. If present, overwrites value in the" + " settings file.", + ) + + return parser + + +def run_model(settings: dict[str, Any], outputs: Path) -> None: + """Runs the mode with the chosen settings and saves the outputs as csv. + + Args: + settings (dict[str, Any]): Settings dictionary with loaded data. + outputs(Path): Directory where to save the outputs. + """ + model = Model() + + model.dates = cast(pd.Series, settings["dates"]).drop_duplicates() + model.add_nodes(settings["nodes"]) + model.add_arcs(settings["arcs"]) + + flows, tanks, _, surfaces = model.run() + + pd.DataFrame(flows).to_csv(outputs / "flows.csv") + pd.DataFrame(tanks).to_csv(outputs / "tanks.csv") + pd.DataFrame(surfaces).to_csv(outputs / "surfaces.csv") + + +def run() -> None: + """Main entry point of the application.""" + args = vars(create_parser().parse_args()) + settings = validate_io_args(**args) + + inputs = settings.pop("inputs") + outputs = settings.pop("outputs") + loaded_data = load_data_files(settings.pop("data", {}), inputs) + loaded_settings = assign_data_to_settings(settings, loaded_data) + + run_model(loaded_settings, outputs) + + +if __name__ == "__main__": + run() diff --git a/wsimod/validation.py b/wsimod/validation.py new file mode 100644 index 00000000..c42fb6f6 --- /dev/null +++ b/wsimod/validation.py @@ -0,0 +1,254 @@ +import ast +import os +from pathlib import Path +from typing import Any, Optional, Union + +import pandas as pd +import yaml + +from wsimod.core import constants + + +def validate_io_args( + settings: Path, inputs: Optional[Path], outputs: Optional[Path] +) -> dict[str, Any]: + """Validate the io arguments, including their definition in settings. + + This does not include validating the existance of data input files, which is done + at a later stage. + + Args: + settings (Path): The path to the file, in TOML format, containing all the + configuration required for the simulation. + inputs (Optional[Path]): Base directory for all input files. If present, + overwrites value in the settings file. + outputs (Optional[Path]): Base directory for all output files. If present, + overwrites value in the settings file. + + Returns: + dict[str, Any]: The loaded settings file with validated inputs and outputs. + """ + if settings.is_dir() or not settings.exists(): + raise ValueError( + f"The settings file at {settings.absolute()} could not be found." + ) + + with settings.open("rb") as f: + settings_ = yaml.safe_load(f) + + # Valildate inputs folder + settings_["inputs"] = _validate_input_dir( + inputs if inputs else settings_.get("inputs", None), default=settings.parent + ) + + # Valildate outputs folder + settings_["outputs"] = _validate_output_dir( + outputs if outputs else settings_.get("outputs", None), default=settings.parent + ) + + return settings_ + + +def _validate_input_dir(input_dir: Optional[Path], default: Path) -> Path: + """Validates the directory of input files. + + If not provided, the default directory is used. + + Args: + input_dir (Optional[Path]): The potential directory with the inputs. + default (Path): Default input path if none provided. + + Raises: + ValueError: If the inputs base directory is not actually a directory. + + Returns: + Path: The validated path containing the inputs. + """ + if not input_dir: + return default.absolute() + + input_dir = Path(input_dir).absolute() + if not input_dir.is_dir(): + raise ValueError( + f"The inputs base directory at {input_dir} is not a directory." + ) + return input_dir + + +def _validate_output_dir(output_dir: Optional[Path], default: Path) -> Path: + """Validates the directory for output files. + + If not provided, the default path is used. If it does not exist, it is created. + + Args: + output_dir (Optional[Path]): The potential directory for the outputs. + default (Path): Defualt output path if none provided. + + Raises: + ValueError: If a file with the same name already exist. + + Returns: + Path: The validated path containing where outputs will be saved. + """ + if not output_dir: + return default.absolute() + + output_dir = Path(output_dir).absolute() + if output_dir.exists() and not output_dir.is_dir(): + raise ValueError(f"A file at {output_dir} exists and is not a directory.") + + os.makedirs(output_dir, exist_ok=True) + return output_dir + + +def load_data_files( + data_settings: dict[str, Any], input_dir: Path +) -> dict[str, Union[pd.DataFrame, pd.Series, dict]]: + """Reads the settings data section and reads the required data from files. + + Args: + data_settings (dict[str, Any]): The data section of the settings file. + input_dir (Path): The directory where input files are located. + + Returns: + dict[str, Union[pd.DataFrame, pd.Series, dict]]: Loaded dataframe, series or + dictionary following the instructions. + """ + return { + f"data:{key}": read_data(var, input_dir) for key, var in data_settings.items() + } + + +def assign_data_to_settings( + settings: dict[str, Any], + data_settings: dict[str, Union[pd.DataFrame, pd.Series, dict]], +) -> dict[str, Any]: + """Assigns the data files to the right variables in the settings dictionary. + + Search for data files to load is done recursively, walking through the whole + settgins dictionary tree. + + Args: + settings (dict[str, Any]): The settings dicitonary. + input_dir (Path): The directory where input files are located. + + Returns: + dict[str, Any]: A new settings dictionary where data files have been loaded. + """ + loaded_settings: dict[str, Any] = {} + + for k, v in settings.items(): + if isinstance(v, dict): + loaded_settings[k] = assign_data_to_settings(v, data_settings) + elif isinstance(v, list): + loaded_settings[k] = [ + assign_data_to_settings(item, data_settings) for item in v + ] + elif isinstance(v, str) and v.startswith("data:"): + try: + loaded_settings[k] = data_settings[v] + except KeyError: + raise ValueError( + f"{v} could not be found. Did you configure loading that data in" + " the data section of the settings file?" + ) + else: + loaded_settings[k] = v + + return loaded_settings + + +def read_data( + instructions: dict[str, Any], inputs: Path +) -> Union[pd.DataFrame, pd.Series, dict]: + """Uses the instructions to load tabular data. + + The instructions are a dictionary of options that define what file to load, how + to load it and some simple manipulations to do to the loaded pandas Dataframe + before returing it. + + The keys to control this proces are: + + filename: Filename of the data to load + filter (optional): List of filters for the dataframe, each a dictionary in the form: + where: column to filer + is: value of that column + scaling (optional): List of variable scaling, each a dictionary of the form: + where: column to filer (optional) + is: value of that column (optional) + variable: name of the column to scale + factor: unit conversion factor, as defined in `wsimod.core.constants`, + eg. MM_TO_M + format (optional): How the output should be provided. If format is `dict` then + the output is provided as a dictonary, otherwise a Dataframe or a Series + (if there is only 1 column) is output. + index (optional): Column(s) to use as index. + output (optional): Column to provide as output. + options (optional): Options to pass to the `pandas.read_csv` function. + + The order in which operations are done is: + + read -> filter -> scale -> set_index -> select_output -> convert_format + + Only the `read` step will always happen. The others depend on the inputs. + + Args: + instructions (str): A dictionary with instructions to load the data. + inputs (Path): Base directory of inputs. + + Returns: + Union[pd.DataFrame, pd.Series, dict]: Loaded dataframe, series or dictionary + following the instructions. + """ + filename = inputs / Path(instructions["filename"]) + options_: dict[str, Any] = process_options(instructions.get("options", "")) + data = pd.read_csv(inputs / Path(filename), **options_) + + for filter in instructions.get("filters", []): + data = data.loc[data[filter["where"]] == filter["is"]] + + for scaler in instructions.get("scaling", []): + idx = data[scaler["where"]] == scaler["is"] if "is" in scaler else slice(None) + factor = ( + getattr(constants, scaler["factor"]) + if isinstance(scaler["factor"], str) + else scaler["factor"] + ) + data.loc[idx, scaler["variable"]] *= factor + + if index := instructions.get("index", None): + data = data.set_index(index) + + if output := instructions.get("output", None): + data = data[output] + + if isinstance(data, pd.DataFrame) and len(data.columns) == 1: + data = data.squeeze() + + if instructions.get("format", "") == "dict": + return data.to_dict() + + return data + + +def process_options(options: str) -> dict[str, Any]: + """Formats the options string as keyword arguments. + + >>> process_options("sep=' ',index_col='datetime'") + {'sep': ' ', 'index_col': 'datetime'} + + Args: + options (str): The strings with the arguments to process. + + Returns: + dict[str, Any]: The dictionary with the processed keyword arguments. + """ + if not options: + return {} + + args = "f({})".format(options) + tree = ast.parse(args) + funccall = tree.body[0].value + + kwargs = {arg.arg: ast.literal_eval(arg.value) for arg in funccall.keywords} + return kwargs