From 5ff20fe829c969ca31a94f723a5d8f928ab777da Mon Sep 17 00:00:00 2001 From: larsevj Date: Wed, 2 Oct 2024 10:59:21 +0200 Subject: [PATCH] Adapt to main --- src/ert/config/__init__.py | 2 + src/ert/config/analysis_config.py | 4 +- src/ert/config/design_matrix.py | 166 +++++++++++++++++- src/ert/sensitivity_analysis/design_matrix.py | 164 ----------------- .../test_design_matrix.py | 8 +- 5 files changed, 173 insertions(+), 171 deletions(-) delete mode 100644 src/ert/sensitivity_analysis/design_matrix.py diff --git a/src/ert/config/__init__.py b/src/ert/config/__init__.py index ec01d6db8b3..971625cc36d 100644 --- a/src/ert/config/__init__.py +++ b/src/ert/config/__init__.py @@ -1,6 +1,7 @@ from .analysis_config import AnalysisConfig from .analysis_module import AnalysisModule, ESSettings, IESSettings from .capture_validation import capture_validation +from .design_matrix import DesignMatrix from .enkf_observation_implementation_type import EnkfObservationImplementationType from .ensemble_config import EnsembleConfig from .ert_config import ErtConfig @@ -48,6 +49,7 @@ "ConfigValidationError", "ConfigValidationError", "ConfigWarning", + "DesignMatrix", "ESSettings", "EnkfObs", "EnkfObservationImplementationType", diff --git a/src/ert/config/analysis_config.py b/src/ert/config/analysis_config.py index a2afdf9ad6f..5bdd376f8c9 100644 --- a/src/ert/config/analysis_config.py +++ b/src/ert/config/analysis_config.py @@ -41,7 +41,7 @@ class AnalysisConfig: ies_module: IESSettings = field(default_factory=IESSettings) observation_settings: UpdateSettings = field(default_factory=UpdateSettings) num_iterations: int = 1 - design_matrix_args: Optional[DesignMatrix] = None + design_matrix: Optional[DesignMatrix] = None @no_type_check @classmethod @@ -194,7 +194,7 @@ def from_dict(cls, config_dict: ConfigDict) -> "AnalysisConfig": observation_settings=obs_settings, es_module=es_settings, ies_module=ies_settings, - design_matrix_args=DesignMatrix.from_config_list(design_matrix_config_list) + design_matrix=DesignMatrix.from_config_list(design_matrix_config_list) if design_matrix_config_list is not None else None, ) diff --git a/src/ert/config/design_matrix.py b/src/ert/config/design_matrix.py index 20b5fd8df0d..538821db208 100644 --- a/src/ert/config/design_matrix.py +++ b/src/ert/config/design_matrix.py @@ -1,8 +1,13 @@ from __future__ import annotations +from collections import defaultdict from dataclasses import dataclass from pathlib import Path -from typing import List +from typing import TYPE_CHECKING, List + +import pandas as pd + +from ert.config.gen_kw_config import GenKwConfig from ._option_dict import option_dict from .parsing import ( @@ -10,6 +15,13 @@ ErrorInfo, ) +if TYPE_CHECKING: + from ert.config import ( + ParameterConfig, + ) + +DESIGN_MATRIX_GROUP = "DESIGN_MATRIX" + @dataclass class DesignMatrix: @@ -41,6 +53,12 @@ def from_config_list(cls, config_list: List[str]) -> "DesignMatrix": errors.append( ErrorInfo("Missing required DEFAULT_SHEET").set_context(config_list) ) + if design_sheet is not None and design_sheet == default_sheet: + errors.append( + ErrorInfo( + "DESIGN_SHEET and DEFAULT_SHEET can not be the same." + ).set_context(config_list) + ) if errors: raise ConfigValidationError.from_collected(errors) assert design_sheet is not None @@ -50,3 +68,149 @@ def from_config_list(cls, config_list: List[str]) -> "DesignMatrix": design_sheet=design_sheet, default_sheet=default_sheet, ) + + def read_design_matrix( + self, + parameter_configurations: List[ParameterConfig], + ) -> pd.DataFrame: + """ + Reads out all file content from different files and create dataframes + """ + design_matrix_df = DesignMatrix._read_excel( + self.xls_filename, self.design_sheet + ) + if "REAL" in design_matrix_df.columns: + design_matrix_df.set_index(design_matrix_df["REAL"]) + del design_matrix_df["REAL"] + try: + DesignMatrix._validate_design_matrix_header(design_matrix_df) + except ValueError as err: + raise ValueError(f"Design matrix not valid, error: {err!s}") from err + + # Todo: Check for invalid realizations, drop them maybe? + # This should probably handle/(fill in) missing values in design_matrix_sheet as well + defaults = DesignMatrix._read_defaultssheet( + self.xls_filename, self.default_sheet + ) + for k, v in defaults.items(): + if k not in design_matrix_df.columns: + design_matrix_df[k] = v + + # ignoring errors here is deprecated in pandas, should find another solution + # design_matrix_sheet = design_matrix_sheet.apply(pd.to_numeric, errors="ignore") + + parameter_groups = defaultdict(list) + parameter_map = [] + all_genkw_configs = [ + param_group + for param_group in parameter_configurations + if isinstance(param_group, GenKwConfig) + ] + errors = {} + for param in design_matrix_df.columns: + par_gp = [] + for param_group in all_genkw_configs: + if param in param_group: + par_gp.append(param_group.name) + + if not par_gp: + parameter_name = "DESIGN_MATRIX" + parameter_groups[parameter_name].append(param) + parameter_map.append((parameter_name, param)) + elif len(par_gp) == 1: + parameter_name = par_gp[0] + parameter_groups[parameter_name].append(param) + parameter_map.append((parameter_name, param)) + else: + errors[param] = par_gp + + if errors: + msg = "" + for key, value in errors.items(): + msg += ( + f"The following parameter '{key}' was found in multiple" + f" GenKw parameters groups: {value}." + ) + raise ValueError(msg) + design_matrix_df.columns = pd.MultiIndex.from_tuples(parameter_map) + return design_matrix_df + + @staticmethod + def _read_excel( + file_name: Path | str, + sheet_name: str, + usecols: int | list[int] | None = None, + header: int | None = 0, + ) -> pd.DataFrame: + """ + Make dataframe from excel file + :return: Dataframe + :raises: OsError if file not found + :raises: ValueError if file not loaded correctly + """ + dframe: pd.DataFrame = pd.read_excel( + file_name, + sheet_name, + usecols=usecols, + header=header, + ) + return dframe.dropna(axis=1, how="all") + + def _validate_design_matrix_header(design_matrix: pd.DataFrame) -> None: + """ + Validate header in user inputted design matrix + :raises: ValueError if design matrix contains empty headers + """ + if design_matrix.empty: + return + try: + unnamed = design_matrix.loc[ + :, design_matrix.columns.str.contains("^Unnamed") + ] + except ValueError as err: + # We catch because int/floats as column headers + # in xlsx gets read as int/float and is not valid to index by. + raise ValueError( + f"Invalid value in design matrix header, error: {err !s}" + ) from err + column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()] + if len(column_indexes) > 0: + raise ValueError(f"Column headers not present in column {column_indexes}") + + @staticmethod + def _read_defaultssheet( + xlsfilename: Path | str, defaultssheetname: str + ) -> dict[str, str]: + """ + Construct a dataframe of keys and values to be used as defaults from the + first two columns in a spreadsheet. + + Returns a dict of default values + + :raises: ValueError if defaults sheet is non-empty but non-parsable + """ + if defaultssheetname: + default_df = DesignMatrix._read_excel( + xlsfilename, defaultssheetname, usecols=[0, 1], header=None + ) + if default_df.empty: + return {} + if len(default_df.columns) < 2: + raise ValueError("Defaults sheet must have at least two columns") + # Look for initial or trailing whitespace in parameter names. This + # is disallowed as it can create user confusion and has no use-case. + for paramname in default_df.loc[:, 0]: + if paramname != paramname.strip(): + raise ValueError( + f'Parameter name "{paramname}" in default values contains ' + "initial or trailing whitespace." + ) + + else: + return {} + + default_df = default_df.rename(columns={0: "keys", 1: "defaults"}) + defaults = {} + for _, row in default_df.iterrows(): + defaults[row["keys"]] = row["defaults"] + return defaults diff --git a/src/ert/sensitivity_analysis/design_matrix.py b/src/ert/sensitivity_analysis/design_matrix.py deleted file mode 100644 index 887fbf2ecc2..00000000000 --- a/src/ert/sensitivity_analysis/design_matrix.py +++ /dev/null @@ -1,164 +0,0 @@ -from __future__ import annotations - -from collections import defaultdict -from pathlib import Path -from typing import TYPE_CHECKING - -import pandas as pd - -from ert.config.gen_kw_config import GenKwConfig - -if TYPE_CHECKING: - from ert.config import ( - ErtConfig, - ) - -DESIGN_MATRIX_GROUP = "DESIGN_MATRIX" - - -def read_design_matrix( - ert_config: ErtConfig, - xlsfilename: Path | str, - designsheetname: str = "DesignSheet01", - defaultssheetname: str = "DefaultValues", -) -> pd.DataFrame: - """ - Reads out all file content from different files and create dataframes - """ - design_matrix_sheet = _read_excel(xlsfilename, designsheetname) - if "REAL" in design_matrix_sheet.columns: - design_matrix_sheet.set_index(design_matrix_sheet["REAL"]) - del design_matrix_sheet["REAL"] - try: - _validate_design_matrix_header(design_matrix_sheet) - except ValueError as err: - raise ValueError(f"Design matrix not valid, error: {err!s}") from err - - # Todo: Check for invalid realizations, drop them maybe? - - if designsheetname == defaultssheetname: - raise ValueError("Design-sheet and defaults-sheet can not be the same") - - # This should probably handle/(fill in) missing values in design_matrix_sheet as well - defaults = _read_defaultssheet(xlsfilename, defaultssheetname) - for k, v in defaults.items(): - if k not in design_matrix_sheet.columns: - design_matrix_sheet[k] = v - - # ignoring errors here is deprecated in pandas, should find another solution - # design_matrix_sheet = design_matrix_sheet.apply(pd.to_numeric, errors="ignore") - - parameter_groups = defaultdict(list) - parameter_map = [] - all_genkw_configs = [ - param_group - for param_group in ert_config.ensemble_config.parameter_configuration - if isinstance(param_group, GenKwConfig) - ] - errors = {} - for param in design_matrix_sheet.columns: - par_gp = [] - for param_group in all_genkw_configs: - if param in param_group: - par_gp.append(param_group.name) - - if not par_gp: - parameter_name = "DESIGN_MATRIX" - parameter_groups[parameter_name].append(param) - parameter_map.append((parameter_name, param)) - elif len(par_gp) == 1: - parameter_name = par_gp[0] - parameter_groups[parameter_name].append(param) - parameter_map.append((parameter_name, param)) - else: - errors[param] = par_gp - - if errors: - msg = "" - for key, value in errors.items(): - msg += ( - f"The following parameter '{key}' was found in multiple" - f" GenKw parameters groups: {value}." - ) - raise ValueError(msg) - design_matrix_sheet.columns = pd.MultiIndex.from_tuples(parameter_map) - return design_matrix_sheet - - -def _read_excel( - file_name: Path | str, - sheet_name: str, - usecols: int | list[int] | None = None, - header: int | None = 0, -) -> pd.DataFrame: - """ - Make dataframe from excel file - :return: Dataframe - :raises: OsError if file not found - :raises: ValueError if file not loaded correctly - """ - dframe: pd.DataFrame = pd.read_excel( - file_name, - sheet_name, - usecols=usecols, - header=header, - ) - return dframe.dropna(axis=1, how="all") - - -def _validate_design_matrix_header(design_matrix: pd.DataFrame) -> None: - """ - Validate header in user inputted design matrix - :raises: ValueError if design matrix contains empty headers - """ - if design_matrix.empty: - return - try: - unnamed = design_matrix.loc[:, design_matrix.columns.str.contains("^Unnamed")] - except ValueError as err: - # We catch because int/floats as column headers - # in xlsx gets read as int/float and is not valid to index by. - raise ValueError( - f"Invalid value in design matrix header, error: {err !s}" - ) from err - column_indexes = [int(x.split(":")[1]) for x in unnamed.columns.to_numpy()] - if len(column_indexes) > 0: - raise ValueError(f"Column headers not present in column {column_indexes}") - - -def _read_defaultssheet( - xlsfilename: Path | str, defaultssheetname: str -) -> dict[str, str]: - """ - Construct a dataframe of keys and values to be used as defaults from the - first two columns in a spreadsheet. - - Returns a dict of default values - - :raises: ValueError if defaults sheet is non-empty but non-parsable - """ - if defaultssheetname: - default_df = _read_excel( - xlsfilename, defaultssheetname, usecols=[0, 1], header=None - ) - if default_df.empty: - return {} - if len(default_df.columns) < 2: - raise ValueError("Defaults sheet must have at least two columns") - # Look for initial or trailing whitespace in parameter names. This - # is disallowed as it can create user confusion and has no use-case. - for paramname in default_df.loc[:, 0]: - if paramname != paramname.strip(): - raise ValueError( - f'Parameter name "{paramname}" in default values contains ' - "initial or trailing whitespace." - ) - - else: - return {} - - default_df = default_df.rename(columns={0: "keys", 1: "defaults"}) - defaults = {} - for _, row in default_df.iterrows(): - defaults[row["keys"]] = row["defaults"] - return defaults diff --git a/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py b/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py index c7b10b1ee9e..347c085f950 100644 --- a/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py +++ b/tests/ert/unit_tests/sensitivity_analysis/test_design_matrix.py @@ -2,9 +2,6 @@ import pytest from ert.config import ErtConfig -from ert.sensitivity_analysis.design_matrix import ( - read_design_matrix, -) @pytest.mark.usefixtures("copy_poly_case") @@ -22,4 +19,7 @@ def test_reading_design_matrix(copy_poly_case): "DESIGN_MATRIX design_matrix.xlsx DESIGN_SHEET:DesignSheet01 DEFAULT_SHEET:DefaultValues" ) ert_config = ErtConfig.from_file("poly.ert") - _design_frame = read_design_matrix(ert_config, "design_matrix.xlsx") + parameter_configurations = ert_config.ensemble_config.parameter_configuration + _design_frame = ert_config.analysis_config.design_matrix.read_design_matrix( + parameter_configurations + )