From fd66870e4f0807d02ca494ca80adf73429c86129 Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Tue, 11 Apr 2023 11:42:58 -0400 Subject: [PATCH 01/38] feat: add typer --- autora/theorist/__main__.py | 9 ++++ poetry.lock | 98 +++++++++++++++++++++++++++++++++++-- pyproject.toml | 3 ++ 3 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 autora/theorist/__main__.py diff --git a/autora/theorist/__main__.py b/autora/theorist/__main__.py new file mode 100644 index 000000000..bb1e5c7b5 --- /dev/null +++ b/autora/theorist/__main__.py @@ -0,0 +1,9 @@ +import typer + + +def main(name: str): + print(f"Hello {name}") + + +if __name__ == "__main__": + typer.run(main) diff --git a/poetry.lock b/poetry.lock index 25083ab1a..f03e8a32c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "anyio" @@ -313,7 +313,7 @@ unicode-backport = ["unicodedata2"] name = "click" version = "8.1.3" description = "Composable command line interface toolkit" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1312,6 +1312,31 @@ importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} [package.extras] testing = ["coverage", "pyyaml"] +[[package]] +name = "markdown-it-py" +version = "2.2.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "markdown-it-py-2.2.0.tar.gz", hash = "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"}, + {file = "markdown_it_py-2.2.0-py3-none-any.whl", hash = "sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["attrs", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "markupsafe" version = "2.1.1" @@ -1452,6 +1477,18 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "mergedeep" version = "1.3.4" @@ -2349,7 +2386,7 @@ files = [ name = "pygments" version = "2.14.0" description = "Pygments is a syntax highlighting package written in Python." -category = "dev" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2870,6 +2907,26 @@ files = [ {file = "rfc3986_validator-0.1.1.tar.gz", hash = "sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055"}, ] +[[package]] +name = "rich" +version = "13.3.3" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +category = "main" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.3.3-py3-none-any.whl", hash = "sha256:540c7d6d26a1178e8e8b37e9ba44573a3cd1464ff6348b99ee7061b95d1c6333"}, + {file = "rich-13.3.3.tar.gz", hash = "sha256:dc84400a9d842b3a9c5ff74addd8eb798d155f36c1c91303888e0a66850d2a15"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0,<3.0.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "scikit-learn" version = "1.2.2" @@ -3008,6 +3065,18 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-g testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "shellingham" +version = "1.5.0.post1" +description = "Tool to Detect Surrounding Shell" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "shellingham-1.5.0.post1-py2.py3-none-any.whl", hash = "sha256:368bf8c00754fd4f55afb7bbb86e272df77e4dc76ac29dbcbb81a59e9fc15744"}, + {file = "shellingham-1.5.0.post1.tar.gz", hash = "sha256:823bc5fb5c34d60f285b624e7264f4dda254bc803a3774a147bf99c0e3004a28"}, +] + [[package]] name = "six" version = "1.16.0" @@ -3255,6 +3324,27 @@ lint = ["black (>=22.6.0)", "mdformat (>0.7)", "ruff (>=0.0.156)"] test = ["pre-commit", "pytest"] typing = ["mypy (>=0.990)"] +[[package]] +name = "typer" +version = "0.7.0" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "typer-0.7.0-py3-none-any.whl", hash = "sha256:b5e704f4e48ec263de1c0b3a2387cd405a13767d2f907f44c1a08cbad96f606d"}, + {file = "typer-0.7.0.tar.gz", hash = "sha256:ff797846578a9f2a201b53442aedeb543319466870fbe1c701eab66dd7681165"}, +] + +[package.dependencies] +click = ">=7.1.1,<9.0.0" + +[package.extras] +all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] +doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] +test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"] + [[package]] name = "typing-extensions" version = "4.4.0" @@ -3463,4 +3553,4 @@ tinkerforge = ["tinkerforge"] [metadata] lock-version = "2.0" python-versions = ">=3.8.10,<3.11" -content-hash = "462fb631f876e5145435416ab989b93a1b71cef94cb8d1afbf3e88762719824d" +content-hash = "be2c95a830dc249a8c33600793aa41c6f842fc6266db8b4363a53afd95fe8b30" diff --git a/pyproject.toml b/pyproject.toml index 90b4895cc..43876b9f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,9 @@ sympy = "^1.10.1" tinkerforge = {version = "^2.1.25", optional=true} torch = "1.13.1" tqdm = "^4.64.0" +typer = "^0.7.0" +rich = "^13.3.3" +shellingham = "^1.5.0.post1" [tool.poetry.extras] tinkerforge = ["tinkerforge"] From 67c095aa2664b7476af9f40856a1c1d08116b075 Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 11:54:04 -0400 Subject: [PATCH 02/38] docs: add import for class plus debug flags --- autora/theorist/__main__.py | 38 +++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/autora/theorist/__main__.py b/autora/theorist/__main__.py index bb1e5c7b5..d67b7f2c9 100644 --- a/autora/theorist/__main__.py +++ b/autora/theorist/__main__.py @@ -1,8 +1,42 @@ +import importlib +import logging +from typing import Type + import typer +from sklearn.base import BaseEstimator + +_logger = logging.getLogger(__name__) + + +def import_class(name: str) -> Type[BaseEstimator]: + """ + Load a class from a module by name. + + Args: + name: + + Examples: + >>> import_class("sklearn.linear_model.LinearRegressor") + + """ + components = name.split(".") + module_name, class_name = ".".join(components[:-1]), components[-1] + _logger.info(f"loading {module_name=}, {class_name=}") + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + return cls + + +def main(regressor: str, verbose: bool = False, debug: bool = False): + + if verbose: + logging.basicConfig(level=logging.INFO) + if debug: + logging.basicConfig(level=logging.DEBUG) + regressor_class = import_class(regressor) -def main(name: str): - print(f"Hello {name}") + print(f"{regressor}: {regressor_class}") if __name__ == "__main__": From e0c8f36651fdf65f0cd317048e65bf7b9e8f98bb Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 11:54:18 -0400 Subject: [PATCH 03/38] chore: remove broken plot_utils for darts --- autora/theorist/darts/plot_utils.py | 1129 --------------------------- 1 file changed, 1129 deletions(-) delete mode 100755 autora/theorist/darts/plot_utils.py diff --git a/autora/theorist/darts/plot_utils.py b/autora/theorist/darts/plot_utils.py deleted file mode 100755 index d256bf412..000000000 --- a/autora/theorist/darts/plot_utils.py +++ /dev/null @@ -1,1129 +0,0 @@ -import os -import typing -from typing import Optional - -import imageio -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -import pandas -import seaborn as sns -import torch.nn -from matplotlib import pyplot -from matplotlib.gridspec import GridSpec - -import autora.config as aer_config -import autora.theorist.darts.darts_config as darts_config -from autora.theorist.object_of_study import Object_Of_Study - - -def generate_darts_summary_figures( - figure_names: typing.List[str], - titles: typing.List[str], - filters: typing.List[str], - title_suffix: str, - study_name: str, - y_name: str, - y_label: str, - y_sem_name: str, - x1_name: str, - x1_label: str, - x2_name: str, - x2_label: str, - x_limit: typing.List[float], - y_limit: typing.List[float], - best_model_name: str, - figure_size: typing.Tuple[int, int], - y_reference: Optional[typing.List[float]] = None, - y_reference_label: str = "", - arch_samp_filter: Optional[str] = None, -): - """ - Generates a summary figure for a given DARTS study. - The figure can be composed of different summary plots. - - Arguments: - figure_names: list of strings with the names of the figures to be generated - titles: list of strings with the titles of the figures to be generated - filters: list of strings with the theorist filters to be used to select the models to be - used in the figures - title_suffix: string with the suffix to be added to the titles of the figures - study_name: string with the name of the study (used to identify the study folder) - y_name: string with the name of the y-axis variable - y_label: string with the label of the y-axis variable - y_sem_name: string with the name of the y-axis coding the standard error of the mean - x1_name: string with the name of the (first) x-axis variable - x1_label: string with the label of the (first) x-axis variable - x2_name: string with the name of the second x-axis variable - x2_label: string with the label of the second x-axis variable - x_limit: list with the limits of the x-axis - y_limit: list with the limits of the y-axis - best_model_name: string with the name of the best model to be highlighted in the figure - figure_size: list with the size of the figure - y_reference: list with the values of the reference line - y_reference_label: string with the label of the reference line - arch_samp_filter: string with the name of the filter to be used to select the - samples of the architecture - - """ - - for idx, (figure_name, title, theorist_filter) in enumerate( - zip(figure_names, titles, filters) - ): - - print("##########################: " + figure_name) - title = title + title_suffix - if idx > 0: # after legend - show_legend = False - figure_dimensions = figure_size - else: - show_legend = True - figure_dimensions = (6, 6) - if idx > 1: # after original darts - y_label = " " - - plot_darts_summary( - study_name=study_name, - title=title, - y_name=y_name, - y_label=y_label, - y_sem_name=y_sem_name, - x1_name=x1_name, - x1_label=x1_label, - x2_name=x2_name, - x2_label=x2_label, - metric="mean_min", - x_limit=x_limit, - y_limit=y_limit, - best_model_name=best_model_name, - theorist_filter=theorist_filter, - arch_samp_filter=arch_samp_filter, - figure_name=figure_name, - figure_dimensions=figure_dimensions, - legend_loc=aer_config.legend_loc, - legend_font_size=aer_config.legend_font_size, - axis_font_size=aer_config.axis_font_size, - title_font_size=aer_config.title_font_size, - show_legend=show_legend, - y_reference=y_reference, - y_reference_label=y_reference_label, - save=True, - ) - - -def plot_darts_summary( - study_name: str, - y_name: str, - x1_name: str, - x2_name: str = "", - y_label: str = "", - x1_label: str = "", - x2_label: str = "", - y_sem_name: Optional[str] = None, - metric: str = "min", - y_reference: Optional[typing.List[float]] = None, - y_reference_label: str = "", - figure_dimensions: Optional[typing.Tuple[int, int]] = None, - title: str = "", - legend_loc: int = 0, - legend_font_size: int = 8, - axis_font_size: int = 10, - title_font_size: int = 10, - show_legend: bool = True, - y_limit: Optional[typing.List[float]] = None, - x_limit: Optional[typing.List[float]] = None, - theorist_filter: Optional[str] = None, - arch_samp_filter: Optional[str] = None, - best_model_name: Optional[str] = None, - save: bool = False, - figure_name: str = "figure", -): - """ - Generates a single summary plot for a given DARTS study. - - Arguments: - study_name: string with the name of the study (used to identify the study folder) - y_name: string with the name of the y-axis variable - x1_name: string with the name of the (first) x-axis variable - x2_name: string with the name of the second x-axis variable - y_label: string with the label of the y-axis variable - x1_label: string with the label of the (first) x-axis variable - x2_label: string with the label of the second x-axis variable - y_sem_name: string with the name of the y-axis coding the standard error of the mean - metric: string with the metric to be used to select the best model - y_reference: list with the values of the reference line - y_reference_label: string with the label of the reference line - figure_dimensions: list with the size of the figure - title: string with the title of the figure - legend_loc: integer with the location of the legend - legend_font_size: integer with the font size of the legend - axis_font_size: integer with the font size of the axis - title_font_size: integer with the font size of the title - show_legend: boolean with the flag to show the legend - y_limit: list with the limits of the y-axis - x_limit: list with the limits of the x-axis - theorist_filter: string with the name of the filter to be used to select the theorist - arch_samp_filter: string with the name of the filter to be used to select the architecture - best_model_name: string with the name of the best model to be highlighted in the figure - save: boolean with the flag to save the figure - figure_name: string with the name of the figure - """ - - palette = "PuBu" - - if figure_dimensions is None: - figure_dimensions = (4, 3) - - if y_label == "": - y_label = y_name - - if x1_label == "": - x1_label = x1_name - - if x2_label == "": - x2_label = x2_name - - if y_reference_label == "": - y_reference_label = "Data Generating Model" - - # determine directory for study results and figures - results_path = ( - aer_config.studies_folder - + study_name - + "/" - + aer_config.models_folder - + aer_config.models_results_folder - ) - - figures_path = ( - aer_config.studies_folder - + study_name - + "/" - + aer_config.models_folder - + aer_config.models_results_figures_folder - ) - - # read in all csv files - files = list() - for file in os.listdir(results_path): - if file.endswith(".csv"): - if "model_" not in file: - continue - - if theorist_filter is not None: - if theorist_filter not in file: - continue - files.append(os.path.join(results_path, file)) - - print("Found " + str(len(files)) + " files.") - - # generate a plot dictionary - plot_dict: typing.Dict[typing.Optional[str], typing.List] = dict() - plot_dict[darts_config.csv_arch_file_name] = list() - plot_dict[y_name] = list() - plot_dict[x1_name] = list() - if x2_name != "": - plot_dict[x2_name] = list() - if y_sem_name is not None: - plot_dict[y_sem_name] = list() - - # load csv files into a common dictionary - for file in files: - data = pandas.read_csv(file, header=0) - - valid_data = list() - - # filter for arch samp - if arch_samp_filter is not None: - for idx, arch_file_name in enumerate(data[darts_config.csv_arch_file_name]): - arch_samp = int( - float(arch_file_name.split("_sample", 1)[1].split("_", 1)[0]) - ) - if arch_samp == arch_samp_filter: - valid_data.append(idx) - else: - for idx in range(len(data[darts_config.csv_arch_file_name])): - valid_data.append(idx) - - plot_dict[darts_config.csv_arch_file_name].extend( - data[darts_config.csv_arch_file_name][valid_data] - ) - if y_name in data.keys(): - plot_dict[y_name].extend(data[y_name][valid_data]) - else: - raise Exception( - 'Could not find key "' + y_name + '" in the data file: ' + str(file) - ) - if x1_name in data.keys(): - plot_dict[x1_name].extend(data[x1_name][valid_data]) - else: - raise Exception( - 'Could not find key "' + x1_name + '" in the data file: ' + str(file) - ) - if x2_name != "": - if x2_name in data.keys(): - plot_dict[x2_name].extend(data[x2_name][valid_data]) - else: - raise Exception( - 'Could not find key "' - + x2_name - + '" in the data file: ' - + str(file) - ) - if y_sem_name is not None: - # extract seed number from model file name - - if y_sem_name in data.keys(): - plot_dict[y_sem_name].extend(data[y_sem_name]) - elif y_sem_name == "seed": - y_sem_list = list() - for file_name in data[darts_config.csv_arch_file_name][valid_data]: - y_sem_list.append( - int(float(file_name.split("_s_", 1)[1].split("_sample", 1)[0])) - ) - plot_dict[y_sem_name].extend(y_sem_list) - - else: - - raise Exception( - 'Could not find key "' - + y_sem_name - + '" in the data file: ' - + str(file) - ) - - model_name_list = plot_dict[darts_config.csv_arch_file_name] - x1_data = np.asarray(plot_dict[x1_name]) - y_data = np.asarray(plot_dict[y_name]) - if x2_name == "": # determine for each value of x1 the corresponding y - x1_data = np.asarray(plot_dict[x1_name]) - x1_unique = np.sort(np.unique(x1_data)) - - y_plot = np.empty(x1_unique.shape) - y_plot[:] = np.nan - y_sem_plot = np.empty(x1_unique.shape) - y_sem_plot[:] = np.nan - y2_plot = np.empty(x1_unique.shape) - y2_plot[:] = np.nan - x1_plot = np.empty(x1_unique.shape) - x1_plot[:] = np.nan - for idx_unique, x1_unique_val in enumerate(x1_unique): - y_match = list() - model_name_match = list() - for idx_data, x_data_val in enumerate(x1_data): - if x1_unique_val == x_data_val: - y_match.append(y_data[idx_data]) - model_name_match.append(model_name_list[idx_data]) - x1_plot[idx_unique] = x1_unique_val - - if metric == "min": - y_plot[idx_unique] = np.min(y_match) - idx_target = np.argmin(y_match) - legend_label_spec = " (min)" - elif metric == "max": - y_plot[idx_unique] = np.max(y_match) - idx_target = np.argmax(y_match) - legend_label_spec = " (max)" - elif metric == "mean": - y_plot[idx_unique] = np.mean(y_match) - idx_target = 0 - legend_label_spec = " (avg)" - elif metric == "mean_min": - y_plot[idx_unique] = np.mean(y_match) - y2_plot[idx_unique] = np.min(y_match) - idx_target = np.argmin(y_match) - legend_label_spec = " (avg)" - legend_label2_spec = " (min)" - elif metric == "mean_max": - y_plot[idx_unique] = np.mean(y_match) - y2_plot[idx_unique] = np.max(y_match) - idx_target = np.argmax(y_match) - legend_label_spec = " (avg)" - legend_label2_spec = " (max)" - else: - raise Exception( - 'Argument "metric" may either be "min", "max", "mean", "mean_min" or "min_max".' - ) - - # compute standard error along given dimension - if y_sem_name is not None: - y_sem_data = np.asarray(plot_dict[y_sem_name]) - y_sem_unique = np.sort(np.unique(y_sem_data)) - y_sem = np.empty(y_sem_unique.shape) - # first average y over all other variables - for idx_y_sem_unique, y_sem_unique_val in enumerate(y_sem_unique): - y_sem_match = list() - for idx_y_sem, ( - y_sem_data_val, - x1_data_val, - y_data_val, - ) in enumerate(zip(y_sem_data, x1_data, y_data)): - if ( - y_sem_unique_val == y_sem_data_val - and x1_unique_val == x1_data_val - ): - y_sem_match.append(y_data_val) - y_sem[idx_y_sem_unique] = np.mean(y_sem_match) - # now compute sem - y_sem_plot[idx_unique] = np.nanstd(y_sem) / np.sqrt(len(y_sem)) - - print( - x1_label - + " = " - + str(x1_unique_val) - + " (" - + str(y_plot[idx_unique]) - + "): " - + model_name_match[idx_target] - ) - - else: # determine for each combination of x1 and x2 (unique rows) the lowest y - x2_data = np.asarray(plot_dict[x2_name]) - x2_unique = np.sort(np.unique(x2_data)) - - y_plot = list() - y_sem_plot = list() - y2_plot = list() - x1_plot = list() - x2_plot = list() - for idx_x2_unique, x2_unique_val in enumerate(x2_unique): - - # collect all x1 and y values matching the current x2 value - model_name_x2_match = list() - y_x2_match = list() - x1_x2_match = list() - for idx_x2_data, x2_data_val in enumerate(x2_data): - if x2_unique_val == x2_data_val: - model_name_x2_match.append(model_name_list[idx_x2_data]) - y_x2_match.append(y_data[idx_x2_data]) - x1_x2_match.append(x1_data[idx_x2_data]) - - # now determine unique x1 values for current x2 value - x1_unique = np.sort(np.unique(x1_x2_match)) - x1_x2_plot = np.empty(x1_unique.shape) - x1_x2_plot[:] = np.nan - y_x2_plot = np.empty(x1_unique.shape) - y_x2_plot[:] = np.nan - y_sem_x2_plot = np.empty(x1_unique.shape) - y_sem_x2_plot[:] = np.nan - y2_x2_plot = np.empty(x1_unique.shape) - y2_x2_plot[:] = np.nan - for idx_x1_unique, x1_unique_val in enumerate(x1_unique): - y_x2_x1_match = list() - model_name_x2_x1_match = list() - for idx_x1_data, x1_data_val in enumerate(x1_x2_match): - if x1_unique_val == x1_data_val: - model_name_x2_x1_match.append(model_name_x2_match[idx_x1_data]) - y_x2_x1_match.append(y_x2_match[idx_x1_data]) - x1_x2_plot[idx_x1_unique] = x1_unique_val - - if metric == "min": - y_x2_plot[idx_x1_unique] = np.min(y_x2_x1_match) - idx_target = np.argmin(y_x2_x1_match) - legend_label_spec = " (min)" - elif metric == "max": - y_x2_plot[idx_x1_unique] = np.max(y_x2_x1_match) - idx_target = np.argmax(y_x2_x1_match) - legend_label_spec = " (max)" - elif metric == "mean": - y_x2_plot[idx_x1_unique] = np.mean(y_x2_x1_match) - idx_target = 0 - legend_label_spec = " (avg)" - elif metric == "mean_min": - y_x2_plot[idx_x1_unique] = np.mean(y_x2_x1_match) - y2_x2_plot[idx_x1_unique] = np.min(y_x2_x1_match) - idx_target = np.argmin(y_x2_x1_match) - legend_label_spec = " (avg)" - legend_label2_spec = " (min)" - elif metric == "mean_max": - y_x2_plot[idx_x1_unique] = np.mean(y_x2_x1_match) - y2_x2_plot[idx_x1_unique] = np.max(y_x2_x1_match) - idx_target = np.argmax(y_x2_x1_match) - legend_label_spec = " (avg)" - legend_label2_spec = " (max)" - else: - raise Exception( - 'Argument "metric" may either be "min", "max", "mean", ' - '"mean_min" or "min_max".' - ) - - # compute standard error along given dimension - if y_sem_name is not None: - y_sem_data = np.asarray(plot_dict[y_sem_name]) - y_sem_unique = np.sort(np.unique(y_sem_data)) - y_sem = np.empty(y_sem_unique.shape) - # first average y over all other variables - for idx_y_sem_unique, y_sem_unique_val in enumerate(y_sem_unique): - y_sem_match = list() - for idx_y_sem, ( - y_sem_data_val, - x1_data_val, - x2_data_val, - y_data_val, - ) in enumerate(zip(y_sem_data, x1_data, x2_data, y_data)): - if ( - y_sem_unique_val == y_sem_data_val - and x1_unique_val == x1_data_val - and x2_unique_val == x2_data_val - ): - y_sem_match.append(y_data_val) - y_sem[idx_y_sem_unique] = np.nanmean(y_sem_match) - # now compute sem - y_sem_x2_plot[idx_x1_unique] = np.nanstd(y_sem) / np.sqrt( - len(y_sem) - ) - - if metric == "mean_min" or metric == "mean_max": - best_val_str = str(y2_x2_plot[idx_x1_unique]) - else: - best_val_str = str(y_x2_plot[idx_x1_unique]) - - print( - x1_label - + " = " - + str(x1_unique_val) - + ", " - + x2_label - + " = " - + str(x2_unique_val) - + " (" - + best_val_str - + "): " - + model_name_x2_x1_match[idx_target] - ) - - y_plot.append(y_x2_plot) - y2_plot.append(y2_x2_plot) - y_sem_plot.append(y_sem_x2_plot) - x1_plot.append(x1_x2_plot) - x2_plot.append(x2_unique_val) - # plot - # plt.axhline - - # determine best model coordinates - best_model_x1 = None - best_model_x2 = None - best_model_y = None - if best_model_name is not None: - theorist = best_model_name.split("weights_", 1)[1].split("_v_", 1)[0] - if theorist_filter is not None: - if theorist_filter == theorist: - determine_best_model = True - else: - determine_best_model = False - else: - determine_best_model = True - - if determine_best_model: - idx = plot_dict[darts_config.csv_arch_file_name].index(best_model_name) - best_model_x1 = plot_dict[x1_name][idx] - best_model_x2 = plot_dict[x2_name][idx] - best_model_y = plot_dict[y_name][idx] - - fig, ax = pyplot.subplots(figsize=figure_dimensions) - - if x2_name == "": - - colors = sns.color_palette(palette, 10) - color = colors[-1] - full_label = "Reconstructed Model" + legend_label_spec - sns.lineplot( - x=x1_plot, - y=y_plot, - marker="o", - linewidth=2, - ax=ax, - label=full_label, - color=color, - ) - - # draw error bars - if y_sem_name is not None: - ax.errorbar(x=x1_plot, y=y_plot, yerr=y_sem_plot, color=color) - - # draw second y value - if metric == "mean_min" or metric == "mean_max": - full_label = "Reconstructed Model" + legend_label2_spec - ax.plot(x1_plot, y2_plot, "*", linewidth=2, label=full_label, color=color) - - if show_legend: - handles, _ = ax.get_legend_handles_labels() - ax.legend(handles=handles, loc=legend_loc) - plt.setp(ax.get_legend().get_texts(), fontsize=legend_font_size) - - # draw selected model - if best_model_x1 is not None and best_model_y is not None: - ax.plot( - best_model_x1, - best_model_y, - "o", - fillstyle="none", - color="black", - markersize=10, - ) - - ax.set_xlabel(x1_label, fontsize=axis_font_size) - ax.set_ylabel(y_label, fontsize=axis_font_size) - ax.set_title(title, fontsize=title_font_size) - - if y_limit is not None: - ax.set_ylim(y_limit) - - if x_limit is not None: - ax.set_xlim(x_limit) - - # generate legend - # ax.scatter(x1_plot, y_plot, marker='.', c='r') - # g = sns.relplot(data=data_plot, x=x1_label, y=y_label, ax=ax) - # g._legend.remove() - if y_reference is not None: - ax.axhline( - y_reference, c="black", linestyle="dashed", label=y_reference_label - ) - - if show_legend: - # generate legend - handles, _ = ax.get_legend_handles_labels() - ax.legend(handles=handles, loc=legend_loc) - plt.setp(ax.get_legend().get_texts(), fontsize=legend_font_size) - else: - - colors = sns.color_palette(palette, len(x2_plot)) - - for idx, x2 in enumerate(x2_plot): - - x1_plot_line = x1_plot[idx] - y_plot_line = y_plot[idx] - label = x2_label + "$ = " + str(x2) + "$" + legend_label_spec - color = colors[idx] - - sns.lineplot( - x=x1_plot_line, - y=y_plot_line, - marker="o", - linewidth=2, - ax=ax, - label=label, - color=color, - alpha=1, - ) - - # draw error bars - if y_sem_name is not None: - y_sem_plot_line = y_sem_plot[idx] - ax.errorbar( - x=x1_plot_line, - y=y_plot_line, - yerr=y_sem_plot_line, - color=color, - alpha=1, - ) - - # # draw second y value on top - # for idx, x2 in enumerate(x2_plot): - # x1_plot_line = x1_plot[idx] - # color = colors[idx] - # - # if metric == 'mean_min' or metric == 'mean_max': - # y2_plot_line = y2_plot[idx] - # label = x2_label + '$ = ' + str(x2) + "$" + legend_label2_spec - # ax.plot(x1_plot_line, y2_plot_line, '*', linewidth=2, label=label, color=color) - - # draw selected model - if best_model_x1 is not None and best_model_y is not None: - ax.plot( - best_model_x1, - best_model_y, - "o", - fillstyle="none", - color="black", - markersize=10, - ) - - for idx, x2 in enumerate(x2_plot): - if best_model_x2 == x2: - color = colors[idx] - ax.plot( - best_model_x1, - best_model_y, - "*", - linewidth=2, - label="Best Model", - color=color, - ) - - if y_reference is not None: - ax.axhline( - y_reference, c="black", linestyle="dashed", label=y_reference_label - ) - - handles, _ = ax.get_legend_handles_labels() - leg = ax.legend( - handles=handles, loc=legend_loc, bbox_to_anchor=(1.05, 1) - ) # , title='Legend' - plt.setp(ax.get_legend().get_texts(), fontsize=legend_font_size) - - if not show_legend: - leg.remove() - - if y_limit is not None: - ax.set_ylim(y_limit) - - if x_limit is not None: - ax.set_xlim(x_limit) - - sns.despine(trim=True) - ax.set_ylabel(y_label, fontsize=axis_font_size) - ax.set_xlabel(x1_label, fontsize=axis_font_size) - ax.set_title(title, fontsize=title_font_size) - plt.show() - - # save plot - if save: - if not os.path.exists(figures_path): - os.mkdir(figures_path) - fig.savefig(os.path.join(figures_path, figure_name)) - - -def plot_model_graph( - study_name: str, - arch_weights_name: str, - model_weights_name: str, - object_of_study: Object_Of_Study, - figure_name: str = "graph", -): - """ - Plot the graph of the DARTS model. - - Arguments: - study_name: name of the study (used to identify the relevant study folder) - arch_weights_name: name of the architecture weights file - model_weights_name: name of the model weights file (that contains the trained parameters) - object_of_study: name of the object of study - figure_name: name of the figure - """ - - import os - - import autora.theorist.darts.utils as utils - import autora.theorist.darts.visualize as viz - - figures_path = ( - aer_config.studies_folder - + study_name - + "/" - + aer_config.models_folder - + aer_config.models_results_figures_folder - ) - - model = load_model( - study_name, model_weights_name, arch_weights_name, object_of_study - ) - - (n_params_total, n_params_base, param_list) = model.countParameters( - print_parameters=True - ) - genotype = model.genotype() - filepath = os.path.join(figures_path, figure_name) - viz.plot( - genotype.normal, - filepath, - file_format="png", - view_file=True, - full_label=True, - param_list=param_list, - input_labels=object_of_study.__get_input_labels__(), - out_dim=object_of_study.__get_output_dim__(), - out_fnc=utils.get_output_str(object_of_study.__get_output_type__()), - ) - - -# old - - -def load_model( - study_name: str, - model_weights_name: str, - arch_weights_name: str, - object_of_study: Object_Of_Study, -) -> torch.nn.Module: - """ - Load the model. - - Arguments: - study_name: name of the study (used to identify the relevant study folder) - model_weights_name: name of the model weights file (that contains the trained parameters) - arch_weights_name: name of the architecture weights file - object_of_study: name of the object of study - - Returns: - model: DARTS model - """ - - import os - - import torch - - import autora.theorist.darts.utils as utils - from autora.theorist.darts.model_search import Network - - num_output = object_of_study.__get_output_dim__() - num_input = object_of_study.__get_input_dim__() - k = int(float(arch_weights_name.split("_k_", 1)[1].split("_s_", 1)[0])) - - results_weights_path = ( - aer_config.studies_folder - + study_name - + "/" - + aer_config.models_folder - + aer_config.models_results_weights_folder - ) - - model_path = os.path.join(results_weights_path, model_weights_name + ".pt") - arch_path = os.path.join(results_weights_path, arch_weights_name + ".pt") - criterion = utils.sigmid_mse - model = Network(num_output, criterion, steps=k, n_input_states=num_input) - utils.load(model, model_path) - alphas_normal = torch.load(arch_path) - model.fix_architecture(True, new_weights=alphas_normal) - - return model - - -class DebugWindow: - """ - A window with plots that are used for debugging. - """ - - def __init__( - self, - num_epochs: int, - numArchEdges: int = 1, - numArchOps: int = 1, - ArchOpsLabels: typing.Tuple = (), - fitPlot3D: bool = False, - show_arch_weights: bool = True, - ): - """ - Initializes the debug window. - - Arguments: - num_epochs: number of architecture training epochs - numArchEdges: number of architecture edges - numArchOps: number of architecture operations - ArchOpsLabels: list of architecture operation labels - fitPlot3D: if True, the 3D plot of the fit is shown - show_arch_weights: if True, the architecture weights are shown - """ - - # initialization - matplotlib.use("TkAgg") # need to add this for PyCharm environment - - plt.ion() - - # SETTINGS - self.show_arch_weights = show_arch_weights - self.fontSize = 10 - - self.performancePlot_limit = (0, 1) - self.modelFitPlot_limit = (0, 500) - self.mismatchPlot_limit = (0, 1) - self.architectureWeightsPlot_limit = (0.1, 0.2) - - self.numPatternsShown = 100 - - # FIGURE - self.fig = plt.figure() - self.fig.set_size_inches(13, 7) - - if self.show_arch_weights is False: - numArchEdges = 0 - - # set up grid - numRows = np.max((1 + np.ceil((numArchEdges + 1) / 4), 2)) - gs = GridSpec(numRows.astype(int), 4, figure=self.fig) - - self.fig.subplots_adjust( - left=0.1, bottom=0.1, right=0.90, top=0.9, wspace=0.4, hspace=0.5 - ) - self.modelGraph = self.fig.add_subplot(gs[1, 0]) - self.performancePlot = self.fig.add_subplot(gs[0, 0]) - self.modelFitPlot = self.fig.add_subplot(gs[0, 1]) - if fitPlot3D: - self.mismatchPlot = self.fig.add_subplot(gs[0, 2], projection="3d") - else: - self.mismatchPlot = self.fig.add_subplot(gs[0, 2]) - self.examplePatternsPlot = self.fig.add_subplot(gs[0, 3]) - - self.architecturePlot = [] - - for edge in range(numArchEdges): - row = np.ceil((edge + 2) / 4).astype(int) - col = (edge + 1) % 4 - self.architecturePlot.append(self.fig.add_subplot(gs[row, col])) - - self.colors = ( - "black", - "red", - "green", - "blue", - "purple", - "orange", - "brown", - "pink", - "grey", - "olive", - "cyan", - "yellow", - "skyblue", - "coral", - "magenta", - "seagreen", - "sandybrown", - ) - - # PERFORMANCE PLOT - x = 1 - y = 1 - (self.train_error,) = self.performancePlot.plot(x, y, "r-") - (self.valid_error,) = self.performancePlot.plot(x, y, "b", linestyle="dashed") - - # set labels - self.performancePlot.set_xlabel("Epoch", fontsize=self.fontSize) - self.performancePlot.set_ylabel("Cross-Entropy Loss", fontsize=self.fontSize) - self.performancePlot.set_title("Performance", fontsize=self.fontSize) - self.performancePlot.legend( - (self.train_error, self.valid_error), ("training error", "validation error") - ) - - # adjust axes - self.performancePlot.set_xlim(0, num_epochs) - self.performancePlot.set_ylim( - self.performancePlot_limit[0], self.performancePlot_limit[1] - ) - - # MODEL FIT PLOT - x = 1 - y = 1 - (self.BIC,) = self.modelFitPlot.plot(x, y, color="black") - (self.AIC,) = self.modelFitPlot.plot(x, y, color="grey") - - # set labels - self.modelFitPlot.set_xlabel("Epoch", fontsize=self.fontSize) - self.modelFitPlot.set_ylabel("Information Criterion", fontsize=self.fontSize) - self.modelFitPlot.set_title("Model Fit", fontsize=self.fontSize) - self.modelFitPlot.legend((self.BIC, self.AIC), ("BIC", "AIC")) - - # adjust axes - self.modelFitPlot.set_xlim(0, num_epochs) - self.modelFitPlot.set_ylim( - self.modelFitPlot_limit[0], self.modelFitPlot_limit[1] - ) - - # RANGE PREDICTION FIT PLOT - x = 1 - y = 1 - if fitPlot3D: - x = np.arange(0, 1, 0.1) - y = np.arange(0, 1, 0.1) - X, Y = np.meshgrid(x, y) - Z = X * np.exp(-X - Y) - - self.range_target = self.mismatchPlot.plot_surface(X, Y, Z) - self.range_prediction = self.mismatchPlot.plot_surface(X, Y, Z) - self.mismatchPlot.set_zlim( - self.mismatchPlot_limit[0], self.mismatchPlot_limit[1] - ) - - # set labels - self.mismatchPlot.set_xlabel("Stimulus 1", fontsize=self.fontSize) - self.mismatchPlot.set_ylabel("Stimulus 2", fontsize=self.fontSize) - self.mismatchPlot.set_zlabel("Outcome Value", fontsize=self.fontSize) - - else: - (self.range_target,) = self.mismatchPlot.plot(x, y, color="black") - (self.range_prediction,) = self.mismatchPlot.plot(x, y, "--", color="red") - - # set labels - self.mismatchPlot.set_xlabel("Stimulus Value", fontsize=self.fontSize) - self.mismatchPlot.set_ylabel("Outcome Value", fontsize=self.fontSize) - self.mismatchPlot.legend( - (self.range_target, self.range_prediction), ("target", "prediction") - ) - - self.mismatchPlot.set_title("Target vs. Prediction", fontsize=self.fontSize) - - # adjust axes - self.mismatchPlot.set_xlim(0, 1) - self.mismatchPlot.set_ylim(0, 1) - - # ARCHITECTURE WEIGHT PLOT - if self.show_arch_weights: - - self.architectureWeights = [] - for idx, architecturePlot in enumerate(self.architecturePlot): - plotWeights = [] - x = 1 - y = 1 - for op in range(numArchOps): - (plotWeight,) = architecturePlot.plot(x, y, color=self.colors[op]) - plotWeights.append(plotWeight) - - # set legend - if idx == 0: - architecturePlot.legend( - plotWeights, ArchOpsLabels, prop={"size": 6} - ) - - # add labels - architecturePlot.set_ylabel("Weight", fontsize=self.fontSize) - architecturePlot.set_title( - "(" + str(idx) + ") Edge Weight", fontsize=self.fontSize - ) - if idx == len(self.architecturePlot) - 1: - architecturePlot.set_xlabel("Epoch", fontsize=self.fontSize) - - # adjust axes - architecturePlot.set_xlim(0, num_epochs) - architecturePlot.set_ylim( - self.architectureWeightsPlot_limit[0], - self.architectureWeightsPlot_limit[1], - ) - - self.architectureWeights.append(plotWeights) - - # draw - plt.draw() - - def update( - self, - train_error: Optional[np.array] = None, - valid_error: Optional[np.array] = None, - weights: Optional[np.array] = None, - BIC: Optional[np.array] = None, - AIC: Optional[np.array] = None, - model_graph: Optional[str] = None, - range_input1: Optional[np.array] = None, - range_input2: Optional[np.array] = None, - range_target: Optional[np.array] = None, - range_prediction: Optional[np.array] = None, - target: Optional[np.array] = None, - prediction: Optional[np.array] = None, - ): - """ - Update the debug plot with new data. - - Arguments: - train_error: training error - valid_error: validation error - weights: weights of the model - BIC: Bayesian information criterion of the model - AIC: Akaike information criterion of the model - model_graph: the graph of the model - range_input1: the range of the first input - range_input2: the range of the second input - range_target: the range of the target - range_prediction: the range of the prediction - target: the target - prediction: the prediction - """ - - # update training error - if train_error is not None: - self.train_error.set_xdata( - np.linspace(1, len(train_error), len(train_error)) - ) - self.train_error.set_ydata(train_error) - - # update validation error - if valid_error is not None: - self.valid_error.set_xdata( - np.linspace(1, len(valid_error), len(valid_error)) - ) - self.valid_error.set_ydata(valid_error) - - # update BIC - if BIC is not None: - self.BIC.set_xdata(np.linspace(1, len(BIC), len(BIC))) - self.BIC.set_ydata(BIC) - - # update AIC - if AIC is not None: - self.AIC.set_xdata(np.linspace(1, len(AIC), len(AIC))) - self.AIC.set_ydata(AIC) - - # update target vs. prediction plot - if ( - range_input1 is not None - and range_target is not None - and range_prediction is not None - and range_input2 is None - ): - self.range_target.set_xdata(range_input1) - self.range_target.set_ydata(range_target) - self.range_prediction.set_xdata(range_input1) - self.range_prediction.set_ydata(range_prediction) - elif ( - range_input1 is not None - and range_target is not None - and range_prediction is not None - and range_input2 is not None - ): - - # update plot - self.mismatchPlot.cla() - self.range_target = self.mismatchPlot.plot_surface( - range_input1, range_input2, range_target, color=(0, 0, 0, 0.5) - ) - self.range_prediction = self.mismatchPlot.plot_surface( - range_input1, range_input2, range_prediction, color=(1, 0, 0, 0.5) - ) - - # set labels - self.mismatchPlot.set_xlabel("Stimulus 1", fontsize=self.fontSize) - self.mismatchPlot.set_ylabel("Stimulus 2", fontsize=self.fontSize) - self.mismatchPlot.set_zlabel("Outcome Value", fontsize=self.fontSize) - self.mismatchPlot.set_title("Target vs. Prediction", fontsize=self.fontSize) - - # update example pattern plot - if target is not None and prediction is not None: - - # select limited number of patterns - self.numPatternsShown = np.min((self.numPatternsShown, target.shape[0])) - target = target[0 : self.numPatternsShown, :] - prediction = prediction[0 : self.numPatternsShown, :] - - im = np.concatenate((target, prediction), axis=1) - self.examplePatternsPlot.cla() - self.examplePatternsPlot.imshow(im, interpolation="nearest", aspect="auto") - x = np.ones(target.shape[0]) * (target.shape[1] - 0.5) - y = np.linspace(1, target.shape[0], target.shape[0]) - self.examplePatternsPlot.plot(x, y, color="red") - - # set labels - self.examplePatternsPlot.set_xlabel("Output", fontsize=self.fontSize) - self.examplePatternsPlot.set_ylabel("Pattern", fontsize=self.fontSize) - self.examplePatternsPlot.set_title( - "Target vs. Prediction", fontsize=self.fontSize - ) - - if self.show_arch_weights: - # update weights - if weights is not None: - for plotIdx, architectureWeights in enumerate(self.architectureWeights): - for lineIdx, plotWeight in enumerate(architectureWeights): - plotWeight.set_xdata( - np.linspace(1, weights.shape[0], weights.shape[0]) - ) - plotWeight.set_ydata(weights[:, plotIdx, lineIdx]) - - # draw current graph - if model_graph is not None: - im = imageio.imread(model_graph) - self.modelGraph.cla() - self.modelGraph.imshow(im) - self.modelGraph.axis("off") - - # re-draw plot - plt.draw() - plt.pause(0.02) From b4509e9b37d86416a3b5455e71bed93afec9b96e Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 16:48:28 -0400 Subject: [PATCH 04/38] feat: add example running theorist --- .../autora_theorist_basic_usage.xml | 25 +++++ autora/theorist/__main__.py | 96 ++++++++++++++++++- tests/cli/theorist/basic-usage/.gitignore | 1 + tests/cli/theorist/basic-usage/data.csv | 5 + tests/cli/theorist/basic-usage/parameters.yml | 0 tests/cli/theorist/basic-usage/variables.yml | 10 ++ 6 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 .idea/runConfigurations/autora_theorist_basic_usage.xml create mode 100644 tests/cli/theorist/basic-usage/.gitignore create mode 100644 tests/cli/theorist/basic-usage/data.csv create mode 100644 tests/cli/theorist/basic-usage/parameters.yml create mode 100644 tests/cli/theorist/basic-usage/variables.yml diff --git a/.idea/runConfigurations/autora_theorist_basic_usage.xml b/.idea/runConfigurations/autora_theorist_basic_usage.xml new file mode 100644 index 000000000..a11982787 --- /dev/null +++ b/.idea/runConfigurations/autora_theorist_basic_usage.xml @@ -0,0 +1,25 @@ + + + + + \ No newline at end of file diff --git a/autora/theorist/__main__.py b/autora/theorist/__main__.py index d67b7f2c9..83b530ea1 100644 --- a/autora/theorist/__main__.py +++ b/autora/theorist/__main__.py @@ -1,10 +1,18 @@ import importlib import logging -from typing import Type +import pathlib +import pickle +import pprint +from typing import Dict, Optional, Type +import pandas as pd import typer +import yaml +from pandas import DataFrame from sklearn.base import BaseEstimator +from autora.variable import VariableCollection + _logger = logging.getLogger(__name__) @@ -27,16 +35,94 @@ def import_class(name: str) -> Type[BaseEstimator]: return cls -def main(regressor: str, verbose: bool = False, debug: bool = False): +def main( + variables: pathlib.Path, + parameters: pathlib.Path, + regressor: str, + data: pathlib.Path, + output: pathlib.Path, + verbose: bool = False, + debug: bool = False, + overwrite: bool = False, +): - if verbose: - logging.basicConfig(level=logging.INFO) + configure_logger(debug, verbose) + + regressor_class_ = load_regressor_class(regressor) + data_ = load_data(data) + variables_ = load_variables(variables) + parameters_ = load_parameters(parameters) + model = fit_model(data_, parameters_, regressor_class_, variables_) + dump_model(model, output, overwrite) + + return + + +def dump_model(model_, output, overwrite): + if overwrite: + mode = "wb" + else: + mode = "xb" + with open(output, mode) as o: + pickle.dump(model_, o) + + +def fit_model(data_, parameters_, regressor_class_, variables_): + model = regressor_class_(**parameters_) + X = data_[[v.name for v in variables_.independent_variables]] + y = data_[[v.name for v in variables_.dependent_variables]] + _logger.debug(f"fitting the regressor with X:\n{X}\nand y:\n{y}") + model.fit(X, y) + try: + _logger.info( + f"fitted {model=}\nmodel.__dict__:" f"\n{pprint.pformat(model.__dict__)}" + ) + except AttributeError: + _logger.warning( + f"fitted {model=} " + f"model has no __dict__ attribute, so no results are shown" + ) + return model + + +def configure_logger(debug, verbose): if debug: logging.basicConfig(level=logging.DEBUG) + _logger.debug("using DEBUG logging level") + if verbose: + logging.basicConfig(level=logging.INFO) + _logger.info("using INFO logging level") + +def load_regressor_class(regressor): regressor_class = import_class(regressor) + _logger.info(f"{regressor}: {regressor_class}") + return regressor_class + + +def load_data(data: pathlib.Path) -> DataFrame: + _logger.debug(f"load_data: loading from {data=}") + with open(data, "r") as fd: + data_: DataFrame = pd.read_csv(fd) + return data_ + + +def load_variables(path: pathlib.Path) -> VariableCollection: + _logger.debug(f"load_variables: loading from {path=}") + variables_: VariableCollection + with open(path, "r") as fv: + variables_ = yaml.load(fv, yaml.Loader) + assert isinstance(variables_, VariableCollection) + return variables_ + - print(f"{regressor}: {regressor_class}") +def load_parameters(path: pathlib.Path) -> Dict: + _logger.debug(f"load_parameters: loading from {path=}") + with open(path, "r") as fp: + parameters_: Optional[Dict] = yaml.load(fp, yaml.Loader) + if parameters_ is None: + parameters_ = dict() + return parameters_ if __name__ == "__main__": diff --git a/tests/cli/theorist/basic-usage/.gitignore b/tests/cli/theorist/basic-usage/.gitignore new file mode 100644 index 000000000..4c2273862 --- /dev/null +++ b/tests/cli/theorist/basic-usage/.gitignore @@ -0,0 +1 @@ +out.pickle diff --git a/tests/cli/theorist/basic-usage/data.csv b/tests/cli/theorist/basic-usage/data.csv new file mode 100644 index 000000000..52c6adc05 --- /dev/null +++ b/tests/cli/theorist/basic-usage/data.csv @@ -0,0 +1,5 @@ +x1,x2,c1,y +1,1,7,2 +1,2,7,3 +2,2,7,4 +0,0,7,0 diff --git a/tests/cli/theorist/basic-usage/parameters.yml b/tests/cli/theorist/basic-usage/parameters.yml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/cli/theorist/basic-usage/variables.yml b/tests/cli/theorist/basic-usage/variables.yml new file mode 100644 index 000000000..c5e11e9e0 --- /dev/null +++ b/tests/cli/theorist/basic-usage/variables.yml @@ -0,0 +1,10 @@ +!!python/object:autora.variable.VariableCollection +covariates: [] +dependent_variables: + - !!python/object:autora.variable.Variable + name: y +independent_variables: + - !!python/object:autora.variable.Variable + name: x1 + - !!python/object:autora.variable.Variable + name: x2 From 93b8d2ed7507a9ba53edb9792a9217a07caa7982 Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 16:52:09 -0400 Subject: [PATCH 05/38] feat: add example using parameters --- tests/cli/theorist/basic-usage/parameters.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/cli/theorist/basic-usage/parameters.yml b/tests/cli/theorist/basic-usage/parameters.yml index e69de29bb..8f78d83ae 100644 --- a/tests/cli/theorist/basic-usage/parameters.yml +++ b/tests/cli/theorist/basic-usage/parameters.yml @@ -0,0 +1 @@ +fit_intercept: True From 48699b304ea2af3b9eb746527016e8887d2ecba0 Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 16:58:27 -0400 Subject: [PATCH 06/38] docs: updating docstrings --- autora/theorist/__main__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/autora/theorist/__main__.py b/autora/theorist/__main__.py index 83b530ea1..298017c7e 100644 --- a/autora/theorist/__main__.py +++ b/autora/theorist/__main__.py @@ -45,14 +45,19 @@ def main( debug: bool = False, overwrite: bool = False, ): - + # Initialization configure_logger(debug, verbose) + # Data Loading regressor_class_ = load_regressor_class(regressor) data_ = load_data(data) variables_ = load_variables(variables) parameters_ = load_parameters(parameters) + + # Fitting model = fit_model(data_, parameters_, regressor_class_, variables_) + + # Writing results dump_model(model, output, overwrite) return @@ -61,8 +66,10 @@ def main( def dump_model(model_, output, overwrite): if overwrite: mode = "wb" + _logger.info(f"overwriting {output=} if it already exists") else: mode = "xb" + _logger.info(f"writing to new file {output=}") with open(output, mode) as o: pickle.dump(model_, o) From 6d3b641780d9d0273359c5ea49edbc9c8aa48cfc Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 17:00:41 -0400 Subject: [PATCH 07/38] refactor: reorder file to match execution order --- autora/theorist/__main__.py | 88 ++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/autora/theorist/__main__.py b/autora/theorist/__main__.py index 298017c7e..7c6d82632 100644 --- a/autora/theorist/__main__.py +++ b/autora/theorist/__main__.py @@ -49,10 +49,10 @@ def main( configure_logger(debug, verbose) # Data Loading - regressor_class_ = load_regressor_class(regressor) - data_ = load_data(data) variables_ = load_variables(variables) parameters_ = load_parameters(parameters) + regressor_class_ = load_regressor_class(regressor) + data_ = load_data(data) # Fitting model = fit_model(data_, parameters_, regressor_class_, variables_) @@ -63,35 +63,6 @@ def main( return -def dump_model(model_, output, overwrite): - if overwrite: - mode = "wb" - _logger.info(f"overwriting {output=} if it already exists") - else: - mode = "xb" - _logger.info(f"writing to new file {output=}") - with open(output, mode) as o: - pickle.dump(model_, o) - - -def fit_model(data_, parameters_, regressor_class_, variables_): - model = regressor_class_(**parameters_) - X = data_[[v.name for v in variables_.independent_variables]] - y = data_[[v.name for v in variables_.dependent_variables]] - _logger.debug(f"fitting the regressor with X:\n{X}\nand y:\n{y}") - model.fit(X, y) - try: - _logger.info( - f"fitted {model=}\nmodel.__dict__:" f"\n{pprint.pformat(model.__dict__)}" - ) - except AttributeError: - _logger.warning( - f"fitted {model=} " - f"model has no __dict__ attribute, so no results are shown" - ) - return model - - def configure_logger(debug, verbose): if debug: logging.basicConfig(level=logging.DEBUG) @@ -101,19 +72,6 @@ def configure_logger(debug, verbose): _logger.info("using INFO logging level") -def load_regressor_class(regressor): - regressor_class = import_class(regressor) - _logger.info(f"{regressor}: {regressor_class}") - return regressor_class - - -def load_data(data: pathlib.Path) -> DataFrame: - _logger.debug(f"load_data: loading from {data=}") - with open(data, "r") as fd: - data_: DataFrame = pd.read_csv(fd) - return data_ - - def load_variables(path: pathlib.Path) -> VariableCollection: _logger.debug(f"load_variables: loading from {path=}") variables_: VariableCollection @@ -132,5 +90,47 @@ def load_parameters(path: pathlib.Path) -> Dict: return parameters_ +def load_regressor_class(regressor): + regressor_class = import_class(regressor) + _logger.info(f"{regressor}: {regressor_class}") + return regressor_class + + +def load_data(data: pathlib.Path) -> DataFrame: + _logger.debug(f"load_data: loading from {data=}") + with open(data, "r") as fd: + data_: DataFrame = pd.read_csv(fd) + return data_ + + +def fit_model(data_, parameters_, regressor_class_, variables_): + model = regressor_class_(**parameters_) + X = data_[[v.name for v in variables_.independent_variables]] + y = data_[[v.name for v in variables_.dependent_variables]] + _logger.debug(f"fitting the regressor with X:\n{X}\nand y:\n{y}") + model.fit(X, y) + try: + _logger.info( + f"fitted {model=}\nmodel.__dict__:" f"\n{pprint.pformat(model.__dict__)}" + ) + except AttributeError: + _logger.warning( + f"fitted {model=} " + f"model has no __dict__ attribute, so no results are shown" + ) + return model + + +def dump_model(model_, output, overwrite): + if overwrite: + mode = "wb" + _logger.info(f"overwriting {output=} if it already exists") + else: + mode = "xb" + _logger.info(f"writing to new file {output=}") + with open(output, mode) as o: + pickle.dump(model_, o) + + if __name__ == "__main__": typer.run(main) From a7be18b440b5e802e899a044a19dd81409e20560 Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 17:01:18 -0400 Subject: [PATCH 08/38] refactor: reorder file to match execution order --- autora/theorist/__main__.py | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/autora/theorist/__main__.py b/autora/theorist/__main__.py index 7c6d82632..1c4877a94 100644 --- a/autora/theorist/__main__.py +++ b/autora/theorist/__main__.py @@ -16,25 +16,6 @@ _logger = logging.getLogger(__name__) -def import_class(name: str) -> Type[BaseEstimator]: - """ - Load a class from a module by name. - - Args: - name: - - Examples: - >>> import_class("sklearn.linear_model.LinearRegressor") - - """ - components = name.split(".") - module_name, class_name = ".".join(components[:-1]), components[-1] - _logger.info(f"loading {module_name=}, {class_name=}") - module = importlib.import_module(module_name) - cls = getattr(module, class_name) - return cls - - def main( variables: pathlib.Path, parameters: pathlib.Path, @@ -96,6 +77,25 @@ def load_regressor_class(regressor): return regressor_class +def import_class(name: str) -> Type[BaseEstimator]: + """ + Load a class from a module by name. + + Args: + name: + + Examples: + >>> import_class("sklearn.linear_model.LinearRegressor") + + """ + components = name.split(".") + module_name, class_name = ".".join(components[:-1]), components[-1] + _logger.info(f"loading {module_name=}, {class_name=}") + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + return cls + + def load_data(data: pathlib.Path) -> DataFrame: _logger.debug(f"load_data: loading from {data=}") with open(data, "r") as fd: From 3b23df366925c60867c864a28fc4e6c964b6c9cc Mon Sep 17 00:00:00 2001 From: John Gerrard Holland Date: Thu, 13 Apr 2023 17:12:34 -0400 Subject: [PATCH 09/38] feat: add ability to pre-set parameters of regressor --- .../autora_theorist_basic_usage.xml | 2 +- autora/theorist/__main__.py | 70 +++++++------------ tests/cli/theorist/basic-usage/regressor.yml | 6 ++ 3 files changed, 32 insertions(+), 46 deletions(-) create mode 100644 tests/cli/theorist/basic-usage/regressor.yml diff --git a/.idea/runConfigurations/autora_theorist_basic_usage.xml b/.idea/runConfigurations/autora_theorist_basic_usage.xml index a11982787..b70e88f22 100644 --- a/.idea/runConfigurations/autora_theorist_basic_usage.xml +++ b/.idea/runConfigurations/autora_theorist_basic_usage.xml @@ -14,7 +14,7 @@