From e9b768d8d8f9ddf4219a2c06545d29bb667b98dd Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Mon, 11 Dec 2023 11:03:43 +0100 Subject: [PATCH 01/19] FilterOutliers overhaul --- ipsuite/configuration_selection/filter.py | 82 +++++++++++++++++------ 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index c6419367..a6f2e29a 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -7,57 +7,83 @@ from ipsuite import base -class FilterOutlier(base.ProcessAtoms): +def direct_cutoff(values, threshold, cutoffs): + # Filtering the direct cutoff values + if cutoffs is None: + raise ValueError("cutoffs not specified.") + return (cutoffs[0], cutoffs[1]) + +def cutoff_around_mean(values, threshold, cutoffs): + # Filtering in multiples of the standard deviation around the mean. + mean = np.mean(values) + std = np.std(values) + + upper_limit = mean + threshold * std + lower_limit = mean - threshold * std + return (upper_limit, lower_limit) + +CUTOFF = { + "direct": direct_cutoff, + "around_mean": cutoff_around_mean +} + + +class FilterOutliers(base.ProcessAtoms): """Remove outliers from the data based on a given property. Attributes ---------- key : str, default="energy" The property to filter on. - threshold : float, default=3 - The threshold for filtering in units of standard deviations. + cutoff_type : {"direct", "around_mean"}, default="around_mean" + Defines the cutoff type. direction : {"above", "below", "both"}, default="both" The direction to filter in. + threshold : float, default=3 + The threshold for filtering in units of standard deviations. + cutoffs : list(float), default=None + Upper and lower cutoff. """ key: str = zntrack.params("energy") - threshold: float = zntrack.params(3) + cutoff_type: t.Literal["direct", "around_mean"] = zntrack.params("around_mean") direction: t.Literal["above", "below", "both"] = zntrack.params("both") + threshold: float = zntrack.params(3) + cutoffs: list(float) = zntrack.params(None) + filtered_indices: list = zntrack.outs() histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png") - def run(self): + def run(self): values = [x.calc.results[self.key] for x in self.data] - mean = np.mean(values) - std = np.std(values) + + if len(values[0][0]) == 3: + # calculates the maximal magnetude of cartesian values + values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] + + upper_limit, lower_limit = CUTOFF(self.cutoff_type)( + values, + self.threshold, + self.cutoffs, + ) if self.direction == "above": self.filtered_indices = [ - i for i, x in enumerate(values) if x > mean + self.threshold * std + i for i, x in enumerate(values) if x > upper_limit ] elif self.direction == "below": self.filtered_indices = [ - i for i, x in enumerate(values) if x < mean - self.threshold * std + i for i, x in enumerate(values) if x < lower_limit ] else: self.filtered_indices = [ i for i, x in enumerate(values) - if x > mean + self.threshold * std or x < mean - self.threshold * std + if x > upper_limit or x < lower_limit ] - fig, ax = plt.subplots(3, figsize=(10, 10)) - ax[0].hist(values, bins=100) - ax[0].set_title("All") - ax[1].hist( - [values[i] for i in range(len(values)) if i not in self.filtered_indices], - bins=100, - ) - ax[1].set_title("Filtered") - ax[2].hist([values[i] for i in self.filtered_indices], bins=100) - ax[2].set_title("Excluded") - fig.savefig(self.histogram, bbox_inches="tight") + plot_hist(values, self.filtered_indices, self.histogram) @property def atoms(self): @@ -68,3 +94,17 @@ def atoms(self): @property def excluded_atoms(self): return [self.data[i] for i in self.filtered_indices] + + +def plot_hist(values, filtered_indices, histogram): + fig, ax = plt.subplots(3, figsize=(10, 10)) + ax[0].hist(values, bins=100) + ax[0].set_title("All") + ax[1].hist( + [values[i] for i in range(len(values)) if i not in filtered_indices], + bins=100, + ) + ax[1].set_title("Filtered") + ax[2].hist([values[i] for i in filtered_indices], bins=100) + ax[2].set_title("Excluded") + fig.savefig(histogram, bbox_inches="tight") \ No newline at end of file From 3110a6bd428e6da52b6c185cc7d898280f7d871a Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Mon, 11 Dec 2023 14:25:27 +0100 Subject: [PATCH 02/19] Filter Node becomes childclass of ConfigurationSelection --- ipsuite/configuration_selection/filter.py | 88 +++++++++++------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index a6f2e29a..e7b25b89 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -1,16 +1,17 @@ import typing as t +import ase import matplotlib.pyplot as plt import numpy as np import zntrack -from ipsuite import base +from ipsuite.configuration_selection import ConfigurationSelection def direct_cutoff(values, threshold, cutoffs): # Filtering the direct cutoff values if cutoffs is None: - raise ValueError("cutoffs not specified.") + raise ValueError("cutoffs have to be specified for using the direct cutoff filter.") return (cutoffs[0], cutoffs[1]) def cutoff_around_mean(values, threshold, cutoffs): @@ -18,9 +19,9 @@ def cutoff_around_mean(values, threshold, cutoffs): mean = np.mean(values) std = np.std(values) - upper_limit = mean + threshold * std - lower_limit = mean - threshold * std - return (upper_limit, lower_limit) + upper_cutoff = mean + threshold * std + lower_cutoff = mean - threshold * std + return (lower_cutoff, upper_cutoff) CUTOFF = { "direct": direct_cutoff, @@ -28,7 +29,7 @@ def cutoff_around_mean(values, threshold, cutoffs): } -class FilterOutliers(base.ProcessAtoms): +class FilterOutlier(ConfigurationSelection): """Remove outliers from the data based on a given property. Attributes @@ -42,69 +43,68 @@ class FilterOutliers(base.ProcessAtoms): threshold : float, default=3 The threshold for filtering in units of standard deviations. cutoffs : list(float), default=None - Upper and lower cutoff. + Lower and upper cutoff. """ key: str = zntrack.params("energy") cutoff_type: t.Literal["direct", "around_mean"] = zntrack.params("around_mean") direction: t.Literal["above", "below", "both"] = zntrack.params("both") threshold: float = zntrack.params(3) - cutoffs: list(float) = zntrack.params(None) + cutoffs: t.Union[t.List[float], None] = zntrack.params(None) - - filtered_indices: list = zntrack.outs() histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png") + + def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: + values = [atoms.calc.results[self.key] for atoms in atoms_lst] - def run(self): - values = [x.calc.results[self.key] for x in self.data] - - if len(values[0][0]) == 3: + # get maximal atomic value per struckture + if np.array(values).ndim == 3: # calculates the maximal magnetude of cartesian values values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] - - upper_limit, lower_limit = CUTOFF(self.cutoff_type)( + elif np.array(values).ndim == 2: + # calculates the maximal atomic values + values = [np.max(value, axis=0) for value in values] + + lower_limit, upper_limit = CUTOFF[self.cutoff_type]( values, self.threshold, self.cutoffs, ) if self.direction == "above": - self.filtered_indices = [ - i for i, x in enumerate(values) if x > upper_limit + selection = [ + i for i, x in enumerate(values) if x < upper_limit ] elif self.direction == "below": - self.filtered_indices = [ - i for i, x in enumerate(values) if x < lower_limit + selection = [ + i for i, x in enumerate(values) if x > lower_limit ] else: - self.filtered_indices = [ + selection = [ i for i, x in enumerate(values) - if x > upper_limit or x < lower_limit + if x > lower_limit and x < upper_limit ] - plot_hist(values, self.filtered_indices, self.histogram) + return selection - @property - def atoms(self): - return [ - self.data[i] for i in range(len(self.data)) if i not in self.filtered_indices - ] - @property - def excluded_atoms(self): - return [self.data[i] for i in self.filtered_indices] - + def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): + values = [atoms.calc.results[self.key] for atoms in atoms_lst] -def plot_hist(values, filtered_indices, histogram): - fig, ax = plt.subplots(3, figsize=(10, 10)) - ax[0].hist(values, bins=100) - ax[0].set_title("All") - ax[1].hist( - [values[i] for i in range(len(values)) if i not in filtered_indices], - bins=100, - ) - ax[1].set_title("Filtered") - ax[2].hist([values[i] for i in filtered_indices], bins=100) - ax[2].set_title("Excluded") - fig.savefig(histogram, bbox_inches="tight") \ No newline at end of file + # check if property is in cartesian basis + if np.array(values).ndim == 3: + # calculates the maximal magnetude of cartesian values + values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] + + fig, ax = plt.subplots(3, figsize=(10, 10)) + ax[0].hist(values, bins=100) + ax[0].set_title("All") + ax[1].hist( + [values[i] for i in range(len(values)) if i not in indices], + bins=100, + ) + ax[1].set_title("Filtered") + ax[2].hist([values[i] for i in indices], bins=100) + ax[2].set_title("Excluded") + fig.savefig(self.img_selection, bbox_inches="tight") \ No newline at end of file From 90fe016a07026118684a0f4cf5a388f694d3725a Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Mon, 11 Dec 2023 14:25:46 +0100 Subject: [PATCH 03/19] introduced test for filter selection --- .../configuration_selection/test_filter.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/unit_tests/configuration_selection/test_filter.py diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py new file mode 100644 index 00000000..cb8a7ce8 --- /dev/null +++ b/tests/unit_tests/configuration_selection/test_filter.py @@ -0,0 +1,36 @@ +import numpy as np +import pytest + +from ipsuite.configuration_selection import FilterOutlier + + +@pytest.mark.parametrize( + "key, cutoff_type, direction, cutoffs", + [ + ("forces", "direct", "both", [7, 13]), + ("forces", "direct", "both", None), + ("forces", "around_mean", "both", None), + ], +) +def test_get_selected_atoms(atoms_list, key, cutoff_type, direction, cutoffs): + for idx, atoms in enumerate(atoms_list): + atoms.calc.results[key] = [idx, 0, 0] + + filter = FilterOutlier( + key=key, + cutoff_type=cutoff_type, + direction=direction, + data=None, + cutoffs=cutoffs, + threshold=0.4, + ) + + if "direct" in cutoff_type and cutoffs is None: + with pytest.raises(ValueError): + selected_atoms = filter.select_atoms(atoms_list) + else: + test_selection = [8, 9, 10, 11, 12] + selected_atoms = filter.select_atoms(atoms_list) + assert isinstance(selected_atoms, list) + assert len(set(selected_atoms)) == 5 + assert selected_atoms == test_selection From 813a0c7331706855e044f78e8000e1e0881a057f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 13:55:50 +0000 Subject: [PATCH 04/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ipsuite/configuration_selection/filter.py | 32 +++++++++-------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index e7b25b89..16e6202a 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -11,9 +11,12 @@ def direct_cutoff(values, threshold, cutoffs): # Filtering the direct cutoff values if cutoffs is None: - raise ValueError("cutoffs have to be specified for using the direct cutoff filter.") + raise ValueError( + "cutoffs have to be specified for using the direct cutoff filter." + ) return (cutoffs[0], cutoffs[1]) + def cutoff_around_mean(values, threshold, cutoffs): # Filtering in multiples of the standard deviation around the mean. mean = np.mean(values) @@ -23,10 +26,8 @@ def cutoff_around_mean(values, threshold, cutoffs): lower_cutoff = mean - threshold * std return (lower_cutoff, upper_cutoff) -CUTOFF = { - "direct": direct_cutoff, - "around_mean": cutoff_around_mean -} + +CUTOFF = {"direct": direct_cutoff, "around_mean": cutoff_around_mean} class FilterOutlier(ConfigurationSelection): @@ -53,8 +54,8 @@ class FilterOutlier(ConfigurationSelection): cutoffs: t.Union[t.List[float], None] = zntrack.params(None) histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png") - - def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: + + def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: values = [atoms.calc.results[self.key] for atoms in atoms_lst] # get maximal atomic value per struckture @@ -64,7 +65,7 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: elif np.array(values).ndim == 2: # calculates the maximal atomic values values = [np.max(value, axis=0) for value in values] - + lower_limit, upper_limit = CUTOFF[self.cutoff_type]( values, self.threshold, @@ -72,23 +73,16 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: ) if self.direction == "above": - selection = [ - i for i, x in enumerate(values) if x < upper_limit - ] + selection = [i for i, x in enumerate(values) if x < upper_limit] elif self.direction == "below": - selection = [ - i for i, x in enumerate(values) if x > lower_limit - ] + selection = [i for i, x in enumerate(values) if x > lower_limit] else: selection = [ - i - for i, x in enumerate(values) - if x > lower_limit and x < upper_limit + i for i, x in enumerate(values) if x > lower_limit and x < upper_limit ] return selection - def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): values = [atoms.calc.results[self.key] for atoms in atoms_lst] @@ -107,4 +101,4 @@ def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): ax[1].set_title("Filtered") ax[2].hist([values[i] for i in indices], bins=100) ax[2].set_title("Excluded") - fig.savefig(self.img_selection, bbox_inches="tight") \ No newline at end of file + fig.savefig(self.img_selection, bbox_inches="tight") From 7f278a3cdfadf79e9bc63498af093ffc8e6c8670 Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Tue, 12 Dec 2023 08:33:29 +0100 Subject: [PATCH 05/19] _get_plot() fix --- ipsuite/configuration_selection/filter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index e7b25b89..b0b9fbee 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -51,8 +51,6 @@ class FilterOutlier(ConfigurationSelection): direction: t.Literal["above", "below", "both"] = zntrack.params("both") threshold: float = zntrack.params(3) cutoffs: t.Union[t.List[float], None] = zntrack.params(None) - - histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png") def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: values = [atoms.calc.results[self.key] for atoms in atoms_lst] @@ -89,7 +87,7 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: return selection - def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): + def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): values = [atoms.calc.results[self.key] for atoms in atoms_lst] # check if property is in cartesian basis From 113989ade9793853a588c872a3bd1079ab663991 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Dec 2023 07:37:19 +0000 Subject: [PATCH 06/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ipsuite/configuration_selection/filter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index be5e440c..4a40e23e 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -52,8 +52,8 @@ class FilterOutlier(ConfigurationSelection): direction: t.Literal["above", "below", "both"] = zntrack.params("both") threshold: float = zntrack.params(3) cutoffs: t.Union[t.List[float], None] = zntrack.params(None) - - def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: + + def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: values = [atoms.calc.results[self.key] for atoms in atoms_lst] # get maximal atomic value per struckture @@ -81,7 +81,6 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: return selection - def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): values = [atoms.calc.results[self.key] for atoms in atoms_lst] From 21c57b7ce1980a25ea8bad7ccb9097198933f2b3 Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Thu, 14 Dec 2023 12:51:15 +0100 Subject: [PATCH 07/19] Ragged values fix --- ipsuite/configuration_selection/filter.py | 27 ++++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index be5e440c..2e98151e 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -57,12 +57,13 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: values = [atoms.calc.results[self.key] for atoms in atoms_lst] # get maximal atomic value per struckture - if np.array(values).ndim == 3: - # calculates the maximal magnetude of cartesian values - values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] - elif np.array(values).ndim == 2: - # calculates the maximal atomic values - values = [np.max(value, axis=0) for value in values] + if isinstance(values[0], np.ndarray): + if values[0].ndim == 2: + # calculates the maximal magnetude of atomic cartesian property + values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] + elif values[0].ndim == 1: + # calculates the maximal atomic property + values = [np.max(value, axis=0) for value in values] lower_limit, upper_limit = CUTOFF[self.cutoff_type]( values, @@ -76,7 +77,7 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: selection = [i for i, x in enumerate(values) if x > lower_limit] else: selection = [ - i for i, x in enumerate(values) if x > lower_limit and x < upper_limit + i for i, x in enumerate(values) if x < lower_limit or x > upper_limit ] return selection @@ -85,10 +86,14 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): values = [atoms.calc.results[self.key] for atoms in atoms_lst] - # check if property is in cartesian basis - if np.array(values).ndim == 3: - # calculates the maximal magnetude of cartesian values - values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] + # get maximal atomic value per struckture + if isinstance(values[0], np.ndarray): + if values[0].ndim == 2: + # calculates the maximal magnetude of atomic cartesian property + values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] + elif values[0].ndim == 1: + # calculates the maximal atomic property + values = [np.max(value, axis=0) for value in values] fig, ax = plt.subplots(3, figsize=(10, 10)) ax[0].hist(values, bins=100) From 401177a7e2aa0908be2564f9a69ca73de242ad1f Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Thu, 14 Dec 2023 13:20:00 +0100 Subject: [PATCH 08/19] node name change test fix --- ipsuite/configuration_selection/__init__.py | 4 ++-- ipsuite/configuration_selection/filter.py | 16 ++++++++-------- ipsuite/nodes.py | 2 +- .../configuration_selection/test_filter.py | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ipsuite/configuration_selection/__init__.py b/ipsuite/configuration_selection/__init__.py index 0c9664e6..a63791d0 100644 --- a/ipsuite/configuration_selection/__init__.py +++ b/ipsuite/configuration_selection/__init__.py @@ -1,7 +1,7 @@ """Configuration Selection Nodes.""" from ipsuite.configuration_selection.base import ConfigurationSelection -from ipsuite.configuration_selection.filter import FilterOutlier +from ipsuite.configuration_selection.filter import PropertyFilter from ipsuite.configuration_selection.index import IndexSelection from ipsuite.configuration_selection.kernel import KernelSelection from ipsuite.configuration_selection.random import RandomSelection @@ -21,5 +21,5 @@ "IndexSelection", "ThresholdSelection", "SplitSelection", - "FilterOutlier", + "PropertyFilter", ] diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index 2e98151e..83969bc1 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -30,8 +30,8 @@ def cutoff_around_mean(values, threshold, cutoffs): CUTOFF = {"direct": direct_cutoff, "around_mean": cutoff_around_mean} -class FilterOutlier(ConfigurationSelection): - """Remove outliers from the data based on a given property. +class PropertyFilter(ConfigurationSelection): + """Filter structures from the dataset based on a given property. Attributes ---------- @@ -72,12 +72,12 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: ) if self.direction == "above": - selection = [i for i, x in enumerate(values) if x < upper_limit] + selection = [i for i, x in enumerate(values) if x > upper_limit] elif self.direction == "below": - selection = [i for i, x in enumerate(values) if x > lower_limit] + selection = [i for i, x in enumerate(values) if x < lower_limit] else: selection = [ - i for i, x in enumerate(values) if x < lower_limit or x > upper_limit + i for i, x in enumerate(values) if x > lower_limit and x < upper_limit ] return selection @@ -98,11 +98,11 @@ def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): fig, ax = plt.subplots(3, figsize=(10, 10)) ax[0].hist(values, bins=100) ax[0].set_title("All") - ax[1].hist( + ax[1].hist([values[i] for i in indices], bins=100) + ax[1].set_title("Selected") + ax[2].hist( [values[i] for i in range(len(values)) if i not in indices], bins=100, ) - ax[1].set_title("Filtered") - ax[2].hist([values[i] for i in indices], bins=100) ax[2].set_title("Excluded") fig.savefig(self.img_selection, bbox_inches="tight") diff --git a/ipsuite/nodes.py b/ipsuite/nodes.py index b24d66e5..ad527f44 100644 --- a/ipsuite/nodes.py +++ b/ipsuite/nodes.py @@ -26,7 +26,7 @@ class _Nodes: ) UniformTemporalSelection = "ipsuite.configuration_selection.UniformTemporalSelection" ThresholdSelection = "ipsuite.configuration_selection.ThresholdSelection" - FilterOutlier = "ipsuite.configuration_selection.FilterOutlier" + PropertyFilter = "ipsuite.configuration_selection.PropertyFilter" BatchKernelSelection = "ipsuite.models.apax.BatchKernelSelection" # Configuration Comparison diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py index cb8a7ce8..adb5662c 100644 --- a/tests/unit_tests/configuration_selection/test_filter.py +++ b/tests/unit_tests/configuration_selection/test_filter.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from ipsuite.configuration_selection import FilterOutlier +from ipsuite.configuration_selection import PropertyFilter @pytest.mark.parametrize( @@ -14,9 +14,9 @@ ) def test_get_selected_atoms(atoms_list, key, cutoff_type, direction, cutoffs): for idx, atoms in enumerate(atoms_list): - atoms.calc.results[key] = [idx, 0, 0] + atoms.calc.results[key] = np.array([[idx, 0, 0], [0, 0, 0]]) - filter = FilterOutlier( + filter = PropertyFilter( key=key, cutoff_type=cutoff_type, direction=direction, From 2f1a994c5da80d308cb28bf46e04ef4035959e59 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 14 Dec 2023 12:20:18 +0000 Subject: [PATCH 09/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ipsuite/configuration_selection/filter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index 5a41bf00..3aa3ab96 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -60,7 +60,9 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: if isinstance(values[0], np.ndarray): if values[0].ndim == 2: # calculates the maximal magnetude of atomic cartesian property - values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] + values = [ + np.max(np.linalg.norm(value, axis=1), axis=0) for value in values + ] elif values[0].ndim == 1: # calculates the maximal atomic property values = [np.max(value, axis=0) for value in values] @@ -89,7 +91,9 @@ def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): if isinstance(values[0], np.ndarray): if values[0].ndim == 2: # calculates the maximal magnetude of atomic cartesian property - values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values] + values = [ + np.max(np.linalg.norm(value, axis=1), axis=0) for value in values + ] elif values[0].ndim == 1: # calculates the maximal atomic property values = [np.max(value, axis=0) for value in values] From 2168bd3797c271673147c6dec2708dedfabac6a4 Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Thu, 14 Dec 2023 13:59:35 +0100 Subject: [PATCH 10/19] fexed integration test --- tests/integration/configuration_selection/test_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/configuration_selection/test_index.py b/tests/integration/configuration_selection/test_index.py index e493a79f..7264dbce 100644 --- a/tests/integration/configuration_selection/test_index.py +++ b/tests/integration/configuration_selection/test_index.py @@ -142,8 +142,8 @@ def test_exclude_configurations_list(proj_path, traj_file): def test_filter_outlier(proj_path, traj_file): with ips.Project() as project: data = ips.AddData(file=traj_file) - filtered_data = ips.configuration_selection.FilterOutlier( - data=data.atoms, key="energy", threshold=1, direction="both" + filtered_data = ips.configuration_selection.PropertyFilter( + data=data.atoms, key="energy", cutoff_type='around_mean', threshold=1, direction="both" ) project.run() From 4404c6ae7c80f6c2905ddf32042609506ef4cce8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 14 Dec 2023 13:00:06 +0000 Subject: [PATCH 11/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/integration/configuration_selection/test_index.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/configuration_selection/test_index.py b/tests/integration/configuration_selection/test_index.py index 7264dbce..73a13669 100644 --- a/tests/integration/configuration_selection/test_index.py +++ b/tests/integration/configuration_selection/test_index.py @@ -143,7 +143,11 @@ def test_filter_outlier(proj_path, traj_file): with ips.Project() as project: data = ips.AddData(file=traj_file) filtered_data = ips.configuration_selection.PropertyFilter( - data=data.atoms, key="energy", cutoff_type='around_mean', threshold=1, direction="both" + data=data.atoms, + key="energy", + cutoff_type="around_mean", + threshold=1, + direction="both", ) project.run() From 8832668f790bf0ca3cfd19719ab5729d64668a8f Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Fri, 19 Apr 2024 12:55:12 +0200 Subject: [PATCH 12/19] PropertyFilter fix --- ipsuite/configuration_selection/filter.py | 168 +++++++++++----------- 1 file changed, 85 insertions(+), 83 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index 3aa3ab96..26a033a5 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -8,104 +8,106 @@ from ipsuite.configuration_selection import ConfigurationSelection -def direct_cutoff(values, threshold, cutoffs): - # Filtering the direct cutoff values - if cutoffs is None: - raise ValueError( - "cutoffs have to be specified for using the direct cutoff filter." - ) - return (cutoffs[0], cutoffs[1]) +def mean_reduction(values, axis): + return np.mean(values, axis=axis) -def cutoff_around_mean(values, threshold, cutoffs): - # Filtering in multiples of the standard deviation around the mean. - mean = np.mean(values) - std = np.std(values) +def max_reduction(values, axis): + return np.max(values, axis=axis) - upper_cutoff = mean + threshold * std - lower_cutoff = mean - threshold * std - return (lower_cutoff, upper_cutoff) +def check_dimension(values): + if values.ndim > 1: + raise ValueError( + f"Value dimension is {values.ndim} != 1. " + "Reduce the dimension by defining dim_reduction, " + "use mean or max to get (n_structures,) shape." + ) -CUTOFF = {"direct": direct_cutoff, "around_mean": cutoff_around_mean} +REDUCTIONS = { + "mean": mean_reduction, + "max": max_reduction, +} class PropertyFilter(ConfigurationSelection): - """Filter structures from the dataset based on a given property. - - Attributes - ---------- - key : str, default="energy" - The property to filter on. - cutoff_type : {"direct", "around_mean"}, default="around_mean" - Defines the cutoff type. - direction : {"above", "below", "both"}, default="both" - The direction to filter in. - threshold : float, default=3 - The threshold for filtering in units of standard deviations. - cutoffs : list(float), default=None - Lower and upper cutoff. - """ - - key: str = zntrack.params("energy") - cutoff_type: t.Literal["direct", "around_mean"] = zntrack.params("around_mean") + + reference = zntrack.params("energy") + cutoffs: t.Union[t.List[float]] = zntrack.params() direction: t.Literal["above", "below", "both"] = zntrack.params("both") - threshold: float = zntrack.params(3) - cutoffs: t.Union[t.List[float], None] = zntrack.params(None) - - def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: - values = [atoms.calc.results[self.key] for atoms in atoms_lst] - - # get maximal atomic value per struckture - if isinstance(values[0], np.ndarray): - if values[0].ndim == 2: - # calculates the maximal magnetude of atomic cartesian property - values = [ - np.max(np.linalg.norm(value, axis=1), axis=0) for value in values - ] - elif values[0].ndim == 1: - # calculates the maximal atomic property - values = [np.max(value, axis=0) for value in values] - - lower_limit, upper_limit = CUTOFF[self.cutoff_type]( - values, - self.threshold, - self.cutoffs, - ) + n_configurations = zntrack.params(None) + min_distance: int = zntrack.params(1) + dim_reduction: str = zntrack.params(None) + reduction_axis = zntrack.params((1, 2)) + + def _post_init_(self): + if self.direction not in ["above", "below", "both"]: + raise ValueError("'direction' should be set to 'above', 'below', or 'both'.") + + return super()._post_init_() + + def select_atoms( + self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: + self.reduction_axis = tuple(self.reduction_axis) + values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) + + if self.dim_reduction is not None: + reduction_fn = REDUCTIONS[self.dim_reduction] + values = reduction_fn(values, self.reduction_axis) + + check_dimension(values) + + lower_limit, upper_limit = self.cutoffs[0], self.cutoffs[1] if self.direction == "above": - selection = [i for i, x in enumerate(values) if x > upper_limit] + pre_selection = np.array([i for i, x in enumerate(values) if x > upper_limit]) + sorting_idx = np.argsort(values[pre_selection])[::-1] elif self.direction == "below": - selection = [i for i, x in enumerate(values) if x < lower_limit] + pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit]) + sorting_idx = np.argsort(values[pre_selection]) else: - selection = [ - i for i, x in enumerate(values) if x > lower_limit and x < upper_limit - ] + pre_selection = np.array([ + i for i, x in enumerate(values) if x < lower_limit or x > upper_limit + ]) + mean = (lower_limit+upper_limit)/2 + dist_to_mean = abs(values[pre_selection]-mean) + sorting_idx = np.argsort(dist_to_mean)[::-1] + + selection = self.get_selection(pre_selection[sorting_idx]) + + return selection + + def get_selection(self, indices): + selection = [] + for idx in indices: + # If the value is close to any of the already selected values, skip it. + if not any(np.abs(idx - np.array(selection)) < self.min_distance): + selection.append(idx) + if len(selection) == self.n_configurations: + break return selection def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): - values = [atoms.calc.results[self.key] for atoms in atoms_lst] - - # get maximal atomic value per struckture - if isinstance(values[0], np.ndarray): - if values[0].ndim == 2: - # calculates the maximal magnetude of atomic cartesian property - values = [ - np.max(np.linalg.norm(value, axis=1), axis=0) for value in values - ] - elif values[0].ndim == 1: - # calculates the maximal atomic property - values = [np.max(value, axis=0) for value in values] - - fig, ax = plt.subplots(3, figsize=(10, 10)) - ax[0].hist(values, bins=100) - ax[0].set_title("All") - ax[1].hist([values[i] for i in indices], bins=100) - ax[1].set_title("Selected") - ax[2].hist( - [values[i] for i in range(len(values)) if i not in indices], - bins=100, - ) - ax[2].set_title("Excluded") + indices = np.array(indices) + values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) + + if self.dim_reduction is not None: + reduction_fn = REDUCTIONS[self.dim_reduction] + values = reduction_fn(values, self.reduction_axis) + + fig, ax = plt.subplots() + ax.plot(values, label=self.reference) + ax.plot(indices, values[indices], "x", color="red") + ax.fill_between( + np.arange(len(values)), + self.cutoffs[0], + self.cutoffs[1], + color="black", + alpha=0.2, + label=f"{self.reference} +- std", + ) + ax.set_ylabel(self.reference) + ax.set_xlabel("configuration") + fig.savefig(self.img_selection, bbox_inches="tight") From 02bdaf7b9198b446957e87fc5dfbee2a4d3a10ba Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Fri, 19 Apr 2024 12:55:42 +0200 Subject: [PATCH 13/19] ThresholdSelection fix --- ipsuite/configuration_selection/threshold.py | 116 +++++++------------ 1 file changed, 43 insertions(+), 73 deletions(-) diff --git a/ipsuite/configuration_selection/threshold.py b/ipsuite/configuration_selection/threshold.py index 6fceda7a..72388298 100644 --- a/ipsuite/configuration_selection/threshold.py +++ b/ipsuite/configuration_selection/threshold.py @@ -34,67 +34,25 @@ def check_dimension(values): class ThresholdSelection(ConfigurationSelection): - """Select atoms based on a given threshold. - - Select atoms above a given threshold or the n_configurations with the - highest / lowest value. Typically useful for uncertainty based selection. - - Attributes - ---------- - key: str - The key in 'calc.results' to select from - threshold: float, optional - All values above (or below if negative) this threshold will be selected. - If n_configurations is given, 'self.threshold' will be prioritized, - but a maximum of n_configurations will be selected. - reference: str, optional - For visualizing the selection a reference value can be given. - For 'energy_uncertainty' this would typically be 'energy'. - n_configurations: int, optional - Number of configurations to select. - min_distance: int, optional - Minimum distance between selected configurations. - dim_reduction: str, optional - Reduces the dimensionality of the chosen uncertainty along the specified axis - by calculating either the maximum or mean value. - - Choose from ["max", "mean"] - reduction_axis: tuple(int), optional - Specifies the axis along which the reduction occurs. - """ - - key = zntrack.params("energy_uncertainty") + reference = zntrack.params("energy") - threshold = zntrack.params(None) + threshold = zntrack.params(2) + direction: typing.Literal["above", "below", "both"] = zntrack.params("both") n_configurations = zntrack.params(None) min_distance: int = zntrack.params(1) dim_reduction: str = zntrack.params(None) reduction_axis = zntrack.params((1, 2)) def _post_init_(self): - if self.threshold is None and self.n_configurations is None: - raise ValueError("Either 'threshold' or 'n_configurations' must not be None.") + if self.direction not in ["above", "below", "both"]: + raise ValueError("'direction' should be set to 'above', 'below', or 'both'.") return super()._post_init_() def select_atoms( - self, atoms_lst: typing.List[ase.Atoms], save_fig: bool = True - ) -> typing.List[int]: - """Take every nth (step) object of a given atoms list. - - Parameters - ---------- - atoms_lst: typing.List[ase.Atoms] - list of atoms objects to arange - - Returns - ------- - typing.List[int]: - list containing the taken indices - """ - + self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]: self.reduction_axis = tuple(self.reduction_axis) - values = np.array([atoms.calc.results[self.key] for atoms in atoms_lst]) + values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) if self.dim_reduction is not None: reduction_fn = REDUCTIONS[self.dim_reduction] @@ -102,48 +60,60 @@ def select_atoms( check_dimension(values) - if self.threshold is not None: - if self.threshold < 0: - indices = np.where(values < self.threshold)[0] - if self.n_configurations is not None: - indices = np.argsort(values)[indices] - else: - indices = np.where(values > self.threshold)[0] - if self.n_configurations is not None: - indices = np.argsort(values)[::-1][indices] + mean = np.mean(values) + std = np.std(values) + upper_limit = mean + self.threshold * std + lower_limit = mean - self.threshold * std + + if self.direction == "above": + pre_selection = np.array([i for i, x in enumerate(values) if x > upper_limit]) + sorting_idx = np.argsort(values[pre_selection])[::-1] + elif self.direction == "below": + pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit]) + sorting_idx = np.argsort(values[pre_selection]) else: - if np.mean(values) > 0: - indices = np.argsort(values)[::-1] - else: - indices = np.argsort(values) + pre_selection = np.array([ + i for i, x in enumerate(values) if x < lower_limit or x > upper_limit + ]) + limit_mean = (lower_limit+upper_limit)/2 + dist_to_mean = abs(values[pre_selection]-limit_mean) + sorting_idx = np.argsort(dist_to_mean)[::-1] - selection = self.get_selection(indices) + selection = self.get_selection(pre_selection[sorting_idx]) return selection def get_selection(self, indices): - selected = [] - for val in indices: + selection = [] + for idx in indices: # If the value is close to any of the already selected values, skip it. - if not any(np.abs(val - np.array(selected)) < self.min_distance): - selected.append(val) - if len(selected) == self.n_configurations: + if not any(np.abs(idx - np.array(selection)) < self.min_distance): + selection.append(idx) + if len(selection) == self.n_configurations: break - return selected + return selection def _get_plot(self, atoms_lst: typing.List[ase.Atoms], indices: typing.List[int]): indices = np.array(indices) - values = np.array([atoms.calc.results[self.key] for atoms in atoms_lst]) + values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) if self.dim_reduction is not None: reduction_fn = REDUCTIONS[self.dim_reduction] values = reduction_fn(values, self.reduction_axis) fig, ax = plt.subplots() - ax.plot(values, label=self.key) + ax.plot(values, label=self.reference) ax.plot(indices, values[indices], "x", color="red") - ax.set_ylabel(self.key) + ax.fill_between( + np.arange(len(values)), + np.mean(values) + self.threshold * np.std(values), + np.mean(values) - self.threshold * np.std(values), + color="black", + alpha=0.2, + label=f"{self.reference} +- std", + ) + ax.set_ylabel(self.reference) ax.set_xlabel("configuration") - fig.savefig(self.img_selection, bbox_inches="tight") + fig.savefig(self.img_selection, bbox_inches="tight") \ No newline at end of file From ae4800bf90e6106a015e920cdd778b79d703a256 Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Fri, 19 Apr 2024 21:10:09 +0200 Subject: [PATCH 14/19] node fix --- ipsuite/configuration_selection/filter.py | 2 ++ ipsuite/configuration_selection/threshold.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index 26a033a5..520b5bd3 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -81,6 +81,8 @@ def get_selection(self, indices): selection = [] for idx in indices: # If the value is close to any of the already selected values, skip it. + if not selection: + selection.append(idx) if not any(np.abs(idx - np.array(selection)) < self.min_distance): selection.append(idx) if len(selection) == self.n_configurations: diff --git a/ipsuite/configuration_selection/threshold.py b/ipsuite/configuration_selection/threshold.py index 72388298..4c6368e4 100644 --- a/ipsuite/configuration_selection/threshold.py +++ b/ipsuite/configuration_selection/threshold.py @@ -87,6 +87,8 @@ def get_selection(self, indices): selection = [] for idx in indices: # If the value is close to any of the already selected values, skip it. + if not selection: + selection.append(idx) if not any(np.abs(idx - np.array(selection)) < self.min_distance): selection.append(idx) if len(selection) == self.n_configurations: From 36b45e051e5fea4fea9e9bb0b72a7553a0d0809c Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Fri, 19 Apr 2024 21:10:18 +0200 Subject: [PATCH 15/19] test fix --- .../configuration_selection/test_filter.py | 62 ++++++++++++++----- .../configuration_selection/test_threshold.py | 59 +++++++++++++----- 2 files changed, 88 insertions(+), 33 deletions(-) diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py index adb5662c..0f0b2975 100644 --- a/tests/unit_tests/configuration_selection/test_filter.py +++ b/tests/unit_tests/configuration_selection/test_filter.py @@ -2,35 +2,63 @@ import pytest from ipsuite.configuration_selection import PropertyFilter - +from ipsuite.configuration_selection.filter import REDUCTIONS @pytest.mark.parametrize( - "key, cutoff_type, direction, cutoffs", + "reference, dim_reduction, reduction_axis", [ - ("forces", "direct", "both", [7, 13]), - ("forces", "direct", "both", None), - ("forces", "around_mean", "both", None), + ("energy", None, (1, 2)), + ("forces", "max", (1, 2)), + ("forces_uncertainty", "mean", (1, 2)), + ("forces_uncertainty", None, (1, 2)), ], ) -def test_get_selected_atoms(atoms_list, key, cutoff_type, direction, cutoffs): - for idx, atoms in enumerate(atoms_list): - atoms.calc.results[key] = np.array([[idx, 0, 0], [0, 0, 0]]) +@pytest.mark.parametrize( + "direction", + [ + "above", + "below", + "both", + ] +) +def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction): + values = np.array([atoms.calc.results[reference] for atoms in atoms_list]) + if dim_reduction is not None: + reduction_fn = REDUCTIONS[dim_reduction] + values = reduction_fn(values, reduction_axis) + + mean = np.mean(values) + std = np.std(values) + upper_limit = mean + 0.5 * std + lower_limit = mean - 0.5 * std filter = PropertyFilter( - key=key, - cutoff_type=cutoff_type, - direction=direction, + reference=reference, + dim_reduction=dim_reduction, + reduction_axis=reduction_axis, data=None, - cutoffs=cutoffs, - threshold=0.4, + cutoffs=[lower_limit, upper_limit], + n_configurations=4, + min_distance=1, + direction=direction, ) - if "direct" in cutoff_type and cutoffs is None: + if reference in ["forces", "forces_uncertainty"] and dim_reduction is None: with pytest.raises(ValueError): selected_atoms = filter.select_atoms(atoms_list) else: - test_selection = [8, 9, 10, 11, 12] selected_atoms = filter.select_atoms(atoms_list) + print(selected_atoms) + + assert len(set(selected_atoms)) == 4 assert isinstance(selected_atoms, list) - assert len(set(selected_atoms)) == 5 - assert selected_atoms == test_selection + + if direction == "above": + assert np.argmax(values) in selected_atoms + + elif direction == "below": + assert np.argmin(values) in selected_atoms + + else: + assert np.argmin(values) in selected_atoms + assert np.argmax(values) in selected_atoms \ No newline at end of file diff --git a/tests/unit_tests/configuration_selection/test_threshold.py b/tests/unit_tests/configuration_selection/test_threshold.py index 87efe45d..5f545268 100644 --- a/tests/unit_tests/configuration_selection/test_threshold.py +++ b/tests/unit_tests/configuration_selection/test_threshold.py @@ -2,35 +2,62 @@ import pytest from ipsuite.configuration_selection import ThresholdSelection +from ipsuite.configuration_selection.threshold import REDUCTIONS @pytest.mark.parametrize( - "key, reference, dim_reduction, reduction_axis", + "reference, dim_reduction, reduction_axis", [ - ("energy_uncertainty", "energy", None, (1, 2)), - ("forces_uncertainty", "forces", "max", (1, 2)), - ("forces_uncertainty", "forces", "mean", (1, 2)), - ("forces_uncertainty", "forces", None, (1, 2)), + ("energy", None, (1, 2)), + ("forces", "max", (1, 2)), + ("forces_uncertainty", "mean", (1, 2)), + ("forces_uncertainty", None, (1, 2)), ], ) -def test_get_selected_atoms(atoms_list, key, reference, dim_reduction, reduction_axis): +@pytest.mark.parametrize( + "direction", + [ + "above", + "below", + "both", + ] +) +def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction): threshold = ThresholdSelection( - key=key, reference=reference, dim_reduction=dim_reduction, reduction_axis=reduction_axis, data=None, - threshold=1.0, - n_configurations=5, - min_distance=5, + threshold=0.5, + n_configurations=4, + min_distance=1, + direction=direction, ) - if "forces_uncertainty" in key and dim_reduction is None: + if reference in ["forces", "forces_uncertainty"] and dim_reduction is None: with pytest.raises(ValueError): - selected_atoms = threshold.select_atoms(atoms_list, save_fig=False) + selected_atoms = threshold.select_atoms(atoms_list) else: - selected_atoms = threshold.select_atoms(atoms_list, save_fig=False) - test_selection = np.linspace(20, 0, 5, dtype=int).tolist() - assert len(set(selected_atoms)) == 5 + selected_atoms = threshold.select_atoms(atoms_list) + # test_selection = np.linspace(20, 0, 5, dtype=int).tolist() + + assert len(set(selected_atoms)) == 4 assert isinstance(selected_atoms, list) - assert selected_atoms == test_selection + + values = np.array([atoms.calc.results[reference] for atoms in atoms_list]) + if dim_reduction is not None: + reduction_fn = REDUCTIONS[dim_reduction] + values = reduction_fn(values, reduction_axis) + + if direction == "above": + print(np.max(values)) + print(values[selected_atoms]) + + assert np.argmax(values) in selected_atoms + + elif direction == "below": + assert np.argmin(values) in selected_atoms + + else: + assert np.argmin(values) in selected_atoms + assert np.argmax(values) in selected_atoms \ No newline at end of file From 8ccb5b7df2345d54171c26c5021ed21349161189 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 19 Apr 2024 19:10:33 +0000 Subject: [PATCH 16/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ipsuite/configuration_selection/filter.py | 31 +++++++++---------- ipsuite/configuration_selection/threshold.py | 30 +++++++++--------- .../configuration_selection/test_filter.py | 11 ++++--- .../configuration_selection/test_threshold.py | 12 ++++--- 4 files changed, 43 insertions(+), 41 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index 520b5bd3..0222b9e6 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -30,8 +30,8 @@ def check_dimension(values): "max": max_reduction, } -class PropertyFilter(ConfigurationSelection): +class PropertyFilter(ConfigurationSelection): reference = zntrack.params("energy") cutoffs: t.Union[t.List[float]] = zntrack.params() direction: t.Literal["above", "below", "both"] = zntrack.params("both") @@ -45,9 +45,8 @@ def _post_init_(self): raise ValueError("'direction' should be set to 'above', 'below', or 'both'.") return super()._post_init_() - - def select_atoms( - self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: + + def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: self.reduction_axis = tuple(self.reduction_axis) values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) @@ -66,11 +65,11 @@ def select_atoms( pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit]) sorting_idx = np.argsort(values[pre_selection]) else: - pre_selection = np.array([ - i for i, x in enumerate(values) if x < lower_limit or x > upper_limit - ]) - mean = (lower_limit+upper_limit)/2 - dist_to_mean = abs(values[pre_selection]-mean) + pre_selection = np.array( + [i for i, x in enumerate(values) if x < lower_limit or x > upper_limit] + ) + mean = (lower_limit + upper_limit) / 2 + dist_to_mean = abs(values[pre_selection] - mean) sorting_idx = np.argsort(dist_to_mean)[::-1] selection = self.get_selection(pre_selection[sorting_idx]) @@ -102,13 +101,13 @@ def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): ax.plot(values, label=self.reference) ax.plot(indices, values[indices], "x", color="red") ax.fill_between( - np.arange(len(values)), - self.cutoffs[0], - self.cutoffs[1], - color="black", - alpha=0.2, - label=f"{self.reference} +- std", - ) + np.arange(len(values)), + self.cutoffs[0], + self.cutoffs[1], + color="black", + alpha=0.2, + label=f"{self.reference} +- std", + ) ax.set_ylabel(self.reference) ax.set_xlabel("configuration") diff --git a/ipsuite/configuration_selection/threshold.py b/ipsuite/configuration_selection/threshold.py index 4c6368e4..f762e29a 100644 --- a/ipsuite/configuration_selection/threshold.py +++ b/ipsuite/configuration_selection/threshold.py @@ -34,7 +34,6 @@ def check_dimension(values): class ThresholdSelection(ConfigurationSelection): - reference = zntrack.params("energy") threshold = zntrack.params(2) direction: typing.Literal["above", "below", "both"] = zntrack.params("both") @@ -49,8 +48,7 @@ def _post_init_(self): return super()._post_init_() - def select_atoms( - self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]: + def select_atoms(self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]: self.reduction_axis = tuple(self.reduction_axis) values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) @@ -72,11 +70,11 @@ def select_atoms( pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit]) sorting_idx = np.argsort(values[pre_selection]) else: - pre_selection = np.array([ - i for i, x in enumerate(values) if x < lower_limit or x > upper_limit - ]) - limit_mean = (lower_limit+upper_limit)/2 - dist_to_mean = abs(values[pre_selection]-limit_mean) + pre_selection = np.array( + [i for i, x in enumerate(values) if x < lower_limit or x > upper_limit] + ) + limit_mean = (lower_limit + upper_limit) / 2 + dist_to_mean = abs(values[pre_selection] - limit_mean) sorting_idx = np.argsort(dist_to_mean)[::-1] selection = self.get_selection(pre_selection[sorting_idx]) @@ -108,14 +106,14 @@ def _get_plot(self, atoms_lst: typing.List[ase.Atoms], indices: typing.List[int] ax.plot(values, label=self.reference) ax.plot(indices, values[indices], "x", color="red") ax.fill_between( - np.arange(len(values)), - np.mean(values) + self.threshold * np.std(values), - np.mean(values) - self.threshold * np.std(values), - color="black", - alpha=0.2, - label=f"{self.reference} +- std", - ) + np.arange(len(values)), + np.mean(values) + self.threshold * np.std(values), + np.mean(values) - self.threshold * np.std(values), + color="black", + alpha=0.2, + label=f"{self.reference} +- std", + ) ax.set_ylabel(self.reference) ax.set_xlabel("configuration") - fig.savefig(self.img_selection, bbox_inches="tight") \ No newline at end of file + fig.savefig(self.img_selection, bbox_inches="tight") diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py index 0f0b2975..040cc352 100644 --- a/tests/unit_tests/configuration_selection/test_filter.py +++ b/tests/unit_tests/configuration_selection/test_filter.py @@ -4,6 +4,7 @@ from ipsuite.configuration_selection import PropertyFilter from ipsuite.configuration_selection.filter import REDUCTIONS + @pytest.mark.parametrize( "reference, dim_reduction, reduction_axis", [ @@ -19,9 +20,11 @@ "above", "below", "both", - ] + ], ) -def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction): +def test_get_selected_atoms( + atoms_list, reference, dim_reduction, reduction_axis, direction +): values = np.array([atoms.calc.results[reference] for atoms in atoms_list]) if dim_reduction is not None: reduction_fn = REDUCTIONS[dim_reduction] @@ -58,7 +61,7 @@ def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis elif direction == "below": assert np.argmin(values) in selected_atoms - + else: assert np.argmin(values) in selected_atoms - assert np.argmax(values) in selected_atoms \ No newline at end of file + assert np.argmax(values) in selected_atoms diff --git a/tests/unit_tests/configuration_selection/test_threshold.py b/tests/unit_tests/configuration_selection/test_threshold.py index 5f545268..11147e16 100644 --- a/tests/unit_tests/configuration_selection/test_threshold.py +++ b/tests/unit_tests/configuration_selection/test_threshold.py @@ -20,9 +20,11 @@ "above", "below", "both", - ] + ], ) -def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction): +def test_get_selected_atoms( + atoms_list, reference, dim_reduction, reduction_axis, direction +): threshold = ThresholdSelection( reference=reference, dim_reduction=dim_reduction, @@ -40,7 +42,7 @@ def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis else: selected_atoms = threshold.select_atoms(atoms_list) # test_selection = np.linspace(20, 0, 5, dtype=int).tolist() - + assert len(set(selected_atoms)) == 4 assert isinstance(selected_atoms, list) @@ -57,7 +59,7 @@ def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis elif direction == "below": assert np.argmin(values) in selected_atoms - + else: assert np.argmin(values) in selected_atoms - assert np.argmax(values) in selected_atoms \ No newline at end of file + assert np.argmax(values) in selected_atoms From 51accca4029f29abd1f8b2b0f476607741e67e75 Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Mon, 22 Apr 2024 10:43:45 +0200 Subject: [PATCH 17/19] cleanup --- ipsuite/configuration_selection/filter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py index 520b5bd3..a78a3658 100644 --- a/ipsuite/configuration_selection/filter.py +++ b/ipsuite/configuration_selection/filter.py @@ -1,4 +1,4 @@ -import typing as t +import typing import ase import matplotlib.pyplot as plt @@ -33,8 +33,8 @@ def check_dimension(values): class PropertyFilter(ConfigurationSelection): reference = zntrack.params("energy") - cutoffs: t.Union[t.List[float]] = zntrack.params() - direction: t.Literal["above", "below", "both"] = zntrack.params("both") + cutoffs: typing.Union[typing.List[float]] = zntrack.params() + direction: typing.Literal["above", "below", "both"] = zntrack.params("both") n_configurations = zntrack.params(None) min_distance: int = zntrack.params(1) dim_reduction: str = zntrack.params(None) @@ -47,7 +47,7 @@ def _post_init_(self): return super()._post_init_() def select_atoms( - self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]: + self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]: self.reduction_axis = tuple(self.reduction_axis) values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) @@ -90,7 +90,7 @@ def get_selection(self, indices): return selection - def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]): + def _get_plot(self, atoms_lst: typing.List[ase.Atoms], indices: typing.List[int]): indices = np.array(indices) values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst]) From 35c35d5377c472df2ba3bf856dec1d7c99b22047 Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Mon, 22 Apr 2024 11:25:37 +0200 Subject: [PATCH 18/19] fix integration test --- tests/integration/configuration_selection/test_index.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/integration/configuration_selection/test_index.py b/tests/integration/configuration_selection/test_index.py index eec12102..c3861846 100644 --- a/tests/integration/configuration_selection/test_index.py +++ b/tests/integration/configuration_selection/test_index.py @@ -142,10 +142,9 @@ def test_exclude_configurations_list(proj_path, traj_file): def test_filter_outlier(proj_path, traj_file): with ips.Project() as project: data = ips.AddData(file=traj_file) - filtered_data = ips.configuration_selection.PropertyFilter( + filtered_data = ips.configuration_selection.ThresholdSelection( data=data.atoms, - key="energy", - cutoff_type="around_mean", + reference="energy", threshold=1, direction="both", ) @@ -153,4 +152,4 @@ def test_filter_outlier(proj_path, traj_file): project.run() filtered_data.load() - assert len(filtered_data.atoms) == 13 + assert len(filtered_data.atoms) == 8 From 75d00b537aa661af06093037282a91e9ea4042e9 Mon Sep 17 00:00:00 2001 From: Tetracarbonylnickel Date: Mon, 1 Jul 2024 18:22:18 +0200 Subject: [PATCH 19/19] fix filter test --- tests/unit_tests/configuration_selection/test_filter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py index 040cc352..d5e0f27a 100644 --- a/tests/unit_tests/configuration_selection/test_filter.py +++ b/tests/unit_tests/configuration_selection/test_filter.py @@ -41,7 +41,7 @@ def test_get_selected_atoms( reduction_axis=reduction_axis, data=None, cutoffs=[lower_limit, upper_limit], - n_configurations=4, + n_configurations=3, min_distance=1, direction=direction, ) @@ -51,9 +51,8 @@ def test_get_selected_atoms( selected_atoms = filter.select_atoms(atoms_list) else: selected_atoms = filter.select_atoms(atoms_list) - print(selected_atoms) - assert len(set(selected_atoms)) == 4 + assert len(set(selected_atoms)) == 3 assert isinstance(selected_atoms, list) if direction == "above":