From e9b768d8d8f9ddf4219a2c06545d29bb667b98dd Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Mon, 11 Dec 2023 11:03:43 +0100
Subject: [PATCH 01/19] FilterOutliers overhaul

---
 ipsuite/configuration_selection/filter.py | 82 +++++++++++++++++------
 1 file changed, 61 insertions(+), 21 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index c6419367..a6f2e29a 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -7,57 +7,83 @@
 from ipsuite import base
 
 
-class FilterOutlier(base.ProcessAtoms):
+def direct_cutoff(values, threshold, cutoffs):
+    # Filtering the direct cutoff values
+    if cutoffs is None:
+        raise ValueError("cutoffs not specified.")
+    return (cutoffs[0], cutoffs[1])
+
+def cutoff_around_mean(values, threshold, cutoffs):
+    # Filtering in multiples of the standard deviation around the mean.
+    mean = np.mean(values)
+    std = np.std(values)
+
+    upper_limit = mean + threshold * std
+    lower_limit = mean - threshold * std
+    return (upper_limit, lower_limit)
+
+CUTOFF = {
+    "direct": direct_cutoff,
+    "around_mean": cutoff_around_mean
+}
+
+
+class FilterOutliers(base.ProcessAtoms):
     """Remove outliers from the data based on a given property.
 
     Attributes
     ----------
     key : str, default="energy"
         The property to filter on.
-    threshold : float, default=3
-        The threshold for filtering in units of standard deviations.
+    cutoff_type : {"direct", "around_mean"}, default="around_mean"
+        Defines the cutoff type.
     direction : {"above", "below", "both"}, default="both"
         The direction to filter in.
+    threshold : float, default=3
+        The threshold for filtering in units of standard deviations.
+    cutoffs : list(float), default=None
+        Upper and lower cutoff.
     """
 
     key: str = zntrack.params("energy")
-    threshold: float = zntrack.params(3)
+    cutoff_type: t.Literal["direct", "around_mean"] = zntrack.params("around_mean")
     direction: t.Literal["above", "below", "both"] = zntrack.params("both")
+    threshold: float = zntrack.params(3)
+    cutoffs: list(float) = zntrack.params(None)
+
 
     filtered_indices: list = zntrack.outs()
     histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png")
 
-    def run(self):
+    def run(self):         
         values = [x.calc.results[self.key] for x in self.data]
-        mean = np.mean(values)
-        std = np.std(values)
+
+        if len(values[0][0]) == 3:
+            # calculates the maximal magnetude of cartesian values
+            values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
+
+        upper_limit, lower_limit = CUTOFF(self.cutoff_type)(
+            values,
+            self.threshold,
+            self.cutoffs,
+        )
 
         if self.direction == "above":
             self.filtered_indices = [
-                i for i, x in enumerate(values) if x > mean + self.threshold * std
+                i for i, x in enumerate(values) if x > upper_limit
             ]
         elif self.direction == "below":
             self.filtered_indices = [
-                i for i, x in enumerate(values) if x < mean - self.threshold * std
+                i for i, x in enumerate(values) if x < lower_limit
             ]
         else:
             self.filtered_indices = [
                 i
                 for i, x in enumerate(values)
-                if x > mean + self.threshold * std or x < mean - self.threshold * std
+                if x > upper_limit or x < lower_limit
             ]
 
-        fig, ax = plt.subplots(3, figsize=(10, 10))
-        ax[0].hist(values, bins=100)
-        ax[0].set_title("All")
-        ax[1].hist(
-            [values[i] for i in range(len(values)) if i not in self.filtered_indices],
-            bins=100,
-        )
-        ax[1].set_title("Filtered")
-        ax[2].hist([values[i] for i in self.filtered_indices], bins=100)
-        ax[2].set_title("Excluded")
-        fig.savefig(self.histogram, bbox_inches="tight")
+        plot_hist(values, self.filtered_indices, self.histogram)
 
     @property
     def atoms(self):
@@ -68,3 +94,17 @@ def atoms(self):
     @property
     def excluded_atoms(self):
         return [self.data[i] for i in self.filtered_indices]
+    
+
+def plot_hist(values, filtered_indices, histogram):
+    fig, ax = plt.subplots(3, figsize=(10, 10))
+    ax[0].hist(values, bins=100)
+    ax[0].set_title("All")
+    ax[1].hist(
+        [values[i] for i in range(len(values)) if i not in filtered_indices],
+        bins=100,
+    )
+    ax[1].set_title("Filtered")
+    ax[2].hist([values[i] for i in filtered_indices], bins=100)
+    ax[2].set_title("Excluded")
+    fig.savefig(histogram, bbox_inches="tight")
\ No newline at end of file

From 3110a6bd428e6da52b6c185cc7d898280f7d871a Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Mon, 11 Dec 2023 14:25:27 +0100
Subject: [PATCH 02/19] Filter Node becomes childclass of
 ConfigurationSelection

---
 ipsuite/configuration_selection/filter.py | 88 +++++++++++------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index a6f2e29a..e7b25b89 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -1,16 +1,17 @@
 import typing as t
 
+import ase
 import matplotlib.pyplot as plt
 import numpy as np
 import zntrack
 
-from ipsuite import base
+from ipsuite.configuration_selection import ConfigurationSelection
 
 
 def direct_cutoff(values, threshold, cutoffs):
     # Filtering the direct cutoff values
     if cutoffs is None:
-        raise ValueError("cutoffs not specified.")
+        raise ValueError("cutoffs have to be specified for using the direct cutoff filter.")
     return (cutoffs[0], cutoffs[1])
 
 def cutoff_around_mean(values, threshold, cutoffs):
@@ -18,9 +19,9 @@ def cutoff_around_mean(values, threshold, cutoffs):
     mean = np.mean(values)
     std = np.std(values)
 
-    upper_limit = mean + threshold * std
-    lower_limit = mean - threshold * std
-    return (upper_limit, lower_limit)
+    upper_cutoff = mean + threshold * std
+    lower_cutoff = mean - threshold * std
+    return (lower_cutoff, upper_cutoff)
 
 CUTOFF = {
     "direct": direct_cutoff,
@@ -28,7 +29,7 @@ def cutoff_around_mean(values, threshold, cutoffs):
 }
 
 
-class FilterOutliers(base.ProcessAtoms):
+class FilterOutlier(ConfigurationSelection):
     """Remove outliers from the data based on a given property.
 
     Attributes
@@ -42,69 +43,68 @@ class FilterOutliers(base.ProcessAtoms):
     threshold : float, default=3
         The threshold for filtering in units of standard deviations.
     cutoffs : list(float), default=None
-        Upper and lower cutoff.
+        Lower and upper cutoff.
     """
 
     key: str = zntrack.params("energy")
     cutoff_type: t.Literal["direct", "around_mean"] = zntrack.params("around_mean")
     direction: t.Literal["above", "below", "both"] = zntrack.params("both")
     threshold: float = zntrack.params(3)
-    cutoffs: list(float) = zntrack.params(None)
+    cutoffs: t.Union[t.List[float], None] = zntrack.params(None)
 
-
-    filtered_indices: list = zntrack.outs()
     histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png")
+    
+    def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:         
+        values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
-    def run(self):         
-        values = [x.calc.results[self.key] for x in self.data]
-
-        if len(values[0][0]) == 3:
+        # get maximal atomic value per struckture
+        if np.array(values).ndim == 3:
             # calculates the maximal magnetude of cartesian values
             values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
-
-        upper_limit, lower_limit = CUTOFF(self.cutoff_type)(
+        elif np.array(values).ndim == 2:
+            # calculates the maximal atomic values
+            values = [np.max(value, axis=0) for value in values]
+            
+        lower_limit, upper_limit = CUTOFF[self.cutoff_type](
             values,
             self.threshold,
             self.cutoffs,
         )
 
         if self.direction == "above":
-            self.filtered_indices = [
-                i for i, x in enumerate(values) if x > upper_limit
+            selection = [
+                i for i, x in enumerate(values) if x < upper_limit
             ]
         elif self.direction == "below":
-            self.filtered_indices = [
-                i for i, x in enumerate(values) if x < lower_limit
+            selection = [
+                i for i, x in enumerate(values) if x > lower_limit
             ]
         else:
-            self.filtered_indices = [
+            selection = [
                 i
                 for i, x in enumerate(values)
-                if x > upper_limit or x < lower_limit
+                if x > lower_limit and x < upper_limit
             ]
 
-        plot_hist(values, self.filtered_indices, self.histogram)
+        return selection
 
-    @property
-    def atoms(self):
-        return [
-            self.data[i] for i in range(len(self.data)) if i not in self.filtered_indices
-        ]
 
-    @property
-    def excluded_atoms(self):
-        return [self.data[i] for i in self.filtered_indices]
-    
+    def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
+        values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
-def plot_hist(values, filtered_indices, histogram):
-    fig, ax = plt.subplots(3, figsize=(10, 10))
-    ax[0].hist(values, bins=100)
-    ax[0].set_title("All")
-    ax[1].hist(
-        [values[i] for i in range(len(values)) if i not in filtered_indices],
-        bins=100,
-    )
-    ax[1].set_title("Filtered")
-    ax[2].hist([values[i] for i in filtered_indices], bins=100)
-    ax[2].set_title("Excluded")
-    fig.savefig(histogram, bbox_inches="tight")
\ No newline at end of file
+        # check if property is in cartesian basis
+        if np.array(values).ndim == 3:
+            # calculates the maximal magnetude of cartesian values
+            values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
+
+        fig, ax = plt.subplots(3, figsize=(10, 10))
+        ax[0].hist(values, bins=100)
+        ax[0].set_title("All")
+        ax[1].hist(
+            [values[i] for i in range(len(values)) if i not in indices],
+            bins=100,
+        )
+        ax[1].set_title("Filtered")
+        ax[2].hist([values[i] for i in indices], bins=100)
+        ax[2].set_title("Excluded")
+        fig.savefig(self.img_selection, bbox_inches="tight")
\ No newline at end of file

From 90fe016a07026118684a0f4cf5a388f694d3725a Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Mon, 11 Dec 2023 14:25:46 +0100
Subject: [PATCH 03/19] introduced test for filter selection

---
 .../configuration_selection/test_filter.py    | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 tests/unit_tests/configuration_selection/test_filter.py

diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py
new file mode 100644
index 00000000..cb8a7ce8
--- /dev/null
+++ b/tests/unit_tests/configuration_selection/test_filter.py
@@ -0,0 +1,36 @@
+import numpy as np
+import pytest
+
+from ipsuite.configuration_selection import FilterOutlier
+
+
+@pytest.mark.parametrize(
+    "key, cutoff_type, direction, cutoffs",
+    [
+        ("forces", "direct", "both", [7, 13]),
+        ("forces", "direct", "both", None),
+        ("forces", "around_mean", "both", None),
+    ],
+)
+def test_get_selected_atoms(atoms_list, key, cutoff_type, direction, cutoffs):
+    for idx, atoms in enumerate(atoms_list):
+        atoms.calc.results[key] = [idx, 0, 0]
+
+    filter = FilterOutlier(
+        key=key,
+        cutoff_type=cutoff_type,
+        direction=direction,
+        data=None,
+        cutoffs=cutoffs,
+        threshold=0.4,
+    )
+
+    if "direct" in cutoff_type and cutoffs is None:
+        with pytest.raises(ValueError):
+            selected_atoms = filter.select_atoms(atoms_list)
+    else:
+        test_selection = [8, 9, 10, 11, 12]
+        selected_atoms = filter.select_atoms(atoms_list)
+        assert isinstance(selected_atoms, list)
+        assert len(set(selected_atoms)) == 5
+        assert selected_atoms == test_selection

From 813a0c7331706855e044f78e8000e1e0881a057f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Dec 2023 13:55:50 +0000
Subject: [PATCH 04/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ipsuite/configuration_selection/filter.py | 32 +++++++++--------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index e7b25b89..16e6202a 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -11,9 +11,12 @@
 def direct_cutoff(values, threshold, cutoffs):
     # Filtering the direct cutoff values
     if cutoffs is None:
-        raise ValueError("cutoffs have to be specified for using the direct cutoff filter.")
+        raise ValueError(
+            "cutoffs have to be specified for using the direct cutoff filter."
+        )
     return (cutoffs[0], cutoffs[1])
 
+
 def cutoff_around_mean(values, threshold, cutoffs):
     # Filtering in multiples of the standard deviation around the mean.
     mean = np.mean(values)
@@ -23,10 +26,8 @@ def cutoff_around_mean(values, threshold, cutoffs):
     lower_cutoff = mean - threshold * std
     return (lower_cutoff, upper_cutoff)
 
-CUTOFF = {
-    "direct": direct_cutoff,
-    "around_mean": cutoff_around_mean
-}
+
+CUTOFF = {"direct": direct_cutoff, "around_mean": cutoff_around_mean}
 
 
 class FilterOutlier(ConfigurationSelection):
@@ -53,8 +54,8 @@ class FilterOutlier(ConfigurationSelection):
     cutoffs: t.Union[t.List[float], None] = zntrack.params(None)
 
     histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png")
-    
-    def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:         
+
+    def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
         # get maximal atomic value per struckture
@@ -64,7 +65,7 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         elif np.array(values).ndim == 2:
             # calculates the maximal atomic values
             values = [np.max(value, axis=0) for value in values]
-            
+
         lower_limit, upper_limit = CUTOFF[self.cutoff_type](
             values,
             self.threshold,
@@ -72,23 +73,16 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         )
 
         if self.direction == "above":
-            selection = [
-                i for i, x in enumerate(values) if x < upper_limit
-            ]
+            selection = [i for i, x in enumerate(values) if x < upper_limit]
         elif self.direction == "below":
-            selection = [
-                i for i, x in enumerate(values) if x > lower_limit
-            ]
+            selection = [i for i, x in enumerate(values) if x > lower_limit]
         else:
             selection = [
-                i
-                for i, x in enumerate(values)
-                if x > lower_limit and x < upper_limit
+                i for i, x in enumerate(values) if x > lower_limit and x < upper_limit
             ]
 
         return selection
 
-
     def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
@@ -107,4 +101,4 @@ def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         ax[1].set_title("Filtered")
         ax[2].hist([values[i] for i in indices], bins=100)
         ax[2].set_title("Excluded")
-        fig.savefig(self.img_selection, bbox_inches="tight")
\ No newline at end of file
+        fig.savefig(self.img_selection, bbox_inches="tight")

From 7f278a3cdfadf79e9bc63498af093ffc8e6c8670 Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Tue, 12 Dec 2023 08:33:29 +0100
Subject: [PATCH 05/19] _get_plot() fix

---
 ipsuite/configuration_selection/filter.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index e7b25b89..b0b9fbee 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -51,8 +51,6 @@ class FilterOutlier(ConfigurationSelection):
     direction: t.Literal["above", "below", "both"] = zntrack.params("both")
     threshold: float = zntrack.params(3)
     cutoffs: t.Union[t.List[float], None] = zntrack.params(None)
-
-    histogram: str = zntrack.outs_path(zntrack.nwd / "histogram.png")
     
     def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:         
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
@@ -89,7 +87,7 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         return selection
 
 
-    def get_hist(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
+    def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
         # check if property is in cartesian basis

From 113989ade9793853a588c872a3bd1079ab663991 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 12 Dec 2023 07:37:19 +0000
Subject: [PATCH 06/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ipsuite/configuration_selection/filter.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index be5e440c..4a40e23e 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -52,8 +52,8 @@ class FilterOutlier(ConfigurationSelection):
     direction: t.Literal["above", "below", "both"] = zntrack.params("both")
     threshold: float = zntrack.params(3)
     cutoffs: t.Union[t.List[float], None] = zntrack.params(None)
-    
-    def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:         
+
+    def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
         # get maximal atomic value per struckture
@@ -81,7 +81,6 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
 
         return selection
 
-
     def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 

From 21c57b7ce1980a25ea8bad7ccb9097198933f2b3 Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Thu, 14 Dec 2023 12:51:15 +0100
Subject: [PATCH 07/19] Ragged values fix

---
 ipsuite/configuration_selection/filter.py | 27 ++++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index be5e440c..2e98151e 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -57,12 +57,13 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
         # get maximal atomic value per struckture
-        if np.array(values).ndim == 3:
-            # calculates the maximal magnetude of cartesian values
-            values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
-        elif np.array(values).ndim == 2:
-            # calculates the maximal atomic values
-            values = [np.max(value, axis=0) for value in values]
+        if isinstance(values[0], np.ndarray):
+            if values[0].ndim == 2:
+                # calculates the maximal magnetude of atomic cartesian property
+                values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
+            elif values[0].ndim == 1:
+                # calculates the maximal atomic property
+                values = [np.max(value, axis=0) for value in values]
 
         lower_limit, upper_limit = CUTOFF[self.cutoff_type](
             values,
@@ -76,7 +77,7 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
             selection = [i for i, x in enumerate(values) if x > lower_limit]
         else:
             selection = [
-                i for i, x in enumerate(values) if x > lower_limit and x < upper_limit
+                i for i, x in enumerate(values) if x < lower_limit or x > upper_limit
             ]
 
         return selection
@@ -85,10 +86,14 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
     def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         values = [atoms.calc.results[self.key] for atoms in atoms_lst]
 
-        # check if property is in cartesian basis
-        if np.array(values).ndim == 3:
-            # calculates the maximal magnetude of cartesian values
-            values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
+        # get maximal atomic value per struckture
+        if isinstance(values[0], np.ndarray):
+            if values[0].ndim == 2:
+                # calculates the maximal magnetude of atomic cartesian property
+                values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
+            elif values[0].ndim == 1:
+                # calculates the maximal atomic property
+                values = [np.max(value, axis=0) for value in values]
 
         fig, ax = plt.subplots(3, figsize=(10, 10))
         ax[0].hist(values, bins=100)

From 401177a7e2aa0908be2564f9a69ca73de242ad1f Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Thu, 14 Dec 2023 13:20:00 +0100
Subject: [PATCH 08/19] node name change test fix

---
 ipsuite/configuration_selection/__init__.py      |  4 ++--
 ipsuite/configuration_selection/filter.py        | 16 ++++++++--------
 ipsuite/nodes.py                                 |  2 +-
 .../configuration_selection/test_filter.py       |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/ipsuite/configuration_selection/__init__.py b/ipsuite/configuration_selection/__init__.py
index 0c9664e6..a63791d0 100644
--- a/ipsuite/configuration_selection/__init__.py
+++ b/ipsuite/configuration_selection/__init__.py
@@ -1,7 +1,7 @@
 """Configuration Selection Nodes."""
 
 from ipsuite.configuration_selection.base import ConfigurationSelection
-from ipsuite.configuration_selection.filter import FilterOutlier
+from ipsuite.configuration_selection.filter import PropertyFilter
 from ipsuite.configuration_selection.index import IndexSelection
 from ipsuite.configuration_selection.kernel import KernelSelection
 from ipsuite.configuration_selection.random import RandomSelection
@@ -21,5 +21,5 @@
     "IndexSelection",
     "ThresholdSelection",
     "SplitSelection",
-    "FilterOutlier",
+    "PropertyFilter",
 ]
diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index 2e98151e..83969bc1 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -30,8 +30,8 @@ def cutoff_around_mean(values, threshold, cutoffs):
 CUTOFF = {"direct": direct_cutoff, "around_mean": cutoff_around_mean}
 
 
-class FilterOutlier(ConfigurationSelection):
-    """Remove outliers from the data based on a given property.
+class PropertyFilter(ConfigurationSelection):
+    """Filter structures from the dataset based on a given property.
 
     Attributes
     ----------
@@ -72,12 +72,12 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         )
 
         if self.direction == "above":
-            selection = [i for i, x in enumerate(values) if x < upper_limit]
+            selection = [i for i, x in enumerate(values) if x > upper_limit]
         elif self.direction == "below":
-            selection = [i for i, x in enumerate(values) if x > lower_limit]
+            selection = [i for i, x in enumerate(values) if x < lower_limit]
         else:
             selection = [
-                i for i, x in enumerate(values) if x < lower_limit or x > upper_limit
+                i for i, x in enumerate(values) if x > lower_limit and x < upper_limit
             ]
 
         return selection
@@ -98,11 +98,11 @@ def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         fig, ax = plt.subplots(3, figsize=(10, 10))
         ax[0].hist(values, bins=100)
         ax[0].set_title("All")
-        ax[1].hist(
+        ax[1].hist([values[i] for i in indices], bins=100)
+        ax[1].set_title("Selected")
+        ax[2].hist(
             [values[i] for i in range(len(values)) if i not in indices],
             bins=100,
         )
-        ax[1].set_title("Filtered")
-        ax[2].hist([values[i] for i in indices], bins=100)
         ax[2].set_title("Excluded")
         fig.savefig(self.img_selection, bbox_inches="tight")
diff --git a/ipsuite/nodes.py b/ipsuite/nodes.py
index b24d66e5..ad527f44 100644
--- a/ipsuite/nodes.py
+++ b/ipsuite/nodes.py
@@ -26,7 +26,7 @@ class _Nodes:
     )
     UniformTemporalSelection = "ipsuite.configuration_selection.UniformTemporalSelection"
     ThresholdSelection = "ipsuite.configuration_selection.ThresholdSelection"
-    FilterOutlier = "ipsuite.configuration_selection.FilterOutlier"
+    PropertyFilter = "ipsuite.configuration_selection.PropertyFilter"
     BatchKernelSelection = "ipsuite.models.apax.BatchKernelSelection"
 
     # Configuration Comparison
diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py
index cb8a7ce8..adb5662c 100644
--- a/tests/unit_tests/configuration_selection/test_filter.py
+++ b/tests/unit_tests/configuration_selection/test_filter.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from ipsuite.configuration_selection import FilterOutlier
+from ipsuite.configuration_selection import PropertyFilter
 
 
 @pytest.mark.parametrize(
@@ -14,9 +14,9 @@
 )
 def test_get_selected_atoms(atoms_list, key, cutoff_type, direction, cutoffs):
     for idx, atoms in enumerate(atoms_list):
-        atoms.calc.results[key] = [idx, 0, 0]
+        atoms.calc.results[key] = np.array([[idx, 0, 0], [0, 0, 0]])
 
-    filter = FilterOutlier(
+    filter = PropertyFilter(
         key=key,
         cutoff_type=cutoff_type,
         direction=direction,

From 2f1a994c5da80d308cb28bf46e04ef4035959e59 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Dec 2023 12:20:18 +0000
Subject: [PATCH 09/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ipsuite/configuration_selection/filter.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index 5a41bf00..3aa3ab96 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -60,7 +60,9 @@ def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         if isinstance(values[0], np.ndarray):
             if values[0].ndim == 2:
                 # calculates the maximal magnetude of atomic cartesian property
-                values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
+                values = [
+                    np.max(np.linalg.norm(value, axis=1), axis=0) for value in values
+                ]
             elif values[0].ndim == 1:
                 # calculates the maximal atomic property
                 values = [np.max(value, axis=0) for value in values]
@@ -89,7 +91,9 @@ def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         if isinstance(values[0], np.ndarray):
             if values[0].ndim == 2:
                 # calculates the maximal magnetude of atomic cartesian property
-                values = [np.max(np.linalg.norm(value, axis=1), axis=0) for value in values]
+                values = [
+                    np.max(np.linalg.norm(value, axis=1), axis=0) for value in values
+                ]
             elif values[0].ndim == 1:
                 # calculates the maximal atomic property
                 values = [np.max(value, axis=0) for value in values]

From 2168bd3797c271673147c6dec2708dedfabac6a4 Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Thu, 14 Dec 2023 13:59:35 +0100
Subject: [PATCH 10/19] fexed integration test

---
 tests/integration/configuration_selection/test_index.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/configuration_selection/test_index.py b/tests/integration/configuration_selection/test_index.py
index e493a79f..7264dbce 100644
--- a/tests/integration/configuration_selection/test_index.py
+++ b/tests/integration/configuration_selection/test_index.py
@@ -142,8 +142,8 @@ def test_exclude_configurations_list(proj_path, traj_file):
 def test_filter_outlier(proj_path, traj_file):
     with ips.Project() as project:
         data = ips.AddData(file=traj_file)
-        filtered_data = ips.configuration_selection.FilterOutlier(
-            data=data.atoms, key="energy", threshold=1, direction="both"
+        filtered_data = ips.configuration_selection.PropertyFilter(
+            data=data.atoms, key="energy", cutoff_type='around_mean', threshold=1, direction="both"
         )
 
     project.run()

From 4404c6ae7c80f6c2905ddf32042609506ef4cce8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 Dec 2023 13:00:06 +0000
Subject: [PATCH 11/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/integration/configuration_selection/test_index.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/integration/configuration_selection/test_index.py b/tests/integration/configuration_selection/test_index.py
index 7264dbce..73a13669 100644
--- a/tests/integration/configuration_selection/test_index.py
+++ b/tests/integration/configuration_selection/test_index.py
@@ -143,7 +143,11 @@ def test_filter_outlier(proj_path, traj_file):
     with ips.Project() as project:
         data = ips.AddData(file=traj_file)
         filtered_data = ips.configuration_selection.PropertyFilter(
-            data=data.atoms, key="energy", cutoff_type='around_mean', threshold=1, direction="both"
+            data=data.atoms,
+            key="energy",
+            cutoff_type="around_mean",
+            threshold=1,
+            direction="both",
         )
 
     project.run()

From 8832668f790bf0ca3cfd19719ab5729d64668a8f Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Fri, 19 Apr 2024 12:55:12 +0200
Subject: [PATCH 12/19] PropertyFilter fix

---
 ipsuite/configuration_selection/filter.py | 168 +++++++++++-----------
 1 file changed, 85 insertions(+), 83 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index 3aa3ab96..26a033a5 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -8,104 +8,106 @@
 from ipsuite.configuration_selection import ConfigurationSelection
 
 
-def direct_cutoff(values, threshold, cutoffs):
-    # Filtering the direct cutoff values
-    if cutoffs is None:
-        raise ValueError(
-            "cutoffs have to be specified for using the direct cutoff filter."
-        )
-    return (cutoffs[0], cutoffs[1])
+def mean_reduction(values, axis):
+    return np.mean(values, axis=axis)
 
 
-def cutoff_around_mean(values, threshold, cutoffs):
-    # Filtering in multiples of the standard deviation around the mean.
-    mean = np.mean(values)
-    std = np.std(values)
+def max_reduction(values, axis):
+    return np.max(values, axis=axis)
 
-    upper_cutoff = mean + threshold * std
-    lower_cutoff = mean - threshold * std
-    return (lower_cutoff, upper_cutoff)
 
+def check_dimension(values):
+    if values.ndim > 1:
+        raise ValueError(
+            f"Value dimension is {values.ndim} != 1. "
+            "Reduce the dimension by defining dim_reduction, "
+            "use mean or max to get (n_structures,) shape."
+        )
 
-CUTOFF = {"direct": direct_cutoff, "around_mean": cutoff_around_mean}
 
+REDUCTIONS = {
+    "mean": mean_reduction,
+    "max": max_reduction,
+}
 
 class PropertyFilter(ConfigurationSelection):
-    """Filter structures from the dataset based on a given property.
-
-    Attributes
-    ----------
-    key : str, default="energy"
-        The property to filter on.
-    cutoff_type : {"direct", "around_mean"}, default="around_mean"
-        Defines the cutoff type.
-    direction : {"above", "below", "both"}, default="both"
-        The direction to filter in.
-    threshold : float, default=3
-        The threshold for filtering in units of standard deviations.
-    cutoffs : list(float), default=None
-        Lower and upper cutoff.
-    """
-
-    key: str = zntrack.params("energy")
-    cutoff_type: t.Literal["direct", "around_mean"] = zntrack.params("around_mean")
+
+    reference = zntrack.params("energy")
+    cutoffs: t.Union[t.List[float]] = zntrack.params()
     direction: t.Literal["above", "below", "both"] = zntrack.params("both")
-    threshold: float = zntrack.params(3)
-    cutoffs: t.Union[t.List[float], None] = zntrack.params(None)
-
-    def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
-        values = [atoms.calc.results[self.key] for atoms in atoms_lst]
-
-        # get maximal atomic value per struckture
-        if isinstance(values[0], np.ndarray):
-            if values[0].ndim == 2:
-                # calculates the maximal magnetude of atomic cartesian property
-                values = [
-                    np.max(np.linalg.norm(value, axis=1), axis=0) for value in values
-                ]
-            elif values[0].ndim == 1:
-                # calculates the maximal atomic property
-                values = [np.max(value, axis=0) for value in values]
-
-        lower_limit, upper_limit = CUTOFF[self.cutoff_type](
-            values,
-            self.threshold,
-            self.cutoffs,
-        )
+    n_configurations = zntrack.params(None)
+    min_distance: int = zntrack.params(1)
+    dim_reduction: str = zntrack.params(None)
+    reduction_axis = zntrack.params((1, 2))
+
+    def _post_init_(self):
+        if self.direction not in ["above", "below", "both"]:
+            raise ValueError("'direction' should be set to 'above', 'below', or 'both'.")
+
+        return super()._post_init_()
+    
+    def select_atoms(
+            self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
+        self.reduction_axis = tuple(self.reduction_axis)
+        values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
+
+        if self.dim_reduction is not None:
+            reduction_fn = REDUCTIONS[self.dim_reduction]
+            values = reduction_fn(values, self.reduction_axis)
+
+        check_dimension(values)
+
+        lower_limit, upper_limit = self.cutoffs[0], self.cutoffs[1]
 
         if self.direction == "above":
-            selection = [i for i, x in enumerate(values) if x > upper_limit]
+            pre_selection = np.array([i for i, x in enumerate(values) if x > upper_limit])
+            sorting_idx = np.argsort(values[pre_selection])[::-1]
         elif self.direction == "below":
-            selection = [i for i, x in enumerate(values) if x < lower_limit]
+            pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit])
+            sorting_idx = np.argsort(values[pre_selection])
         else:
-            selection = [
-                i for i, x in enumerate(values) if x > lower_limit and x < upper_limit
-            ]
+            pre_selection = np.array([
+                i for i, x in enumerate(values) if x < lower_limit or x > upper_limit
+            ])
+            mean = (lower_limit+upper_limit)/2
+            dist_to_mean = abs(values[pre_selection]-mean)
+            sorting_idx = np.argsort(dist_to_mean)[::-1]
+
+        selection = self.get_selection(pre_selection[sorting_idx])
+
+        return selection
+
+    def get_selection(self, indices):
+        selection = []
+        for idx in indices:
+            # If the value is close to any of the already selected values, skip it.
+            if not any(np.abs(idx - np.array(selection)) < self.min_distance):
+                selection.append(idx)
+            if len(selection) == self.n_configurations:
+                break
 
         return selection
 
     def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
-        values = [atoms.calc.results[self.key] for atoms in atoms_lst]
-
-        # get maximal atomic value per struckture
-        if isinstance(values[0], np.ndarray):
-            if values[0].ndim == 2:
-                # calculates the maximal magnetude of atomic cartesian property
-                values = [
-                    np.max(np.linalg.norm(value, axis=1), axis=0) for value in values
-                ]
-            elif values[0].ndim == 1:
-                # calculates the maximal atomic property
-                values = [np.max(value, axis=0) for value in values]
-
-        fig, ax = plt.subplots(3, figsize=(10, 10))
-        ax[0].hist(values, bins=100)
-        ax[0].set_title("All")
-        ax[1].hist([values[i] for i in indices], bins=100)
-        ax[1].set_title("Selected")
-        ax[2].hist(
-            [values[i] for i in range(len(values)) if i not in indices],
-            bins=100,
-        )
-        ax[2].set_title("Excluded")
+        indices = np.array(indices)
+        values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
+
+        if self.dim_reduction is not None:
+            reduction_fn = REDUCTIONS[self.dim_reduction]
+            values = reduction_fn(values, self.reduction_axis)
+
+        fig, ax = plt.subplots()
+        ax.plot(values, label=self.reference)
+        ax.plot(indices, values[indices], "x", color="red")
+        ax.fill_between(
+                np.arange(len(values)),
+                self.cutoffs[0],
+                self.cutoffs[1],
+                color="black",
+                alpha=0.2,
+                label=f"{self.reference} +- std",
+            )
+        ax.set_ylabel(self.reference)
+        ax.set_xlabel("configuration")
+
         fig.savefig(self.img_selection, bbox_inches="tight")

From 02bdaf7b9198b446957e87fc5dfbee2a4d3a10ba Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Fri, 19 Apr 2024 12:55:42 +0200
Subject: [PATCH 13/19] ThresholdSelection fix

---
 ipsuite/configuration_selection/threshold.py | 116 +++++++------------
 1 file changed, 43 insertions(+), 73 deletions(-)

diff --git a/ipsuite/configuration_selection/threshold.py b/ipsuite/configuration_selection/threshold.py
index 6fceda7a..72388298 100644
--- a/ipsuite/configuration_selection/threshold.py
+++ b/ipsuite/configuration_selection/threshold.py
@@ -34,67 +34,25 @@ def check_dimension(values):
 
 
 class ThresholdSelection(ConfigurationSelection):
-    """Select atoms based on a given threshold.
-
-    Select atoms above a given threshold or the n_configurations with the
-    highest / lowest value. Typically useful for uncertainty based selection.
-
-    Attributes
-    ----------
-    key: str
-        The key in 'calc.results' to select from
-    threshold: float, optional
-        All values above (or below if negative) this threshold will be selected.
-        If n_configurations is given, 'self.threshold' will be prioritized,
-        but a maximum of n_configurations will be selected.
-    reference: str, optional
-        For visualizing the selection a reference value can be given.
-        For 'energy_uncertainty' this would typically be 'energy'.
-    n_configurations: int, optional
-        Number of configurations to select.
-    min_distance: int, optional
-        Minimum distance between selected configurations.
-    dim_reduction: str, optional
-        Reduces the dimensionality of the chosen uncertainty along the specified axis
-        by calculating either the maximum or mean value.
-
-        Choose from ["max", "mean"]
-    reduction_axis: tuple(int), optional
-        Specifies the axis along which the reduction occurs.
-    """
-
-    key = zntrack.params("energy_uncertainty")
+
     reference = zntrack.params("energy")
-    threshold = zntrack.params(None)
+    threshold = zntrack.params(2)
+    direction: typing.Literal["above", "below", "both"] = zntrack.params("both")
     n_configurations = zntrack.params(None)
     min_distance: int = zntrack.params(1)
     dim_reduction: str = zntrack.params(None)
     reduction_axis = zntrack.params((1, 2))
 
     def _post_init_(self):
-        if self.threshold is None and self.n_configurations is None:
-            raise ValueError("Either 'threshold' or 'n_configurations' must not be None.")
+        if self.direction not in ["above", "below", "both"]:
+            raise ValueError("'direction' should be set to 'above', 'below', or 'both'.")
 
         return super()._post_init_()
 
     def select_atoms(
-        self, atoms_lst: typing.List[ase.Atoms], save_fig: bool = True
-    ) -> typing.List[int]:
-        """Take every nth (step) object of a given atoms list.
-
-        Parameters
-        ----------
-        atoms_lst: typing.List[ase.Atoms]
-            list of atoms objects to arange
-
-        Returns
-        -------
-        typing.List[int]:
-            list containing the taken indices
-        """
-
+            self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]:
         self.reduction_axis = tuple(self.reduction_axis)
-        values = np.array([atoms.calc.results[self.key] for atoms in atoms_lst])
+        values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
 
         if self.dim_reduction is not None:
             reduction_fn = REDUCTIONS[self.dim_reduction]
@@ -102,48 +60,60 @@ def select_atoms(
 
         check_dimension(values)
 
-        if self.threshold is not None:
-            if self.threshold < 0:
-                indices = np.where(values < self.threshold)[0]
-                if self.n_configurations is not None:
-                    indices = np.argsort(values)[indices]
-            else:
-                indices = np.where(values > self.threshold)[0]
-                if self.n_configurations is not None:
-                    indices = np.argsort(values)[::-1][indices]
+        mean = np.mean(values)
+        std = np.std(values)
+        upper_limit = mean + self.threshold * std
+        lower_limit = mean - self.threshold * std
+
+        if self.direction == "above":
+            pre_selection = np.array([i for i, x in enumerate(values) if x > upper_limit])
+            sorting_idx = np.argsort(values[pre_selection])[::-1]
+        elif self.direction == "below":
+            pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit])
+            sorting_idx = np.argsort(values[pre_selection])
         else:
-            if np.mean(values) > 0:
-                indices = np.argsort(values)[::-1]
-            else:
-                indices = np.argsort(values)
+            pre_selection = np.array([
+                i for i, x in enumerate(values) if x < lower_limit or x > upper_limit
+            ])
+            limit_mean = (lower_limit+upper_limit)/2
+            dist_to_mean = abs(values[pre_selection]-limit_mean)
+            sorting_idx = np.argsort(dist_to_mean)[::-1]
 
-        selection = self.get_selection(indices)
+        selection = self.get_selection(pre_selection[sorting_idx])
 
         return selection
 
     def get_selection(self, indices):
-        selected = []
-        for val in indices:
+        selection = []
+        for idx in indices:
             # If the value is close to any of the already selected values, skip it.
-            if not any(np.abs(val - np.array(selected)) < self.min_distance):
-                selected.append(val)
-            if len(selected) == self.n_configurations:
+            if not any(np.abs(idx - np.array(selection)) < self.min_distance):
+                selection.append(idx)
+            if len(selection) == self.n_configurations:
                 break
 
-        return selected
+        return selection
 
     def _get_plot(self, atoms_lst: typing.List[ase.Atoms], indices: typing.List[int]):
         indices = np.array(indices)
-        values = np.array([atoms.calc.results[self.key] for atoms in atoms_lst])
+        values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
 
         if self.dim_reduction is not None:
             reduction_fn = REDUCTIONS[self.dim_reduction]
             values = reduction_fn(values, self.reduction_axis)
 
         fig, ax = plt.subplots()
-        ax.plot(values, label=self.key)
+        ax.plot(values, label=self.reference)
         ax.plot(indices, values[indices], "x", color="red")
-        ax.set_ylabel(self.key)
+        ax.fill_between(
+                np.arange(len(values)),
+                np.mean(values) + self.threshold * np.std(values),
+                np.mean(values) - self.threshold * np.std(values),
+                color="black",
+                alpha=0.2,
+                label=f"{self.reference} +- std",
+            )
+        ax.set_ylabel(self.reference)
         ax.set_xlabel("configuration")
 
-        fig.savefig(self.img_selection, bbox_inches="tight")
+        fig.savefig(self.img_selection, bbox_inches="tight")
\ No newline at end of file

From ae4800bf90e6106a015e920cdd778b79d703a256 Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Fri, 19 Apr 2024 21:10:09 +0200
Subject: [PATCH 14/19] node fix

---
 ipsuite/configuration_selection/filter.py    | 2 ++
 ipsuite/configuration_selection/threshold.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index 26a033a5..520b5bd3 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -81,6 +81,8 @@ def get_selection(self, indices):
         selection = []
         for idx in indices:
             # If the value is close to any of the already selected values, skip it.
+            if not selection:
+                selection.append(idx)
             if not any(np.abs(idx - np.array(selection)) < self.min_distance):
                 selection.append(idx)
             if len(selection) == self.n_configurations:
diff --git a/ipsuite/configuration_selection/threshold.py b/ipsuite/configuration_selection/threshold.py
index 72388298..4c6368e4 100644
--- a/ipsuite/configuration_selection/threshold.py
+++ b/ipsuite/configuration_selection/threshold.py
@@ -87,6 +87,8 @@ def get_selection(self, indices):
         selection = []
         for idx in indices:
             # If the value is close to any of the already selected values, skip it.
+            if not selection:
+                selection.append(idx)
             if not any(np.abs(idx - np.array(selection)) < self.min_distance):
                 selection.append(idx)
             if len(selection) == self.n_configurations:

From 36b45e051e5fea4fea9e9bb0b72a7553a0d0809c Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Fri, 19 Apr 2024 21:10:18 +0200
Subject: [PATCH 15/19] test fix

---
 .../configuration_selection/test_filter.py    | 62 ++++++++++++++-----
 .../configuration_selection/test_threshold.py | 59 +++++++++++++-----
 2 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py
index adb5662c..0f0b2975 100644
--- a/tests/unit_tests/configuration_selection/test_filter.py
+++ b/tests/unit_tests/configuration_selection/test_filter.py
@@ -2,35 +2,63 @@
 import pytest
 
 from ipsuite.configuration_selection import PropertyFilter
-
+from ipsuite.configuration_selection.filter import REDUCTIONS
 
 @pytest.mark.parametrize(
-    "key, cutoff_type, direction, cutoffs",
+    "reference, dim_reduction, reduction_axis",
     [
-        ("forces", "direct", "both", [7, 13]),
-        ("forces", "direct", "both", None),
-        ("forces", "around_mean", "both", None),
+        ("energy", None, (1, 2)),
+        ("forces", "max", (1, 2)),
+        ("forces_uncertainty", "mean", (1, 2)),
+        ("forces_uncertainty", None, (1, 2)),
     ],
 )
-def test_get_selected_atoms(atoms_list, key, cutoff_type, direction, cutoffs):
-    for idx, atoms in enumerate(atoms_list):
-        atoms.calc.results[key] = np.array([[idx, 0, 0], [0, 0, 0]])
+@pytest.mark.parametrize(
+    "direction",
+    [
+        "above",
+        "below",
+        "both",
+     ]
+)
+def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction):
+    values = np.array([atoms.calc.results[reference] for atoms in atoms_list])
+    if dim_reduction is not None:
+        reduction_fn = REDUCTIONS[dim_reduction]
+        values = reduction_fn(values, reduction_axis)
+
+    mean = np.mean(values)
+    std = np.std(values)
+    upper_limit = mean + 0.5 * std
+    lower_limit = mean - 0.5 * std
 
     filter = PropertyFilter(
-        key=key,
-        cutoff_type=cutoff_type,
-        direction=direction,
+        reference=reference,
+        dim_reduction=dim_reduction,
+        reduction_axis=reduction_axis,
         data=None,
-        cutoffs=cutoffs,
-        threshold=0.4,
+        cutoffs=[lower_limit, upper_limit],
+        n_configurations=4,
+        min_distance=1,
+        direction=direction,
     )
 
-    if "direct" in cutoff_type and cutoffs is None:
+    if reference in ["forces", "forces_uncertainty"] and dim_reduction is None:
         with pytest.raises(ValueError):
             selected_atoms = filter.select_atoms(atoms_list)
     else:
-        test_selection = [8, 9, 10, 11, 12]
         selected_atoms = filter.select_atoms(atoms_list)
+        print(selected_atoms)
+
+        assert len(set(selected_atoms)) == 4
         assert isinstance(selected_atoms, list)
-        assert len(set(selected_atoms)) == 5
-        assert selected_atoms == test_selection
+
+        if direction == "above":
+            assert np.argmax(values) in selected_atoms
+
+        elif direction == "below":
+            assert np.argmin(values) in selected_atoms
+            
+        else:
+            assert np.argmin(values) in selected_atoms
+            assert np.argmax(values) in selected_atoms
\ No newline at end of file
diff --git a/tests/unit_tests/configuration_selection/test_threshold.py b/tests/unit_tests/configuration_selection/test_threshold.py
index 87efe45d..5f545268 100644
--- a/tests/unit_tests/configuration_selection/test_threshold.py
+++ b/tests/unit_tests/configuration_selection/test_threshold.py
@@ -2,35 +2,62 @@
 import pytest
 
 from ipsuite.configuration_selection import ThresholdSelection
+from ipsuite.configuration_selection.threshold import REDUCTIONS
 
 
 @pytest.mark.parametrize(
-    "key, reference, dim_reduction, reduction_axis",
+    "reference, dim_reduction, reduction_axis",
     [
-        ("energy_uncertainty", "energy", None, (1, 2)),
-        ("forces_uncertainty", "forces", "max", (1, 2)),
-        ("forces_uncertainty", "forces", "mean", (1, 2)),
-        ("forces_uncertainty", "forces", None, (1, 2)),
+        ("energy", None, (1, 2)),
+        ("forces", "max", (1, 2)),
+        ("forces_uncertainty", "mean", (1, 2)),
+        ("forces_uncertainty", None, (1, 2)),
     ],
 )
-def test_get_selected_atoms(atoms_list, key, reference, dim_reduction, reduction_axis):
+@pytest.mark.parametrize(
+    "direction",
+    [
+        "above",
+        "below",
+        "both",
+     ]
+)
+def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction):
     threshold = ThresholdSelection(
-        key=key,
         reference=reference,
         dim_reduction=dim_reduction,
         reduction_axis=reduction_axis,
         data=None,
-        threshold=1.0,
-        n_configurations=5,
-        min_distance=5,
+        threshold=0.5,
+        n_configurations=4,
+        min_distance=1,
+        direction=direction,
     )
 
-    if "forces_uncertainty" in key and dim_reduction is None:
+    if reference in ["forces", "forces_uncertainty"] and dim_reduction is None:
         with pytest.raises(ValueError):
-            selected_atoms = threshold.select_atoms(atoms_list, save_fig=False)
+            selected_atoms = threshold.select_atoms(atoms_list)
     else:
-        selected_atoms = threshold.select_atoms(atoms_list, save_fig=False)
-        test_selection = np.linspace(20, 0, 5, dtype=int).tolist()
-        assert len(set(selected_atoms)) == 5
+        selected_atoms = threshold.select_atoms(atoms_list)
+        # test_selection = np.linspace(20, 0, 5, dtype=int).tolist()
+        
+        assert len(set(selected_atoms)) == 4
         assert isinstance(selected_atoms, list)
-        assert selected_atoms == test_selection
+
+        values = np.array([atoms.calc.results[reference] for atoms in atoms_list])
+        if dim_reduction is not None:
+            reduction_fn = REDUCTIONS[dim_reduction]
+            values = reduction_fn(values, reduction_axis)
+
+        if direction == "above":
+            print(np.max(values))
+            print(values[selected_atoms])
+
+            assert np.argmax(values) in selected_atoms
+
+        elif direction == "below":
+            assert np.argmin(values) in selected_atoms
+            
+        else:
+            assert np.argmin(values) in selected_atoms
+            assert np.argmax(values) in selected_atoms
\ No newline at end of file

From 8ccb5b7df2345d54171c26c5021ed21349161189 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 19 Apr 2024 19:10:33 +0000
Subject: [PATCH 16/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ipsuite/configuration_selection/filter.py     | 31 +++++++++----------
 ipsuite/configuration_selection/threshold.py  | 30 +++++++++---------
 .../configuration_selection/test_filter.py    | 11 ++++---
 .../configuration_selection/test_threshold.py | 12 ++++---
 4 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index 520b5bd3..0222b9e6 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -30,8 +30,8 @@ def check_dimension(values):
     "max": max_reduction,
 }
 
-class PropertyFilter(ConfigurationSelection):
 
+class PropertyFilter(ConfigurationSelection):
     reference = zntrack.params("energy")
     cutoffs: t.Union[t.List[float]] = zntrack.params()
     direction: t.Literal["above", "below", "both"] = zntrack.params("both")
@@ -45,9 +45,8 @@ def _post_init_(self):
             raise ValueError("'direction' should be set to 'above', 'below', or 'both'.")
 
         return super()._post_init_()
-    
-    def select_atoms(
-            self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
+
+    def select_atoms(self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
         self.reduction_axis = tuple(self.reduction_axis)
         values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
 
@@ -66,11 +65,11 @@ def select_atoms(
             pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit])
             sorting_idx = np.argsort(values[pre_selection])
         else:
-            pre_selection = np.array([
-                i for i, x in enumerate(values) if x < lower_limit or x > upper_limit
-            ])
-            mean = (lower_limit+upper_limit)/2
-            dist_to_mean = abs(values[pre_selection]-mean)
+            pre_selection = np.array(
+                [i for i, x in enumerate(values) if x < lower_limit or x > upper_limit]
+            )
+            mean = (lower_limit + upper_limit) / 2
+            dist_to_mean = abs(values[pre_selection] - mean)
             sorting_idx = np.argsort(dist_to_mean)[::-1]
 
         selection = self.get_selection(pre_selection[sorting_idx])
@@ -102,13 +101,13 @@ def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
         ax.plot(values, label=self.reference)
         ax.plot(indices, values[indices], "x", color="red")
         ax.fill_between(
-                np.arange(len(values)),
-                self.cutoffs[0],
-                self.cutoffs[1],
-                color="black",
-                alpha=0.2,
-                label=f"{self.reference} +- std",
-            )
+            np.arange(len(values)),
+            self.cutoffs[0],
+            self.cutoffs[1],
+            color="black",
+            alpha=0.2,
+            label=f"{self.reference} +- std",
+        )
         ax.set_ylabel(self.reference)
         ax.set_xlabel("configuration")
 
diff --git a/ipsuite/configuration_selection/threshold.py b/ipsuite/configuration_selection/threshold.py
index 4c6368e4..f762e29a 100644
--- a/ipsuite/configuration_selection/threshold.py
+++ b/ipsuite/configuration_selection/threshold.py
@@ -34,7 +34,6 @@ def check_dimension(values):
 
 
 class ThresholdSelection(ConfigurationSelection):
-
     reference = zntrack.params("energy")
     threshold = zntrack.params(2)
     direction: typing.Literal["above", "below", "both"] = zntrack.params("both")
@@ -49,8 +48,7 @@ def _post_init_(self):
 
         return super()._post_init_()
 
-    def select_atoms(
-            self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]:
+    def select_atoms(self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]:
         self.reduction_axis = tuple(self.reduction_axis)
         values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
 
@@ -72,11 +70,11 @@ def select_atoms(
             pre_selection = np.array([i for i, x in enumerate(values) if x < lower_limit])
             sorting_idx = np.argsort(values[pre_selection])
         else:
-            pre_selection = np.array([
-                i for i, x in enumerate(values) if x < lower_limit or x > upper_limit
-            ])
-            limit_mean = (lower_limit+upper_limit)/2
-            dist_to_mean = abs(values[pre_selection]-limit_mean)
+            pre_selection = np.array(
+                [i for i, x in enumerate(values) if x < lower_limit or x > upper_limit]
+            )
+            limit_mean = (lower_limit + upper_limit) / 2
+            dist_to_mean = abs(values[pre_selection] - limit_mean)
             sorting_idx = np.argsort(dist_to_mean)[::-1]
 
         selection = self.get_selection(pre_selection[sorting_idx])
@@ -108,14 +106,14 @@ def _get_plot(self, atoms_lst: typing.List[ase.Atoms], indices: typing.List[int]
         ax.plot(values, label=self.reference)
         ax.plot(indices, values[indices], "x", color="red")
         ax.fill_between(
-                np.arange(len(values)),
-                np.mean(values) + self.threshold * np.std(values),
-                np.mean(values) - self.threshold * np.std(values),
-                color="black",
-                alpha=0.2,
-                label=f"{self.reference} +- std",
-            )
+            np.arange(len(values)),
+            np.mean(values) + self.threshold * np.std(values),
+            np.mean(values) - self.threshold * np.std(values),
+            color="black",
+            alpha=0.2,
+            label=f"{self.reference} +- std",
+        )
         ax.set_ylabel(self.reference)
         ax.set_xlabel("configuration")
 
-        fig.savefig(self.img_selection, bbox_inches="tight")
\ No newline at end of file
+        fig.savefig(self.img_selection, bbox_inches="tight")
diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py
index 0f0b2975..040cc352 100644
--- a/tests/unit_tests/configuration_selection/test_filter.py
+++ b/tests/unit_tests/configuration_selection/test_filter.py
@@ -4,6 +4,7 @@
 from ipsuite.configuration_selection import PropertyFilter
 from ipsuite.configuration_selection.filter import REDUCTIONS
 
+
 @pytest.mark.parametrize(
     "reference, dim_reduction, reduction_axis",
     [
@@ -19,9 +20,11 @@
         "above",
         "below",
         "both",
-     ]
+    ],
 )
-def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction):
+def test_get_selected_atoms(
+    atoms_list, reference, dim_reduction, reduction_axis, direction
+):
     values = np.array([atoms.calc.results[reference] for atoms in atoms_list])
     if dim_reduction is not None:
         reduction_fn = REDUCTIONS[dim_reduction]
@@ -58,7 +61,7 @@ def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis
 
         elif direction == "below":
             assert np.argmin(values) in selected_atoms
-            
+
         else:
             assert np.argmin(values) in selected_atoms
-            assert np.argmax(values) in selected_atoms
\ No newline at end of file
+            assert np.argmax(values) in selected_atoms
diff --git a/tests/unit_tests/configuration_selection/test_threshold.py b/tests/unit_tests/configuration_selection/test_threshold.py
index 5f545268..11147e16 100644
--- a/tests/unit_tests/configuration_selection/test_threshold.py
+++ b/tests/unit_tests/configuration_selection/test_threshold.py
@@ -20,9 +20,11 @@
         "above",
         "below",
         "both",
-     ]
+    ],
 )
-def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis, direction):
+def test_get_selected_atoms(
+    atoms_list, reference, dim_reduction, reduction_axis, direction
+):
     threshold = ThresholdSelection(
         reference=reference,
         dim_reduction=dim_reduction,
@@ -40,7 +42,7 @@ def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis
     else:
         selected_atoms = threshold.select_atoms(atoms_list)
         # test_selection = np.linspace(20, 0, 5, dtype=int).tolist()
-        
+
         assert len(set(selected_atoms)) == 4
         assert isinstance(selected_atoms, list)
 
@@ -57,7 +59,7 @@ def test_get_selected_atoms(atoms_list, reference, dim_reduction, reduction_axis
 
         elif direction == "below":
             assert np.argmin(values) in selected_atoms
-            
+
         else:
             assert np.argmin(values) in selected_atoms
-            assert np.argmax(values) in selected_atoms
\ No newline at end of file
+            assert np.argmax(values) in selected_atoms

From 51accca4029f29abd1f8b2b0f476607741e67e75 Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Mon, 22 Apr 2024 10:43:45 +0200
Subject: [PATCH 17/19] cleanup

---
 ipsuite/configuration_selection/filter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ipsuite/configuration_selection/filter.py b/ipsuite/configuration_selection/filter.py
index 520b5bd3..a78a3658 100644
--- a/ipsuite/configuration_selection/filter.py
+++ b/ipsuite/configuration_selection/filter.py
@@ -1,4 +1,4 @@
-import typing as t
+import typing
 
 import ase
 import matplotlib.pyplot as plt
@@ -33,8 +33,8 @@ def check_dimension(values):
 class PropertyFilter(ConfigurationSelection):
 
     reference = zntrack.params("energy")
-    cutoffs: t.Union[t.List[float]] = zntrack.params()
-    direction: t.Literal["above", "below", "both"] = zntrack.params("both")
+    cutoffs: typing.Union[typing.List[float]] = zntrack.params()
+    direction: typing.Literal["above", "below", "both"] = zntrack.params("both")
     n_configurations = zntrack.params(None)
     min_distance: int = zntrack.params(1)
     dim_reduction: str = zntrack.params(None)
@@ -47,7 +47,7 @@ def _post_init_(self):
         return super()._post_init_()
     
     def select_atoms(
-            self, atoms_lst: t.List[ase.Atoms]) -> t.List[int]:
+            self, atoms_lst: typing.List[ase.Atoms]) -> typing.List[int]:
         self.reduction_axis = tuple(self.reduction_axis)
         values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
 
@@ -90,7 +90,7 @@ def get_selection(self, indices):
 
         return selection
 
-    def _get_plot(self, atoms_lst: t.List[ase.Atoms], indices: t.List[int]):
+    def _get_plot(self, atoms_lst: typing.List[ase.Atoms], indices: typing.List[int]):
         indices = np.array(indices)
         values = np.array([atoms.calc.results[self.reference] for atoms in atoms_lst])
 

From 35c35d5377c472df2ba3bf856dec1d7c99b22047 Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Mon, 22 Apr 2024 11:25:37 +0200
Subject: [PATCH 18/19] fix integration test

---
 tests/integration/configuration_selection/test_index.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/integration/configuration_selection/test_index.py b/tests/integration/configuration_selection/test_index.py
index eec12102..c3861846 100644
--- a/tests/integration/configuration_selection/test_index.py
+++ b/tests/integration/configuration_selection/test_index.py
@@ -142,10 +142,9 @@ def test_exclude_configurations_list(proj_path, traj_file):
 def test_filter_outlier(proj_path, traj_file):
     with ips.Project() as project:
         data = ips.AddData(file=traj_file)
-        filtered_data = ips.configuration_selection.PropertyFilter(
+        filtered_data = ips.configuration_selection.ThresholdSelection(
             data=data.atoms,
-            key="energy",
-            cutoff_type="around_mean",
+            reference="energy",
             threshold=1,
             direction="both",
         )
@@ -153,4 +152,4 @@ def test_filter_outlier(proj_path, traj_file):
     project.run()
 
     filtered_data.load()
-    assert len(filtered_data.atoms) == 13
+    assert len(filtered_data.atoms) == 8

From 75d00b537aa661af06093037282a91e9ea4042e9 Mon Sep 17 00:00:00 2001
From: Tetracarbonylnickel <segreto@theochem.uni-stuttgart.de>
Date: Mon, 1 Jul 2024 18:22:18 +0200
Subject: [PATCH 19/19] fix filter test

---
 tests/unit_tests/configuration_selection/test_filter.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/unit_tests/configuration_selection/test_filter.py b/tests/unit_tests/configuration_selection/test_filter.py
index 040cc352..d5e0f27a 100644
--- a/tests/unit_tests/configuration_selection/test_filter.py
+++ b/tests/unit_tests/configuration_selection/test_filter.py
@@ -41,7 +41,7 @@ def test_get_selected_atoms(
         reduction_axis=reduction_axis,
         data=None,
         cutoffs=[lower_limit, upper_limit],
-        n_configurations=4,
+        n_configurations=3,
         min_distance=1,
         direction=direction,
     )
@@ -51,9 +51,8 @@ def test_get_selected_atoms(
             selected_atoms = filter.select_atoms(atoms_list)
     else:
         selected_atoms = filter.select_atoms(atoms_list)
-        print(selected_atoms)
 
-        assert len(set(selected_atoms)) == 4
+        assert len(set(selected_atoms)) == 3
         assert isinstance(selected_atoms, list)
 
         if direction == "above":