feat(JAQPOT-135): Update MeanVar (#29)

* feat: Update Leverage DOA Tests * fix: Disabled numeric test * Revert "fix: Disabled numeric test" This reverts commit d2abf6f. * fix: Leverage_DOA removed numerical test * fix: Delete calculate() method in MeanVar class Deleted calculate() method because it is already replaced by predict() * fix: Delete commented-out code * feat: Update `test_mean_var` test * feat: Refactor`DOA' classes This commit refactors the structure of `DOA` to be an abstract base class. Additionally, `Leverage` and `MeanVar` inherit from `DOA`. * fix: MeanVar test * Update doa.py Deleted some comments
ntua-unit-of-control-and-informatics · Jun 13, 2024 · d015e7f · d015e7f
1 parent 25f2328
commit d015e7f
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 148 deletions.
diff --git a/jaqpotpy/doa/doa.py b/jaqpotpy/doa/doa.py
@@ -1,57 +1,27 @@
-from abc import ABC
+from abc import ABC, abstractmethod
 import pandas as pd
 import numpy as np
 from typing import Iterable, Any
 import math
 from jaqpotpy.descriptors.molecular import RDKitDescriptors, MordredDescriptors
 import pickle
-# import dill
 
-# def calculate_a(X):
-#     shape = X.shape
-#     a = (3 * (shape[1] + 1)) / shape[0]
-#     return a
 
-
-# def calculate_doa_matrix(X):
-#     x_T = X.transpose()
-#     x_out = x_T.dot(X)
-#     x_out_inv = pd.DataFrame(np.linalg.pinv(x_out.values), x_out.columns, x_out.index)
-#     return x_out_inv
-
-
-# def calc_doa(doa_matrix, new_data):
-#     doaAll = []
-#     for nd in new_data:
-#         d1 = np.dot(nd, doa_matrix)
-#         ndt = np.transpose(nd)
-#         d2 = np.dot(d1, ndt)
-#         doa = {'DOA': d2}
-#         doaAll.append(doa)
-#     return doaAll
-
-
-class DOA(object):
+class DOA(ABC):
     """
     Abstract class for DOA methods
     """
-    def calculate_threshold(self):
-        raise NotImplementedError
-
-    def calculate_matrix(self):
-        raise NotImplementedError
-
-    def calculate(self, data: Iterable[Any]) -> Iterable[Any]:
-        raise NotImplementedError
 
+    @abstractmethod
     def fit(self, X: np.array):
         raise NotImplementedError
 
+    @abstractmethod
     def predict(self, data: Iterable[Any]) -> Iterable[Any]:
         raise NotImplementedError
 
 
-class Leverage(DOA, ABC):
+class Leverage(DOA):
     """
     Implements DOA method leverage.
     Initialized upon training data and holds the doa matrix and the threshold 'A' value.
@@ -147,7 +117,7 @@ def predict(self, new_data: np.array) -> Iterable[Any]:
         return doaAll
 
 
-class MeanVar(DOA, ABC):
+class MeanVar(DOA):
     """
     Implements Mean and Variance domain of applicability .
     Initialized upon training data and holds the doa mean and the variance of the data.
@@ -161,10 +131,7 @@ def __name__(self):
         return 'MeanVar'
 
     def __init__(self) -> None:
-        # self._scaler: BaseEstimator = scaler
         self._data: np.array = None
-        self._doa_matrix = None
-        self._a = None
 
     @property
     def doa_new(self):
@@ -182,21 +149,6 @@ def IN(self):
     def IN(self, value):
         self._in = value
 
-    @property
-    def doa_matrix(self):
-        return self._doa_matrix
-
-    @doa_matrix.setter
-    def doa_matrix(self, value):
-        self._doa_matrix = value
-
-    @property
-    def a(self):
-        return self._a
-
-    @a.setter
-    def a(self, value):
-        self._a = value
 
     @property
     def data(self):
@@ -208,42 +160,17 @@ def data(self, value):
 
     def fit(self, X: np.array):
         self._data = X
-        # self._scaler.fit(X)
-        # self._data = self._scaler.transform(X)
         columns = list(zip(*self._data))
         shape = X.shape
         list_m_var = []
         for i in range(shape[1]):
             list_m_var.append([np.mean(columns[i]), np.std(columns[i]), np.var(columns[i])])
         self._data = np.array(list_m_var)
-        self._doa_matrix = np.array(list_m_var)
-        self._a = np.array(list_m_var)
-
-    def calculate(self, new_data: np.array) -> Iterable[Any]:
-        doaAll = []
-        self._doa = []
-        self._in = []
-        # new_data = self._scaler.transform(new_data)
-        in_doa = True
-        for nd in new_data:
-            for index, row in enumerate(nd):
-                bounds = self._data[index]
-                bounds_data = [bounds[0]-4*bounds[1], bounds[0]+4*bounds[1]]
-                if row >= bounds_data[0] and row <= bounds_data[1]:
-                    continue
-                else:
-                    in_doa = False
-            # if len(new_data[0]) > 100 and many > 5:
-            #     in_doa = False
-            doa = {'IN': in_doa}
-            doaAll.append(doa)
-        return doaAll
 
     def predict(self, new_data: np.array) -> Iterable[Any]:
         doaAll = []
         self._doa = []
         self._in = []
-        # new_data = self._scaler.transform(new_data)
         in_doa = True
         for nd in new_data:
             for index, row in enumerate(nd):
@@ -253,8 +180,6 @@ def predict(self, new_data: np.array) -> Iterable[Any]:
                     continue
                 else:
                     in_doa = False
-            # if len(new_data[0]) > 100 and many > 5:
-            #     in_doa = False
             doa = {'IN': in_doa}
             doaAll.append(doa)
             self._doa.append(new_data)

diff --git a/jaqpotpy/doa/tests/test_doa.py b/jaqpotpy/doa/tests/test_doa.py
@@ -94,7 +94,6 @@ def test_smiles_leverage(self):
         assert doa.doa_new == [90575896122526.53, 0.9804306739393107, 0.9992936436413169]
         assert len(calc) == len(mol)
 
-    @unittest.skip("This test needs refactoring")
     def test_mean_var(self):
         mols = [
             'C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO',
@@ -115,80 +114,20 @@ def test_mean_var(self):
         ]
 
         featurizer = RDKitDescriptors(use_fragment=False, ipc_avg=False)
-        # featurizer = MordredDescriptors()
-
         descriptors = featurizer(mols)
-        # descriptors = np.array([[0,1,2], [1,2,3], [1,2,1], [2,2,2], [3,3,3], [1,1,1], [2,1,3], [2,2,2], [2,2,1]])
 
-        minmax = MinMaxScaler()
-        # doa = MeanVar(minmax)
         doa = MeanVar()
-
         doa.fit(descriptors)
-        # doa.data = descriptors
-        # doa.calculate_matrix()
-        # doa.calculate_threshold()
 
         mol = [
             'C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO',
-            'COc1ccc2c(N)nn(C(=O)Cc3cccc(Cl)c3)c2c1'
-            , 'O=C(Cc1cncc2ccccc12)N(CCC1CCCCC1)c1cccc(Cl)c1'
-            , 'Cc1ccncc1NC(=O)Cc1cc(Cl)cc(-c2cnn(C)c2C(F)F)c1'
-            , 'OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2'
-            , 'Cc1ccncc1NC(=O)Cc1cc(Cl)cc(-c2cnn(C)c2C(F)F)c1'
-            , 'Cc1cc(C(F)(F)F)nc2c1c(N)nn2C(=O)Cc1cccc(Cl)c1'
-            , 'Cc1cc(C(F)(F)F)nc2c1c(N)nn2C(=O)C1CCOc2ccc(Cl)cc21'
-            , 'O=C(c1cc(=O)[nH]c2ccccc12)N1CCN(c2cccc(Cl)c2)C(=O)C1'
-            , 'O=C1NC2(CCOc3ccc(Cl)cc32)C(=O)N1c1cncc2ccccc12'
+            'CCC'
         ]
         descriptors = featurizer(mol)
-        # descriptors = np.array([[5,1,0], [3,3,3], [-1,-2,0], [5,5,5], [100,10,40]])
         calc = doa.predict(descriptors)
-        # print(doa.a)
-        # print(doa.doa_matrix)
-        # print(calc)
+        diag = np.diag(doa.data)
+
         assert len(calc) == len(mol)
-
-    @unittest.skip("This test needs refactoring")
-    def test_with_other_data(self):
-        # basedir = os.path.dirname(sys.argv[0])
-        # filename = "gdp-countries.csv"
-        # path = os.path.join(basedir, "results", filename)
-
-        # data = pd.read_csv(path)
-        # data = pd.read_csv('../../test_data/gdp-countries.csv')
-
-        # data = data[['GDP', 'LFG', 'EQP', 'NEQ', 'GAP']].to_numpy()
-
-        data = np.array([[0.0089, 0.0118, 0.0214, 0.2286, 0.6079], [0.0332, 0.0014, 0.0991, 0.1349, 0.5809],
-                         [0.0256, 0.0061, 0.0684, 0.1653, 0.4109], [0.0124, 0.0209, 0.0167, 0.1133, 0.8634],
-                         [0.0676, 0.0239, 0.131, 0.149, 0.9474], [0.0437, 0.0306, 0.0646, 0.1588, 0.8498]])
-
-        minmax = MinMaxScaler()
-        # doa = Leverage(minmax)
-        doa = Leverage()
-        doa.fit(data)
-        calc = doa.predict(data)
-        assert len(calc) == len(data)
-
-    @unittest.skip("This test needs refactoring")
-    def test_with_other_data_mean_var(self):
-        # basedir = os.path.dirname(sys.argv[0])
-        # filename = "gdp-countries.csv"
-        # path = os.path.join(basedir, "results", filename)
-        #
-        # data = pd.read_csv(path)
-
-        # data = pd.read_csv('../../test_data/gdp-countries.csv')
-        # data = data[['GDP', 'LFG', 'EQP', 'NEQ', 'GAP']].to_numpy()
-
-        data = np.array([[0.0089, 0.0118, 0.0214, 0.2286, 0.6079], [0.0332, 0.0014, 0.0991, 0.1349, 0.5809],
-                         [0.0256, 0.0061, 0.0684, 0.1653, 0.4109], [0.0124, 0.0209, 0.0167, 0.1133, 0.8634],
-                         [0.0676, 0.0239, 0.131, 0.149, 0.9474], [0.0437, 0.0306, 0.0646, 0.1588, 0.8498]])
-
-        minmax = MinMaxScaler()
-        # doa = Leverage(minmax)
-        doa = MeanVar()
-        doa.fit(data)
-        calc = doa.predict(data)
-        assert len(calc) == len(data)
+        assert calc[0]['IN']==True, f"Expected calc[0]['IN'] == True, got {calc[0]['IN']} != True"
+        assert calc[1]['IN']==False, f"Expected calc[0]['IN'] == False, got {calc[1]['IN']} != False"
+        assert np.allclose(diag, [1.31511044e+01, 6.69162726e-01, 5.37187947e-03], atol= 1e-5), f"Expected diag == [1.31511044e+01, 6.69162726e-01, 5.37187947e-03], got diag != {diag}"