Skip to content

Commit

Permalink
feat(JAQPOT-135): Update MeanVar (#29)
Browse files Browse the repository at this point in the history
* feat: Update Leverage DOA Tests

* fix: Disabled numeric test

* Revert "fix: Disabled numeric test"

This reverts commit d2abf6f.

* fix: Leverage_DOA removed numerical test

* fix: Delete calculate() method in MeanVar class

Deleted calculate() method because it is already replaced by predict()

* fix: Delete commented-out code

* feat: Update `test_mean_var` test

* feat: Refactor`DOA' classes

This commit refactors the structure of `DOA` to be an abstract base class. Additionally, `Leverage` and `MeanVar` inherit from `DOA`.

* fix: MeanVar test

* Update doa.py

Deleted some comments
  • Loading branch information
vassilismin authored Jun 13, 2024
1 parent 25f2328 commit d015e7f
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 148 deletions.
87 changes: 6 additions & 81 deletions jaqpotpy/doa/doa.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,27 @@
from abc import ABC
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from typing import Iterable, Any
import math
from jaqpotpy.descriptors.molecular import RDKitDescriptors, MordredDescriptors
import pickle
# import dill

# def calculate_a(X):
# shape = X.shape
# a = (3 * (shape[1] + 1)) / shape[0]
# return a


# def calculate_doa_matrix(X):
# x_T = X.transpose()
# x_out = x_T.dot(X)
# x_out_inv = pd.DataFrame(np.linalg.pinv(x_out.values), x_out.columns, x_out.index)
# return x_out_inv


# def calc_doa(doa_matrix, new_data):
# doaAll = []
# for nd in new_data:
# d1 = np.dot(nd, doa_matrix)
# ndt = np.transpose(nd)
# d2 = np.dot(d1, ndt)
# doa = {'DOA': d2}
# doaAll.append(doa)
# return doaAll


class DOA(object):
class DOA(ABC):
"""
Abstract class for DOA methods
"""
def calculate_threshold(self):
raise NotImplementedError

def calculate_matrix(self):
raise NotImplementedError

def calculate(self, data: Iterable[Any]) -> Iterable[Any]:
raise NotImplementedError

@abstractmethod
def fit(self, X: np.array):
raise NotImplementedError

@abstractmethod
def predict(self, data: Iterable[Any]) -> Iterable[Any]:
raise NotImplementedError


class Leverage(DOA, ABC):
class Leverage(DOA):
"""
Implements DOA method leverage.
Initialized upon training data and holds the doa matrix and the threshold 'A' value.
Expand Down Expand Up @@ -147,7 +117,7 @@ def predict(self, new_data: np.array) -> Iterable[Any]:
return doaAll


class MeanVar(DOA, ABC):
class MeanVar(DOA):
"""
Implements Mean and Variance domain of applicability .
Initialized upon training data and holds the doa mean and the variance of the data.
Expand All @@ -161,10 +131,7 @@ def __name__(self):
return 'MeanVar'

def __init__(self) -> None:
# self._scaler: BaseEstimator = scaler
self._data: np.array = None
self._doa_matrix = None
self._a = None

@property
def doa_new(self):
Expand All @@ -182,21 +149,6 @@ def IN(self):
def IN(self, value):
self._in = value

@property
def doa_matrix(self):
return self._doa_matrix

@doa_matrix.setter
def doa_matrix(self, value):
self._doa_matrix = value

@property
def a(self):
return self._a

@a.setter
def a(self, value):
self._a = value

@property
def data(self):
Expand All @@ -208,42 +160,17 @@ def data(self, value):

def fit(self, X: np.array):
self._data = X
# self._scaler.fit(X)
# self._data = self._scaler.transform(X)
columns = list(zip(*self._data))
shape = X.shape
list_m_var = []
for i in range(shape[1]):
list_m_var.append([np.mean(columns[i]), np.std(columns[i]), np.var(columns[i])])
self._data = np.array(list_m_var)
self._doa_matrix = np.array(list_m_var)
self._a = np.array(list_m_var)

def calculate(self, new_data: np.array) -> Iterable[Any]:
doaAll = []
self._doa = []
self._in = []
# new_data = self._scaler.transform(new_data)
in_doa = True
for nd in new_data:
for index, row in enumerate(nd):
bounds = self._data[index]
bounds_data = [bounds[0]-4*bounds[1], bounds[0]+4*bounds[1]]
if row >= bounds_data[0] and row <= bounds_data[1]:
continue
else:
in_doa = False
# if len(new_data[0]) > 100 and many > 5:
# in_doa = False
doa = {'IN': in_doa}
doaAll.append(doa)
return doaAll

def predict(self, new_data: np.array) -> Iterable[Any]:
doaAll = []
self._doa = []
self._in = []
# new_data = self._scaler.transform(new_data)
in_doa = True
for nd in new_data:
for index, row in enumerate(nd):
Expand All @@ -253,8 +180,6 @@ def predict(self, new_data: np.array) -> Iterable[Any]:
continue
else:
in_doa = False
# if len(new_data[0]) > 100 and many > 5:
# in_doa = False
doa = {'IN': in_doa}
doaAll.append(doa)
self._doa.append(new_data)
Expand Down
73 changes: 6 additions & 67 deletions jaqpotpy/doa/tests/test_doa.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def test_smiles_leverage(self):
assert doa.doa_new == [90575896122526.53, 0.9804306739393107, 0.9992936436413169]
assert len(calc) == len(mol)

@unittest.skip("This test needs refactoring")
def test_mean_var(self):
mols = [
'C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO',
Expand All @@ -115,80 +114,20 @@ def test_mean_var(self):
]

featurizer = RDKitDescriptors(use_fragment=False, ipc_avg=False)
# featurizer = MordredDescriptors()

descriptors = featurizer(mols)
# descriptors = np.array([[0,1,2], [1,2,3], [1,2,1], [2,2,2], [3,3,3], [1,1,1], [2,1,3], [2,2,2], [2,2,1]])

minmax = MinMaxScaler()
# doa = MeanVar(minmax)
doa = MeanVar()

doa.fit(descriptors)
# doa.data = descriptors
# doa.calculate_matrix()
# doa.calculate_threshold()

mol = [
'C[C@@](C)(O1)C[C@@H](O)[C@@]1(O2)[C@@H](C)[C@@H]3CC=C4[C@]3(C2)C(=O)C[C@H]5[C@H]4CC[C@@H](C6)[C@]5(C)Cc(n7)c6nc(C[C@@]89(C))c7C[C@@H]8CC[C@@H]%10[C@@H]9C[C@@H](O)[C@@]%11(C)C%10=C[C@H](O%12)[C@]%11(O)[C@H](C)[C@]%12(O%13)[C@H](O)C[C@@]%13(C)CO',
'COc1ccc2c(N)nn(C(=O)Cc3cccc(Cl)c3)c2c1'
, 'O=C(Cc1cncc2ccccc12)N(CCC1CCCCC1)c1cccc(Cl)c1'
, 'Cc1ccncc1NC(=O)Cc1cc(Cl)cc(-c2cnn(C)c2C(F)F)c1'
, 'OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2'
, 'Cc1ccncc1NC(=O)Cc1cc(Cl)cc(-c2cnn(C)c2C(F)F)c1'
, 'Cc1cc(C(F)(F)F)nc2c1c(N)nn2C(=O)Cc1cccc(Cl)c1'
, 'Cc1cc(C(F)(F)F)nc2c1c(N)nn2C(=O)C1CCOc2ccc(Cl)cc21'
, 'O=C(c1cc(=O)[nH]c2ccccc12)N1CCN(c2cccc(Cl)c2)C(=O)C1'
, 'O=C1NC2(CCOc3ccc(Cl)cc32)C(=O)N1c1cncc2ccccc12'
'CCC'
]
descriptors = featurizer(mol)
# descriptors = np.array([[5,1,0], [3,3,3], [-1,-2,0], [5,5,5], [100,10,40]])
calc = doa.predict(descriptors)
# print(doa.a)
# print(doa.doa_matrix)
# print(calc)
diag = np.diag(doa.data)

assert len(calc) == len(mol)

@unittest.skip("This test needs refactoring")
def test_with_other_data(self):
# basedir = os.path.dirname(sys.argv[0])
# filename = "gdp-countries.csv"
# path = os.path.join(basedir, "results", filename)

# data = pd.read_csv(path)
# data = pd.read_csv('../../test_data/gdp-countries.csv')

# data = data[['GDP', 'LFG', 'EQP', 'NEQ', 'GAP']].to_numpy()

data = np.array([[0.0089, 0.0118, 0.0214, 0.2286, 0.6079], [0.0332, 0.0014, 0.0991, 0.1349, 0.5809],
[0.0256, 0.0061, 0.0684, 0.1653, 0.4109], [0.0124, 0.0209, 0.0167, 0.1133, 0.8634],
[0.0676, 0.0239, 0.131, 0.149, 0.9474], [0.0437, 0.0306, 0.0646, 0.1588, 0.8498]])

minmax = MinMaxScaler()
# doa = Leverage(minmax)
doa = Leverage()
doa.fit(data)
calc = doa.predict(data)
assert len(calc) == len(data)

@unittest.skip("This test needs refactoring")
def test_with_other_data_mean_var(self):
# basedir = os.path.dirname(sys.argv[0])
# filename = "gdp-countries.csv"
# path = os.path.join(basedir, "results", filename)
#
# data = pd.read_csv(path)

# data = pd.read_csv('../../test_data/gdp-countries.csv')
# data = data[['GDP', 'LFG', 'EQP', 'NEQ', 'GAP']].to_numpy()

data = np.array([[0.0089, 0.0118, 0.0214, 0.2286, 0.6079], [0.0332, 0.0014, 0.0991, 0.1349, 0.5809],
[0.0256, 0.0061, 0.0684, 0.1653, 0.4109], [0.0124, 0.0209, 0.0167, 0.1133, 0.8634],
[0.0676, 0.0239, 0.131, 0.149, 0.9474], [0.0437, 0.0306, 0.0646, 0.1588, 0.8498]])

minmax = MinMaxScaler()
# doa = Leverage(minmax)
doa = MeanVar()
doa.fit(data)
calc = doa.predict(data)
assert len(calc) == len(data)
assert calc[0]['IN']==True, f"Expected calc[0]['IN'] == True, got {calc[0]['IN']} != True"
assert calc[1]['IN']==False, f"Expected calc[0]['IN'] == False, got {calc[1]['IN']} != False"
assert np.allclose(diag, [1.31511044e+01, 6.69162726e-01, 5.37187947e-03], atol= 1e-5), f"Expected diag == [1.31511044e+01, 6.69162726e-01, 5.37187947e-03], got diag != {diag}"

0 comments on commit d015e7f

Please sign in to comment.