drugdiscovery.py

# -*- coding: utf-8 -*-
"""DrugDiscovery.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1N6Oh_s--bqdLrbUaftnmNTcJH2en7Ztk

# PART 1 **Computational Drug Discovery**

Sina ShenZi-Ast

***Collecting Bioactivity Data***

📘 **Steps :**

1.   Installing & importing required libraries. 📚
2.   Using [ChEMBL](https://www.ebi.ac.uk/chembl/) data-base to select our desired target.
3. Filtering potency of a substance **(IC50)**.
4. Processing unique **ChEMBL ID, SMILES, Standard Value** of each compound.
5. Labeling compunds as  (Active ✅,Intermediate 🟨,Inactive 🟥).
6. Creating "First_Tabel.csv".

📘 **Step 1.1 :**

Installing and importing required libraries
"""

pip install chembl_webresource_client

import pandas as pd
from chembl_webresource_client.new_client import new_client

"""📘 **Step 2.1:**

Searching in ChEMBL database to find the desired chemical compound, which in this case is **"NAME OF THE COMPUND"**
"""

target = new_client.target
target_query = target.search('acetylcholinesterase')
targets = pd.DataFrame.from_dict(target_query)
targets

selected_target = targets.target_chembl_id[0]
selected_target

"""📘 **Step 3.1:**


*   Filtering activity data of the selected chemical compund based on **standard_type="IC50**"
*   Save it to a DataFrame and csv file as Bio_activity.csv


"""

activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

#df = pd.DataFrame.from_dict(res)
#df

#df.to_csv('Bio_activity.csv', index=False)

df = pd.read_csv('Bio_activity.csv')
df

df.info()

"""📘 **Step 4.1:**

Removing **null** and **duplicated** values from *standard_value* and *canonical_smiles*.

"""

df2= df[df.standard_value.notna()]
df2= df[df.canonical_smiles.notna()]
df2

df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

"""🟢 Creating a combined DataFrame to indicate each compound Bioactivity class."""

selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

"""📘 **Step 5.1:**

Indicating each compund status as **(Active ✅,Intermediate 🟨,Inactive 🟥)** by creating a bioactivity class and defining the threshold.
"""

bioactivity_threshold = []
for i in df3.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

"""📘 **Step 6.1:**

Creating First_Tabel.csv
"""

bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df4 = pd.concat([df3, bioactivity_class], axis=1)
df4 = df4.dropna()
df4.to_csv('First_Tabel.csv', index=False)
df4

! ls -l

"""# PART 2: Lipinski & Mann-Whitney Test

📘 **Steps:**

1.   Deep cleaning data from **null** and **duplicate** SMILES
2.   Installing **Rdkit** & Creating **"df_lipinski"** with *Lipinski's rule of five* 🤚
3. Convert IC50 to **pIC50**
4. Normilizing **Standard_value** & **pIC50**
5. MW / LogP Bar ✅
6. **Mann-Whitney U-Test** & **Statistical Bars** 📊

---


**Step 1.2:**

This type of cleaning can be helpful in ensuring consistency and accuracy when working with SMILES data in cheminformatics or drug discovery tasks.

In essence, this code extracts the longest substring from each canonical smiles string in the dataframe, likely to remove any additional information or fragments attached to the main smiles representation.
"""

df_no_smiles = df4.drop(columns='canonical_smiles')

smiles = []

for i in df.canonical_smiles.tolist():
  cpd = str(i).split('.')
  cpd_longest = max(cpd, key = len)
  smiles.append(cpd_longest)

smiles = pd.Series(smiles, name = 'canonical_smiles')

df_clean_smiles = pd.concat([df_no_smiles,smiles], axis=1)
df5 = df_clean_smiles.dropna()
df5

! ls -l

"""

---


📘 **Step 2.2:**

> Investigating Druglikness with Lipinski's rule of five

**Lipinski's Rule stated the following:**

1.  Molecular weight < 500 Dalton
2.  Octanol-water partition coefficient (LogP) < 5
3. Hydrogen bond donors < 5
4. Hydrogen bond acceptors < 10


"""

pip install rdkit

import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# Inspired by: https://codeocean.com/explore/capsules?query=tag:data-curation
def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem)
        moldata.append(mol)

    baseData= np.arange(1,1)
    i=0
    for mol in moldata:

        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)

        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])

        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1

    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)

    return descriptors

df_lipinski = lipinski(df5.canonical_smiles)
df_lipinski.dropna(subset=['MW','LogP','NumHDonors','NumHAcceptors']) #added to solve NaN data in df _combined
df_lipinski

df5.reset_index(drop=True, inplace=True)  #Reseting Index to avoiding NaN data
df_lipinski.reset_index(drop=True, inplace=True)

df_combined = pd.concat([df5,df_lipinski], axis=1)
df_combined.to_csv('Second_Tabel.csv', index=False)
df_combined

df_combined.dropna(subset=['MW','LogP','class','standard_value','molecule_chembl_id','canonical_smiles']) #added to solve NaN data in df_combined

"""

---


📘 **Step 3.2:**

**Converting IC50 to pIC50**

Comparing IC50 values can be cumbersome due to the wide range.

**pIC50** values provide a more linear scale for comparison.

pIC50 is simply the negative logarithm of the IC50 value. It's a transformed value that makes it easier to compare compounds with widely different IC50 values.


---


This custom function pIC50() will accept a DataFrame as input and will:


*   Take the IC50 values from the ``standard_value`` column and converts it from nM to M by multiplying the value by 10$^{-9}$
*   Take the molar value and apply -log10
*   Delete the ``standard_value`` column and create a new ``pIC50`` column
"""

#Point to note: Values greater than 100,000,000 will be fixed at 100,000,000 otherwise the negative logarithmic value will become negative.
# https://github.com/chaninlab/estrogen-receptor-alpha-qsar/blob/master/02_ER_alpha_RO5.ipynb

def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', axis=1) #'axis=1' specifies that we're dropping a column

    return x

df_combined.standard_value.describe()

-np.log10( (10**-9)* 100000000 )

-np.log10( (10**-9)* 10000000000 )

"""

---


📘 **Step 4.2:**

Normalizing the **standard_value** and **pIC50**

The result is a DataFrame with a normalized **standard_value_norm** column in place of the original standard_value column."""

def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', axis=1)

    return x

df_norm = norm_value(df_combined)
df_norm

df_norm.standard_value_norm.describe()

df_final = pIC50(df_norm)
df_final

df_final.pIC50.describe()

"""

---


📘 **Step 5.2:**

Removing the **'intermediate'** 🟨 class from dataset and saving the result as Third_Tabel.csv

"""

df_2class = df_final[df_final['class'] != 'intermediate']
df_2class.to_csv('Third_Tabel.csv', index=False)
df_2class

import seaborn as sns
sns.set(style='ticks')
import matplotlib.pyplot as plt

plt.figure(figsize=(5.5, 5.5))

# Define a color palette
colors = ["#FF0000", "#0000FF"]  # Example: Red for inactive, blue for active

sns.countplot(x='class', data=df_2class, edgecolor='black', palette=colors)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')

plt.savefig('plot_bioactivity_class.pdf')

"""

---


📘 **Step 6.2:** 📊

Scatter plot of **"LogP"** vs **"MW"**

**MW:**
* **Size of the molecule:** Higher MW typically indicates a larger molecule.

* **Potential for drug-like properties:** Very high MW compounds often have poor drug-like properties due to issues like poor solubility and permeability.


**LogP**

* **Lipophilicity:** Higher LogP values indicate a more lipophilic (fat-soluble) compound.

* **Distribution:** LogP influences how a compound distributes between aqueous and lipid environments (e.g., blood and cell membranes).

* **Potential for drug-like properties**: Both very high and very low LogP values can lead to poor drug-like properties.


❗
**Solubility:** High **LogP** values often correlate with poor water solubility, which can affect drug formulation and delivery."""

plt.figure(figsize=(5.5, 5.5))

sns.scatterplot(x='MW', y='LogP', data=df_2class, hue='class', size='pIC50', edgecolor='black', alpha=0.7)

plt.xlabel('MW', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.savefig('plot_MW_vs_LogP.pdf')

"""📘 **Steps 7.2:**

**Mann-Whitney U Test:**

The Mann-Whitney U test is a **non-parametric** statistical test used to compare two independent samples.

 It assesses whether there is a significant difference in the distributions of the two groups.


---
**T-test:**
Parametric test: Assumes data follows a normal distribution.
Compares the means of two groups.
Sensitive to outliers.
More powerful when assumptions are met.

**Mann-Whitney U test:**
Non-parametric test: Doesn't require data to follow a specific distribution.
Compares the distributions of two groups.
Robust to outliers.
Less powerful than the T-test when assumptions of the T-test are met.

❕ **When to use which:**

**T-test:** If your data is normally distributed and you want to compare means.

**Mann-Whitney U test:** If your data is not normally distributed, you have outliers, or you're unsure about the distribution.

"""

def mannwhitney(descriptor, verbose=False):
  # https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
  from numpy.random import seed
  from numpy.random import randn
  from scipy.stats import mannwhitneyu

# seed the random number generator
  seed(1)

# actives and inactives
  selection = [descriptor, 'class']
  df = df_2class[selection]
  active = df[df['class'] == 'active']
  active = active[descriptor]

  selection = [descriptor, 'class']
  df = df_2class[selection]
  inactive = df[df['class'] == 'inactive']
  inactive = inactive[descriptor]

# compare samples
  stat, p = mannwhitneyu(active, inactive)
  #print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
  alpha = 0.05
  if p > alpha:
    interpretation = 'Same distribution (fail to reject H0)'
  else:
    interpretation = 'Different distribution (reject H0)'
  filename = 'mannwhitneyu_' + descriptor + '.csv'
  pd.DataFrame({'Descriptor':descriptor,
                          'Statistics':stat,
                          'p':p,
                          'alpha':alpha,
                          'Interpretation':interpretation}, index=[0]).to_csv(filename)

  return pd.DataFrame({'Descriptor':descriptor,
                          'Statistics':stat,
                          'p':p,
                          'alpha':alpha,
                          'Interpretation':interpretation}, index=[0])

"""✅ **Interpreting the Results:** 🔴

* It sets a significance level **(alpha)** of *0.05.*

* If the ***p-value*** (p) is greater than the alpha level (alpha), the function interprets the results as "**Same distribution** (fail to reject H0).

* This means you fail to reject the null hypothesis, which states that the **distributions of the two groups are the same**.

If the ***p-value*** (p) is **less than or equal to the alpha** level (alpha), the function interprets the results as "**Different distribution**"- (reject H0). This means you reject the null hypothesis and conclude that the distributions of the two groups are different.


---

⛓ **"pIC50" Distribution:**

Resualt: *reject H0* ✅
"""

mannwhitney('pIC50')

plt.figure(figsize=(5.5, 5.5))
colors = ["#FF0000", "#0000FF"]
sns.boxplot(x = 'class', y = 'pIC50', data = df_2class, palette=colors)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')

plt.savefig('plot_ic50.pdf')

"""

---


⛓ **"MW" Distribution:**  

Result : *FAIL TO REJECT H0* ❎
"""

mannwhitney('MW')

plt.figure(figsize=(5.5, 5.5))
colors = ["#FF0000", "#0000FF"]
sns.boxplot(x = 'class', y = 'MW', data = df_2class,palette=colors)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('MW', fontsize=14, fontweight='bold')

plt.savefig('plot_MW.pdf')

"""

---


⛓ **"LogP" Distribution:**

**Resualt:** *Reject H0* ✅"""

mannwhitney('LogP')

plt.figure(figsize=(5.5, 5.5))
colors = ["#FF0000", "#0000FF"]
sns.boxplot(x = 'class', y = 'LogP', data = df_2class,palette=colors)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')

plt.savefig('plot_LogP.pdf')

"""

---


⛓ **"NumHDonors" Distribution:**

**Resualt:** *Fail to rejech H0* ❎
"""

mannwhitney('NumHDonors')

plt.figure(figsize=(5.5, 5.5))
colors = ["#FF0000", "#0000FF"]
sns.boxplot(x = 'class', y = 'NumHDonors', data = df_2class, palette=colors)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHDonors', fontsize=14, fontweight='bold')

plt.savefig('plot_NumHDonors.pdf')

"""

---


⛓ **NumHAcceptors Distribution:**

**Resualt:** *reject H0* ✅"""

mannwhitney('NumHAcceptors')

plt.figure(figsize=(5.5, 5.5))
colors = ["#FF0000", "#0000FF"]
sns.boxplot(x = 'class', y = 'NumHAcceptors', data = df_2class, palette=colors)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHAcceptors', fontsize=14, fontweight='bold')

plt.savefig('plot_NumHAcceptors.pdf')

"""❗
Taking a look at pIC50 values, the actives and inactives displayed statistically significant difference, which is to be expected since threshold values (IC50 < 1,000 nM = Actives while IC50 > 10,000 nM = Inactives, corresponding to pIC50 > 6 = Actives and pIC50 < 5 = Inactives) were used to define actives and inactives.


---

❗
**Lipinski's descriptors:**

All of the 4 Lipinski's descriptors exhibited statistically significant difference between the actives and inactives.

---

# PART 3: Calculating Molecular FP

📘 **Steps:**

**TUT** Descriptor Calculation and Dataset Preparation ⚡


1.   Installing & importing required libraries **padelpy** & **padel-wrapper** 📚
2.   Calculating Molecular Fingerprints by using PaDEL
3.   Combining pIC50 with PubChem Fingerprints creating **"dataset4.to_csv"**

**Recalling df_final**
"""

df_final

"""📘 **Step 3.1 :**

Instaling required librearies


"""

pip install padelpy

pip install padel-pywrapper

selection = ['canonical_smiles','molecule_chembl_id']
df4_selection = df4[selection]                                 ## DF3 in tutarial cheeeeek it more
df4_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

! cat molecule.smi | head -5

! cat molecule.smi | wc -l

"""📘 **Step 3.2:**

Calculating Molecular Fingerprints by using PaDEL

* **Molecular descriptors**

 are numerical representations of molecular properties. They convert chemical information into data for analysis and prediction, used in fields like drug discovery, materials science, and environmental science.


---


[PaDEL-Descriptor](http://www.yapcwsoft.com/dd/)

**PaDEL descriptors** are molecular descriptors generated using PaDEL-Descriptor, an open-source Java-based software. This tool calculates over 800 molecular descriptors and more than 10 types of molecular fingerprints, facilitating cheminformatics and **computational chemistry** tasks. Key types of PaDEL descriptors include:

 ℹ *More Info:*
[Additional Link](https://pubmed.ncbi.nlm.nih.gov/21425294/)

⏬ **Downloading [PaDEL-Descriptor]**
"""

! wget https://github.com/ShenZi-Ast/Drug-Discovery/raw/main/padel.zip
! wget https://github.com/ShenZi-Ast/Drug-Discovery/raw/main/padel.sh

! unzip padel.zip

! cat padel.sh

! bash padel.sh

! ls -l

"""📘 **Step 3.3:**

**Creating X & Y Tabels**

👿 **X tabel (Molecular Fingerprints):**

1. Creating **df_4X** by using pd to read **"descriptors_output.csv"**

2. Removing the names from it.


"""

import pandas as pd

df4_X = pd.read_csv('descriptors_output.csv')  #FROM PADEL

df4_X = df4_X.drop(columns=['Name'])

df4_X

"""**Y Tabel (pIC50)**

---
Recalling pIC50 From df_final to join it to df4_x (Molecular Fingerprints ), FOR WHAT ?
"""

df4_Y = df_final['pIC50']
df4_Y

"""Concating Y and X matrices"""

dataset4 = pd.concat([df4_X,df4_Y], axis=1)
dataset4

#dataset4.to_csv('dataset4.csv', index=False)

X = df4_X
Y = df4_Y

X.shape

Y.shape

"""# PART 4 : ML

📘 **Steps:**

1.   Importing **Seaborn** & **Sklearn** 📚
2.   Creating **ML** model Using **RandomForest** 📬
3.   *Scatter* Plot of results 📈

📘 **Step 4.1:**

* Inistaling libreries

* Removing Low-Variance data
"""

import seaborn as sns #Second_Time_Importing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = selection.fit_transform(X)

"""Observing VarianceThreshold effect on X value Change"""

X.shape

"""📘 **Step 4.2:**


* **Data split (80/20 ratio)**
* **Using RandomForest as Model**

🔷 80% Training - 20% Testing 🔷
"""

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

X_train.shape, Y_train.shape

X_test.shape, Y_test.shape

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
r2

"""
**Interpretation:**

**0:** The model explains no variance in the dependent variable.

**1:** The model explains all variance in the dependent variable (perfect fit, which is uncommon in real-world applications)."""

Y_pred = model.predict(X_test)

"""📘 **Step 4.3:**

**Experimental vs Predicted pIC50 Values**
"""

sns.set(color_codes=True)
sns.set_style("white")

# Combine Y_test and Y_pred into a single DataFrame for sns.regplot()
df_ML = pd.DataFrame({'Experimental pIC50': Y_test, 'Predicted pIC50': Y_pred})

ax = sns.regplot(x='Experimental pIC50', y='Predicted pIC50', data=df_ML, scatter_kws={'alpha':0.4,'edgecolor':'black'})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 15)
ax.set_ylim(0, 15)
ax.figure.set_size_inches(6, 6)
plt.show()

"""# Part 5: Finding best ML model

📘 **Steps:**

1. Importing Lazypredict library 📚

2. Using **LazyPredict** for Model comparison

3. Charts

---

 ☁ **Lazypredict:**

 offers a rapid way to compare multiple algorithms on your dataset without extensive hyperparameter tuning.

📘 **Step 5.1**

Instaling lib
"""

pip install lazypredict

import lazypredict
from lazypredict.Supervised import LazyRegressor

"""📘 **Step 5.2:**

Runing LazyPre
"""

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

clf = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, Y_train, Y_train)     #all should be train ??????
models_test,predictions_test = clf.fit(X_train, X_test, Y_train, Y_test)

predictions_train

predictions_test

"""📘 **Step 5.3:**

Model performance Chatrs 📊

**R-squared values**
"""

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
palette = sns.cubehelix_palette(n_colors=len(predictions_train), start=0.5, rot=-3, dark=0.3, light=0.8, reverse=True)
ax = sns.barplot(y=predictions_train.index, x="R-Squared", data=predictions_train, palette=palette)
ax.set(xlim=(0, 1))

"""**RMSE values**"""

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
palette = sns.cubehelix_palette(n_colors=len(predictions_train), start=0.5, rot=-0.75, dark=0.2, light=0.8, reverse=True)
ax = sns.barplot(y=predictions_train.index, x="RMSE", data=predictions_train, palette=palette)
ax.set(xlim=(0, 10))

"""**calculation time**"""

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
palette = sns.cubehelix_palette(n_colors=len(predictions_train), start=0.5, rot=-0.75, dark=0.2, light=0.8, reverse=True)
ax = sns.barplot(y=predictions_train.index, x="Time Taken", data=predictions_train, palette=palette)
ax.set(xlim=(0, 10))