diff --git a/QligFEP.py b/QligFEP.py index 7d6ed0f..92b8530 100644 --- a/QligFEP.py +++ b/QligFEP.py @@ -713,61 +713,115 @@ def write_runfile(self, writedir, file_list): run_threads = '{}'.format(int(replacements['NTASKS'])) with open(src) as infile, open(tgt, 'w') as outfile: - for line in infile: - if line.strip() == '#SBATCH -A ACCOUNT': - try: - replacements['ACCOUNT'] - - except: - line = '' - outline = IO.replace(line, replacements) - outfile.write(outline) - - if line.strip() == '#EQ_FILES': - for line in EQ_files: - file_base = line.split('/')[-1][:-4] - outline = 'time mpirun -np {} $qdyn {}.inp' \ - ' > {}.log\n'.format(ntasks, - file_base, - file_base) - outfile.write(outline) - - if line.strip() == '#RUN_FILES': - if self.start == '1': - for line in MD_files: + if self.cluster=='CSB': + + for line in infile: + if line.strip() == '#SBATCH -A ACCOUNT': + try: + replacements['ACCOUNT'] + + except: + line = '' + outline = IO.replace(line, replacements) + outfile.write(outline) + + if line.strip() == '#EQ_FILES': + for line in EQ_files: file_base = line.split('/')[-1][:-4] - outline = 'time mpirun -np {} $qdyn {}.inp' \ - ' > {}.log\n'.format(ntasks, - file_base, - file_base) + outline = 'time mpirun -np {} $qdyn {}.inp' \ + ' > {}.log\n'.format(ntasks, + file_base, + file_base) + outfile.write(outline) + + if line.strip() == '#RUN_FILES': + if self.start == '1': + for line in MD_files: + file_base = line.split('/')[-1][:-4] + outline = 'time mpirun -np {} $qdyn {}.inp' \ + ' > {}.log\n'.format(ntasks, + file_base, + file_base) + outfile.write(outline) + + elif self.start == '0.5': + outline = 'time mpirun -np {} $qdyn {}.inp' \ + ' > {}.log\n\n'.format(ntasks, + 'md_0500_0500', + 'md_0500_0500') outfile.write(outline) + for i, md in enumerate(md_1): + outline1 = 'time mpirun -np {:d} $qdyn {}.inp' \ + ' > {}.log &\n'.format(int(int(ntasks)/2), + md_1[i][:-4], + md_1[i][:-4]) + + outline2 = 'time mpirun -np {:d} $qdyn {}.inp' \ + ' > {}.log\n'.format(int(int(ntasks)/2), + md_2[i][:-4], + md_2[i][:-4]) + + outfile.write(outline1) + outfile.write(outline2) + outfile.write('\n') + elif self.cluster=='TETRA': + + for line in infile: + if line.strip() == '#SBATCH -A ACCOUNT': + try: + replacements['ACCOUNT'] - elif self.start == '0.5': - outline = 'time mpirun -np {} $qdyn {}.inp' \ - ' > {}.log\n\n'.format(ntasks, - 'md_0500_0500', - 'md_0500_0500') - outfile.write(outline) - for i, md in enumerate(md_1): - outline1 = 'time mpirun -np {:d} $qdyn {}.inp' \ - ' > {}.log &\n'.format(int(int(ntasks)/2), - md_1[i][:-4], - md_1[i][:-4]) - - outline2 = 'time mpirun -np {:d} $qdyn {}.inp' \ - ' > {}.log\n'.format(int(int(ntasks)/2), - md_2[i][:-4], - md_2[i][:-4]) - - outfile.write(outline1) - outfile.write(outline2) - outfile.write('\n') + except: + line = '' + outline = IO.replace(line, replacements) + outfile.write(outline) + + if line.strip() == '#EQ_FILES': + for line in EQ_files: + file_base = line.split('/')[-1][:-4] + outline = 'time mpprun -np {} $qdyn {}.inp' \ + ' > {}.log\n'.format(ntasks, + file_base, + file_base) + outfile.write(outline) - if self.cluster == 'ALICE': + if line.strip() == '#RUN_FILES': + if self.start == '1': + for line in MD_files: + file_base = line.split('/')[-1][:-4] + outline = 'time mpprun -np {} $qdyn {}.inp' \ + ' > {}.log\n'.format(ntasks, + file_base, + file_base) + outfile.write(outline) + + elif self.start == '0.5': + outline = 'time mpprun -np {} $qdyn {}.inp' \ + ' > {}.log\n\n'.format(ntasks, + 'md_0500_0500', + 'md_0500_0500') + outfile.write(outline) + for i, md in enumerate(md_1): + outline1 = 'time mpprun -np {:d} $qdyn {}.inp' \ + ' > {}.log &\n'.format(int(int(ntasks)/2), + md_1[i][:-4], + md_1[i][:-4]) + + outline2 = 'time mpprun -np {:d} $qdyn {}.inp' \ + ' > {}.log\n'.format(int(int(ntasks)/2), + md_2[i][:-4], + md_2[i][:-4]) + + outfile.write(outline1) + outfile.write(outline2) + outfile.write('\n') + elif self.cluster == 'ALICE': outfile.write('rm *.dcd\n') outfile.write('rm *.en\n') outfile.write('rm *.re\n') - + else: + raise ValueError(f"Unknown cluster: {self.cluster}") + def write_qfep(self, inputdir, windows, lambdas): qfep_in = s.ROOT_DIR + '/INPUTS/qfep.inp' qfep_out = writedir + '/inputfiles/qfep.inp' @@ -978,5 +1032,4 @@ def parseargs(args: list[str] = []) -> argparse.Namespace: run.write_submitfile(writedir) run.write_qfep(inputdir, args.windows, lambdas) run.write_qprep(inputdir) - run.qprep(inputdir) - + run.qprep(inputdir) \ No newline at end of file diff --git a/call_analyze_FEP.py b/call_analyze_FEP.py new file mode 100644 index 0000000..4fa44c6 --- /dev/null +++ b/call_analyze_FEP.py @@ -0,0 +1,25 @@ +import os +import subprocess +import sys + +def run_analyze_FEP(parent_folder, command_args): + for root, dirs, files in os.walk(parent_folder): + for dir_name in dirs: + if dir_name.startswith('FEP_'): + full_path = os.path.join(root, dir_name) + cmd = ['python', 'analyze_FEP.py', '-F', full_path] + command_args + subprocess.run(cmd) + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: python script_name.py ") + sys.exit(1) + + parent_folder = sys.argv[1] + command_args = sys.argv[2:] + + if not os.path.isdir(parent_folder): + print(f"Error: '{parent_folder}' is not a valid directory.") + sys.exit(1) + + run_analyze_FEP(parent_folder, command_args) diff --git a/template/1.ligprep/ligand.md b/template/1.ligprep/ligand.md new file mode 100644 index 0000000..f0b5030 --- /dev/null +++ b/template/1.ligprep/ligand.md @@ -0,0 +1 @@ +Move all your ligands here. See README.md for further instructions diff --git a/template/2.protprep/prot.md b/template/2.protprep/prot.md new file mode 100644 index 0000000..73fc2d8 --- /dev/null +++ b/template/2.protprep/prot.md @@ -0,0 +1 @@ +Move the protein here. See README.md for further instructions diff --git a/template/3.setupFEP/analyze_FEP.py b/template/3.setupFEP/analyze_FEP.py new file mode 100644 index 0000000..46ba6a1 --- /dev/null +++ b/template/3.setupFEP/analyze_FEP.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python + +import argparse +import glob +import numpy as np +import pandas as pd +import os +import csv + +import functions as f +import settings as s +import IO + +try: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + plot = True +except: + print('cannot import matplotlib, skipping plot generation') + plot = False + +class Run(object): + """ + """ + def __init__(self, FEP, color, PDB, cluster, esc, *args, **kwargs): + self.esc = esc + self.cluster=cluster + self.FEP = FEP.strip('/') + self.energies = {} + self.FEPstages = [] + FEPfiles = glob.glob(self.FEP + '/inputfiles/FEP*.fep') + inputs = glob.glob(self.FEP + '/inputfiles/md*.inp') + inputs = [x for x in inputs if not '_F.inp' in x] + FEPfiles.sort() + self.failed = [] + for FEPfile in FEPfiles: + FEPstage = FEPfile.split('/')[-1] + FEPstage = FEPstage.split('.')[0] + self.FEPstages.append(FEPstage) + + self.lambda_sum = len(FEPfiles) * (len(inputs)-1) + + colors = {'blue':['navy','lightblue'], + 'red' :['darkred','mistyrose'] + } + + self.color = colors[color] + + def create_environment(self): + self.analysisdir = self.FEP + '/analysis' + # Add overwrite function? + if os.path.isdir(self.analysisdir) != True: + os.mkdir(self.analysisdir) + + def read_FEPs(self): + methods_list = ['dG', 'dGf', 'dGr', 'dGos', 'dGbar'] + methods = {'dG' : {}, + 'dGf' : {}, + 'dGr' : {}, + 'dGos' : {}, + 'dGbar' : {} + } + results = {} + out = [] + + FEPs = sorted(glob.glob(self.FEP + '/*/*/*/qfep.out')) + for filename in FEPs: + i = -1 + file_parse = filename.split('/') + FEP = file_parse[1] + temperature = file_parse[2] + replicate = file_parse[3] + + try: + if self.esc: + energies = IO.read_qfep_esc(filename) + else: + energies = IO.read_qfep(filename) + except: + print("Could not retrieve energies for: " + filename) + energies = [np.nan, np.nan, np.nan, np.nan, np.nan] + if not replicate in self.failed: + self.failed.append(replicate) + + for key in methods_list: + i += 1 + try: + methods[key][FEP].append(energies[i]) + except: + methods[key][FEP] = [energies[i]] + + # Construct for the energy figure + if not replicate in self.energies: + self.energies[replicate] = {} + + self.energies[replicate][FEP] = IO.read_qfep_verbose(filename) + for method in methods: + dG_array = [] + for key in sorted(methods[method]): + # print(method, key, methods[method][key]) + dG_array.append(methods[method][key]) + dG_array = [[float(i) for i in fep_stage] for fep_stage in dG_array] + dG_array = np.array(dG_array) + dG = f.avg_sem(dG_array) + results[method]='{:6.2f}{:6.2f}'.format(*dG) + + self.methods = methods + print(methods) + + key_method = [] + dG_values = [] + + # Iterate through the dictionary + for key, value in methods.items(): + key_method.append(key) + dG_values.append(value['1.QligFEP_CDK2']) + + # for method in methods: + # print(method) + # print('crashes {}'.format(len(self.failed))) + + def read_mdlog(self): + mapping = {} + cnt = -1 + # Add temperature variable later + md_files = glob.glob(self.FEP + '/FEP*/*/*/md*.log') + md_files.sort() + md_ref = glob.glob(self.FEP + '/inputfiles/md*.inp') + windows = len(glob.glob(self.FEP + '/inputfiles/md*.inp')) - 1 + stages = len(glob.glob(self.FEP + '/inputfiles/FEP*.fep')) + for ref in md_ref: + w = ref.split('/')[-1].split('_')[2].split('.')[0] + cnt += 1 + mapping[w]=cnt + + for md_file in md_files: + stage = md_file.split('/')[1][-1] + l = md_file.split('/')[-1].split('_')[2].split('.')[0] + offset = (int(stage) - 1) * int(windows) + cumulative_l = (mapping[l] + offset) + (cumulative_l) + + + + def SEM_calculator(self, output_file, FEP): + methods = self.methods + existing_file = os.path.isfile(output_file) + if existing_file: + filename_base, file_extension = os.path.splitext(output_file) + index = 1 + while existing_file: + new_filename = f"{filename_base}-{index}{file_extension}" + existing_file = os.path.isfile(new_filename) # Update the existing_file variable + index += 1 + output_file = new_filename + + with open(output_file, 'w', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + + # Add the FEP directory path as a row at the top (header) + csv_writer.writerow(['FEP Directory Path']) + csv_writer.writerow([FEP]) + + csv_writer.writerow(['Key', 'Value', 'SEM']) + for key in methods: + value = methods[key]['1.QligFEP_CDK2'] + value_without_nan = np.array(value)[~np.isnan(value)] + SEM = np.std(value_without_nan) / (np.sqrt(len(value_without_nan))) + csv_writer.writerow([key, value, SEM]) + + def plot_data(self): + y_axis = {} + x_axis = range(0,self.lambda_sum+1) + avg = [] + for replicate in self.energies: + for FEPstage in self.FEPstages: + if not FEPstage in self.energies[replicate] and not replicate in self.failed: + self.failed.append(replicate) + for replicate in self.failed: + try: + del self.energies[replicate] + except: + continue + for replicate in self.energies: + y_axis[replicate] = [0] + dG = 0 + for FEPstage in self.FEPstages: + for energy in self.energies[replicate][FEPstage][0][1:]: + energy = dG + energy + y_axis[replicate].append(energy) + dG+=self.energies[replicate][FEPstage][0][-1] + + for y in y_axis: + for i,energy in enumerate(y_axis[y]): + if len(avg) < self.lambda_sum + 1: + avg.append(energy) + else: + avg[i] += energy + + plt.plot(x_axis,y_axis[y],color=self.color[1]) + y_avg = [x / len(y_axis) for x in avg] + plt.plot(x_axis,y_avg,color=self.color[0]) + axes = plt.gca() + axes.set_xlim([0,self.lambda_sum]) + plt.xlabel(r'cumulative $\lambda$', fontsize=18) + plt.ylabel(r'$\Delta$G (kcal/mol)', fontsize=16) + plt.savefig(self.analysisdir+'/dG.png',dpi=300,transparent=True) + + def write_re2pdb(self): + curdir = os.getcwd() + os.chdir(self.FEP + '/analysis') + if not os.path.exists('pdbs'): + os.mkdir('pdbs') + + libfiles = glob.glob('../inputfiles/*.lib') + re_files = glob.glob('../FEP*/*/*/*.re') + topology = glob.glob('../inputfiles/*.top')[0] + + with open('../inputfiles/qprep.inp') as f: + protlib = f.readline() + + with open('re2pdb.inp', 'w') as outfile: + outfile.write('{}'.format(protlib)) + + for libfile in libfiles: + outfile.write('rl {}\n'.format(libfile)) + + outfile.write('rt {}\n'.format(topology)) + + for re_file in re_files: + pdb_out = re_file.split('/')[-1][:-3] + repeat = '{:02d}'.format(int(re_file.split('/')[3])) + fep_stage = re_file.split('/')[1] + pdb_out = 'pdbs/{}_{}_{}'.format(repeat, fep_stage, pdb_out) + outfile.write('rx {}\n'.format(re_file)) + outfile.write('wp {}.pdb\n'.format(pdb_out)) + outfile.write('y\n') + + outfile.write('mask none\n') + outfile.write('mask not excluded\n') + outfile.write('wp pdbs/complexnotexcluded.pdb\n') + outfile.write('y\n') + + outfile.write('q\n') + + cluster_options = getattr(s, self.cluster) + qprep = cluster_options['QPREP'] + options = ' < re2pdb.inp > re2pdb.out' + # Somehow Q is very annoying with this < > input style so had to implement + # another function that just calls os.system instead of using the preferred + # subprocess module.... + IO.run_command(qprep, options, string = True) + + os.chdir(curdir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='protPREP', + formatter_class=argparse.RawDescriptionHelpFormatter, + description = ' == Analyse FEP == ') + + + parser.add_argument('-F', '--FEP', + dest = "FEP", + required = True, + help = "name of FEP directory (FEP_$)") + + parser.add_argument('-pdb', '--PDB', + dest = "PDB", + required = False, + default = False, + action = 'store_true', + help = "Add this argument if you want .pdb files of the trajectory") + + parser.add_argument('-c', '--color', + dest = "color", + required = False, + default = 'blue', + choices = ['blue', 'red'], + help = "color for the plot") + + parser.add_argument('-C', '--cluster', + dest = "cluster", + required = False, + help = "cluster information") + parser.add_argument('-esc', '--end-state-catastrophe', + dest = "esc", + required = False, + help = "Add this argument in case you have singularities in the final lambda windows resulting in only nan values") + + args = parser.parse_args() + run = Run(FEP = args.FEP, + color = args.color, + cluster = args.cluster, + PDB = args.PDB, + esc = args.esc + ) + + run.create_environment() + run.read_FEPs() + run.SEM_calculator('dG.csv', args.FEP) + run.read_mdlog() + + if plot == True: + run.plot_data() + + if args.PDB == True: + run.write_re2pdb() \ No newline at end of file diff --git a/template/3.setupFEP/analyze_allFEP.py b/template/3.setupFEP/analyze_allFEP.py new file mode 100644 index 0000000..f00d41f --- /dev/null +++ b/template/3.setupFEP/analyze_allFEP.py @@ -0,0 +1,172 @@ +import glob +import numpy as np +import math + +def get_results(): + results = {} + residues = [] + + for filename in glob.glob('1.*/*/FEP_*/FEP*/*/*'): + line = filename.split('/') + run = line[0] + FEP = line[1] + name = line[2] + '_' + line[1].split('.')[1] + replicate = line[-1] + ERROR = False + + with open(filename + '/qfep.out') as infile: + block = 0 + for line in infile: + line = line.split() + if len(line) > 3: + if line[0] == 'ERROR:' or line[1] == 'ERROR:' or line[1] == 'Failed': + ERROR = True + + if line[3] == 'Free': + block = 1 + + if line[3] == 'Termodynamic': + block = 2 + + if line[3] == 'Overlap': + block = 3 + + if line[3] == 'BAR': + block = 4 + + if line[3] == 'Reaction': + block = 0 + + if len(line) > 1: + if block == 1: + if line[0] == '1.000000': + dGr = line[4] + + elif line[0] == '0.000000': + dGf = line[2] + + if line[5] == '-Infinity': + dG = np.nan + + else: + dG = float(line[5]) + + if block == 2 and line[0] == '0.000000': + #dG_TI = line[2] + dG_TI = np.nan + + if block == 3 and line[0] == '0.000000': + if line[2] == '-Infinity': + dG_overlap = np.nan + + else: + dG_overlap = float(line[2]) + + if block == 4 and line[0] == '0.000000': + if line[2] == '-Infinity': + dG_BAR = np.nan + else: + dG_BAR = float(line[2]) + + if ERROR != True: + data = [name, replicate, dG, dG_overlap, dG_BAR] + + else: + data = [name, replicate, np.nan, np.nan, np.nan] + + if name in results: + #if len(results[name]) < 10: # temp fix + results[name].append(data) + else: + results[name] = [data] + return results + +def calc_sum_error(data): + cnt = 0 + for line in data: + if np.isnan(line) == True: + continue + else: + cnt = cnt + 1 + #data = np.array(data) + try: + data = np.array(data) + mean = np.nanmean(data) + sem = np.nanstd(np.array(data), ddof =1)/np.sqrt(cnt) #Check if this is correct! + return mean, sem + except: + return None, None + #return mean, sem + +def calc_ddG(raw_data): + with open('results.txt', 'w') as outfile: + outfile.write('FEP Zwanzig error OS error BAR error\n') + dG = {} + for name in raw_data: + scores = [] + a = name.split('_') + FEP = a[0] + '_' + a[1] + print(FEP) + system = a[-1] + scoring = [[], [], []] + for line in raw_data[name]: + scoring[0].append(line[2]) + scoring[1].append(line[3]) + scoring[2].append(line[4]) + + for data in scoring: + scores.append(calc_sum_error(data)) + + scores = [system,scores] + if FEP not in dG: + dG[FEP] = [scores] + + else: + dG[FEP].append(scores) + + for key in dG: + data = dG[key] + for system in data: + if system[0] == 'protein': + Zwanzig_avg_prot= system[1][0][0] + Zwanzig_err_prot= system[1][0][1] + OS_avg_prot= system[1][1][0] + OS_err_prot= system[1][1][1] + BAR_avg_prot= system[1][2][0] + BAR_err_prot= system[1][2][1] + + if system[0] == 'water': + Zwanzig_avg_wat= system[1][0][0] + Zwanzig_err_wat= system[1][0][1] + OS_avg_wat= system[1][1][0] + OS_err_wat= system[1][1][1] + BAR_avg_wat= system[1][2][0] + BAR_err_wat= system[1][2][1] + + try: + ddG_Zwanzig = Zwanzig_avg_prot - Zwanzig_avg_wat + ddG_OS = OS_avg_prot - OS_avg_wat + ddG_BAR = BAR_avg_prot - BAR_avg_wat + ddG_Zwanzig_error = (Zwanzig_err_prot + Zwanzig_err_wat)/math.sqrt(2) + ddG_OS_error = (OS_err_prot + OS_err_wat)/math.sqrt(2) + ddG_BAR_error = (BAR_err_prot + BAR_err_wat)/math.sqrt(2) + except: + ddG_Zwanzig = 0 + ddG_Zwanzig_error = 0 + ddG_OS = 0 + ddG_OS_error = 0 + ddG_BAR = 0 + ddG_BAR_error = 0 + outfile.write('{:30}{:8.2f}{:8.2f}{:8.2f}{:8.2f}{:8.2f}{:8.2f}\n'.format(key, + ddG_Zwanzig, + ddG_Zwanzig_error, + ddG_OS, + ddG_OS_error, + ddG_BAR, + ddG_BAR_error + )) + + + +raw_data = get_results() +calc_ddG(raw_data) diff --git a/template/3.setupFEP/pairs.txt b/template/3.setupFEP/pairs.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/template/3.setupFEP/pairs.txt @@ -0,0 +1 @@ + diff --git a/template/3.setupFEP/setup.py b/template/3.setupFEP/setup.py new file mode 100644 index 0000000..cded706 --- /dev/null +++ b/template/3.setupFEP/setup.py @@ -0,0 +1,103 @@ +import os +import shutil +import glob +import re + +# Get the path to the '1.ligprep' directory +current_folder = os.path.dirname(os.path.abspath(__file__)) +ligandprep_folder = os.path.join(current_folder, '..', '1.ligprep') + +# Check if the '1.ligprep' directory exists +if not os.path.exists(ligandprep_folder): + print("Error: '1.ligprep' directory not found.") +else: + # Import ligand files needed for QligFEP + lig_files = glob.glob(os.path.join(ligandprep_folder, '*.pdb')) + \ + glob.glob(os.path.join(ligandprep_folder, '*.lib')) + \ + glob.glob(os.path.join(ligandprep_folder, '*.prm')) + + for file in lig_files: + shutil.copy(file, '.') + +# Get the path to the '2.protprep' directory +protprep_folder = os.path.join(current_folder, '..', '2.protprep') + +# Check if the '2.protprep' directory exists +if not os.path.exists(protprep_folder): + print("Error: '2.protprep' directory not found.") + print("Have you used --noclean when calling qligfep/protprep.py?") +else: + # Import 'protein.pdb' + protein_pdb = os.path.join(protprep_folder, 'protein.pdb') + if os.path.exists(protein_pdb): + shutil.copy(protein_pdb, '.') + + # Import 'water.pdb' + water_pdb = os.path.join(protprep_folder, 'water.pdb') + if os.path.exists(water_pdb): + shutil.copy(water_pdb, '.') + +# Name for folders. Can be changed. +systems = ['protein-TETRA', 'water-TETRA'] +cnt = 0 + +# Change this to where you installed QligFEP +setupFEP = 'python /home/USER/QLIGFEP/qligfep/QligFEP.py' +windows = '100' + +# Open qprep.inp where cysbond to add are specified. +if os.path.isfile(protprep_folder+'/qprep.inp'): + with open(protprep_folder+'/qprep.inp', 'r') as file: + file_content = file.read() + + # Use regular expressions to find lines with 'addbond' and extract the numbers. + bond_lines = re.findall(r'addbond (\d+):SG (\d+):SG', file_content) + bonds = {} + for idx, (atom1, atom2) in enumerate(bond_lines, 1): + bond_name = f"bond-{idx}" + bonds[bond_name] = [int(atom1), int(atom2)] + + cys = [] + for x, i in bonds.values(): + with open('protein.pdb', 'r') as file: + for line in file: + column = line.split() + if column[0] == 'ATOM' and column[2] == 'SG': + if column[4] == str(x): + atom_1= column[1] + cys.append(atom_1) + if column[4] == str(i): + atom_2 = column[1] + cys.append(atom_2) + pairs = [cys[i] + '_' + cys[i+1] for i in range(0, len(cys), 2)] # Add '_' between atoms and match style required by QligFEP + cysbond = ','.join(pairs) + print('cysbond added: ', cysbond) + for system in systems: + cnt += 1 + directory = str(cnt) + '.' + system + + # Check if the directory already exists and create a new one with a different name if needed + new_directory = directory + suffix = 2 + while os.path.exists(new_directory): + new_directory = f"{directory}-{suffix}" + suffix += 1 + + os.mkdir(new_directory) + + # Call QligFEP + with open('pairs.txt') as infile: + for line in infile: + line = line.split() + mol1 = line[0] + mol2 = line[1] + if system == systems[1]: + call = setupFEP + ' -l1 ' + mol1 + ' -l2 ' + mol2 + ' -FF OPLS2015 -s water -S sigmoidal -c TETRA -r 25 -l 1 -w' + windows + if system == systems[0]: + call = setupFEP + ' -l1 ' + mol1 + ' -l2 ' + mol2 + ' -FF OPLS2015 -b' + cysbond + ' -S sigmoidal -s protein -c TETRA -r 25 -l 1 -w' + windows + src = 'FEP_' + mol1 + '-' + mol2 + dst = os.path.join(new_directory, 'FEP_' + mol1 + '-' + mol2) + os.system(call) + shutil.move(src, dst) +else: + print('Error: qprep.inp file not found in 2.protprep folder. Have you used --noclean when calling protprep.py?') diff --git a/template/3.setupFEP/submit_FEP.py b/template/3.setupFEP/submit_FEP.py new file mode 100644 index 0000000..2b63b45 --- /dev/null +++ b/template/3.setupFEP/submit_FEP.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +import os + +# Get all folders starting with "FEP" +folders = [folder for folder in os.listdir('.') if folder.startswith('FEP') and os.path.isdir(folder)] + +# Loop through folders +for folder in folders: + print(f"Entering folder: {folder}") + os.chdir(folder) + print(f"Running FEP_submit.sh in {folder}") + os.system("sbatch FEP_submit.sh") + # Return to the original directory + os.chdir('..') diff --git a/template/README.md b/template/README.md new file mode 100644 index 0000000..35b725b --- /dev/null +++ b/template/README.md @@ -0,0 +1,95 @@ +Before starting: +To make your workflow easier, copy the `template` folder in the repository. This contains everything you will use for the common usecase for QligFEP. +Locate the .pdb files for your ligands in `1.ligprep` folder and your_protein.pdb in the `2.protprep` folder. + +# 1. Ligand Preparation +We initiate the process with ligands generated in Maestro, typically provided as PDB files. However, to conduct MD simulations, we require specific parameters that are not available initially. Hence, the ligands undergo a crucial step known as ligand preparation. + + +The folder contains pre-generated 3D-coordinates of the ligands, which are not generated by setupFEP. Therefore, it is our responsibility to ensure that these coordinates are reliable. It is essential to bear in mind that FEP handles small changes in the system effectively, but reliable results are obtained only when the phase space sufficiently overlaps. The ligands in this folder share a similar core region, with variations in small substituents being investigated. For AMBER parameters, please refer to qtools: + + +### Generate the files using a script from the script folder. +To generate the necessary files using a the script located in the script folder, follow these steps: +1. Move to 1.ligprep folder from the template. If you're not using the template provided, create a folder to store your ligands and call it `1.ligprep`: +* `mkdir 1.ligprep` +2. Navigate to the newly created directory: +* `cd 1.ligprep` +3. Place all your ligand.pdb files inside this folder. If your files are in your local machine and you want to use a Cluster, you can use the following command: +* `scp /path/to/local/file username@cluster_ip:/path/on/cluster/` +4. Finally, run the following command to execute the script: +* `python $qligfep/scripts/generate_opls.py`. +This script will produce the necessary .lib, .prm, and .pdb files required for the third stage. +The .log files are generated by ffld_server. + +Important! + * Make sure that you've exported qligfep to your .bashrc file (or any other of the common Unix shells) so you can call qligfep from a different folder by using `$qligfep`. +If you haven't, you can do it following these steps: + 1. `cd` to your home folder + 2. `nano ~/.bashrc` + 3. Add this to the end of the file: `export qligfep=/home/USERNAME/QLIGFEP/qligfep` + - Keep in mind that `/home/USERNAME/QLIGFEP/qligfep` should be replaced with the PATH to your qligfep folder. If you're not sure of what the path to it is, go to your qligfep folder and run `pwd` to find out. Then, simply replace `/home/USERNAME/QLIGFEP/qligfep` with the returned path from pwd. + 5. Use `source ~/.bashrc` to apply the changes + + +# 2. Protprep +The second step involves preparing the protein for simulations under spherical boundary conditions (SBC). Please note that this script does not assign protonation states, so a preparation step should be executed beforehand (e.g., using Maestro's Protein Preparation Wizard). + +The center of the sphere can be determined based on a residue in the protein or, as in this case, by explicitly providing the center of geometry (COG) coordinates from a ligand. For example, to obtain the COG coordinates used in this example, run: + +`python "$qligfep/scripts/COG.py" -i ../1.ligprep/17.pdb` + +! Make sure to **add -i before the pdb file**. +This will return the coordinates [0.535:26.772:8.819], which can be directly put in protprep.py: + +`python "$qligfep/protprep.py" -p 1h1s_PrepWiz.pdb -r 22 -c [0.535:26.772:8.819] -w -P CSB` + +After this, you should have three new files in 2.protprep: protPREP.log protein.pdb water.pdb. + +**Note: Prior to executing the above command, ensure that you have navigated to the folder containing the "1h1s_PrepWiz.pdb" file.** + +Furthermore, it is essential to bear in mind that the ligands under investigation share a similar core region, with only minor variations in small substituents. During previous phases, these ligands were superimposed, likely using Maestro. As a result, their centers of geometry are nearly identical. Therefore, we need to perform this process for just one ligand, saving time and effort. + +# 3.setupFEP + +In the next stage we will prepare the input files for the actual simulations. This stage uses $qligfep/QligFEP.py, you can use the -h flag to get more specifics on all the input variables. This folder includes a useful top level script, generate.py, that takes a .txt file as input that specifies the ligand pairs that will be used in this simulation. You can simply run: + +`python setup.py` + +And this should result in a 1.protein and 2.water folder, containing the two systems necessary to perform the calculation. Simply go into the folder (1.PROTEIN OR 2.WATER) and run + +`./FEP_submit.sh` + +This script will either submit your jobs to a slurm queue (see the setup section in $setupFEP/README.md for a more detailed description). Alternatively the jobs can be run on your local machine. + +A typical procedure is to put the two systems in a top layer folder to easily track running calculations (e.g. 1.TESTRUN) in the case of this example. Of course, these toplayer scripts can be easily adjusted for any other use (e.g. to calculate solvation free energies, you add a water and vacuum leg, instead of protein water. + + +# 4. Analyze + +Run `python analyze_FEP.py -F tutorials/1.QligFEP_CDK2/3.setupFEP/1.protein/FEP_22-17 -C CSB` from the `QLIGFEP/qligfep` folder. +Otherwise it won't work because functions.py cannot be imported. + +Additionally, you can try using collect_dG.py to analyze several folders at once. See the instructions provided on how to use this script below the page. The output of the analysis will contain dG, dGf, dGr, dGos, and dGbar, representing different ways of calculating dG. For most cases, dGbar is used. There will be 10 values corresponding to 10 replicas (runs), and some values may be "nan," which is normal. Thanks to changes in this forked repository, SEM is calculated only with existing values (nan values are dropped). + +To change the number of runs, modify the FEP_submit.sh file located in the path: /home/USER/QLIGFEP/qligfep/tutorials/1.QligFEP_CDK2/3.setupFEP/1.protein/FEP_17-22 + + +## How to Use collect_dG.py + +`collect_dG.py` is a python script that allows you to call analyze_FEP.py for different folders (usually 1.protein/FEP_pair and 2.water/FEP_pair) with a single command and stores the results in different .csv files so it's easier to work with the results later on. + +To use collect_dG.py, you need to provide at least 4 arguments when calling it on the command line: +- collect_dG.py +- folder_name (the folder where the .csv files will be stored) +- path_to_folder1 +- path_to_folder2 + +and is called from the qligfep folder like this: `python collect_dG.py folder_name path_to_folder1 path_to_folder2` + +#### Example for tutorial. +After going trough steps 1, 2, 3 and from `qligfep`folder write +`python collect_dG.py dG-FEP_17-22_2 tutorials/1.QligFEP_CDK2/3.setupFEP/1.protein/FEP_17-22 tutorials/1.QligFEP_CDK2/3.setupFEP/2.water/FEP_17-22` + +Now, there should be 2 .csv files dG.csv and dG-1.csv inside the dG-FEP_17-22 folder. Each .csv file contains the path to the folder from which the dG values were calculated along with the calculation results. In later versions, it is expected to create a single .csv file with all the results (problems with merging the files are being addressed). + diff --git a/tutorials/1.QligFEP_CDK2/3.setupFEP/setup.py b/tutorials/1.QligFEP_CDK2/3.setupFEP/setup.py index 0dc6daa..cded706 100644 --- a/tutorials/1.QligFEP_CDK2/3.setupFEP/setup.py +++ b/tutorials/1.QligFEP_CDK2/3.setupFEP/setup.py @@ -1,31 +1,103 @@ import os import shutil import glob +import re -systems = ['protein', 'water'] +# Get the path to the '1.ligprep' directory +current_folder = os.path.dirname(os.path.abspath(__file__)) +ligandprep_folder = os.path.join(current_folder, '..', '1.ligprep') + +# Check if the '1.ligprep' directory exists +if not os.path.exists(ligandprep_folder): + print("Error: '1.ligprep' directory not found.") +else: + # Import ligand files needed for QligFEP + lig_files = glob.glob(os.path.join(ligandprep_folder, '*.pdb')) + \ + glob.glob(os.path.join(ligandprep_folder, '*.lib')) + \ + glob.glob(os.path.join(ligandprep_folder, '*.prm')) + + for file in lig_files: + shutil.copy(file, '.') + +# Get the path to the '2.protprep' directory +protprep_folder = os.path.join(current_folder, '..', '2.protprep') + +# Check if the '2.protprep' directory exists +if not os.path.exists(protprep_folder): + print("Error: '2.protprep' directory not found.") + print("Have you used --noclean when calling qligfep/protprep.py?") +else: + # Import 'protein.pdb' + protein_pdb = os.path.join(protprep_folder, 'protein.pdb') + if os.path.exists(protein_pdb): + shutil.copy(protein_pdb, '.') + + # Import 'water.pdb' + water_pdb = os.path.join(protprep_folder, 'water.pdb') + if os.path.exists(water_pdb): + shutil.copy(water_pdb, '.') + +# Name for folders. Can be changed. +systems = ['protein-TETRA', 'water-TETRA'] cnt = 0 + # Change this to where you installed QligFEP -setupFEP = 'python $qligfep/QligFEP.py' -cysbond = ' ' - -for system in systems: - cnt += 1 - directory = str(cnt) + '.' + system - os.mkdir(directory) - with open('pairs.txt') as infile: - - for line in infile: - line = line.split() - mol1 = line[0] - mol2 = line[1] - if system == 'water': - - call = setupFEP + ' -l1 ' + mol1 + ' -l2 ' + mol2 + ' -FF OPLS2015 -s water -c CSB -r 22 -l 0.5' - - if system == 'protein': - call = setupFEP + ' -l1 ' + mol1 + ' -l2 ' + mol2 + ' -FF OPLS2015 -s protein -c CSB -r 22 -l 0.5' - - src = 'FEP_' + mol1 + '-' + mol2 - dst = directory + '/FEP_' + mol1 + '-' + mol2 - os.system(call) - shutil.move(src, dst) +setupFEP = 'python /home/USER/QLIGFEP/qligfep/QligFEP.py' +windows = '100' + +# Open qprep.inp where cysbond to add are specified. +if os.path.isfile(protprep_folder+'/qprep.inp'): + with open(protprep_folder+'/qprep.inp', 'r') as file: + file_content = file.read() + + # Use regular expressions to find lines with 'addbond' and extract the numbers. + bond_lines = re.findall(r'addbond (\d+):SG (\d+):SG', file_content) + bonds = {} + for idx, (atom1, atom2) in enumerate(bond_lines, 1): + bond_name = f"bond-{idx}" + bonds[bond_name] = [int(atom1), int(atom2)] + + cys = [] + for x, i in bonds.values(): + with open('protein.pdb', 'r') as file: + for line in file: + column = line.split() + if column[0] == 'ATOM' and column[2] == 'SG': + if column[4] == str(x): + atom_1= column[1] + cys.append(atom_1) + if column[4] == str(i): + atom_2 = column[1] + cys.append(atom_2) + pairs = [cys[i] + '_' + cys[i+1] for i in range(0, len(cys), 2)] # Add '_' between atoms and match style required by QligFEP + cysbond = ','.join(pairs) + print('cysbond added: ', cysbond) + for system in systems: + cnt += 1 + directory = str(cnt) + '.' + system + + # Check if the directory already exists and create a new one with a different name if needed + new_directory = directory + suffix = 2 + while os.path.exists(new_directory): + new_directory = f"{directory}-{suffix}" + suffix += 1 + + os.mkdir(new_directory) + + # Call QligFEP + with open('pairs.txt') as infile: + for line in infile: + line = line.split() + mol1 = line[0] + mol2 = line[1] + if system == systems[1]: + call = setupFEP + ' -l1 ' + mol1 + ' -l2 ' + mol2 + ' -FF OPLS2015 -s water -S sigmoidal -c TETRA -r 25 -l 1 -w' + windows + if system == systems[0]: + call = setupFEP + ' -l1 ' + mol1 + ' -l2 ' + mol2 + ' -FF OPLS2015 -b' + cysbond + ' -S sigmoidal -s protein -c TETRA -r 25 -l 1 -w' + windows + src = 'FEP_' + mol1 + '-' + mol2 + dst = os.path.join(new_directory, 'FEP_' + mol1 + '-' + mol2) + os.system(call) + shutil.move(src, dst) +else: + print('Error: qprep.inp file not found in 2.protprep folder. Have you used --noclean when calling protprep.py?') diff --git a/tutorials/1.QligFEP_CDK2/README.md b/tutorials/1.QligFEP_CDK2/README.md index 8d72a7a..3703f6a 100644 --- a/tutorials/1.QligFEP_CDK2/README.md +++ b/tutorials/1.QligFEP_CDK2/README.md @@ -1,128 +1,246 @@ -# QligFEP +#### Why perform Molecular Dynamics (MD) of a Docking Pose? + +Molecular Dynamics (MD) simulations of docking poses serve a crucial purpose in validating and refining initial docking results. When docking is initially performed, it is often done under vacuum conditions, leading to the generation of hypothetical conformations for the protein-ligand complex. However, to ensure the accuracy and reliability of the identified pose, it becomes imperative to explore the system's behavior and dynamics in various solvent environments. + + +#### Workflow Template + +To streamline your workflow, you can copy the template folder provided in the repository. This folder contains all the necessary files and resources for the common use case in QligFEP. Specifically, you will need to place your ligand .pdb files in the 1.ligprep folder and your_protein.pdb in the 2.protprep folder. +For each step, move to the folder named as the step itself. + + +# 1. Ligand Preparation +We initiate the process with ligands generated in Maestro, typically provided as PDB files. However, to conduct MD simulations, we require specific parameters that are not available initially. Hence, the ligands undergo a crucial step known as ligand preparation. +For correct usage of QligFEP, dock every ligand in the desired protein, find the best pose for it (i.e. Minimised energy, known interactions etc.) and then align all the ligands with similar core region. +We won't go in detail on how to get to this. + +The folder contains pre-generated 3D-coordinates of the ligands, which are not generated by setupFEP. Therefore, it is our responsibility to ensure that these coordinates are reliable. It is essential to bear in mind that FEP handles small changes in the system effectively, but reliable results are obtained only when the phase space sufficiently overlaps. The ligands in this folder share a similar core region, with variations in small substituents being investigated. For AMBER parameters, please refer to qtools. + + +### 1.1 Generate the files using a script from the script folder. +To generate the necessary files using a the script located in the script folder, follow these steps: +1. Move to 1.ligprep folder from the template. If you're not using the template provided, create a folder to store your ligands and call it `1.ligprep`: +* `mkdir 1.ligprep` +2. Navigate to the newly created directory: +* `cd 1.ligprep` +3. Place all your ligand.pdb files inside this folder. If your files are in your local machine and you want to use a Cluster, you can use the following command: +* `scp /path/to/local/file username@cluster_ip:/path/on/cluster/` +4. Finally, run the following command to execute the script: +* `python $qligfep/scripts/generate_opls.py`. +This script will produce the necessary .lib, .prm, and .log files required for the third stage. +The .log files are generated by ffld_server. + +Important! + * Make sure that you've exported qligfep to your .bashrc file (or any other of the common Unix shells) so you can call qligfep from a different folder by using `$qligfep`. +If you haven't, you can do it following these steps: + 1. `cd` to your home folder + 2. `nano ~/.bashrc` + 3. Add this to the end of the file: `export qligfep=/home/USERNAME/QLIGFEP/qligfep` + - Keep in mind that `/home/USERNAME/QLIGFEP/qligfep` should be replaced with the PATH to your qligfep folder. If you're not sure of what the path to it is, go to your qligfep folder and run `pwd` to find out. Then, simply replace `/home/USERNAME/QLIGFEP/qligfep` with the returned path from pwd. + 5. Use `source ~/.bashrc` to apply the changes + + +# 2.Protprep +The second step involves preparing the protein for simulations under spherical boundary conditions (SBC). Please note that this script does not assign protonation states, so a preparation step should be executed beforehand (e.g., using Maestro's Protein Preparation Wizard). +The center of the sphere can be determined based on a residue in the protein or, as in this case, by explicitly providing the center of geometry (COG) coordinates from a ligand. + +### 2.1 Obtain COG coordinates. +To do so run: +`python "$qligfep/scripts/COG.py" -i ligand_path.pdb` + +For example, to obtain the COG coordinates used in this example, run: +`python "$qligfep/scripts/COG.py" -i ../1.ligprep/17.pdb` +This will return the coordinates [0.535:26.772:8.819] +! Make sure to **add -i before the pdb file**. + +### 2.2 Add the COG coordinates directly in protprep.py: +`python "$qligfep/protprep.py" -p path_to_protein.pdb -r RADIUS -c COORDINATES -w -P CLUSTER_NAME --noclean` + +_The '--noclean' flag serves a specific purpose by retaining files such as 'qprep.inp' for display. In subsequent stages, particularly during utilization by 'setup.py', the contents of this file become instrumental in gathering data concerning necessary cysteine bond additions. Throughout this process, it's possible to encounter an error marked by 'FileNotFoundError: [Errno 2] No such file or directory: 'top_p.pdb'.' For a more comprehensive understanding of this issue, refer to the 'Common Errors messages' section located at the end of this document._ + +For this tutorial: + +- `python "$qligfep/protprep.py" -p 1h1s_PrepWiz.pdb -r 22 -c [0.535:26.772:8.819] -w -P CSB --noclean` + +After this, you should have three new files in 2.protprep: protPREP.log protein.pdb water.pdb. + +**Note: Prior to executing the above command, ensure that you have navigated to the folder containing the "1h1s_PrepWiz.pdb" file.** +Furthermore, it is essential to bear in mind that the ligands under investigation share a similar core region, with only minor variations in small substituents. During previous phases, these ligands were superimposed, likely using Maestro. As a result, their centers of geometry are nearly identical. Therefore, we need to perform this process **for just one ligand**, saving time and effort. + +# 3.setupFEP +In the next stage we will prepare the input files for the actual simulations. + +### 3.1 Prepare input files + +#### 3.1.1 Pairs.txt +To ensure accurate calculation of ddG, QligFEP requires a well-defined list of paired combinations. Following a recommended protocol, it's advisable to curate pairs that exhibit maximal similarity while progressing from intricate to simpler compositions. The 'pairs.txt' file plays a pivotal role in this process and mandates specific conventions. + +Each entry within 'pairs.txt' corresponds to a pair of ligands, and it should contain the ligand's identifier, matching the name in the corresponding '.pdb' file within the '1.ligprep' folder. The identifier should exclude the '.pdb' extension, with pairs differentiated by a single space. This procedure is to be repeated for every pair of ligands. + +For instance, let's consider an illustrative scenario with four distinct ligands, each characterized by a different moiety (R): + +Ligand_A (R = 2-F-Benzyl), stored as 'ligand_a.pdb' in the '1.ligprep' folder. +Ligand_B (R = Benzyl), stored as 'ligand_b.pdb' in the '1.ligprep' folder. +Ligand_C (R = Propyl), stored as 'ligand_c.pdb' in the '1.ligprep' folder. +Ligand_D (R = Methyl), stored as 'ligand_d.pdb' in the '1.ligprep' folder. + +The corresponding 'pairs.txt' file should be structured as follows: +``` +ligand_a ligand_b +ligand_c ligand_d +``` + +One can also invert the pairs to see if the results are similar as they should be equal in absolute value. So: + +``` +ligand_a ligand_b +ligand_b ligand_a +ligand_c ligand_d +ligand_d ligand_c +``` +#### 3.1.2 QligFEP.py +This stage uses `$qligfep/QligFEP.py`. You can use the -h flag to get more specifics on all the input variables. This folder includes a useful top level script, `generate.py`, that takes the pairs.txt file as input that specifies the ligand pairs that will be used in this simulation. + +We then run QligFEP.py by running setup.py: +- `python setup.py` + +_Note: In this forked version, setup.py is capable of moving the files necessary to use QligFEP.py. If you're using a different version of setup.py, you will have to manually move all the ligands.pbd from `1.ligprep`, the protein.pdb and water.pdb files from `2.protprep`_ in order to do this step. + +This should result in a 1.protein and 2.water folder, containing folders for each pair of ligands described in the `pairs.txt` file. A message will be printed specifying the cystein bonds added. Make sure they are correct (i.e. no atoms duplicated). A `qprep ended normally`message should be printed for each pair correctly added to the folder. + +To perform the calculation. Simply go into the folder (1.PROTEIN OR 2.WATER), then go into the pair folder whose job you want to submit and run +`./FEP_submit.sh` + +This script will either submit your jobs to a slurm queue (see the setup section in $setupFEP/README.md for a more detailed description). Alternatively the jobs can be run on your local machine. +A typical procedure is to put the two systems in a top layer folder to easily track running calculations (e.g. 1.TESTRUN) in the case of this example. Of course, these toplayer scripts can be easily adjusted for any other use (e.g. to calculate solvation free energies, you add a water and vacuum leg, instead of protein water. + +If you want to submit all the jobs at once without having to go into each pair folder, you can copy `submit_FEP.py` to your `1.protein` or `2.water` folder and then run it. This python script will find all the folders starting as 'FEP_' (basically, all the folders named after the pairs.txt file) and submit their job. + +_Note: Only in this forked version submit_FEP.py exists. If your using the original repository you will have to submit all the jobs manually or copy the code for submit_FEP.py from [here](https://github.com/afloresep/qligfep/blob/master/submit_FEP.py) and add it to your workspace._ + +# 4. Analyze dG in protein and water. +Run the analyze_FEP.py script with the appropriate parameters: +`python analyze_FEP.py -F path_to_FEP_pair1-pair2_folder -C CLUSTER` +In this tutorial this will be: +`python analyze_FEP.py -F tutorials/1.QligFEP_CDK2/3.setupFEP/1.protein/FEP_22-17 -C CSB` + + +_Note: You should run this command for each FEP_ligand1-ligand2 folder specified in the pairs.txt file, located in both the `1.protein` and `2.water` folders._ + +This will give us the dG calculations for the pair 22-17 in 1.protein. You will have as many FEP_ligand1-ligand2 folders as pairs indicated in the `pairs.txt` file. Thus, you will need to run `analyze_FEP.py` in all of them, for both `1.protein` and `2.water` folders. +Nonetheless, if you want to call analzye_FEP.py for all folders located in 1.protein or 2.water (those named after the pair.txt file: i.e. FEP_17-22) you may use `call_analyzeFEP.py` located in qligfep folder as: +`python call_analyze_FEP.py path_to_1.protein_folder -C CSB` and then `python call_analyze_FEP.py path_to_2.water_folder -C CSB`. + +_Note: Again, these are python scripts specific to this repository and may not be available in the original repository where you will have to do it manually._ + +**To change the number of runs**, modify the FEP_submit.sh file located in the path of each pair: +/home/USER/QLIGFEP/qligfep/tutorials/1.QligFEP_CDK2/3.setupFEP/1.protein/FEP_17-22 + +It is necessary to collect dG values for both ligand pairs in water and protein since a complete thermodynamic cycle is needed to calculate ddG. +(for further understanding of the thermodynamic cycle: https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.7b00564) + +## Following steps: +### Calculating ddG: +First, we have to ask the question: +#### _Why do we want to calculate ddG in the first place?_ +Before diving into the calculation of ddG, let's understand its purpose. The primary objective is to determine the difference in binding free energy (ddGbind) between two ligands or multiple pairs of ligands with slight variations. This allows us to assess their relative binding affinities to the receptor. ddGbind represents the energy required for one ligand (B) to bind the receptor compared to another ligand (A). + + +#### _How can we calculate ddG?_ +To calculate ddG, we rely on Free Energy Perturbation (FEP) simulations conducted in both the protein and water environments. The difference in free energy between these two transformations ($$\Delta G_{R} - \Delta G_{w}$$) will be theoretically equivalent to the differnece in binding free energy between the two ligands $$\Delta\Delta G_{\text{bind (B-A)}} = \Delta G_{\text{bind, B}} - \Delta G_{\text{bind, A}}$$). For a more detailed explanation of this concept you can refer to [this article](https://link.springer.com/protocol/10.1007/978-1-0716-1209-5_12) + +In summary, we need to take the dG results obtained from step 4 and calculate the difference between the two states (protein and water) for all ligand pairs. + + + + +_For ease of calculation, we can utilize an Excel file. An Excel template is available for download from [here](https://github.com/afloresep/qligfep/blob/master/ddG_template.xlsx)_ + +In summary, with the help of the template we will calculate: +1. Calculate the average dG for each ligand pair in the protein environment: + $$\bar{dG_{\text{protein}}} = \frac{\sum_{i=1}^{n} dG_{\text{}, i}}{n_{\text{}}}$$ +2. Calculate the average dG for each ligand pair in the water environment: + $$\bar{dG_{\text{water}}} = \frac{\sum_{i=1}^{n} dG_{\text{}, i}}{n_{\text{}}}$$ +3. Calculate the ddG for each pair (ddG(A->B) = AVG(dGprotein) - AVG(dGwater)): +$$ddG(A \to B) = \bar{dG_{\text{protein}}} - \bar{dG_{\text{water}}}$$ +4. Calculate the Standard Error of the Mean (SEM) for the protein and water environments: +$$SEM = \frac{\sigma}{\sqrt{n}}$$ +5. Calculate the final SEM: +$$SEM_{f} = \sqrt{SEM_{\text{water}}^2 + SEM_{\text{protein}}^2}$$ +6. Finally, obtain the ddG value along with the error margin: +$$ddG (A \to B) \pm \sqrt{SEM_{\text{water}}^2 + SEM_{\text{protein}}^2}$$ + +Typically, you would compare these results with experimental data. To calculate experimental ddG from Ki (constant of inhibition), you can use the following formula: + +$$\Delta G^\circ = -RT \ln K$$ + +### Comparing ddG with experimental data: +So far, we have obtained theoretical values for ddG between various pairs of ligands based on docking poses under optimal conditions. However, this information alone may not suffice since the actual binding mode of the ligands could differ. Hence, it is essential to compare the theoretical ddG with experimental ddG. + +$$\Delta G^\circ = -RT \ln K$$ +_R_ is the gas constant with a value of 8.314 J 8.314 J K-1mol-1 +_T_ is the temperature of the reaction in Kelvin. +_ΔG°_ the standard Gibbs free energy change between two states. + +The predicted ΔG° values from relative binding free energy calculations can be directly compared to Ki values. However, in many cases, only experimental IC50 values are available. Although IC50 and Ki values are sometimes used interchangeably, it's crucial to consider the compound's mechanism of inhibition and the specific assay conditions. The Cheng−Prusoff equation provides a mathematical description of the relationship between Ki and IC50 for competitive inhibitors that bind to the free enzyme: + +$$K_i = \frac{IC_{50}}{1 + \frac{[S]}{K_m}}$$ + +_Ki_ = the inhibitory constant, defined as the equilibrium concentration of an inhibitory ligand when 50% of the receptor sites are occupied if no competing substrate is present.   + +_IC50_ = The concentration at which the inhibitory ligand displaces 50% of the substrate.   + +_[S]_ = The concentration of the substrate used in the binding assay.  + +_Km_ = the affinity constant of the substrate, defined as the equilibrium concentration that results in substrate occupying 50% of the receptor sites in the absence of competitio + +If the _[S]_ is small enough (depends on what your data) one can assume then that [S] = 0 so that: +$$K_i = \frac{IC_{50}}{1 + \frac{0}{K_m}}$$ +and therefore: +$$K_i = {IC_{50}}$$ + +For a more detailed understanding of the topic, refer to [further reading](https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.7b00564). + + +## Common Error messages: +### - qligfep/protprep.py Error Handling: +`FileNotFoundError: [Errno 2] No such file or directory: 'top_p.pdb'`: +The most prevalent cause for encountering this error is an erroneous configuration within the 'qprep.inp' file. In certain instances, the 'qprep.inp' file contains two or more cysteine bonds utilizing the same atom, which disrupts the process of generating the topology PDB file. To address this, manual intervention is required to remove the conflicting cysteine bond that employs atoms that have been previously used. The following steps outline how to rectify this issue: + +#### 1. Manual Correction of qprep.inp +Open your 'qprep.inp' file using a text editor such as vim or nano. Examine the content and locate the lines involving 'addbond' commands, such as +Practical example: +### +addbond 69:SG 164:SG y +addbond 75:SG 168:SG y +addbond 151:SG 163:SG y +addbond 163:SG 164:SG +### + +In the final line, observe that 'addbond' 3 and 'addbond' 4 utilize the same atom '163:SG'. Delete the problematic line (last 'addbond') to resolve the conflict. Save the modified 'qprep.inp' file. + +#### 2. Utilize Modified qprep.inp with qprep: +Execute the 'qprep' program from the 'q6/bin' directory, utilizing the newly edited 'qprep.inp' file: +`path_to_q6_folder/q6/bin/qprep < qprep.inp` +Please adapt the command based on the location of your 'q6/bin' directory. For instance, if the directory is located at '/home/user/QLIGFEP/q6/', the command would be: +`/home/user/QLIGFEP/q6/bin/qprep < qprep.inp` + +#### 3. Substitute Protein PDB: +Upon completing the previous step, you should find a file named 'complexnotexcluded.pdb' alongside other files. Replace your original protein PDB file ('your_protein.pdb') with this modified PDB file using the 'cp' command +`cp complexnotexcluded.pdb your_protein.pdb` + + +#### 4. Invoke qprep.py with Updated PDB: +Once the replacement is done, you can rerun the 'qprep.py' script with the newly updated PDB file using the following command: +`python "$qligfep/protprep.py" -p your_protein.pdb -r RADIUS -c COORDINATES -w -P CLUSTER_NAME --noclean` +Ensure to replace placeholders such as 'RADIUS', 'COORDINATES', and 'CLUSTER_NAME' with actual values as needed. + + + + + -This folder contains a step by step tutorial to setup and -analyze QligFEP calculations, as reported in Jespers et al. -(https://doi.org/10.1186/s13321-019-0348-5). -The workflow consists of four main steps: -- 1 Generate ligand parameters, based on 3D atom coordinates -- 2 Prepare the protein to run under spherical boundary conditions -- 3 Generate all the FEP related input files -- 4 Analyse the results -The folders in $setupFEP/tutorials are ordered such that it -follows the scheme presented above. A step by step how-to is given -below. -# 1.ligprep -Ths folder contains pregenerated 3D-coordinates of the ligand. -Note that these 3D coordinates are not generated by setupFEP, and -you are responsible to get reliable coordinates. It is important -to keep in mind that FEP handles well small changes in the system, -but if the phase space is not sufficiently overlapping the results -become increasingly unreliable. The ligands in this folder are -all overlaying a similar core region, and small substituents on -are investigated. - -For AMBER parameters, please refer to qtools: - -https://zenodo.org/badge/latestdoi/80016679 - -Since we have quite a few ligands (n = 16), we will generate the -files using a script from the script folder, e.g. run: - - python $qligfep/scripts/generate_opls.py - -This will write out the .lib, .prm and .pdb files needed for -stage 3. The .log files are generated by ffld_server. - --- Note -- -To successfully run this script, a working version of Maestro is -needed, since the OPLS parameters are derived from ffld_server -output. See the setup section in $setupFEP/README.md - --- Note -- -In order to successfully run generate_charmm.py: -- Have cgenff installed and properly referenced in the script -- Have a running version of openbabel -- make sure that you have .mol2 files with partial charges added - --- Note -- -Due to an implementation error in **Q**, 4 letter atom names are not -properly written out. In the case of particularly halogen atoms -you would have to double check and adjust the .pdb file, e.g. -in the case of 17.pdb in this tutorial, change: - -HETATM 25 Br25 LIG 900 3.933 25.020 9.894 - -to: - -HETATM 25 Br25LIG 900 3.933 25.020 9.894 - -# 2.protprep -The second step is needed to prepare a protein for simulations -under spherical boundary conditions (SBC). Note that this script -does not attempt to assign protonation states, so one needs to run -a preparation step first (e.g. Maestro's Protein Preparation -Wizard). Alternatively, protonation states can be assigned manually -by using the explicit names as provided in $setupFEP/FF/*.lib files. - -The center of the sphere can be based on a residue in the protein, -or as in this case by explicitly providing the center of geometry -(COG) coordinates from a ligand. E.g. to get those used in this -example run: - - python $qligfep/scripts/COG.py ../1.ligprep/17.pdb - -Which returns the coordinates 0.535:26.772:8.819, which can be -directly put in protprep.py: - - python $qligfep/protprep.py -p 1h1s_PrepWiz.pdb -r 22 -c 0.535:26.772:8.819 -w -P CSB - -You can run $qligfep/protprep.py -h for a full list of options. -The outputs are a .log file, containing some information on the -system and the adaptions made to run it in Q, a protein.pdb and -a water.pdb file. The latter two are needed in the next stage. - --- NOTE -- -The use of molprobity output is currently under construction! - -# 3.setupFEP -In the next stage we will prepare the input files for the actual -simulations. This stage uses $qligfep/QligFEP.py, you can use -the -h flag to get more specifics on all the input variables. -This folder includes a useful top level script, generate.py, that -takes a .txt file as input that specifies the ligand pairs that -will be used in this simulation. You can simply run: - - python setup.py - -And this should result in a 1.protein and 2.water folder, containing -the two systems necessary to perform the calculation. Simply go into -the folder and run - - ./FEP_submit.sh - -This will either submit your jobs to a slurm queue (see the setup -section in $setupFEP/READE.md for a more detailed description). -Alternatively the jobs can be run on your local machine. - -A typical procedure is to put the two systems in a top layer folder -to easily track running calculations (e.g. 1.TESTRUN) in the case -of this example. Of course, these toplayer scripts can be easily -adjusted for any other use (e.g. to calculate solvation free -energies, you add a water and vacuum leg, instead of protein water. - -# 4.analyzeFEP -The last step includes running the analyze_allFEP.py script, also -present in the 3.setupFEP folder. In the particular case of this -example, the analysis will be conduced on the 1.TESTRUN folder. -The output is a results.txt file containing the DDG values obtained -for every ligand pair. Exponential averaging (Zwanzig), Overlap -Sampling (OS) and Bennet acceptance ratio (BAR) are used, and the -error is a standard error of the mean over the (finished) replicates. -Alternatively, one can get the free energies per leg by using the -analyze_FEP.py in the main folder. (e.g. you can run: -python $/qligfep/analyze_FEP.py -h to get an overview of the -required input).