run_af2_step2.py

#!/usr/bin/env python
#-*- coding:utf-8 -*-
#
#
#  AlphaFold2 Step 2 -- Run models 1-5 to produce the unrelaxed models
#  Usage: run_af2_step2.py [--data_dir AF2_db_dir] /path/to/features.pkl /path/to/output_dir
#
#

import json
import os
import pathlib
import pickle
import random
import shutil
import sys
import time
import gzip
from typing import Dict, Union, Optional
import configparser
import argparse
import inspect

os.environ['TF_FORCE_UNIFIED_MEMORY'] = '1'
os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '4.0'

cur_path = pathlib.Path(__file__).parent.resolve()

ini_config = configparser.ConfigParser(allow_no_value=True)
assert len(ini_config.read(os.path.join(cur_path, 'config.ini'))) > 0, "Read config.ini failed"

sys.path.insert(0, ini_config['ALPHAFOLD2']['alphafold_path'])

from alphafold.common import protein
from alphafold.common import residue_constants
from alphafold.data import pipeline
from alphafold.model import config
from alphafold.model import model
from alphafold.relax import relax
from alphafold.model import data
import numpy as np

parser = argparse.ArgumentParser(description='AlphaFold2 Step 2 -- Run models 1-5 to produce the unrelaxed models')

parser.add_argument('input_file', metavar='input_file', type=str, help='The features.pkl file generated by AlphaFold2 step 1')
parser.add_argument('output_dir', metavar='output_dir', type=str, help='Path to a directory that will store the results.')
parser.add_argument('--params_parent_dir', default=ini_config['DATABASE']['params_parent_dir'], type=str, help="Path to the AlphaFold database, must contain params path")
parser.add_argument('--models', default='1,2,3,4,5', type=str, help="Models to run, seperated by comma. (1,2,3 or 1_ptm,2_ptm)")
parser.add_argument('--num_recycle', default=3, type=int, help="Number of recycles")
parser.add_argument('--num_ensemble', default=1, type=int, help="Number of ensembl")

args = parser.parse_args()

######################
## Util functions
######################

def func_has_agu(func, agu):
    param_keys = list(inspect.signature(func).parameters.keys())
    return agu in param_keys

model_names = [f'model_{i}' for i in range(1, 6)]
def get_model_runner(i, ptm=False):
    model_name = model_names[i]
    if ptm:
        model_name += "_ptm"
    model_config = config.model_config(model_name)
    model_config.data.eval.num_ensemble = args.num_ensemble
    model_config.model.num_recycle = args.num_recycle
    #if args.low_memory:
    #    model_config.model.global_config.subbatch_size = 1 # To save memory
    model_params = data.get_model_haiku_params(model_name=model_name, data_dir=args.params_parent_dir)
    model_runner = model.RunModel(model_config, model_params)
    return model_runner, model_params

######################
## Read features.pkl file
######################

if args.input_file.endswith('.gz'):
    feature_dict = pickle.load(gzip.open(args.input_file, 'rb'))
else:
    feature_dict = pickle.load(open(args.input_file, 'rb'))

print("Input length:", feature_dict['aatype'].shape[0], flush=True)

######################
## Run model 1-5 sperately
######################

output_dir = args.output_dir
assert os.path.exists(output_dir), "Error: --output_dir does not exists"

models_to_run = []
models_are_ptm = []

for item in args.models.split(','):
    if '_' in item:
        id_, ptm_ = item.split('_')
    else:
        id_ = item
        ptm_ = ''
    id_ = int(id_) - 1
    models_to_run.append(int(id_))
    if ptm_ == 'ptm':
        models_are_ptm.append(True)
    else:
        models_are_ptm.append(False)
    assert 0 <= int(id_) <= 4, "Error: --models should be 1<=model<=5"

for i,ptm in zip(models_to_run, models_are_ptm):
    ptm_token = '_ptm' if ptm else ''
    unrelaxed_pdb_path = os.path.join(output_dir, f'unrelaxed_{model_names[i]}{ptm_token}.pdb')
    result_output_path = os.path.join(output_dir, f'result_{model_names[i]}{ptm_token}.pkl.gz')
    if os.path.exists(unrelaxed_pdb_path) and os.path.exists(result_output_path):
        print(f"Info: {unrelaxed_pdb_path} and {result_output_path} exists, please delete and try again", flush=True)
        continue
    print(f"Start to run model_{i+1}{ptm_token}", flush=True)
    ###########################
    ### Get the Runner
    ############################ 
    model_runner, model_params = get_model_runner(i, ptm)
    processed_feature_dict = model_runner.process_features(feature_dict, random_seed=None)
    if func_has_agu(model_runner.predict, 'random_seed'):
        prediction_result = model_runner.predict(processed_feature_dict, random_seed=0)
    else:
        prediction_result = model_runner.predict(processed_feature_dict)
    # Save memory
    del model_runner
    del model_params
    ###########################
    ### Save as Protein object
    ############################ 
    plddt = prediction_result['plddt']
    plddt_b_factors = np.repeat(plddt[:, None], residue_constants.atom_type_num, axis=-1)
    params = {
        'features': processed_feature_dict,
        'result': prediction_result,
        'b_factors': plddt_b_factors,
    }
    if func_has_agu(protein.from_prediction, 'remove_leading_feature_dimension'):
        params['remove_leading_feature_dimension'] = True
    unrelaxed_protein = protein.from_prediction(**params)
    ###########################
    ### Save as PDB file
    ###########################
    unrelaxed_pdb = protein.to_pdb(unrelaxed_protein)
    print(unrelaxed_pdb, file=open(unrelaxed_pdb_path, 'w'))
    ###########################
    ### Save as pkl file
    ###########################
    pickle.dump(prediction_result, gzip.open(result_output_path, 'wb'), protocol=4)