-
Notifications
You must be signed in to change notification settings - Fork 2
/
functions.py
224 lines (191 loc) · 7.87 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""Set of diverse functions to solve biological and technical problems during the analysis."""
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import Bio.PDB
import numpy
import os
import collections as col
import string
import utilities
import copy
import itertools
import classes
import re
def check_input(inputs_dir, user_dir):
"""Checks if the input given by the user is a directory."""
if os.path.isdir(inputs_dir) == False:
error = classes.IncorrectInputDir(inputs_dir, user_dir)
raise error
return "Checked."
def get_distance(list1, list2):
"""Given two lists of atom coordinates, it calculates the distance."""
import math
distance_result = (list1[0] - list2[0]) ** 2 + (list1[1] - list2[1]) ** 2 + (list1[2] - list2[2]) ** 2
return math.sqrt(abs(distance_result))
def check_type(chain):
"""Function to check if the chain is a protein or a nucleic acid."""
atoms = chain.get_atoms()
type_chain = ""
list_c = []
for element in atoms:
list_c.append(element.get_name())
if "CA" in list_c:
type_chain = "protein"
else:
type_chain = "nucleic_acid"
return type_chain
def adapt_chain(chain):
"""Checks the type of the chain and changes the C1' to CA if it is nucleic acid.
This will allow us to measure the distances between atoms in the same way as we do for proteins."""
type_chain = check_type(chain)
name = chain.id
if type_chain == "nucleic_acid":
new_chain = Bio.PDB.Chain.Chain(name)
chain = copy.copy(chain)
for residue in chain:
new_chain.add(residue.copy())
for residue in new_chain:
for atom in residue:
if atom.id == "C1'":
atom.id = "CA"
residue.add(atom.copy())
return new_chain
else:
return chain
def unique_chains_fasta (number_to_letter, outputs_dir):
"""Returns a multifasta file with the sequence of the unique proteins from the total sequences of the macrocomplex."""
fo = open(outputs_dir+"/unique_chains_fasta.mfa", "w")
for key, value in number_to_letter.items():
name_chain = "chain_" + value
fo.write(">%s\n%s\n" % (name_chain, get_seq_from_pdbchain(key)))
fo.close()
return fo
def get_atoms_list(chain):
"""Creates a list of the atoms only taking CA or P for protein and acid nucleics, respectively.
This list of atoms will be lately used in the superimposition process."""
type_chain = check_type(chain)
if type_chain == "protein":
atom_id = "CA"
elif type_chain == "nucleic_acid":
atom_id = "P"
atoms = chain.get_atoms()
atoms_list = []
for atom in atoms:
if atom.id == atom_id:
atoms_list.append(atom)
return atoms_list
def three_to_one(three_res_list):
"""Returns sequence string (one amino acid code format). Used in the get_seq_from_pdbchain."""
one_res_list = []
for res in three_res_list:
try:
one = utilities.three_to_one[res]
one_res_list.append(one)
except KeyError:
return False
return "".join(one_res_list)
def get_seq_from_pdbchain(chain):
"""Returns the pdb amino acid or nucleic acid sequence. In case it is a protein, the sequence is returned in one
amino acid code format. In case it is a nucleic acid, the sequence is composed by the nucleotides."""
type_chain = check_type(chain)
if type_chain == "protein":
three_res_list = []
for res in chain:
residues_atoms = res.get_atoms()
for atom in residues_atoms:
if atom.get_name() == 'CA':
residue = atom.get_parent()
three_res_list.append(residue.get_resname())
return three_to_one(three_res_list) # three_to_one function
else:
nucleic_acid_res = []
for res in chain:
residues_atoms = res.get_atoms()
for atom in residues_atoms:
if atom.get_name() == 'P':
residue = atom.get_parent()
nucleic_acid_res.append(residue.get_resname())
nucleic_acid_seq = [x[2] for x in nucleic_acid_res]
return "".join(nucleic_acid_seq)
def refine_for_superimpose(fixedchain, movingchain):
"""Given two chains with different sequence length, returns a pattern of 0 and 1 (for each chain) based on the pairwise alignment, to
include or exclude the residue of the original chains. This is needed for the superimposition."""
fixedchain_seq = get_seq_from_pdbchain(fixedchain)
movingchain_seq = get_seq_from_pdbchain(movingchain)
alignments = pairwise2.align.globalxx(fixedchain_seq, movingchain_seq)
fixedchain_seq = alignments[0][0]
movingchain_seq = alignments[0][1]
fixedchain_pattern = []
movingchain_pattern = []
for res1, res2 in zip(fixedchain_seq, movingchain_seq):
if res1 == res2:
fixedchain_pattern.append(1)
movingchain_pattern.append(1)
elif res1 == '-':
movingchain_pattern.append(0)
elif res2 == '-':
fixedchain_pattern.append(0)
chains_pattern = (fixedchain_pattern, movingchain_pattern)
return chains_pattern
def get_chain_refined(chain_original, chain_pattern):
"""Creates new chain objects filtering the residues of the original chains, to get their atoms later for the
superimposition process."""
new_chain = Bio.PDB.Chain.Chain('X')
for residue, pattern in zip(chain_original.get_residues(), chain_pattern):
if pattern == 1:
new_chain.add(residue.copy())
return new_chain
def one_to_three(chain_refined):
"""Returns amino acid sequence of a chain (three amino acid code format) from a sequence in one amino acid code format.
Used in refine_for_superimpose function."""
three_res_list = []
for res in chain_refined:
three = utilities.one_to_three[res]
three_res_list.append(three)
return three_res_list
def check_clashes_multi(current_model, list_models, newchain, clash_distance_cutoff):
"""Checks clashes between the last added chain and the current model depending on the clash_distance_cutoff, which
is set by the user (default 1.2A)."""
neighbors2 = set()
atom_list = Bio.PDB.Selection.unfold_entities(current_model, 'A')
for model in list_models:
atom_list2 = Bio.PDB.Selection.unfold_entities(model, 'A')
for element in atom_list2:
atom_list.append(element)
ns = Bio.PDB.NeighborSearch(atom_list)
for at in newchain.get_atoms():
center = at.get_coord()
neighbors = ns.search(center, clash_distance_cutoff, level='C')
for element in neighbors:
if element != newchain:
neighbors2.add(element)
if len(neighbors2) != 0:
return True
else:
return False
def create_tempPDB(outputs):
"""Erases the acid nucleic chains of the pdb files that will be used in the refining process, once the model
has been built (to avoid Modeller errors)."""
temp_dir = outputs + "temp/"
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
files_list = []
models_list = os.listdir(outputs)
for file in models_list:
if file.endswith(".pdb") or file.endswith(".B"):
files_list.append(file)
for element in files_list:
pdb = open(outputs + element, 'r')
fo = open(temp_dir + "mod_" + element, "w")
for line in pdb:
if line.startswith('ATOM'):
line = line.strip()
search = re.search(r'[A-Z]*\s*\d{1,5}\s*[A-Z]{1,3}[0-9]?\'?\s*([A-Z]{1,3}).*', line)
residue = search.group(1)
if residue in utilities.nucleic_list:
continue
else:
fo.write("%s\n" % (line))
fo.close()
pdb.close()
return temp_dir