-
Notifications
You must be signed in to change notification settings - Fork 0
/
SocialEntities_ver01.py
275 lines (234 loc) · 12.5 KB
/
SocialEntities_ver01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env python3
#
###############################################################################
#
# Name: Social Entities
#
# Author: Åsmund Kaupang, 2018-2022
# Thank you: Johannes Karwounopoulos, for suggesting the inclusion
# of multiprocess and tqdm!
#
# Description: A PyMOL/python3 script that returns a selection of entities
# (atoms, residues, etc) that have a certain number of other
# entities (from other objects) around them within a specified
# distance. Depending on the input data, there may be several
# applications for such a function.
#
# Here is an example of a usage in which we look for structural
# water molecules:
# One way to look for structural water molecules, i.e. water
# molecules that appear in several protein crystal structures
# (perhaps across crystallization conditions, unit cells, space
# groups or multimeric complexes), would be to superimpose the
# protein structures and see whether the locations of the
# crystallographically determined water molecules coincide. This
# script helps you select those waters whose positions are
# corroborated as structurally relevant by the presence of water
# molecules from other protein structures in their vicinity.
#
# Load the script with:
# run SocialEntities.py
#
# To select those water molecules ("resn HOH") from a selection
# of all the water molecules of all the loaded objects
# ('allWaters'), that are surrounded by a minimum of 5 water
# molecules belonging to other objects (e.g. other protein
# structures) at a maximum distance of 2 Å from their position,
# issue:
#
# UPDATE TEXT socialEntities('allWaters', 'resn HOH', 'resn HOH', 0.90, 1, 'Waters')
#
# A part of the name of the produced selections can be modified
# to reflect the type of entity you are investigating ('Waters')
# or if not provided, it will appear as 'Entities', in the
# following three groups;
#
# socialEntities Those entities that have neighbours
# within the specified criteria (number,
# distance).
# pseudosocialEntities Those entities that have fewer
# neighbours than specified within the
# specified distance.
# lonelyEntities Those entities that have no neighbours
# within the specified distance.
#
################################################################################
from pymol import cmd
from pymol import stored
from tqdm import tqdm
from itertools import repeat, chain
import multiprocess as mp
def modIndTupleListToSelctionString(modindtuplelist):
print(f"start to go through {len(modindtuplelist)} items")
if len(modindtuplelist) == 0:
selection = f"none"
else:
for idx, modind in enumerate(modindtuplelist):
# First pass; create the start of the selection string
if idx == 0:
selection = f"({modind[0]} and index {modind[1]})"
# Subsequent passes; grow the selection string
else:
selection = f"{selection} or ({modind[0]} and index {modind[1]})"
print("end loop")
return selection
def neighbourSearch(start, end, objectindexlist, poolSel, numberOfEntitiesAround):
#print(mp.current_process())
# Initiate lists for entities with enough found neighbours, for entities
# with too few neighbours and for entities with no neighbours.
nblist = []
fewnblist = []
nonblist = []
for entitynum in tqdm(range(start, end)):
# Define a selection for the neighbours of the current entity, that are present in the swarm/pool.
# Recall that poolSel is:
# '(allEntSelName and nbEntTypeSel) near_to distanceToEntitiesAround of ', e.g.
# '(allwaters and resn HOH) near_to 2 of '
nbSel = poolSel + '(' + str(objectindexlist[entitynum][0]) + ' and index ' + str(objectindexlist[entitynum][1]) + ')'
# which then results in e.g.:
# '(allwaters and resn HOH) near_to 2 of (2ABC_A and index 3052)', in which 3052 is the current index dictated by entitynum
# Get the indexes of these neighbours defined by the selection
nbs = cmd.index(nbSel)
# If no neighbours are found, add the entity to a list of lonely entities (no neighbours)
# If neighbours are found, evaluate if there are enough of them, and if so, add the
# (model, index) tuple to a list of social entities (enough neighbours). If there are
# some, but to few neighbours, add the entity to a list of less lonely entities
# (few neighbours).
if nbs == []:
nonblist.append(objectindexlist[entitynum])
elif len(nbs) >= numberOfEntitiesAround:
nblist.extend(nbs)
else:
fewnblist.append(objectindexlist[entitynum])
return nonblist, fewnblist, nblist
def socialEntities( allEntSelName, allEntTypeSel, nbEntTypeSel, objFractOfEntitiesAround, distanceToEntitiesAround, newSelectionType='Entities', cpus='all' ):
# Initiate a list in the PyMOL name space to hold the models and indexes
stored.modelsindexes = []
# Select all of the pool entities
cmd.select(allEntSelName, allEntTypeSel)
# Fill the list with the (model, index) tuples from the user selection
cmd.iterate(allEntSelName, 'stored.modelsindexes.append((model,index))')
# Patch together a generic opening to the neighbour selection string
poolSel = '(' + allEntSelName + ' and ' + nbEntTypeSel + ') near_to ' + str(distanceToEntitiesAround) + ' of '
# Get the total number of input entities ('the whole swarm')
# stored.modelsindexes is a list of tuples; ..., (object/model name, index number), ...
totnumentities = len(stored.modelsindexes)
# Get the number of objects (for later)
numberOfObjects = len(cmd.get_object_list())
# Evaluate the requested fraction of objects containing neighbours
numberOfEntitiesAround = int(objFractOfEntitiesAround * numberOfObjects)
# Get the number of CPUs/threads available to divide the workload
availcpus = mp.cpu_count()
# Moderate CPU usage if requested by user
if cpus == 'all':
cpus = availcpus
elif cpus > availcpus:
cpus = availcpus
else:
pass
# Print some information before the calculations start
print(#
f"The requested neighbour fraction ({objFractOfEntitiesAround*100}%) corresponds to neighbours from {numberOfEntitiesAround} of {numberOfObjects} objects.\n" \
f"The total number of entities of type <{allEntTypeSel}> whose neighbourhoods will be searched for neighbours of type <{nbEntTypeSel}> is {totnumentities}.\n" \
f"{availcpus} CPUs/threads were found and {cpus} will be used in the calculation. "
)
# Define the pool based on the CPU count
pool = mp.Pool(cpus)
# The map function applies a function (the first argument) to each of the
# items in an interable, which is passed as the second argument.
# If more than one iterable is passed, the number of iterables must
# correspond to the number of arguments taken by the function
# If the iterables are of different lengths, the iteration through them
# will stop when the shortes iterable is exhausted.
#
# The map function returns a map object, which is an iterator. This object
# can be transformed to a sequence object (a list, a tuple, etc) by calling
# the respective constructor; list(), tuple(), etc
# The map object can also be iterated through using a for loop
#
# If starmap is used instead of map, the iterators must be "pre-zipped" so
# that the input is a zip object. The zip object can be visualized when
# converted to a list, and is then a list of tuples, where the length of
# each tuple corresponds to the number of iterators zipped together.
# The zip function can work to transpose a list of lists
# Partition the entities equally (the last part is longer or shorter) for
# each CPU.
wholepartslength = totnumentities // cpus
numwholeparts = (totnumentities // wholepartslength) - 1
# In practice, the indices that refer to their placement in the
# stored.modelsindexes are noted down and added to lists of start-
# and end indices
startindexes = []
endindexes = []
for idx, i in enumerate(range(0, totnumentities, wholepartslength)):
if idx < numwholeparts:
startindexes.append(i)
endindexes.append(i + wholepartslength - 1)
elif idx == numwholeparts:
startindexes.append(i)
endindexes.append(totnumentities - 1)
else:
pass
# Now we throw a bone to each dog.
# The input data structure is the zip transposition of the following lists:
# [ start1, start2, start3, ...]
# [ end1, end2, end3, ...]
# [ MI[()],[MI[()],[MI[()], ...]
# [poolSel,poolSel,poolSel, ...]
# [ numOEA, numOEA, numOEA, ...]
#
# Where '[MI[()]' is [[(m1,i1),(m1,i2),...,(m2,i1),(m2,i2),...],
# [(m1,i1),(m1,i2),...,(m2,i1),(m2,i2),...],
# ...]
#
result = pool.starmap(neighbourSearch, zip(startindexes, endindexes, repeat(stored.modelsindexes), repeat(poolSel), repeat(numberOfEntitiesAround)))
# End the parallelized part of the script
pool.close()
#pool.join()
# The output data structure is a list of lists, whose length in the top
# level is defined by the number of processes/workers/forks/cpus. Each
# process has a number of sublists (here, three) defined by the number of
# output variables from the function.
# Gather the outputs of each process to reconstitute the outputs of the
# neighbourSearch function and make new lists containing only the unique
# entities in each raw listv(as neighbours will have been found multiple
# times)
nonblist = list(set(chain(*[result[i][0] for i in range(0, len(result))])))
fewnblist = list(set(chain(*[result[i][1] for i in range(0, len(result))])))
nblist = list(set(chain(*[result[i][2] for i in range(0, len(result))])))
# Print some rudimentary statistics to help tune the search parameters
frcnonb = len(nonblist)/totnumentities
frcfewnb = len(fewnblist)/totnumentities
frcnb = len(nblist)/totnumentities
print(#
f"The requested neighbour fraction ({objFractOfEntitiesAround*100}%) corresponds to neighbours from {numberOfEntitiesAround} of {numberOfObjects} objects.\n" \
f"Out of {totnumentities} entities of type <{allEntTypeSel}>, across {numberOfObjects} objects:\n" \
f"{frcnb*100:.1f}% had >= {numberOfEntitiesAround} neighbours of type <{nbEntTypeSel}> within {distanceToEntitiesAround} Å\n" \
f"{frcfewnb*100:.1f}% had < {numberOfEntitiesAround} neighbours of type <{nbEntTypeSel}> within {distanceToEntitiesAround} Å\n" \
f"{frcnonb*100:.1f}% had no neighbours of type <{nbEntTypeSel}> within {distanceToEntitiesAround} Å" \
)
if numberOfEntitiesAround > numberOfObjects:
print(f"WARNING: The number of entities in the search is larger than the number of objects. Please check your input!")
else:
pass
# Get a selection string from each list of unique entities
print('Preparing selections and visualization. Please wait...')
nbsel = modIndTupleListToSelctionString(nblist)
#fewnbsel = modIndTupleListToSelctionString(fewnblist)
#nonbsel = modIndTupleListToSelctionString(nonblist)
print('selecting in pymol')
# Export these selections to the PyMOL GUI
cmd.select(f"social{newSelectionType}", nbsel)
#cmd.select(f"pseudosocial{newSelectionType}", fewnbsel)
#cmd.select(f"lonely{newSelectionType}", nonbsel)
#cmd.show_as(f"nb_spheres", f"({nbsel} or {fewnbsel} or {nonbsel})")
cmd.show_as(f"nb_spheres", nbsel)
#cmd.color(f"red", nonbsel)
#cmd.color(f"orange", fewnbsel)
cmd.color(f"green", nbsel)
cmd.deselect()
print('Done')
if __name__ == "__main__":
socialEntities()
# Make the function runnable in PyMOL
cmd.extend("socialEntities", socialEntities)