forked from CAHanke/flr2mmcif
-
Notifications
You must be signed in to change notification settings - Fork 0
/
flr2mmcif.py
6041 lines (5743 loc) · 357 KB
/
flr2mmcif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (c) 2019-2024 Christian A. Hanke
# This script reads in an spreadsheet file (e.g. Microsoft Excel) of a fixed
# format and uses the python-ihm implementation to generate the objects and
# write the mmcif file in the end
# version 1.06
# Note: In case of non-mandatory parameters, it should be checked whether the
# column is present at all or whether the respective cell in the excel sheet
# is empty (using pandas.isnull).
# Note: Currently, the script requires to be used with Python 3 in order to
# ensure proper output in the mmcif files of the unicode strings.
# Note: The script requires the python ihm, pandas, and openpyxl
import os
import pandas
import argparse
import ihm
import ihm.flr
import ihm.dumper
import ihm.dataset
import ihm.location
import ihm.startmodel
import ihm.analysis
import ihm.protocol
import ihm.model
import ihm.source
import ihm.reader
BEVERBOSE = True
DEBUG = False
class DummyClass():
def __init__(self, value):
self._id = value
def occurs_in_list(curobject, list):
for entry in list:
if curobject.__dict__ == entry.__dict__:
return True
return False
def get_index_from_list(curobject, list):
for i in range(len(list)):
if curobject.__dict__ == list[i].__dict__:
return i
return -1
def get_resatom_from_list(curobject, list):
""" Get the residue or atom from a list by checking for identical entries
in the list. This is not achieved by simply comparing, since Atom and
Residue use __slots__.
In case of an Atom, also the respective Residue has to be checked for
equality.
:param curobject: Atom or Residue
:param list: The resatom list
:return: The object that has all the same attribute values as curobject
or None.
"""
for i in range(len(list)):
# check whether all attribute values of the objects are identical
isidentical = True
# Check whether the objects have the same attrributes
if curobject.__slots__ == list[i].__slots__:
for attr in curobject.__slots__:
curobjectattrvalue = getattr(curobject, attr)
listobjectattrvalue = getattr(list[i], attr)
# in case there is an additional object with slots in the
# curobject we also need to check the slots of this one
if hasattr(curobjectattrvalue, '__slots__'):
if curobjectattrvalue.__slots__ == \
listobjectattrvalue.__slots__:
for innerattr in curobjectattrvalue.__slots__:
# Check whether innerattr is included in an object
if hasattr(curobjectattrvalue, innerattr):
# and in the other object
if hasattr(listobjectattrvalue, innerattr):
# Check whether the attributes are the
# same
if not getattr(curobjectattrvalue,
innerattr) == \
getattr(listobjectattrvalue,
innerattr):
isidentical = False
# If only one of the objects has the
# attributes, they are not identical
else:
isidentical = False
# otherwise, we just compare them
else:
if not (curobjectattrvalue == listobjectattrvalue):
isidentical = False
if isidentical:
return list[i]
return None
class Flr2mmcifConverter:
def __init__(self,
xls_filename,
cifout_filename,
atom_site_filename,
be_verbose=True):
self.verbose=be_verbose
self.xls_filename = xls_filename
self.cifout_filename = cifout_filename
self.atom_site_filename = atom_site_filename
# Create an IHM system
self.system = ihm.System()
# Create an excel file object
self.xls_file = pandas.ExcelFile(self.xls_filename)
# Prepare the lists to store data
self.initialize_lists()
def run(self):
"""Read and process the data from the spreadsheet, and write the cif file"""
# Fill all the data
self.fill()
# Write the file
self.write_cif_file()
def initialize_lists(self):
"""Prepare lists and dictionaries to store data"""
# Collection of information to be stored
self.list_resatoms = []
self.list_citations = []
self.list_citation_ids = []
self.list_ihm_softwares = []
self.list_ihm_software_ids = []
self.list_locations = []
# Datasets
self.list_datasets = []
self.list_dataset_ids = []
self.list_dataset_groups = []
self.list_dataset_group_ids = []
# Dataset subgroups correspond to the entries in the Dataset list
# (i.e. there might be a subgroup for Single-molecule FRET data and
# another one for De Novo models)
self.list_dataset_subgroup_by_dataset_list_id = []
self.list_dataset_subgroup_by_dataset_list_id_ids = []
# External reference ids for connection to the external files
self.list_dataset_external_reference_ids = []
# connect datasets with external files; store datasets and the
# respective external_file ids
self.list_connection_dataset_external_file_datasets = []
self.list_connection_dataset_external_file_ids = []
# repositories
self.list_repositories = []
self.list_respository_ids = []
# External files
self.list_external_files = []
self.list_external_files_locations = []
self.list_external_files_ids = []
self.tmp_dict_for_dataset_groups = {}
# Entity (IHM) and Entity assembly (FLR)
self.list_ihm_entities = []
self.list_ihm_entity_ids = []
self.list_entity_assemblies = []
self.list_entity_assembly_ids = []
# Comparative starting model
self.list_starting_model_templates = []
self.tmp_dict_templates_for_starting_models = {}
# Instance (AsymUnit)
self.list_asym_units = []
self.list_asym_units_ids = []
self.list_asym_unit_ranges = []
self.list_ihm_model_representations = []
self.list_ihm_model_representation_ids = []
self.list_ihm_starting_model_details = []
self.list_structure_assemblies = []
self.list_structure_assembly_ids = []
self.tmp_dict_for_structure_assembly = {}
self.tmp_dict_for_structure_assembly_names = {}
self.tmp_dict_for_structure_assembly_description = {}
self.tmp_dict_for_model_representations = {}
# Modeling protocol and modeling post process
self.list_ihm_modeling_protocol_analysis_steps = []
self.list_ihm_modeling_protocol_analysis_step_ids = []
self.list_ihm_modeling_protocols = []
self.list_ihm_modeling_protocols_ids = []
self.list_ihm_modeling_protocol_analyses = []
self.list_ihm_modeling_protocol_analyses_ids = []
self.list_ihm_modeling_protocol_modeling_steps = []
self.list_ihm_modeling_protocol_modeling_step_ids = []
self.dict_ihm_modeling_protocol_software_ids = {}
# Multi-state modeling
self.list_empty_states = []
self.list_empty_state_ids = []
# Models
self.list_models = []
self.list_models_ids = []
# Models - Model groups
self.list_model_groups = []
self.list_model_group_ids = []
# Models - models that belong to one state
self.list_models_states = []
self.list_models_state_ids = []
self.list_models_state_groups = []
self.list_models_state_group_ids = []
# Ensembles
self.list_ensembles = []
# FLR
# FLR - FPS global parameters
self.list_flr_fps_global_parameters = []
# A dictionary to identify the global parameters by the protocol id.
# The dictionary, which uses the protocol id as keys contains additional
# dictionaries that use the protocol steps as keys
self.dict_flr_fps_global_parameters_by_protocol_id = {}
# FLR - FPS MPP
self.dict_flr_fps_mpp_groups = {}
# The following lists are for both, reference measurements and FLR
# FLR - sample conditions
self.list_sample_conditions = []
# FLR - instruments
self.list_instruments = []
# FLR - instrument settings
self.list_inst_settings = []
# FLR - experimental conditions
self.list_exp_conditions = []
# FLR - samples
self.list_samples = []
self.list_sample_ids = []
# FLR - experiments
self.list_experiments = []
# FLR - chemical descriptors
self.list_chemical_descriptors = []
# FLR - chemical components
self.list_chemical_components = []
# FLR - probes
self.list_probe_donors = []
self.list_probe_acceptors = []
# FLR - modified and mutated residues
self.list_modified_residues = []
self.list_mutated_residues = []
# Poly_probe_positions
self.list_poly_probe_positions = []
self.list_residue_objects = []
# Sample_probe_details
self.list_sample_probe_details = []
# Probe_descriptor_conjugate
self.list_probe_conjugate_descriptors = []
# Poly_probe_conjugate
self.list_poly_probe_conjugates = []
# Reference measurement lifetimes and fractions
self.list_ref_measurement_lifetimes = []
# Reference measurements
self.list_ref_measurements = []
# Reference measurement groups
self.list_ref_measurement_groups = []
self.list_ref_measurement_group_ids = []
self.list_of_object_indices_refmeas = []
# FLR
self.list_of_object_indices = []
self.list_fret_forster_radius = []
self.list_fret_calibration_parameters = []
self.list_lifetime_fit_models = []
self.list_fret_analysis = []
self.list_fret_analysis_ids = []
self.list_peak_assignments = []
self.list_fret_distance_restraint_groups = []
self.list_fret_distance_restraints = []
self.list_fret_distance_restraint_ids = []
self.list_FPS_AV_parameters = []
self.list_FPS_modeling_by_AV = []
self.list_FPS_AV_modeling = []
self.list_FPS_mean_probe_positions = []
self.list_FPS_modeling_by_MPP = []
self.list_FPS_MPP_modeling = []
self.list_flr_model_quality = []
self.list_flr_model_distances = []
# IHM multi-state schemes
self.list_ihm_multi_state_schemes = []
self.list_ihm_multi_state_scheme_ids = []
self.list_ihm_multi_state_scheme_connectivities = []
self.list_ihm_multi_state_scheme_connectivity_ids = []
self.list_ihm_mss_relaxation_times = []
self.list_ihm_mss_relaxation_time_ids = []
self.list_ihm_mss_eq_constants = []
self.list_ihm_mss_eq_constant_ids = []
self.list_ihm_mss_kinetic_rates = []
self.list_ihm_mss_kinetic_rate_ids = []
self.list_ihm_mss_relaxation_time_fret_analysis_connections = []
self.list_ihm_mss_kinetic_rate_fret_analysis_connections = []
def fill(self):
self.add_general()
self.add_citation()
self.add_ihm_software()
self.add_ihm_dataset_and_external_files()
self.add_ihm_entity()
self.add_ihm_comparative_starting_model()
self.add_ihm_instance_and_AsymUnits()
self.add_ihm_modeling_protocol()
self.add_ihm_multi_state_modeling_and_models()
self.add_ihm_ensemble()
self.add_flr_fps_global_parameters()
self.add_flr_fps_mpp()
self.add_flr_reference_measurements()
self.add_flr()
self.add_flr_model_quality()
self.add_flr_model_distances()
self.add_ihm_multi_state_schemes()
self.create_flrdata()
self.handle_atom_site_file()
def add_general(self, sheet_name='General', skiprows=3, header=0):
"""Read the general information from the excel sheet"""
if self.verbose:
print(" ... Processing tab \'General\' ...")
xls_data = pandas.read_excel(self.xls_file,
sheet_name=sheet_name,
skiprows=skiprows,
header=header)
nr_entries = len(xls_data['IHM_Struct_id'])
for i in range(nr_entries):
cur_general_title = \
None if ('IHM_Struct_title' not in xls_data.keys()
or pandas.isnull(xls_data['IHM_Struct_title'][i])) \
else xls_data['IHM_Struct_title'][i]
self.system.title = cur_general_title
def add_citation(self, sheet_name='Citation', skiprows=3, header=0):
"""Read the information on the Citation"""
if self.verbose:
print(" ... Processing tab \'Citation\' ...")
xls_data = pandas.read_excel(self.xls_file,
sheet_name=sheet_name,
skiprows=skiprows,
header=header)
nr_entries = len(xls_data['IHM_Citation_id'])
for i in range(nr_entries):
cur_citation_id = xls_data['IHM_Citation_id'][i]
cur_citation_title = xls_data['IHM_Citation_Title'][i]
cur_citation_journal_abbreviation = \
None if ('IHM_Citation_Journal_abbreviation' not in
xls_data.keys()
or pandas.isnull(
xls_data['IHM_Citation_Journal_abbreviation'][i])) \
else xls_data['IHM_Citation_Journal_abbreviation'][i]
cur_citation_journal_volume = \
None if ('IHM_Citation_Journal_volume' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Citation_Journal_volume'][i])) \
else xls_data['IHM_Citation_Journal_volume'][i]
cur_citation_first_page = \
None if ('IHM_Citation_First_page' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Citation_First_page'][i])) \
else xls_data['IHM_Citation_First_page'][i]
cur_citation_last_page = \
None if ('IHM_Citation_Last_page' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Citation_Last_page'][i])) \
else xls_data['IHM_Citation_Last_page'][i]
cur_citation_year = \
None if ('IHM_Citation_Year' not in xls_data.keys()
or pandas.isnull(xls_data['IHM_Citation_Year'][i])) \
else xls_data['IHM_Citation_Year'][i]
cur_citation_pubmed_id = \
None if ('IHM_Citation_Pubmed_id' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Citation_Pubmed_id'][i])) \
else xls_data['IHM_Citation_Pubmed_id'][i]
cur_citation_doi = \
None if ('IHM_Citation_DOI' not in xls_data.keys()
or pandas.isnull(xls_data['IHM_Citation_DOI'][i])) \
else xls_data['IHM_Citation_DOI'][i]
cur_citation_authors = xls_data['IHM_Citation_Authors'][i]
cur_citation_authors_list = cur_citation_authors.split(';')
# Create the citation object
cur_citation = ihm.Citation(
pmid=cur_citation_pubmed_id,
title=cur_citation_title,
journal=cur_citation_journal_abbreviation,
volume=cur_citation_journal_volume,
page_range=(cur_citation_first_page, cur_citation_last_page),
year=cur_citation_year,
authors=cur_citation_authors_list,
doi=cur_citation_doi)
if not occurs_in_list(cur_citation, self.list_citations):
self.list_citations.append(cur_citation)
self.list_citation_ids.append(cur_citation_id)
# add all citation objects to the system
for entry in self.list_citations:
self.system.citations.append(entry)
def add_ihm_software(self, sheet_name='Software', skiprows=3, header=0):
"""Read information on the Software from the excel sheet"""
if self.verbose:
print(" ... Processing tab \'Software\' ...")
xls_data = pandas.read_excel(self.xls_file,
sheet_name=sheet_name,
skiprows=skiprows,
header=header)
nr_entries = len(xls_data['IHM_Software_id'])
for i in range(nr_entries):
cur_ihm_software_id = xls_data['IHM_Software_id'][i]
cur_ihm_software_name = xls_data['IHM_Software_name'][i]
cur_ihm_software_classification = \
xls_data['IHM_Software_classification'][i]
cur_ihm_software_description = \
None if ('IHM_Software_description' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Software_description'][i])) \
else xls_data['IHM_Software_description'][i]
cur_ihm_software_location = \
None if ('IHM_Software_location' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Software_location'][i]))\
else xls_data['IHM_Software_location'][i]
cur_ihm_software_type = xls_data['IHM_Software_type'][i]
cur_ihm_software_version = \
None if ('IHM_Software_version' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Software_version'][i])) \
else xls_data['IHM_Software_version'][i]
cur_ihm_software = ihm.Software(
name=cur_ihm_software_name,
classification=cur_ihm_software_classification,
description=cur_ihm_software_description,
location=cur_ihm_software_location,
type=cur_ihm_software_type,
version=cur_ihm_software_version)
if cur_ihm_software not in self.list_ihm_softwares:
self.list_ihm_softwares.append(cur_ihm_software)
self.list_ihm_software_ids.append(cur_ihm_software_id)
for entry in self.list_ihm_softwares:
self.system.software.append(entry)
def add_ihm_dataset_and_external_files(self,
sheet_name_dataset='Dataset',
skiprows_dataset=3,
header_dataset=0,
sheet_name_external_files=
'External_files',
skiprows_external_files=3,
header_external_files=0):
"""Read on the Datasets and external file information from the
excel sheet"""
if self.verbose:
print(" ... Processing tab \'Dataset\' ...")
# Note: Many of these temporary lists could possibly be avoided when
# using more classes. => Future TODO
# storage for datasets that belong to one group and create a
# dataset_group later
tmp_dict_for_dataset_groups = {}
tmp_info_for_dataset_groups = {}
tmp_dict_for_dataset_subgroups = {}
# store the type of the dataset for each external reference id. This
# will be used for the external files to create the datasets
tmp_dict_for_external_reference_store_dataset_type = {}
tmp_dict_for_external_reference_store_dataset_group = {}
tmp_dict_for_external_reference_store_dataset_details = {}
tmp_dict_for_external_reference_store_repository = {}
tmp_dict_for_external_reference_store_dataset_list_id = {}
# Read the data
xls_data_d = pandas.read_excel(self.xls_file,
sheet_name=sheet_name_dataset,
skiprows=skiprows_dataset,
header=header_dataset)
nr_entries_d = len(xls_data_d['IHM_Dataset_Dataset_list_id'])
for i in range(nr_entries_d):
cur_dataset_list_id = xls_data_d['IHM_Dataset_Dataset_list_id'][i]
# required to create the dataset_group
cur_dataset_group = xls_data_d['IHM_Dataset_Dataset_group'][i]
cur_dataset_group_name = \
None if ('IHM_Dataset_Dataset_group_name' not in
xls_data_d.keys() or pandas.isnull(
xls_data_d['IHM_Dataset_Dataset_group_name'][i])) \
else xls_data_d['IHM_Dataset_Dataset_group_name'][i]
cur_dataset_group_details = \
None if (
'IHM_Dataset_Dataset_group_details' not in
xls_data_d.keys() or pandas.isnull(
xls_data_d['IHM_Dataset_Dataset_group_details'][i])) \
else xls_data_d['IHM_Dataset_Dataset_group_details'][i]
# required to create the dataset of the correct type
cur_dataset_data_type = xls_data_d['IHM_Dataset_Data_type'][i]
# required for the creation of the database location
cur_dataset_DB_flag = xls_data_d['IHM_Dataset_DB_flag'][i] in \
['Yes', 'YES', 'yes']
cur_dataset_DB_name = xls_data_d['IHM_Dataset_DB_name'][i]
cur_dataset_DB_accession_code = \
xls_data_d['IHM_Dataset_DB_accession_code'][i]
cur_dataset_DB_version = \
None if ('IHM_Dataset_DB_version' not in xls_data_d.keys()
or pandas.isnull(
xls_data_d['IHM_Dataset_DB_version'][i])) \
else xls_data_d['IHM_Dataset_DB_version'][i]
# required for the connection to the external files
cur_dataset_external_reference_id = \
xls_data_d['IHM_Dataset_External_reference_id'][i]
# required for the creation of the repository
cur_dataset_reference_provider = \
xls_data_d['IHM_Dataset_Reference_provider'][i]
cur_dataset_reference_type = \
xls_data_d['IHM_Dataset_Reference_type'][i]
cur_dataset_reference = xls_data_d['IHM_Dataset_Reference'][i]
cur_dataset_refers_to = xls_data_d['IHM_Dataset_Refers_to'][i]
cur_dataset_associated_url = \
xls_data_d['IHM_Dataset_Associated_url'][i]
cur_dataset_top_directory = \
None if ('IHM_Dataset_top_directory' not in xls_data_d.keys()
or pandas.isnull(
xls_data_d['IHM_Dataset_top_directory'][i])) \
else xls_data_d['IHM_Dataset_top_directory'][i]
cur_dataset_details = \
None if ('IHM_Dataset_Details' not in xls_data_d.keys()
or pandas.isnull(
xls_data_d['IHM_Dataset_Details'][i])) \
else xls_data_d['IHM_Dataset_Details'][i]
# if the dataset is a database entry
if cur_dataset_DB_flag:
cur_dataset = None
cur_location = None
# PDB
if cur_dataset_DB_name == 'PDB':
cur_location = ihm.location.PDBLocation(
db_code=cur_dataset_DB_accession_code,
version=cur_dataset_DB_version,
details=cur_dataset_details)
if cur_location is not None and \
cur_location not in self.list_locations:
self.list_locations.append(cur_location)
cur_dataset = ihm.dataset.PDBDataset(cur_location)
# SASBDB
elif cur_dataset_DB_name == 'SASBDB':
cur_location = ihm.location.SASBDBLocation(
db_code=cur_dataset_DB_accession_code,
version=cur_dataset_DB_version,
details=cur_dataset_details)
if cur_location not in self.list_locations:
self.list_locations.append(cur_location)
cur_dataset = ihm.dataset.SASDataset(cur_location)
# BMRB
elif cur_dataset_DB_name == 'BMRB':
cur_location = ihm.location.BMRBLocation(
db_code=cur_dataset_DB_accession_code,
version=cur_dataset_DB_version,
details=cur_dataset_details)
if cur_location not in self.list_locations:
self.list_locations.append(cur_location)
cur_dataset = ihm.dataset.NMRDataset(cur_location)
# TODO: handle other databases
else:
print('NOTE! Database name %s not handled.' % (
cur_dataset_DB_name))
if cur_dataset is not None:
if cur_dataset not in self.list_datasets:
self.list_datasets.append(cur_dataset)
self.list_dataset_ids.append(cur_dataset_list_id)
self.list_dataset_external_reference_ids.append(
cur_dataset_external_reference_id)
if cur_dataset_group not in \
tmp_dict_for_dataset_groups.keys():
tmp_dict_for_dataset_groups[cur_dataset_group] = []
tmp_dict_for_dataset_groups[cur_dataset_group].append(
cur_dataset)
tmp_info_for_dataset_groups[cur_dataset_group] = {}
tmp_info_for_dataset_groups[cur_dataset_group]['name'] \
= cur_dataset_group_name
tmp_info_for_dataset_groups[
cur_dataset_group]['details'] = \
cur_dataset_group_details
else:
tmp_dict_for_dataset_groups[cur_dataset_group].append(
cur_dataset)
# otherwise it is stored in a repository
else:
cur_dataset = None
if cur_dataset_reference_type == 'DOI':
# !!! TODO: root
cur_repo = ihm.location.Repository(
doi=cur_dataset_reference,
root='.',
url=cur_dataset_associated_url,
top_directory=cur_dataset_top_directory)
if not occurs_in_list(cur_repo, self.list_repositories):
self.list_repositories.append(cur_repo)
if not cur_dataset_external_reference_id in \
tmp_dict_for_external_reference_store_dataset_list_id.keys():
tmp_dict_for_external_reference_store_dataset_list_id[
cur_dataset_external_reference_id] = cur_dataset_list_id
tmp_dict_for_external_reference_store_dataset_type[
cur_dataset_external_reference_id] = cur_dataset_data_type
tmp_dict_for_external_reference_store_dataset_group[
cur_dataset_external_reference_id] = cur_dataset_group
tmp_dict_for_external_reference_store_repository[
cur_dataset_external_reference_id] = cur_repo
tmp_dict_for_external_reference_store_dataset_details[
cur_dataset_external_reference_id] = cur_dataset_details
# Add dataset group id to the list of dataset groups;
# Still needs to be filled.
if cur_dataset_group not in \
tmp_dict_for_dataset_groups.keys():
tmp_dict_for_dataset_groups[cur_dataset_group] = []
tmp_info_for_dataset_groups[cur_dataset_group] = {}
tmp_info_for_dataset_groups[cur_dataset_group]['name'] = \
cur_dataset_group_name
tmp_info_for_dataset_groups[cur_dataset_group]['details'] = \
cur_dataset_group_details
# Now read the external file information
if self.verbose:
print(" ... Processing tab \'External_files\' ... ")
xls_data_e = pandas.read_excel(self.xls_file,
sheet_name=sheet_name_external_files,
skiprows=skiprows_external_files,
header=header_external_files)
nr_entries_e = len(xls_data_e['IHM_External_file_Ordinal'])
for i in range(nr_entries_e):
cur_external_files_ordinal = \
xls_data_e['IHM_External_file_Ordinal'][i]
cur_external_files_reference_id = \
xls_data_e['IHM_External_file_Reference_id'][i]
cur_external_files_file_path = \
xls_data_e['IHM_External_file_File_path'][i]
cur_external_files_file_format = \
xls_data_e['IHM_External_file_File_format'][i]
cur_external_files_content_type = \
xls_data_e['IHM_External_file_Content_type'][i]
cur_external_files_file_size = \
xls_data_e['IHM_External_file_File_size'][i]
cur_external_files_details = \
None if ('IHM_External_file_Details' not in xls_data_e.keys()
or pandas.isnull(
xls_data_e['IHM_External_file_Details'][i])) \
else xls_data_e['IHM_External_file_Details'][i]
# from the dataset tab
cur_external_files_dataset_type = \
tmp_dict_for_external_reference_store_dataset_type[
cur_external_files_reference_id]
cur_external_files_dataset_group = \
tmp_dict_for_external_reference_store_dataset_group[
cur_external_files_reference_id]
cur_external_files_dataset_details = \
tmp_dict_for_external_reference_store_dataset_details[
cur_external_files_reference_id]
cur_external_files_repository = \
tmp_dict_for_external_reference_store_repository[
cur_external_files_reference_id]
cur_external_files_dataset_list_id = \
tmp_dict_for_external_reference_store_dataset_list_id[
cur_external_files_reference_id]
cur_location = None
cur_dataset = None
if cur_external_files_content_type == 'Input data or restraints':
cur_location = \
ihm.location.InputFileLocation(
path=cur_external_files_file_path,
repo=cur_external_files_repository,
details=cur_external_files_details)
if cur_external_files_content_type == \
'Modeling or post-processing output':
cur_location = \
ihm.location.OutputFileLocation(
path=cur_external_files_file_path,
repo=cur_external_files_repository,
details=cur_external_files_details)
if cur_external_files_content_type == \
'Modeling workflow or script':
cur_location = \
ihm.location.WorkflowFileLocation(
path=cur_external_files_file_path,
repo=cur_external_files_repository,
details=cur_external_files_details)
if cur_external_files_content_type == 'Visualization script':
cur_location = \
ihm.location.VisualizationFileLocation(
path=cur_external_files_file_path,
repo=cur_external_files_repository,
details=cur_external_files_details)
if cur_external_files_content_type == 'Other':
cur_location = \
ihm.location.FileLocation(
path=cur_external_files_file_path,
repo=cur_external_files_repository,
details=cur_external_files_details)
if cur_location not in self.list_external_files_locations:
self.list_external_files_locations.append(cur_location)
self.list_external_files_ids.append(cur_external_files_ordinal)
self.system.locations.append(cur_location)
# create the dataset
cur_dataset = None
if cur_external_files_dataset_type == 'Single molecule FRET data':
cur_dataset = ihm.dataset.FRETDataset(
location=cur_location,
details=cur_external_files_dataset_details)
elif cur_external_files_dataset_type == 'Ensemble FRET data':
cur_dataset = ihm.dataset.EnsembleFRETDataset(
location=cur_location,
details=cur_external_files_dataset_details)
elif cur_external_files_dataset_type == 'De Novo model':
cur_dataset = ihm.dataset.DeNovoModelDataset(
location=cur_location,
details=cur_external_files_dataset_details)
elif cur_external_files_dataset_type == 'Integrative model':
cur_dataset = ihm.dataset.IntegrativeModelDataset(
location=cur_location,
details=cur_external_files_dataset_details)
elif cur_external_files_dataset_type == 'Other':
cur_dataset = ihm.dataset.Dataset(
location=cur_location,
details=cur_external_files_dataset_details)
else:
cur_dataset = ihm.dataset.Dataset(
location=cur_location,
details=cur_external_files_dataset_details)
# store the dataset in the list for dataset groups
if cur_dataset is not None and not occurs_in_list(
cur_dataset,
self.list_datasets):
self.list_datasets.append(cur_dataset)
self.list_dataset_ids.append(cur_external_files_dataset_list_id)
# Store the external_files_reference_id as well in order to be
# able to use it for the Dataset groups
self.list_dataset_external_reference_ids.append(
cur_external_files_reference_id)
if cur_external_files_dataset_group not in \
tmp_dict_for_dataset_groups.keys():
tmp_dict_for_dataset_groups[
cur_external_files_dataset_group] = []
tmp_dict_for_dataset_groups[
cur_external_files_dataset_group].append(cur_dataset)
# otherwise, use the previously generated dataset
else:
cur_dataset = self.list_datasets[get_index_from_list(
cur_dataset,
self.list_datasets)]
if cur_external_files_dataset_group not in \
tmp_dict_for_dataset_groups.keys():
tmp_dict_for_dataset_groups[
cur_external_files_dataset_group] = []
if cur_dataset is not None:
tmp_dict_for_dataset_groups[
cur_external_files_dataset_group].append(cur_dataset)
# store the current dataset for the subgroups that correspond to the
# dataset_list_ids
if cur_external_files_reference_id not in \
tmp_dict_for_dataset_subgroups.keys():
tmp_dict_for_dataset_subgroups[
cur_external_files_reference_id] = []
if cur_dataset is not None:
if cur_dataset not in \
tmp_dict_for_dataset_subgroups[
cur_external_files_reference_id]:
tmp_dict_for_dataset_subgroups[
cur_external_files_reference_id].append(cur_dataset)
# store the connection between datasets and external file ordinal ids
if cur_dataset is not None:
self.list_connection_dataset_external_file_datasets.append(
cur_dataset)
self.list_connection_dataset_external_file_ids.append(
cur_external_files_ordinal)
# Go through the Dataset entries again to collect all the external
# references belonging to one of the dataset groups
# This is important because an external file could be used in multiple
# dataset groups
# Only if external files are present at all.
for i in range(nr_entries_d):
cur_dataset_group = \
xls_data_d['IHM_Dataset_Dataset_group'][i]
cur_dataset_DB_flag = \
xls_data_d['IHM_Dataset_DB_flag'][i] in \
['Yes', 'YES', 'yes']
cur_dataset_external_reference_id = \
xls_data_d['IHM_Dataset_External_reference_id'][i]
# Only if the dataset entry has an external file and is not in a
# database
if not cur_dataset_DB_flag:
# if the current dataset group is not in the list of dataset
# groups yet
if not cur_dataset_group in tmp_dict_for_dataset_groups.keys():
# add it
tmp_dict_for_dataset_groups[cur_dataset_group] = []
# then add the data
# check for each of the previously collected entries in the
# subgroups (which were collected by external reference id)
# whether they are already added
for cur_entry in tmp_dict_for_dataset_subgroups[
cur_dataset_external_reference_id]:
# check whether the entry is already in the list for the
# dataset groups
if cur_entry not in \
tmp_dict_for_dataset_groups[cur_dataset_group]:
tmp_dict_for_dataset_groups[
cur_dataset_group].append(cur_entry)
# create the dataset_group
for groupkey in tmp_dict_for_dataset_groups.keys():
cur_dataset_group = \
ihm.dataset.DatasetGroup(
elements=tmp_dict_for_dataset_groups[groupkey],
name=tmp_info_for_dataset_groups[groupkey]['name'],
details=tmp_info_for_dataset_groups[groupkey]['details'])
self.list_dataset_groups.append(cur_dataset_group)
self.list_dataset_group_ids.append(groupkey)
# create the subgroups (corresponding to the dataset_list_id)
for groupkey in tmp_dict_for_dataset_subgroups.keys():
cur_dataset_subgroup = \
ihm.dataset.DatasetGroup(tmp_dict_for_dataset_subgroups[groupkey])
self.list_dataset_subgroup_by_dataset_list_id.append(cur_dataset_subgroup)
self.list_dataset_subgroup_by_dataset_list_id_ids.append(groupkey)
# update the locations in the repositories
self.system.update_locations_in_repositories(self.list_repositories)
def add_ihm_entity(self, sheet_name='Entity', skiprows=3, header=0):
"""Read entity information from the excel sheet"""
if self.verbose:
print(" ... Processing tab \'Entity and Entity Assembly (FLR)\' "
"...")
xls_data = pandas.read_excel(self.xls_file,
sheet_name=sheet_name,
skiprows=skiprows,
header=header)
nr_entries = len(xls_data['IHM_Entity_Ordinal'])
# Entity (IHM) and Entity assembly (FLR)
for i in range(nr_entries):
cur_entity_molecular_entity = \
xls_data['IHM_Entity_Molecular_entity'][i]
cur_entity_type = xls_data['IHM_Entity_Entity_type'][i]
cur_entity_source_method = xls_data['IHM_Entity_Source_method'][i]
cur_entity_description = \
None if ('IHM_Entity_Description' not in xls_data.keys()
or pandas.isnull(
xls_data['IHM_Entity_Description'][i])) \
else xls_data['IHM_Entity_Description'][i]
cur_entity_polymer_type = xls_data['IHM_Entity_Polymer_type'][i]
cur_entity_polymer_one_letter_code = \
xls_data['IHM_Entity_Polymer_one_letter_code'][i]
cur_entity_polymer_one_letter_code_canonical = \
xls_data['IHM_Entity_Polymer_one_letter_code_canonical'][i]
cur_entity_nonpolymer_chem_comp_id = \
xls_data['IHM_Entity_Nonpolymer_chem_comp_ID'][i]
cur_entity_nonpolymer_chem_comp_name = \
xls_data['IHM_Entity_Nonpolymer_chem_comp_name'][i]
cur_entity_nonpolymer_chem_comp_formula = \
xls_data['IHM_Entity_Nonpolymer_chem_comp_formula'][i]
# Source
# !!! TODO: For natural and synthetic: extend for the details
cur_source = None
if cur_entity_source_method == 'genetically manipulated source':
cur_source = ihm.source.Manipulated()
elif cur_entity_source_method == 'natural source':
cur_source = ihm.source.Natural(Details=None)
elif cur_entity_source_method == 'synthetic source':
cur_source = ihm.source.Synthetic(Details=None)
else:
cur_source = ihm.source.Source()
cur_entity = None
# If the entity is a polymer
if cur_entity_type == 'polymer':
cur_alphabet = {
'polypeptide(D)': ihm.DPeptideAlphabet(),
'polypeptide(L)': ihm.LPeptideAlphabet(),
'polyribonucleotide': ihm.RNAAlphabet(),
'polydeoxribonucleotide':
ihm.DNAAlphabet()}[cur_entity_polymer_type]
try:
cur_entity = ihm.Entity(
sequence=cur_entity_polymer_one_letter_code,
alphabet=cur_alphabet,
description=cur_entity_description,
source=cur_source)
except KeyError:
# Check whether there are modified residues in the sequence
# Note: Modified residues have to be denoted by (Xyz),
# i.e. Three letters in brackets.
if '(' in cur_entity_polymer_one_letter_code:
list_of_sequence = []
this_index_i = 0 # non-canonical index
this_index_j = 0 # canonical index
# Go through the non-canonical sequence
while this_index_i < \
len(cur_entity_polymer_one_letter_code):
# If non-canonical and canonical sequence match,
# this residue is kept
if cur_entity_polymer_one_letter_code[this_index_i] == \
cur_entity_polymer_one_letter_code_canonical[this_index_j]:
list_of_sequence.append(
cur_entity_polymer_one_letter_code[this_index_i])
this_index_i += 1
this_index_j += 1
# If we find a bracket in the sequence, we keep the
# next three letters and create a new chemical
# component which we assign the resepective canonical
# code from the canonical sequence
elif cur_entity_polymer_one_letter_code[this_index_i] == '(':
this_new_chem_comp_name = \
cur_entity_polymer_one_letter_code[this_index_i + 1:this_index_i + 4]
this_canonical_res = \
cur_entity_polymer_one_letter_code_canonical[this_index_j]
# Create the new chemical component
if cur_entity_polymer_type == 'polypeptide(D)':
this_new_chem_comp = \
ihm.DPeptideChemComp(
id=this_new_chem_comp_name,
code=this_new_chem_comp_name,
code_canonical=this_canonical_res)
if cur_entity_polymer_type == 'polypeptide(L)':
this_new_chem_comp = \
ihm.LPeptideChemComp(
id=this_new_chem_comp_name,
code=this_new_chem_comp_name,
code_canonical=this_canonical_res)
if cur_entity_polymer_type == 'polyribonucleotide':
this_new_chem_comp = \
ihm.RNAChemComp(
id=this_new_chem_comp_name,
code=this_new_chem_comp_name,
code_canonical=this_canonical_res)
if cur_entity_polymer_type == 'polydeoxribonucleotide':
this_new_chem_comp = \
ihm.DNAChemComp(
id=this_new_chem_comp_name,
code=this_new_chem_comp_name,
code_canonical=this_canonical_res)
list_of_sequence.append(this_new_chem_comp)
# And we go to the next entry in the non-canonical
# sequence
this_index_i += 5
this_index_j += 1
else:
pass
# Create the entity with the sequence including the
# modified residues.
cur_ihm_entity = ihm.Entity(
sequence=list_of_sequence,
alphabet=cur_alphabet,
description=cur_entity_description,
source=cur_source)
# If the entity is a non-polymer
if cur_entity_type == 'non-polymer':
cur_entity = ihm.Entity(
sequence=[ihm.NonPolymerChemComp(
id=cur_entity_nonpolymer_chem_comp_id)],
description=cur_ihm_entity_description)
# If the entity is water
if cur_entity_type == 'water':
cur_entity = ihm.Entity(
sequence=[ihm.WaterChemComp()],
description=cur_ihm_entity_description)
cur_ihm_entity_index = -1
if cur_entity not in self.list_ihm_entities:
self.list_ihm_entities.append(cur_entity)
self.list_ihm_entity_ids.append(cur_entity_molecular_entity)
cur_ihm_entity_index = self.list_ihm_entities.index(cur_entity)
# possibly generate an entity assembly
cur_entity_assembly_id = \
xls_data['IHM_Entity_Entity_assembly_id'][i]
cur_entity_number_of_copies = \
xls_data['IHM_Entity_Number_of_copies'][i]
# if the entity_assembly_id is not in the list of entity_assembly_ids
if cur_entity_assembly_id not in self.list_entity_assembly_ids:
# create the entity assembly
cur_entity_assembly = ihm.flr.EntityAssembly()
# add the id and the entity assembly to the respective lists
self.list_entity_assembly_ids.append(cur_entity_assembly_id)
self.list_entity_assemblies.append(cur_entity_assembly)
# and add the current entity to the respective assembly
# get the index of the entity_assembly_id
cur_entity_assembly_id_index = \
self.list_entity_assembly_ids.index(cur_entity_assembly_id)
# and add the entity to the entity assembly
self.list_entity_assemblies[cur_entity_assembly_id_index].add_entity(
entity=cur_entity,
num_copies=cur_entity_number_of_copies)