diff --git a/modbase_pdb_to_cif.py b/modbase_pdb_to_cif.py index 5345174..f035737 100755 --- a/modbase_pdb_to_cif.py +++ b/modbase_pdb_to_cif.py @@ -12,7 +12,8 @@ # Single sequence in a Modeller alignment Sequence = collections.namedtuple( - "Sequence", ["seqtyp", "chain", "method", "gapped", "primary"]) + "Sequence", ["seqtyp", "chain", "method", "gapped", "primary", + "primary_can", "primary_print"]) # Reference sequence database @@ -30,6 +31,7 @@ } one_to_three = {val: key for key, val in three_to_one.items()} +one_to_three['UNK'] = 'UNK' def split_resnum(resnum): @@ -71,9 +73,20 @@ def _read_seq(self, fh): if seqlines[-1].endswith('*'): break gapped = "".join(seqlines)[:-1] + # "Canonical" primary sequence is always a sequence of one-letter + # codes; regular primary sequence is 1-letter for standard amino + # acids, but can be longer for any non-standard residues (currently + # only UNK is handled here, assuming X always means UNK in the + # template). + primary_can = gapped.replace('-', '') + primary = ['UNK' if x == 'X' else x for x in primary_can] + # Primary sequence suitable for printing, e.g. "ACG(UNK)" + primary_print = "".join('(%s)' % x if len(x) > 1 else x + for x in primary) return Sequence( seqtyp=header[0], chain=header[3], method=header[7], - gapped=gapped, primary=gapped.replace('-', '')) + gapped=gapped, primary=primary, primary_can=primary_can, + primary_print=primary_print) class CifWriter: @@ -201,16 +214,17 @@ def write_entity_details(self, sequence3): "pdbx_seq_one_letter_code", "pdbx_seq_one_letter_code_can"]) as lp: if self.align: - if self.align.target.primary != target_primary: + if self.align.target.primary_can != target_primary: raise ValueError( "Model sequence does not match target " - "sequence in alignment:", - target_primary, self.align.target.primary) - p = self.align.template.primary + "canonical sequence in alignment:", + target_primary, self.align.target.primary_can) + p = self.align.template.primary_print + p_can = self.align.template.primary_can lp.write(entity_id=self.template.entity_id, type="polypeptide(L)", nstd_linkage="no", pdbx_seq_one_letter_code=p, - pdbx_seq_one_letter_code_can=p) + pdbx_seq_one_letter_code_can=p_can) lp.write(entity_id=self.target.entity_id, type="polypeptide(L)", nstd_linkage="no", pdbx_seq_one_letter_code=target_primary, @@ -269,9 +283,10 @@ def write_template_details(self, chain_id, tmpbeg, tmpend, tmpasym, "_ma_template_poly", ["template_id", "seq_one_letter_code", "seq_one_letter_code_can"]) as lp: - p = self.align.template.primary + p = self.align.template.primary_print + p_can = self.align.template.primary_can lp.write(template_id=1, seq_one_letter_code=p, - seq_one_letter_code_can=p) + seq_one_letter_code_can=p_can) if self.align: # template_id makes no sense if we have no alignment diff --git a/test/input/align.ali b/test/input/align.ali index f38ed79..c403ddc 100644 --- a/test/input/align.ali +++ b/test/input/align.ali @@ -9,7 +9,7 @@ STSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE* >P1;3icqT structureX:3icq:401:T:835:T:3icqT:Seq-Prf (0001):-1.00:-1.00 -PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGP +PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGP D----AFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHN TNERVRPRAWY--LFYRFVKSIKKVNYTE-SSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFET VGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSAL------ENIISVYCSLMAIGNFAKGFPAREEVAW diff --git a/test/input/output_with_align.cif b/test/input/output_with_align.cif index 67aefbd..c11d82c 100644 --- a/test/input/output_with_align.cif +++ b/test/input/output_with_align.cif @@ -83,6 +83,7 @@ SER 'L-peptide linking' SERINE 'C3 H7 N O3' 105.093 THR 'L-peptide linking' THREONINE 'C4 H9 N O3' 119.120 TRP 'L-peptide linking' TRYPTOPHAN 'C11 H12 N2 O2' 204.229 TYR 'L-peptide linking' TYROSINE 'C9 H11 N O3' 181.191 +UNK 'L-peptide linking' UNKNOWN 'C4 H9 N O2' 103.121 VAL 'L-peptide linking' VALINE 'C5 H11 N O2' 117.148 # # @@ -102,8 +103,8 @@ _entity_poly.nstd_linkage _entity_poly.pdbx_seq_one_letter_code _entity_poly.pdbx_seq_one_letter_code_can 1 polypeptide(L) no -PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY -PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY +PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIF(UNK)EGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY +PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY 2 polypeptide(L) no ETLDSMIELFKDYKPGSITLENITRLCQTLGLESFTEELSNELSRLSTASKIIVIDVDYNKKQDRIQDVKLVLASNFDNFDYFNQRDGEHEKSNILLNSLTKYPDLKAFHNNLKFLYLLDAYSHIESDSTSHNNGSSDKSLDSSNASFNNQGKLDLFKYFTELSHYIRQCFQDNCCDFKVRTNLNDKFGIYILTQGINGKEVPLAKIYLEENKSDSQYRFYEYIYSQETKSWINESAENFSNGISLVMEIVANAKESNYTDLIWFPEDFISPELIIDKVTCSSNSSSSPPIIDLFSNNNYNSRIQLMNDFTTKLINIKKFDISNDNLDLISEILKWVQWSRIVLQNVFKLVSTPSSNSNSSELEPDYQAPFSTSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE ETLDSMIELFKDYKPGSITLENITRLCQTLGLESFTEELSNELSRLSTASKIIVIDVDYNKKQDRIQDVKLVLASNFDNFDYFNQRDGEHEKSNILLNSLTKYPDLKAFHNNLKFLYLLDAYSHIESDSTSHNNGSSDKSLDSSNASFNNQGKLDLFKYFTELSHYIRQCFQDNCCDFKVRTNLNDKFGIYILTQGINGKEVPLAKIYLEENKSDSQYRFYEYIYSQETKSWINESAENFSNGISLVMEIVANAKESNYTDLIWFPEDFISPELIIDKVTCSSNSSSSPPIIDLFSNNNYNSRIQLMNDFTTKLINIKKFDISNDNLDLISEILKWVQWSRIVLQNVFKLVSTPSSNSNSSELEPDYQAPFSTSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE @@ -182,7 +183,7 @@ _entity_poly_seq.hetero 1 66 TYR . 1 67 ILE . 1 68 PHE . -1 69 GLY . +1 69 UNK . 1 70 GLU . 1 71 GLY . 1 72 LEU . @@ -1027,8 +1028,8 @@ _ma_template_poly.template_id _ma_template_poly.seq_one_letter_code _ma_template_poly.seq_one_letter_code_can 1 -PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY -PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY +PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIF(UNK)EGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY +PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY # # loop_ @@ -1108,7 +1109,7 @@ _ma_alignment.alignment_id _ma_alignment.target_template_flag _ma_alignment.sequence 1 1 2 -PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPD----AFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWY--LFYRFVKSIKKVNYTE-SSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSAL------ENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFN--EDIRGAVRFTSGRIIN--VVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY +PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGPD----AFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWY--LFYRFVKSIKKVNYTE-SSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSAL------ENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFN--EDIRGAVRFTSGRIIN--VVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY 2 1 1 ETLDSMIELFKDYKPGSITLENITRLCQTLGLESFTEELSNELSRLSTASKIIVIDVDYNKKQDRIQDVKLVLASNFDNFDYFNQRDGEHEKSNILLNSLTKYPDLKAFHNNLK--FLYLLDAYSHIESDSTSHNNGSSDKSLDSSNASFNNQGKLDLFKYFTELSHYIRQCFQDNCCDFKVRTNLNDKFGIYILTQGINGKEVPLAKIYLEENKSDSQYRFYEYIYS--QETKSWINESAENFSNGISLVMEIVANAKESNYTDLIWFPEDFISPELIIDKVTCSSNSSSSPPIIDLFSNNNYNSRIQLMNDFTTKLINIKKFDISNDNLDLISEILKWVQWSRIVLQNVFKLVSTPSSNSNSSELEPDYQAPFSTSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE #