Skip to content

Commit

Permalink
Handle UNK/X in template sequence
Browse files Browse the repository at this point in the history
If the template sequence contains any X residues
(ModBase never generates models containing X)
assume these map to UNK and populate the relevant
tables. Closes #5.
  • Loading branch information
benmwebb committed Dec 2, 2021
1 parent 38e58bf commit f2f8907
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 16 deletions.
33 changes: 24 additions & 9 deletions modbase_pdb_to_cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

# Single sequence in a Modeller alignment
Sequence = collections.namedtuple(
"Sequence", ["seqtyp", "chain", "method", "gapped", "primary"])
"Sequence", ["seqtyp", "chain", "method", "gapped", "primary",
"primary_can", "primary_print"])


# Reference sequence database
Expand All @@ -30,6 +31,7 @@
}

one_to_three = {val: key for key, val in three_to_one.items()}
one_to_three['UNK'] = 'UNK'


def split_resnum(resnum):
Expand Down Expand Up @@ -71,9 +73,20 @@ def _read_seq(self, fh):
if seqlines[-1].endswith('*'):
break
gapped = "".join(seqlines)[:-1]
# "Canonical" primary sequence is always a sequence of one-letter
# codes; regular primary sequence is 1-letter for standard amino
# acids, but can be longer for any non-standard residues (currently
# only UNK is handled here, assuming X always means UNK in the
# template).
primary_can = gapped.replace('-', '')
primary = ['UNK' if x == 'X' else x for x in primary_can]
# Primary sequence suitable for printing, e.g. "ACG(UNK)"
primary_print = "".join('(%s)' % x if len(x) > 1 else x
for x in primary)
return Sequence(
seqtyp=header[0], chain=header[3], method=header[7],
gapped=gapped, primary=gapped.replace('-', ''))
gapped=gapped, primary=primary, primary_can=primary_can,
primary_print=primary_print)


class CifWriter:
Expand Down Expand Up @@ -201,16 +214,17 @@ def write_entity_details(self, sequence3):
"pdbx_seq_one_letter_code",
"pdbx_seq_one_letter_code_can"]) as lp:
if self.align:
if self.align.target.primary != target_primary:
if self.align.target.primary_can != target_primary:
raise ValueError(
"Model sequence does not match target "
"sequence in alignment:",
target_primary, self.align.target.primary)
p = self.align.template.primary
"canonical sequence in alignment:",
target_primary, self.align.target.primary_can)
p = self.align.template.primary_print
p_can = self.align.template.primary_can
lp.write(entity_id=self.template.entity_id,
type="polypeptide(L)", nstd_linkage="no",
pdbx_seq_one_letter_code=p,
pdbx_seq_one_letter_code_can=p)
pdbx_seq_one_letter_code_can=p_can)
lp.write(entity_id=self.target.entity_id,
type="polypeptide(L)", nstd_linkage="no",
pdbx_seq_one_letter_code=target_primary,
Expand Down Expand Up @@ -269,9 +283,10 @@ def write_template_details(self, chain_id, tmpbeg, tmpend, tmpasym,
"_ma_template_poly",
["template_id", "seq_one_letter_code",
"seq_one_letter_code_can"]) as lp:
p = self.align.template.primary
p = self.align.template.primary_print
p_can = self.align.template.primary_can
lp.write(template_id=1, seq_one_letter_code=p,
seq_one_letter_code_can=p)
seq_one_letter_code_can=p_can)

if self.align:
# template_id makes no sense if we have no alignment
Expand Down
2 changes: 1 addition & 1 deletion test/input/align.ali
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ STSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE*

>P1;3icqT
structureX:3icq:401:T:835:T:3icqT:Seq-Prf (0001):-1.00:-1.00
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGP
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGP
D----AFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHN
TNERVRPRAWY--LFYRFVKSIKKVNYTE-SSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFET
VGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSAL------ENIISVYCSLMAIGNFAKGFPAREEVAW
Expand Down
13 changes: 7 additions & 6 deletions test/input/output_with_align.cif
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ SER 'L-peptide linking' SERINE 'C3 H7 N O3' 105.093
THR 'L-peptide linking' THREONINE 'C4 H9 N O3' 119.120
TRP 'L-peptide linking' TRYPTOPHAN 'C11 H12 N2 O2' 204.229
TYR 'L-peptide linking' TYROSINE 'C9 H11 N O3' 181.191
UNK 'L-peptide linking' UNKNOWN 'C4 H9 N O2' 103.121
VAL 'L-peptide linking' VALINE 'C5 H11 N O2' 117.148
#
#
Expand All @@ -102,8 +103,8 @@ _entity_poly.nstd_linkage
_entity_poly.pdbx_seq_one_letter_code
_entity_poly.pdbx_seq_one_letter_code_can
1 polypeptide(L) no
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIF(UNK)EGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
2 polypeptide(L) no
ETLDSMIELFKDYKPGSITLENITRLCQTLGLESFTEELSNELSRLSTASKIIVIDVDYNKKQDRIQDVKLVLASNFDNFDYFNQRDGEHEKSNILLNSLTKYPDLKAFHNNLKFLYLLDAYSHIESDSTSHNNGSSDKSLDSSNASFNNQGKLDLFKYFTELSHYIRQCFQDNCCDFKVRTNLNDKFGIYILTQGINGKEVPLAKIYLEENKSDSQYRFYEYIYSQETKSWINESAENFSNGISLVMEIVANAKESNYTDLIWFPEDFISPELIIDKVTCSSNSSSSPPIIDLFSNNNYNSRIQLMNDFTTKLINIKKFDISNDNLDLISEILKWVQWSRIVLQNVFKLVSTPSSNSNSSELEPDYQAPFSTSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE
ETLDSMIELFKDYKPGSITLENITRLCQTLGLESFTEELSNELSRLSTASKIIVIDVDYNKKQDRIQDVKLVLASNFDNFDYFNQRDGEHEKSNILLNSLTKYPDLKAFHNNLKFLYLLDAYSHIESDSTSHNNGSSDKSLDSSNASFNNQGKLDLFKYFTELSHYIRQCFQDNCCDFKVRTNLNDKFGIYILTQGINGKEVPLAKIYLEENKSDSQYRFYEYIYSQETKSWINESAENFSNGISLVMEIVANAKESNYTDLIWFPEDFISPELIIDKVTCSSNSSSSPPIIDLFSNNNYNSRIQLMNDFTTKLINIKKFDISNDNLDLISEILKWVQWSRIVLQNVFKLVSTPSSNSNSSELEPDYQAPFSTSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE
Expand Down Expand Up @@ -182,7 +183,7 @@ _entity_poly_seq.hetero
1 66 TYR .
1 67 ILE .
1 68 PHE .
1 69 GLY .
1 69 UNK .
1 70 GLU .
1 71 GLY .
1 72 LEU .
Expand Down Expand Up @@ -1027,8 +1028,8 @@ _ma_template_poly.template_id
_ma_template_poly.seq_one_letter_code
_ma_template_poly.seq_one_letter_code_can
1
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIF(UNK)EGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGPDAFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWYLFYRFVKSIKKVNYTESSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSALENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFNEDIRGAVRFTSGRIINVVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
#
#
loop_
Expand Down Expand Up @@ -1108,7 +1109,7 @@ _ma_alignment.alignment_id
_ma_alignment.target_template_flag
_ma_alignment.sequence
1 1 2
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFGEGLRGPD----AFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWY--LFYRFVKSIKKVNYTE-SSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSAL------ENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFN--EDIRGAVRFTSGRIIN--VVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
PDSEEEAEFQEMRKKLKIFQDTINSIDSSLFSSYMYSAITSSLSTAATLSPENSWQLIEFALYETYIFXEGLRGPD----AFFNEVDKSPTVLSQILALVTTSQVCRHPHPLVQLLYMEILVRYASFFDYESAAIPALIEYFVGPRGIHNTNERVRPRAWY--LFYRFVKSIKKVNYTE-SSLAMLGDLLNISVSPVTDMDAPVPTLNSSIRNSDFNSQLYLFETVGVLISSGNLTPEEQALYCDSLINALIGKANAALSSDLSAL------ENIISVYCSLMAIGNFAKGFPAREEVAWLASFNKASDEIFLILDRMGFN--EDIRGAVRFTSGRIIN--VVGPDMLPKVPQLISILLNSIDMNELVDVLSFISQLIHIYKDNMMEITNRMLPTLLMRIFSSLSAAPQGTDDAVKQNDLRKSYISFILQLLNKGFGSILFTEENQVY
2 1 1
ETLDSMIELFKDYKPGSITLENITRLCQTLGLESFTEELSNELSRLSTASKIIVIDVDYNKKQDRIQDVKLVLASNFDNFDYFNQRDGEHEKSNILLNSLTKYPDLKAFHNNLK--FLYLLDAYSHIESDSTSHNNGSSDKSLDSSNASFNNQGKLDLFKYFTELSHYIRQCFQDNCCDFKVRTNLNDKFGIYILTQGINGKEVPLAKIYLEENKSDSQYRFYEYIYS--QETKSWINESAENFSNGISLVMEIVANAKESNYTDLIWFPEDFISPELIIDKVTCSSNSSSSPPIIDLFSNNNYNSRIQLMNDFTTKLINIKKFDISNDNLDLISEILKWVQWSRIVLQNVFKLVSTPSSNSNSSELEPDYQAPFSTSTKDKNSSTSNTEPIPRSNRHGSVVEASRRRRSSTNKSKRPSITEAMMLKEEGLQQFNLHEILSEPAIEEE
#
Expand Down

0 comments on commit f2f8907

Please sign in to comment.