From 5ffb871f42688348393762bf9a63fbfb3a71d781 Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Tue, 12 Mar 2024 13:59:25 +0000 Subject: [PATCH 01/13] First Version of FASTA with sequence sources --- .../uniprot/core/uniparc/UniParcDatabase.java | 95 +++++++----- .../core/uniparc/UniParcDatabaseTest.java | 10 ++ .../uniprot/core/parser/fasta/FastaUtils.java | 17 ++ .../core/parser/fasta/UniRefFastaParser.java | 22 +-- .../{ => uniparc}/UniParcFastaParser.java | 16 +- .../uniparc/UniParcProteomeFastaParser.java | 86 +++++++++++ .../uniprot/UniProtKBFastaParserWriter.java | 17 +- .../core/parser/fasta/FastaUtilsTest.java | 54 +++++++ .../parser/fasta/UniParcFastaParserTest.java | 116 -------------- .../fasta/uniparc/UniParcFastaParserTest.java | 44 ++++++ .../uniparc/UniParcFastaParserTestUtils.java | 59 +++++++ .../UniParcProteomeFastaParserTest.java | 146 ++++++++++++++++++ 12 files changed, 482 insertions(+), 200 deletions(-) create mode 100644 core-parser/src/main/java/org/uniprot/core/parser/fasta/FastaUtils.java rename core-parser/src/main/java/org/uniprot/core/parser/fasta/{ => uniparc}/UniParcFastaParser.java (64%) create mode 100644 core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java create mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/FastaUtilsTest.java delete mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java create mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java create mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java create mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java diff --git a/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcDatabase.java b/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcDatabase.java index d3caa5501..cf58f6c74 100644 --- a/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcDatabase.java +++ b/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcDatabase.java @@ -6,92 +6,99 @@ import org.uniprot.core.util.EnumDisplay; public enum UniParcDatabase implements Database, EnumDisplay { - EG_BACTERIA(900, "EnsemblBacteria", true, "https://www.ensemblgenomes.org/id/%id"), - EG_FUNGI(1000, "EnsemblFungi", true, "https://www.ensemblgenomes.org/id/%id"), - EG_METAZOA(1100, "EnsemblMetazoa", true, "https://www.ensemblgenomes.org/id/%id"), - EG_PLANTS(1200, "EnsemblPlants", true, "https://www.ensemblgenomes.org/id/%id"), - EG_PROTISTS(1300, "EnsemblProtists", true, "https://www.ensemblgenomes.org/id/%id"), - - EMBL(300, "EMBL", true, "https://www.ebi.ac.uk/ena/browser/view/%id"), - EMBL_CON(400, "EMBL_CON", true, "https://www.ebi.ac.uk/ena/browser/view/%id"), - EMBL_TPA(500, "EMBL_TPA", false, "https://www.ebi.ac.uk/ena/browser/view/%id"), - EMBL_TSA(600, "EMBL_TSA", true, "https://www.ebi.ac.uk/ena/browser/view/%id"), - EMBLWGS(700, "EMBLWGS", true, "https://www.ebi.ac.uk/ena/browser/view/%id"), - - ENSEMBL_VERTEBRATE(800, "Ensembl", true, "https://www.ensembl.org/id/%id"), - ENSEMBL_RAPID(1350, "EnsemblRapid", true, "https://rapid.ensembl.org/id/%id"), - - EPO(1400, "EPO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=epo_prt&id=%id"), - FLYBASE(1500, "FlyBase", true, "https://flybase.org/reports/%id.html"), + EG_BACTERIA(900, "EnsemblBacteria", true, true, "https://www.ensemblgenomes.org/id/%id"), + EG_FUNGI(1000, "EnsemblFungi", true, true, "https://www.ensemblgenomes.org/id/%id"), + EG_METAZOA(1100, "EnsemblMetazoa", true, true, "https://www.ensemblgenomes.org/id/%id"), + EG_PLANTS(1200, "EnsemblPlants", true, true, "https://www.ensemblgenomes.org/id/%id"), + EG_PROTISTS(1300, "EnsemblProtists", true, true, "https://www.ensemblgenomes.org/id/%id"), + + EMBL(300, "EMBL", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"), + EMBL_CON(400, "EMBL_CON", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"), + EMBL_TPA(500, "EMBL_TPA", false, true, "https://www.ebi.ac.uk/ena/browser/view/%id"), + EMBL_TSA(600, "EMBL_TSA", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"), + EMBLWGS(700, "EMBLWGS", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"), + + ENSEMBL_VERTEBRATE(800, "Ensembl", true, true, "https://www.ensembl.org/id/%id"), + ENSEMBL_RAPID(1350, "EnsemblRapid", true, true, "https://rapid.ensembl.org/id/%id"), + + EPO(1400, "EPO", true, false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=epo_prt&id=%id"), + FLYBASE(1500, "FlyBase", true,false, "https://flybase.org/reports/%id.html"), FUSION_GDB( 1550, "FusionGDB", true, + false, "https://compbio.uth.edu/FusionGDB2/gene_search_result.cgi?type=quick_search&quick_search=%id"), - H_INV(1600, "H-InvDB", false), - IPI(1700, "IPI", false), + H_INV(1600, "H-InvDB",false, false), + IPI(1700, "IPI",false, false), - JPO(1800, "JPO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=jpo_prt&id=%id"), - KIPO(1900, "KIPO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=kipo_prt&id=%id"), - PATRIC(2000, "PATRIC", true, "https://www.patricbrc.org/view/Feature/%id"), + JPO(1800, "JPO", true,false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=jpo_prt&id=%id"), + KIPO(1900, "KIPO", true,false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=kipo_prt&id=%id"), + PATRIC(2000, "PATRIC", true,false, "https://www.patricbrc.org/view/Feature/%id"), PDB( 2100, "PDB", true, + false, "https://www.ebi.ac.uk/pdbe/entry/pdb/%id"), // need to remove the chain, eg "4q8n_A", // just use "4q8n" as id - PIR(2200, "PIR", false), + PIR(2200, "PIR",false, false), - PIRARC(2300, "PIRARC", false), - PRF(2400, "PRF", false, "http://www.prf.or.jp/cgi-bin/seqget.pl?id=%id"), - REFSEQ(2500, "RefSeq", true, "https://www.ncbi.nlm.nih.gov/protein/%id"), - REMTREMBL(2600, "REMTREMBL", false), + PIRARC(2300, "PIRARC",false, false), + PRF(2400, "PRF",false, false, "http://www.prf.or.jp/cgi-bin/seqget.pl?id=%id"), + REFSEQ(2500, "RefSeq", true, true,"https://www.ncbi.nlm.nih.gov/protein/%id"), + REMTREMBL(2600, "REMTREMBL",false, false), SEED( 2700, "SEED", true, + false, "https://pubseed.theseed.org/seedviewer.cgi?page=Annotation&feature=%id"), - SGD(2800, "SGD", true, "https://www.yeastgenome.org/locus/%id"), - SWISSPROT(100, "UniProtKB/Swiss-Prot", true, "https://www.uniprot.org/uniprot/%id"), + SGD(2800, "SGD", true, false,"https://www.yeastgenome.org/locus/%id"), + SWISSPROT(100, "UniProtKB/Swiss-Prot", true, false,"https://www.uniprot.org/uniprot/%id"), SWISSPROT_VARSPLIC( 200, "UniProtKB/Swiss-Prot protein isoforms", true, + false, "https://www.uniprot.org/uniprot/%id"), // swissprot isoform TAIR_ARABIDOPSIS( 2900, "TAIR", true, + false, "https://www.arabidopsis.org/servlets/TairObject?type=aa_sequence&name=%id"), - TREMBL(100, "UniProtKB/TrEMBL", true, "https://www.uniprot.org/uniprot/%id"), + TREMBL(100, "UniProtKB/TrEMBL", true, false, "https://www.uniprot.org/uniprot/%id"), - TREMBLNEW(3000, "TREMBLNEW", false), - TREMBL_VARSPLIC(3100, "TREMBL_VARSPLIC", false), - TROME(3200, "TROME", true), // no link - UNIMES(3300, "UNIMES", false), - USPTO(3400, "USPTO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uspto_prt&id=%id"), + TREMBLNEW(3000, "TREMBLNEW", false, false), + TREMBL_VARSPLIC(3100, "TREMBL_VARSPLIC", false, false), + TROME(3200, "TROME", true, false), // no link + UNIMES(3300, "UNIMES", false, false), + USPTO(3400, "USPTO", true, false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uspto_prt&id=%id"), - VECTORBASE(3500, "VectorBase", false), - VEGA(3600, "VEGA", true, "https://vega.sanger.ac.uk/id/%id"), - WORMBASE_PARASITE(3700, "WBParaSite", true, "https://parasite.wormbase.org/id/%id"), - WORMBASE(3800, "WormBase", true, "https://wormbase.org/db/seq/protein?name=%id;class=CDS"); + VECTORBASE(3500, "VectorBase", false, false), + VEGA(3600, "VEGA", true, false, "https://vega.sanger.ac.uk/id/%id"), + WORMBASE_PARASITE(3700, "WBParaSite", true, true, "https://parasite.wormbase.org/id/%id"), + WORMBASE(3800, "WormBase", true, true,"https://wormbase.org/db/seq/protein?name=%id;class=CDS"); private final String displayName; private final boolean alive; private final String url; private final int index; + private final boolean source; - UniParcDatabase(int index, String displayName, boolean alive) { - this(index, displayName, alive, ""); + UniParcDatabase(int index, String displayName, boolean alive, boolean source) { + this(index, displayName, alive, source, ""); } - UniParcDatabase(int index, String displayName, boolean alive, String url) { + UniParcDatabase(int index, String displayName, boolean alive, boolean source,String url) { this.index = index; this.displayName = displayName; this.alive = alive; this.url = url; + this.source = source; } public int getIndex() { @@ -110,6 +117,10 @@ public String getUrl() { return url; } + public boolean isSource() { + return source; + } + public static @Nonnull UniParcDatabase typeOf(@Nonnull String displayName) { return EnumDisplay.typeOf(displayName, UniParcDatabase.class); } diff --git a/core-domain/src/test/java/org/uniprot/core/uniparc/UniParcDatabaseTest.java b/core-domain/src/test/java/org/uniprot/core/uniparc/UniParcDatabaseTest.java index 082858429..878ab96da 100644 --- a/core-domain/src/test/java/org/uniprot/core/uniparc/UniParcDatabaseTest.java +++ b/core-domain/src/test/java/org/uniprot/core/uniparc/UniParcDatabaseTest.java @@ -64,6 +64,16 @@ void hasNoUrl() { assertEquals("", UniParcDatabase.VECTORBASE.getUrl()); } + @Test + void isSource() { + assertTrue( UniParcDatabase.EMBL.isSource()); + } + + @Test + void isNotSource() { + assertFalse( UniParcDatabase.FLYBASE.isSource()); + } + @Test void canGetIndex() { assertEquals(100, UniParcDatabase.SWISSPROT.getIndex()); diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/FastaUtils.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/FastaUtils.java new file mode 100644 index 000000000..6faabd814 --- /dev/null +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/FastaUtils.java @@ -0,0 +1,17 @@ +package org.uniprot.core.parser.fasta; + +public class FastaUtils { + + public static String parseSequence(String sequence) { + StringBuilder sb = new StringBuilder(); + int columnCounter = 0; + for (char c : sequence.toCharArray()) { + if (columnCounter % 60 == 0 && columnCounter > 0) { + sb.append("\n"); + } + sb.append(c); + columnCounter++; + } + return sb.toString(); + } +} diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/UniRefFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/UniRefFastaParser.java index 5f5e0224e..984736cba 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/UniRefFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/UniRefFastaParser.java @@ -4,6 +4,8 @@ import org.uniprot.core.uniref.UniRefEntry; import org.uniprot.core.uniref.UniRefEntryLight; +import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence; + /** * @author jluo * @date: 22 Aug 2019 @@ -15,30 +17,14 @@ private UniRefFastaParser() {} public static String toFasta(UniRefEntryLight entry) { StringBuilder sb = new StringBuilder(); sb.append(getHeader(entry)).append("\n"); - int columnCounter = 0; - String sequence = entry.getRepresentativeMember().getSequence().getValue(); - for (char c : sequence.toCharArray()) { - if (columnCounter % 60 == 0 && columnCounter > 0) { - sb.append("\n"); - } - sb.append(c); - columnCounter++; - } + sb.append(parseSequence(entry.getRepresentativeMember().getSequence().getValue())); return sb.toString(); } public static String toFasta(UniRefEntry entry) { StringBuilder sb = new StringBuilder(); sb.append(getHeader(entry)).append("\n"); - String sequence = entry.getRepresentativeMember().getSequence().getValue(); - int columnCounter = 0; - for (char c : sequence.toCharArray()) { - if (columnCounter % 60 == 0 && columnCounter > 0) { - sb.append("\n"); - } - columnCounter++; - sb.append(c); - } + sb.append(parseSequence(entry.getRepresentativeMember().getSequence().getValue())); return sb.toString(); } diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/UniParcFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java similarity index 64% rename from core-parser/src/main/java/org/uniprot/core/parser/fasta/UniParcFastaParser.java rename to core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java index 5e7a860b5..0742c7240 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/UniParcFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java @@ -1,12 +1,15 @@ -package org.uniprot.core.parser.fasta; +package org.uniprot.core.parser.fasta.uniparc; import org.uniprot.core.uniparc.UniParcEntry; +import static org.uniprot.core.parser.fasta.FastaUtils.*; + /** * @author jluo * @date: 24 Jun 2019 */ public class UniParcFastaParser { + public static String toFasta(UniParcEntry entry) { StringBuilder sb = new StringBuilder(); String status = "active"; @@ -18,15 +21,8 @@ public static String toFasta(UniParcEntry entry) { sb.append(">").append(entry.getUniParcId().getValue()).append(" "); sb.append("status=").append(status); sb.append("\n"); - int columnCounter = 0; - String sequence = entry.getSequence().getValue(); - for (char c : sequence.toCharArray()) { - if (columnCounter % 60 == 0 && columnCounter > 0) { - sb.append("\n"); - } - sb.append(c); - columnCounter++; - } + sb.append(parseSequence(entry.getSequence().getValue())); return sb.toString(); } } + diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java new file mode 100644 index 000000000..77ebef0e8 --- /dev/null +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -0,0 +1,86 @@ +package org.uniprot.core.parser.fasta.uniparc; + +import org.uniprot.core.uniparc.UniParcCrossReference; +import org.uniprot.core.uniparc.UniParcDatabase; +import org.uniprot.core.uniparc.UniParcEntry; +import org.uniprot.core.uniprotkb.taxonomy.Organism; +import org.uniprot.core.util.Utils; + +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence; + +public class UniParcProteomeFastaParser { + + private static final Set uniProtDatabases = Set.of( + UniParcDatabase.SWISSPROT, UniParcDatabase.TREMBL); + + public static String toFasta(UniParcEntry entry, String proteomeID) { + String id = entry.getUniParcId().getValue(); + Set sourceIds = new HashSet<>(); + Set organisms = new HashSet<>(); + Set component = new HashSet<>(); + entry.getUniParcCrossReferences().stream() + .filter(UniParcCrossReference::isActive) + .filter(xref -> Objects.nonNull(xref.getProteomeId())) + .filter(xref -> xref.getProteomeId().equals(proteomeID)) + .forEach(xref -> { + if(xref.hasDatabase() && xref.getDatabase().isSource()){ + sourceIds.add(xref.getDatabase().getName() + ":" + xref.getId()); + } + if(Utils.notNull(xref.getOrganism())){ + organisms.add(xref.getOrganism()); + } + if(Utils.notNullNotEmpty(xref.getComponent())){ + component.add(xref.getComponent()); + } + }); + StringBuilder sb = new StringBuilder(); + sb.append(">").append(id); + if(!sourceIds.isEmpty()){ + sb.append("|").append(String.join("|", sourceIds)); + } + sb.append(" UP=").append(proteomeID); + if(!component.isEmpty()){ + sb.append(":").append(String.join("|", component)); + } + sb.append(parseOrganismAndAccession(entry.getUniParcCrossReferences(), organisms)); + sb.append("\n"); + sb.append(parseSequence(entry.getSequence().getValue())); + return sb.toString(); + } + + private static String parseOrganismAndAccession(List xrefs, Set organisms) { + StringBuilder sb = new StringBuilder(); + if(!organisms.isEmpty()) { + Organism organism = organisms.stream().findFirst().get(); + if (organism.getTaxonId() > 0L) { + sb.append(" OX=").append(organism.getTaxonId()); + } + if (organism.hasScientificName()) { + sb.append(" OS=").append(organism.getScientificName()); + } + if (organism.getTaxonId() > 0L) { + Set accessions = new HashSet<>(); + xrefs.stream() + .filter(xref -> filterOrganism(xref, organism)) + .filter(xref -> uniProtDatabases.contains(xref.getDatabase())) + .map(UniParcCrossReference::getId) + .filter(Objects::nonNull) + .forEach(accessions::add); + if(!accessions.isEmpty()) { + sb.append(" AC=").append(String.join("|", accessions)); + } + } + } + return sb.toString(); + } + + private static boolean filterOrganism(UniParcCrossReference xref, Organism organism) { + return xref.getOrganism() != null && + xref.getOrganism().getTaxonId() == organism.getTaxonId(); + } +} diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniprot/UniProtKBFastaParserWriter.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniprot/UniProtKBFastaParserWriter.java index 9a06ee943..c53ae126b 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniprot/UniProtKBFastaParserWriter.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniprot/UniProtKBFastaParserWriter.java @@ -1,6 +1,5 @@ package org.uniprot.core.parser.fasta.uniprot; -import org.uniprot.core.Sequence; import org.uniprot.core.fasta.UniProtKBFasta; import org.uniprot.core.uniprotkb.ProteinExistence; import org.uniprot.core.uniprotkb.UniProtKBEntryType; @@ -9,6 +8,8 @@ import java.util.Objects; +import static org.uniprot.core.parser.fasta.FastaUtils.*; + /** * @author lgonzales * @since 22/10/2020 @@ -45,7 +46,7 @@ static String toString(UniProtKBFasta entry) { } sb.append("\n"); - sb.append(getSequence(entry.getSequence())); + sb.append(parseSequence(entry.getSequence().getValue())); return sb.toString(); } @@ -80,16 +81,4 @@ private static String getProteinName(UniProtKBFasta entry) { } return desc.toString(); } - - private static String getSequence(Sequence sequence) { - StringBuilder sb = new StringBuilder(); - int columnCounter = 0; - for (char c : sequence.getValue().toCharArray()) { - sb.append(c); - if ((++columnCounter % 60 == 0) && (columnCounter < sequence.getLength())) { - sb.append("\n"); - } - } - return sb.toString(); - } } diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/FastaUtilsTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/FastaUtilsTest.java new file mode 100644 index 000000000..7f966f1a7 --- /dev/null +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/FastaUtilsTest.java @@ -0,0 +1,54 @@ +package org.uniprot.core.parser.fasta; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class FastaUtilsTest { + @Test + void parseSequenceOneLineSimple() { + String input = getSequence(10); + String result = FastaUtils.parseSequence(input); + assertEquals(input, result); + } + + @Test + void parseSequenceOneLineFullDoNotAddLineBreak() { + String input = getSequence(60); + String result = FastaUtils.parseSequence(input); + assertEquals(input, result); + } + + @Test + void parseSequenceTwoLines() { + String input = getSequence(61); + String result = FastaUtils.parseSequence(input); + String expect = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n" + + "A"; + assertEquals(expect, result); + } + + @Test + void parseSequenceTwoLinesFull() { + String input = getSequence(120); + String result = FastaUtils.parseSequence(input); + String expect = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n" + + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + assertEquals(expect, result); + } + + @Test + void parseSequenceThreeLines() { + String input = getSequence(121); + String result = FastaUtils.parseSequence(input); + String expect = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n" + + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n" + + "A"; + assertEquals(expect, result); + } + + private String getSequence(int length){ + String s = "A"; + return s.repeat(length); + } +} \ No newline at end of file diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java deleted file mode 100644 index cfbdcbcf7..000000000 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java +++ /dev/null @@ -1,116 +0,0 @@ -package org.uniprot.core.parser.fasta; - -import static org.junit.jupiter.api.Assertions.*; - -import java.time.LocalDate; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.junit.jupiter.api.Test; -import org.uniprot.core.Location; -import org.uniprot.core.Property; -import org.uniprot.core.Sequence; -import org.uniprot.core.impl.SequenceBuilder; -import org.uniprot.core.uniparc.*; -import org.uniprot.core.uniparc.UniParcCrossReference; -import org.uniprot.core.uniparc.impl.InterProGroupBuilder; -import org.uniprot.core.uniparc.impl.SequenceFeatureBuilder; -import org.uniprot.core.uniparc.impl.UniParcCrossReferenceBuilder; -import org.uniprot.core.uniparc.impl.UniParcEntryBuilder; -import org.uniprot.core.uniparc.impl.UniParcIdBuilder; -import org.uniprot.core.uniprotkb.taxonomy.Organism; -import org.uniprot.core.uniprotkb.taxonomy.impl.OrganismBuilder; - -/** - * @author jluo - * @date: 24 Jun 2019 - */ -class UniParcFastaParserTest { - - @Test - void testToFasta() { - UniParcEntry entry = create(); - String fasta = UniParcFastaParser.toFasta(entry); - System.out.println(fasta); - String expected = - ">UPI0000083A08 status=active\n" - + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" - + "LLRAIDWFRDNGYFNA"; - assertEquals(expected, fasta); - } - - private UniParcEntry create() { - String seq = - "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT" + "LLRAIDWFRDNGYFNA"; - Sequence sequence = new SequenceBuilder(seq).build(); - List xrefs = getXrefs(); - List seqFeatures = getSeqFeatures(); - UniParcEntry entry = - new UniParcEntryBuilder() - .uniParcId(new UniParcIdBuilder("UPI0000083A08").build()) - .uniParcCrossReferencesSet(xrefs) - .sequence(sequence) - .sequenceFeaturesSet(seqFeatures) - .build(); - return entry; - } - - private List getSeqFeatures() { - List locations = Arrays.asList(new Location(12, 23), new Location(45, 89)); - InterProGroup domain = new InterProGroupBuilder().name("name1").id("id1").build(); - SequenceFeature sf = - new SequenceFeatureBuilder() - .interproGroup(domain) - .signatureDbType(SignatureDbType.PFAM) - .signatureDbId("sigId2") - .locationsSet(locations) - .build(); - SequenceFeature sf3 = - SequenceFeatureBuilder.from(sf).signatureDbType(SignatureDbType.PROSITE).build(); - return Arrays.asList(sf, sf3); - } - - private List getXrefs() { - Organism taxonomy = - new OrganismBuilder().taxonId(9606).scientificName("Homo sapiens").build(); - List properties = new ArrayList<>(); - properties.add(new Property("prop1", "pvalue")); - UniParcCrossReference xref = - new UniParcCrossReferenceBuilder() - .versionI(3) - .database(UniParcDatabase.SWISSPROT) - .id("P12345") - .version(7) - .active(true) - .created(LocalDate.of(2017, 5, 17)) - .lastUpdated(LocalDate.of(2017, 2, 27)) - .propertiesSet(properties) - .organism(taxonomy) - .proteinName("some pname") - .geneName("some gname") - .build(); - - List properties2 = new ArrayList<>(); - properties.add(new Property("prop2", "pvalue2")); - Organism taxonomy2 = new OrganismBuilder().taxonId(10090).scientificName("MOUSE").build(); - - UniParcCrossReference xref2 = - new UniParcCrossReferenceBuilder() - .versionI(1) - .database(UniParcDatabase.TREMBL) - .id("P52346") - .version(7) - .active(true) - .created(LocalDate.of(2017, 2, 12)) - .lastUpdated(LocalDate.of(2017, 4, 23)) - .propertiesSet(properties2) - .organism(taxonomy2) - .proteinName("some pname") - .proteomeId("UP00000564") - .component("chromosome 1") - .build(); - - return Arrays.asList(xref, xref2); - } -} diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java new file mode 100644 index 000000000..849ec85fa --- /dev/null +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java @@ -0,0 +1,44 @@ +package org.uniprot.core.parser.fasta.uniparc; + +import static org.junit.jupiter.api.Assertions.*; +import static org.uniprot.core.parser.fasta.uniparc.UniParcFastaParserTestUtils.*; + +import org.junit.jupiter.api.Test; +import org.uniprot.core.uniparc.*; +import org.uniprot.core.uniparc.impl.UniParcEntryBuilder; +import org.uniprot.core.uniparc.impl.UniParcIdBuilder; + +/** + * @author jluo + * @date: 24 Jun 2019 + */ +class UniParcFastaParserTest { + + @Test + void testToFastaActive() { + UniParcEntry entry = create(); + String fasta = UniParcFastaParser.toFasta(entry); + String expected = + ">UPI0000083A08 status=active\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + + @Test + void testToFastaInactive() { + UniParcEntry entry = + new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A08").build()) + .uniParcCrossReferencesSet(getUniProtXrefs(false)) + .sequence(getSequence()) + .build(); + String fasta = UniParcFastaParser.toFasta(entry); + String expected = + ">UPI0000083A08 status=inactive\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + +} diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java new file mode 100644 index 000000000..c4128734f --- /dev/null +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java @@ -0,0 +1,59 @@ +package org.uniprot.core.parser.fasta.uniparc; + +import org.uniprot.core.Sequence; +import org.uniprot.core.impl.SequenceBuilder; +import org.uniprot.core.uniparc.*; +import org.uniprot.core.uniparc.impl.*; +import org.uniprot.core.uniprotkb.taxonomy.Organism; +import org.uniprot.core.uniprotkb.taxonomy.impl.OrganismBuilder; + +import java.util.ArrayList; +import java.util.List; + +public class UniParcFastaParserTestUtils { + + static UniParcEntry create() { + List xrefs = getUniProtXrefs(true); + Organism organism = getOrganism(9606L, "Homo Sapiens"); + xrefs.add(getXref(UniParcDatabase.EMBL,"CQR81549", true, organism, "UP000005640", "Chromosome 1")); + return new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A08").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(getSequence()) + .build(); + } + + static Sequence getSequence() { + String seq = "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT" + + "LLRAIDWFRDNGYFNA"; + return new SequenceBuilder(seq).build(); + } + + static List getUniProtXrefs(boolean active) { + List result = new ArrayList<>(); + Organism human = getOrganism(9606, "Homo sapiens"); + + result.add(getXref(UniParcDatabase.SWISSPROT, "P12345", active, human)); + + return result; + } + + static Organism getOrganism(long taxonId, String name) { + return new OrganismBuilder().taxonId(taxonId).scientificName(name).build(); + } + + static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism) { + return getXref(database, id, active, organism, null, null); + } + + static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism, String proteomeId, String component) { + return new UniParcCrossReferenceBuilder() + .database(database) + .id(id) + .active(active) + .organism(organism) + .proteomeId(proteomeId) + .component(component) + .build(); + } +} diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java new file mode 100644 index 000000000..745547e41 --- /dev/null +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java @@ -0,0 +1,146 @@ +package org.uniprot.core.parser.fasta.uniparc; + +import org.junit.jupiter.api.Test; +import org.uniprot.core.uniparc.UniParcCrossReference; +import org.uniprot.core.uniparc.UniParcDatabase; +import org.uniprot.core.uniparc.UniParcEntry; +import org.uniprot.core.uniparc.impl.UniParcEntryBuilder; +import org.uniprot.core.uniparc.impl.UniParcIdBuilder; +import org.uniprot.core.uniprotkb.taxonomy.Organism; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; +import static org.uniprot.core.parser.fasta.uniparc.UniParcFastaParserTestUtils.*; +import static org.uniprot.core.parser.fasta.uniparc.UniParcFastaParserTestUtils.create; + +class UniParcProteomeFastaParserTest { + + @Test + void toFastaFullSingleValues() { + UniParcEntry entry = create(); + String fasta = UniParcProteomeFastaParser.toFasta(entry, "UP000005640"); + String expected = + ">UPI0000083A08|EMBL:CQR81549 UP=UP000005640:Chromosome 1 OX=9606 OS=Homo Sapiens AC=P12345\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + + @Test + void toFastaFullMultiValues() { + String proteomeId = "UP000005640"; + Organism organism = getOrganism(9606L, "Homo Sapiens"); + List xrefs = new ArrayList<>(); + xrefs.add(getXref(UniParcDatabase.SWISSPROT, "P21802", true, organism)); + xrefs.add(getXref(UniParcDatabase.TREMBL, "P12345", true, organism)); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, organism, proteomeId, "C1")); + xrefs.add(getXref(UniParcDatabase.EMBL_TPA, "XP54321", true, organism, proteomeId, "C2")); + UniParcEntry entry = new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(getSequence()) + .build(); + String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String expected = + ">UPI0000083A09|EMBL_CON:XP12345|EMBL_TPA:XP54321 UP=UP000005640:C1|C2 OX=9606 OS=Homo Sapiens AC=P21802|P12345\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + + @Test + void toFastaWithoutAccessions() { + String proteomeId = "UP000005640"; + Organism organism = getOrganism(9606L, "Homo Sapiens"); + List xrefs = new ArrayList<>(); + xrefs.add(getXref(UniParcDatabase.SWISSPROT_VARSPLIC, "P21802-1", true, organism)); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, organism, proteomeId, "C1")); + UniParcEntry entry = new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(getSequence()) + .build(); + String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String expected = + ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C1 OX=9606 OS=Homo Sapiens\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + + @Test + void toFastaWithoutComponent() { + String proteomeId = "UP000005640"; + Organism organism = getOrganism(9606L, "Homo Sapiens"); + List xrefs = new ArrayList<>(); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, organism, proteomeId, null)); + UniParcEntry entry = new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(getSequence()) + .build(); + String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String expected = + ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640 OX=9606 OS=Homo Sapiens\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + + @Test + void toFastaWithoutOrganism() { + String proteomeId = "UP000005640"; + List xrefs = new ArrayList<>(); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, null, proteomeId, "C8")); + UniParcEntry entry = new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(getSequence()) + .build(); + String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String expected = + ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C8\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + + @Test + void toFastaWithoutSource() { + String proteomeId = "UP000005640"; + List xrefs = new ArrayList<>(); + xrefs.add(getXref(UniParcDatabase.PDB, "PDB12345", true, null, proteomeId, "C9")); + UniParcEntry entry = new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(getSequence()) + .build(); + String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String expected = + ">UPI0000083A09 UP=UP000005640:C9\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } + + @Test + void toFastaFilterInactiveSources() { + String proteomeId = "UP000005640"; + List xrefs = new ArrayList<>(); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, null, proteomeId, "C5")); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP54321", false, null, proteomeId, "C3")); + UniParcEntry entry = new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(getSequence()) + .build(); + String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String expected = + ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C5\n" + + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + + "LLRAIDWFRDNGYFNA"; + assertEquals(expected, fasta); + } +} \ No newline at end of file From 805dd6234021e324fc9b42af26a5fbe2be771138 Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Thu, 9 May 2024 09:35:11 +0100 Subject: [PATCH 02/13] Updated second Version of FASTA with sequence sources --- .../uniparc/UniParcProteomeFastaParser.java | 56 ++++++++++++------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index 77ebef0e8..f7cf129e5 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -6,10 +6,7 @@ import org.uniprot.core.uniprotkb.taxonomy.Organism; import org.uniprot.core.util.Utils; -import java.util.HashSet; -import java.util.List; -import java.util.Objects; -import java.util.Set; +import java.util.*; import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence; @@ -20,34 +17,53 @@ public class UniParcProteomeFastaParser { public static String toFasta(UniParcEntry entry, String proteomeID) { String id = entry.getUniParcId().getValue(); - Set sourceIds = new HashSet<>(); - Set organisms = new HashSet<>(); - Set component = new HashSet<>(); - entry.getUniParcCrossReferences().stream() + Organism organism = entry.getUniParcCrossReferences().stream() .filter(UniParcCrossReference::isActive) .filter(xref -> Objects.nonNull(xref.getProteomeId())) .filter(xref -> xref.getProteomeId().equals(proteomeID)) + .filter(xref -> Utils.notNull(xref.getOrganism())) + .findFirst() + .map(UniParcCrossReference::getOrganism) + .orElse(null); + + List proteinName = new ArrayList<>(); + List geneNames = new ArrayList<>(); + List accessions = new ArrayList<>(); + + entry.getUniParcCrossReferences().stream() + .filter(UniParcCrossReference::isActive) + .filter(xref -> filterOrganism(xref, organism)) + .filter(xref -> uniProtDatabases.contains(xref.getDatabase())) .forEach(xref -> { - if(xref.hasDatabase() && xref.getDatabase().isSource()){ - sourceIds.add(xref.getDatabase().getName() + ":" + xref.getId()); + if(Utils.notNullNotEmpty(xref.getId())){ + accessions.add(xref.getId()); } - if(Utils.notNull(xref.getOrganism())){ - organisms.add(xref.getOrganism()); + if(Utils.notNullNotEmpty(xref.getProteinName())){ + proteinName.add(xref.getProteinName()); } - if(Utils.notNullNotEmpty(xref.getComponent())){ - component.add(xref.getComponent()); + if(Utils.notNullNotEmpty(xref.getGeneName())){ + geneNames.add(xref.getGeneName()); } }); + StringBuilder sb = new StringBuilder(); sb.append(">").append(id); - if(!sourceIds.isEmpty()){ - sb.append("|").append(String.join("|", sourceIds)); + if(!accessions.isEmpty()){ + sb.append("|").append(String.join("|", accessions)); } - sb.append(" UP=").append(proteomeID); - if(!component.isEmpty()){ - sb.append(":").append(String.join("|", component)); + if(!proteinName.isEmpty()){ + sb.append(" ").append(String.join("|", proteinName)); + } + if (Utils.notNull(organism)) { + if (organism.hasScientificName()) { + sb.append(" OS=").append(organism.getScientificName()); + } + sb.append(" OX=").append(organism.getTaxonId()); + } + + if(!geneNames.isEmpty()){ + sb.append(" GN=").append(String.join("|", geneNames)); } - sb.append(parseOrganismAndAccession(entry.getUniParcCrossReferences(), organisms)); sb.append("\n"); sb.append(parseSequence(entry.getSequence().getValue())); return sb.toString(); From 5d93997bd198940dde724a43482675708b6e2d61 Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Tue, 14 May 2024 16:55:52 +0100 Subject: [PATCH 03/13] Add exploratory options source ids and proteome and proteome component data --- .../uniparc/UniParcProteomeFastaParser.java | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index f7cf129e5..cbe15bbc7 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -29,6 +29,8 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { List proteinName = new ArrayList<>(); List geneNames = new ArrayList<>(); List accessions = new ArrayList<>(); + Set sourceIds = new HashSet<>(); + Set component = new HashSet<>(); entry.getUniParcCrossReferences().stream() .filter(UniParcCrossReference::isActive) @@ -44,13 +46,18 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { if(Utils.notNullNotEmpty(xref.getGeneName())){ geneNames.add(xref.getGeneName()); } + if(proteomeID.equals(xref.getProteomeId())) { + if (xref.hasDatabase() && xref.getDatabase().isSource()) { + sourceIds.add(xref.getDatabase().getName() + ":" + xref.getId()); + } + if (Utils.notNullNotEmpty(xref.getComponent())) { + component.add(xref.getComponent()); + } + } }); StringBuilder sb = new StringBuilder(); sb.append(">").append(id); - if(!accessions.isEmpty()){ - sb.append("|").append(String.join("|", accessions)); - } if(!proteinName.isEmpty()){ sb.append(" ").append(String.join("|", proteinName)); } @@ -64,6 +71,17 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { if(!geneNames.isEmpty()){ sb.append(" GN=").append(String.join("|", geneNames)); } + if(!accessions.isEmpty()){ + sb.append(" AC=").append(String.join("|", accessions)); + } + if(!sourceIds.isEmpty()){ + sb.append(" SS=").append(String.join("|", sourceIds)); + } + sb.append(" UP=").append(proteomeID); + if(!component.isEmpty()){ + sb.append(" UP=").append(proteomeID); + sb.append(":").append(String.join("|", component)); + } sb.append("\n"); sb.append(parseSequence(entry.getSequence().getValue())); return sb.toString(); From 5f1113886be871b7d4c2014663ec9fc358752712 Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Wed, 15 May 2024 11:25:22 +0100 Subject: [PATCH 04/13] Fix exploratory options source ids and proteome and proteome component data --- .../uniparc/UniParcProteomeFastaParser.java | 21 ++++++++++--------- .../uniparc/UniParcFastaParserTestUtils.java | 8 ++++++- .../UniParcProteomeFastaParserTest.java | 4 ++-- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index cbe15bbc7..bbccc880b 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -35,16 +35,17 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { entry.getUniParcCrossReferences().stream() .filter(UniParcCrossReference::isActive) .filter(xref -> filterOrganism(xref, organism)) - .filter(xref -> uniProtDatabases.contains(xref.getDatabase())) .forEach(xref -> { - if(Utils.notNullNotEmpty(xref.getId())){ - accessions.add(xref.getId()); - } - if(Utils.notNullNotEmpty(xref.getProteinName())){ - proteinName.add(xref.getProteinName()); - } - if(Utils.notNullNotEmpty(xref.getGeneName())){ - geneNames.add(xref.getGeneName()); + if(uniProtDatabases.contains(xref.getDatabase())) { + if (Utils.notNullNotEmpty(xref.getId())) { + accessions.add(xref.getId()); + } + if (Utils.notNullNotEmpty(xref.getProteinName())) { + proteinName.add(xref.getProteinName()); + } + if (Utils.notNullNotEmpty(xref.getGeneName())) { + geneNames.add(xref.getGeneName()); + } } if(proteomeID.equals(xref.getProteomeId())) { if (xref.hasDatabase() && xref.getDatabase().isSource()) { @@ -79,9 +80,9 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { } sb.append(" UP=").append(proteomeID); if(!component.isEmpty()){ - sb.append(" UP=").append(proteomeID); sb.append(":").append(String.join("|", component)); } + sb.append("\n"); sb.append(parseSequence(entry.getSequence().getValue())); return sb.toString(); diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java index c4128734f..32ad2428d 100644 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java @@ -33,7 +33,7 @@ static List getUniProtXrefs(boolean active) { List result = new ArrayList<>(); Organism human = getOrganism(9606, "Homo sapiens"); - result.add(getXref(UniParcDatabase.SWISSPROT, "P12345", active, human)); + result.add(getXref(UniParcDatabase.SWISSPROT, "P12345", active, human, null, null, "Protein Name One", "Gene Name One")); return result; } @@ -47,11 +47,17 @@ static UniParcCrossReference getXref(UniParcDatabase database, String id, boolea } static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism, String proteomeId, String component) { + return getXref(database,id, active, organism, proteomeId, component, null, null); + } + + static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism, String proteomeId, String component, String proteinName, String geneName) { return new UniParcCrossReferenceBuilder() .database(database) .id(id) .active(active) .organism(organism) + .proteinName(proteinName) + .geneName(geneName) .proteomeId(proteomeId) .component(component) .build(); diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java index 745547e41..102e87cba 100644 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java @@ -33,8 +33,8 @@ void toFastaFullMultiValues() { String proteomeId = "UP000005640"; Organism organism = getOrganism(9606L, "Homo Sapiens"); List xrefs = new ArrayList<>(); - xrefs.add(getXref(UniParcDatabase.SWISSPROT, "P21802", true, organism)); - xrefs.add(getXref(UniParcDatabase.TREMBL, "P12345", true, organism)); + xrefs.add(getXref(UniParcDatabase.SWISSPROT, "P21802", true, organism, null, null, "Protein Name 1", "Gene Name1")); + xrefs.add(getXref(UniParcDatabase.TREMBL, "P12345", true, organism, null, null, "Protein Name 2", "Gene Name2")); xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, organism, proteomeId, "C1")); xrefs.add(getXref(UniParcDatabase.EMBL_TPA, "XP54321", true, organism, proteomeId, "C2")); UniParcEntry entry = new UniParcEntryBuilder() From 01e806cba6fdf63ded6d432b416e190eceacccad Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Wed, 15 May 2024 13:43:15 +0100 Subject: [PATCH 05/13] add SWISSPROT_VARSPLIC database --- .../core/parser/fasta/uniparc/UniParcProteomeFastaParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index bbccc880b..1ed5cae8a 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -13,7 +13,7 @@ public class UniParcProteomeFastaParser { private static final Set uniProtDatabases = Set.of( - UniParcDatabase.SWISSPROT, UniParcDatabase.TREMBL); + UniParcDatabase.SWISSPROT, UniParcDatabase.TREMBL, UniParcDatabase.SWISSPROT_VARSPLIC); public static String toFasta(UniParcEntry entry, String proteomeID) { String id = entry.getUniParcId().getValue(); From 2cb464893ffdd3de3f6be19454c0f6c4f88fe146 Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Thu, 13 Jun 2024 16:00:11 +0100 Subject: [PATCH 06/13] Add stream to uniparc proteome endpoint --- .../core/parser/fasta/uniparc/UniParcProteomeFastaParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index 1ed5cae8a..0562ea731 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -33,8 +33,8 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { Set component = new HashSet<>(); entry.getUniParcCrossReferences().stream() - .filter(UniParcCrossReference::isActive) .filter(xref -> filterOrganism(xref, organism)) + .sorted(Comparator.comparing(UniParcCrossReference::isActive,Comparator.reverseOrder())) .forEach(xref -> { if(uniProtDatabases.contains(xref.getDatabase())) { if (Utils.notNullNotEmpty(xref.getId())) { From fc381ebc1621e57e93c976743543c8e7d625d169 Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Tue, 27 Aug 2024 14:35:24 +0100 Subject: [PATCH 07/13] Merge changes --- .../core/parser/fasta/uniparc/UniParcFastaParserTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java index 849ec85fa..dd48848fc 100644 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java @@ -4,6 +4,7 @@ import static org.uniprot.core.parser.fasta.uniparc.UniParcFastaParserTestUtils.*; import org.junit.jupiter.api.Test; +import org.uniprot.core.parser.fasta.UniParcFastaParser; import org.uniprot.core.uniparc.*; import org.uniprot.core.uniparc.impl.UniParcEntryBuilder; import org.uniprot.core.uniparc.impl.UniParcIdBuilder; From 149a8cfda7105b623364c37bf2c65108eee4a50f Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Tue, 3 Sep 2024 17:18:44 +0100 Subject: [PATCH 08/13] add sequence source for proteome fasta --- .../uniparc/UniParcProteomeFastaParser.java | 79 +++++++------------ .../UniParcDBCrossReferenceConverter.java | 4 + 2 files changed, 32 insertions(+), 51 deletions(-) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index 0562ea731..15c695823 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -1,5 +1,6 @@ package org.uniprot.core.parser.fasta.uniparc; +import org.uniprot.core.Property; import org.uniprot.core.uniparc.UniParcCrossReference; import org.uniprot.core.uniparc.UniParcDatabase; import org.uniprot.core.uniparc.UniParcEntry; @@ -7,6 +8,7 @@ import org.uniprot.core.util.Utils; import java.util.*; +import java.util.concurrent.atomic.AtomicReference; import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence; @@ -17,23 +19,17 @@ public class UniParcProteomeFastaParser { public static String toFasta(UniParcEntry entry, String proteomeID) { String id = entry.getUniParcId().getValue(); - Organism organism = entry.getUniParcCrossReferences().stream() - .filter(UniParcCrossReference::isActive) - .filter(xref -> Objects.nonNull(xref.getProteomeId())) - .filter(xref -> xref.getProteomeId().equals(proteomeID)) - .filter(xref -> Utils.notNull(xref.getOrganism())) - .findFirst() - .map(UniParcCrossReference::getOrganism) - .orElse(null); List proteinName = new ArrayList<>(); List geneNames = new ArrayList<>(); List accessions = new ArrayList<>(); Set sourceIds = new HashSet<>(); Set component = new HashSet<>(); + AtomicReference organism = new AtomicReference<>(); + AtomicReference proteomeIdValue = new AtomicReference<>(); entry.getUniParcCrossReferences().stream() - .filter(xref -> filterOrganism(xref, organism)) + .filter(xref -> uniProtDatabases.contains(xref.getDatabase())) .sorted(Comparator.comparing(UniParcCrossReference::isActive,Comparator.reverseOrder())) .forEach(xref -> { if(uniProtDatabases.contains(xref.getDatabase())) { @@ -46,13 +42,25 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { if (Utils.notNullNotEmpty(xref.getGeneName())) { geneNames.add(xref.getGeneName()); } - } - if(proteomeID.equals(xref.getProteomeId())) { - if (xref.hasDatabase() && xref.getDatabase().isSource()) { - sourceIds.add(xref.getDatabase().getName() + ":" + xref.getId()); + if (Utils.notNull(xref.getOrganism())) { + organism.set(xref.getOrganism()); } - if (Utils.notNullNotEmpty(xref.getComponent())) { - component.add(xref.getComponent()); + if (Utils.notNullNotEmpty(xref.getProperties())) { + xref.getProperties().stream() + .filter(p -> "source".equals(p.getKey())) + .map(Property::getValue) + .forEach(source -> { + String[] sources = source.split(":"); + if(sources.length > 0){ + sourceIds.add(sources[0]); + } + if(sources.length > 1){ + proteomeIdValue.set(sources[1]); + } + if(sources.length > 2){ + component.add(sources[2]); + } + }); } } }); @@ -62,11 +70,11 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { if(!proteinName.isEmpty()){ sb.append(" ").append(String.join("|", proteinName)); } - if (Utils.notNull(organism)) { - if (organism.hasScientificName()) { - sb.append(" OS=").append(organism.getScientificName()); + if (Utils.notNull(organism.get())) { + if (organism.get().hasScientificName()) { + sb.append(" OS=").append(organism.get().getScientificName()); } - sb.append(" OX=").append(organism.getTaxonId()); + sb.append(" OX=").append(organism.get().getTaxonId()); } if(!geneNames.isEmpty()){ @@ -78,7 +86,7 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { if(!sourceIds.isEmpty()){ sb.append(" SS=").append(String.join("|", sourceIds)); } - sb.append(" UP=").append(proteomeID); + sb.append(" UP=").append(proteomeIdValue.get()); if(!component.isEmpty()){ sb.append(":").append(String.join("|", component)); } @@ -87,35 +95,4 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { sb.append(parseSequence(entry.getSequence().getValue())); return sb.toString(); } - - private static String parseOrganismAndAccession(List xrefs, Set organisms) { - StringBuilder sb = new StringBuilder(); - if(!organisms.isEmpty()) { - Organism organism = organisms.stream().findFirst().get(); - if (organism.getTaxonId() > 0L) { - sb.append(" OX=").append(organism.getTaxonId()); - } - if (organism.hasScientificName()) { - sb.append(" OS=").append(organism.getScientificName()); - } - if (organism.getTaxonId() > 0L) { - Set accessions = new HashSet<>(); - xrefs.stream() - .filter(xref -> filterOrganism(xref, organism)) - .filter(xref -> uniProtDatabases.contains(xref.getDatabase())) - .map(UniParcCrossReference::getId) - .filter(Objects::nonNull) - .forEach(accessions::add); - if(!accessions.isEmpty()) { - sb.append(" AC=").append(String.join("|", accessions)); - } - } - } - return sb.toString(); - } - - private static boolean filterOrganism(UniParcCrossReference xref, Organism organism) { - return xref.getOrganism() != null && - xref.getOrganism().getTaxonId() == organism.getTaxonId(); - } } diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java b/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java index 9a71713be..5a26d2177 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java @@ -36,6 +36,7 @@ public class UniParcDBCrossReferenceConverter public static final String PROPERTY_COMPONENT = "component"; public static final String PROPERTY_NCBI_TAXONOMY_ID = "NCBI_taxonomy_id"; public static final String PROPERTY_UNIPROTKB_ACCESSION = "UniProtKB_accession"; + public static final String PROPERTY_SOURCE = "source"; private final ObjectFactory xmlFactory; private final TaxonomyRepo taxonomyRepo; @@ -85,6 +86,9 @@ public UniParcCrossReference fromXml(DbReferenceType xmlObj) { case PROPERTY_UNIPROTKB_ACCESSION: builder.propertiesAdd(PROPERTY_UNIPROTKB_ACCESSION, property.getValue()); break; + case PROPERTY_SOURCE: + builder.propertiesAdd(PROPERTY_SOURCE, property.getValue()); + break; default: throw new XmlReaderException( "Unable to read xml property: " From 335954df1128f66aeaebffeee214b2663687510a Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Tue, 1 Oct 2024 11:25:32 +0100 Subject: [PATCH 09/13] Improvements in the fasta Format parser and sources attribute --- .../core/uniparc/UniParcCrossReference.java | 2 + .../uniparc/UniParcProteomeFastaParser.java | 123 ++++++++++-------- .../UniParcProteomeFastaParserTest.java | 14 +- 3 files changed, 81 insertions(+), 58 deletions(-) diff --git a/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java b/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java index 0871d624f..783081643 100644 --- a/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java +++ b/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java @@ -11,6 +11,8 @@ */ public interface UniParcCrossReference extends CrossReference { + public static final String PROPERTY_SOURCES = "sources"; + int getVersionI(); Integer getVersion(); diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index 15c695823..25025789d 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -11,70 +11,94 @@ import java.util.concurrent.atomic.AtomicReference; import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence; +import static org.uniprot.core.util.Utils.*; public class UniParcProteomeFastaParser { private static final Set uniProtDatabases = Set.of( UniParcDatabase.SWISSPROT, UniParcDatabase.TREMBL, UniParcDatabase.SWISSPROT_VARSPLIC); - public static String toFasta(UniParcEntry entry, String proteomeID) { + public static String toFasta(UniParcEntry entry) { String id = entry.getUniParcId().getValue(); - List proteinName = new ArrayList<>(); - List geneNames = new ArrayList<>(); - List accessions = new ArrayList<>(); - Set sourceIds = new HashSet<>(); - Set component = new HashSet<>(); - AtomicReference organism = new AtomicReference<>(); - AtomicReference proteomeIdValue = new AtomicReference<>(); + List uniProtXrefs = new ArrayList<>(); + List sourceXrefs = new ArrayList<>(); + boolean hasActive = false; + boolean hasSourceActive = false; + String proteomeId = null; + for(UniParcCrossReference xref: entry.getUniParcCrossReferences()){ + if(uniProtDatabases.contains(xref.getDatabase())){ + uniProtXrefs.add(xref); + if(xref.isActive()){ + hasActive = true; + } + } else { + sourceXrefs.add(xref); + proteomeId = xref.getProteomeId(); + if(xref.isActive()){ + hasSourceActive = true; + } + } + } + StringBuilder sb = new StringBuilder(); + if(!uniProtXrefs.isEmpty()){ + sb.append(getFastaHeader(uniProtXrefs, hasActive, id, proteomeId, false)); + } else { + sb.append(getFastaHeader(sourceXrefs, hasSourceActive, id, proteomeId, true)); + } + sb.append("\n"); + sb.append(parseSequence(entry.getSequence().getValue())); + return sb.toString(); + } - entry.getUniParcCrossReferences().stream() - .filter(xref -> uniProtDatabases.contains(xref.getDatabase())) - .sorted(Comparator.comparing(UniParcCrossReference::isActive,Comparator.reverseOrder())) - .forEach(xref -> { - if(uniProtDatabases.contains(xref.getDatabase())) { - if (Utils.notNullNotEmpty(xref.getId())) { - accessions.add(xref.getId()); - } - if (Utils.notNullNotEmpty(xref.getProteinName())) { - proteinName.add(xref.getProteinName()); - } - if (Utils.notNullNotEmpty(xref.getGeneName())) { - geneNames.add(xref.getGeneName()); - } - if (Utils.notNull(xref.getOrganism())) { - organism.set(xref.getOrganism()); - } - if (Utils.notNullNotEmpty(xref.getProperties())) { - xref.getProperties().stream() - .filter(p -> "source".equals(p.getKey())) - .map(Property::getValue) - .forEach(source -> { - String[] sources = source.split(":"); - if(sources.length > 0){ - sourceIds.add(sources[0]); - } - if(sources.length > 1){ - proteomeIdValue.set(sources[1]); + private static StringBuilder getFastaHeader(List xrefs, boolean hasActive, String id, String proteomeId, boolean isSource) { + Set proteinName = new LinkedHashSet<>(); + Set geneNames = new LinkedHashSet<>(); + Set accessions = new LinkedHashSet<>(); + Set sourceIds = new LinkedHashSet<>(); + Set component = new LinkedHashSet<>(); + Organism organism = null; + for(UniParcCrossReference xref: xrefs) { + if (xref.isActive() == hasActive) { + addOrIgnoreNull(xref.getProteinName(), proteinName); + addOrIgnoreNull(xref.getGeneName(), geneNames); + organism = xref.getOrganism(); + + if (isSource) { + addOrIgnoreNull(xref.getId(), sourceIds); + addOrIgnoreNull(xref.getComponent(), component); + } else { + addOrIgnoreNull(xref.getId(), accessions); + if (notNullNotEmpty(xref.getProperties())) { + xref.getProperties().stream() + .filter(p -> UniParcCrossReference.PROPERTY_SOURCES.equals(p.getKey())) + .map(Property::getValue) + .forEach(value -> { + String[] sources = value.split(","); + for (String source : sources) { + String[] ids = source.split(":"); + if (ids.length > 1 && proteomeId.equals(ids[1])) { + sourceIds.add(ids[0]); } - if(sources.length > 2){ - component.add(sources[2]); + if (ids.length > 2 && proteomeId.equals(ids[1])) { + component.add(ids[2]); } - }); - } + } + }); } - }); - + } + } + } StringBuilder sb = new StringBuilder(); sb.append(">").append(id); if(!proteinName.isEmpty()){ sb.append(" ").append(String.join("|", proteinName)); } - if (Utils.notNull(organism.get())) { - if (organism.get().hasScientificName()) { - sb.append(" OS=").append(organism.get().getScientificName()); + if (notNull(organism)) { + if (organism.hasScientificName()) { + sb.append(" OS=").append(organism.getScientificName()); } - sb.append(" OX=").append(organism.get().getTaxonId()); + sb.append(" OX=").append(organism.getTaxonId()); } if(!geneNames.isEmpty()){ @@ -86,13 +110,10 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { if(!sourceIds.isEmpty()){ sb.append(" SS=").append(String.join("|", sourceIds)); } - sb.append(" UP=").append(proteomeIdValue.get()); + sb.append(" UP=").append(proteomeId); if(!component.isEmpty()){ sb.append(":").append(String.join("|", component)); } - - sb.append("\n"); - sb.append(parseSequence(entry.getSequence().getValue())); - return sb.toString(); + return sb; } } diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java index 102e87cba..b9a115d32 100644 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java @@ -20,7 +20,7 @@ class UniParcProteomeFastaParserTest { @Test void toFastaFullSingleValues() { UniParcEntry entry = create(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, "UP000005640"); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A08|EMBL:CQR81549 UP=UP000005640:Chromosome 1 OX=9606 OS=Homo Sapiens AC=P12345\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -42,7 +42,7 @@ void toFastaFullMultiValues() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345|EMBL_TPA:XP54321 UP=UP000005640:C1|C2 OX=9606 OS=Homo Sapiens AC=P21802|P12345\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -62,7 +62,7 @@ void toFastaWithoutAccessions() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C1 OX=9606 OS=Homo Sapiens\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -81,7 +81,7 @@ void toFastaWithoutComponent() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640 OX=9606 OS=Homo Sapiens\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -99,7 +99,7 @@ void toFastaWithoutOrganism() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C8\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -117,7 +117,7 @@ void toFastaWithoutSource() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09 UP=UP000005640:C9\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -136,7 +136,7 @@ void toFastaFilterInactiveSources() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C5\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + From a13cefc870b3fa9f62013b70c6b6489674154eee Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Thu, 28 Nov 2024 16:37:11 +0000 Subject: [PATCH 10/13] improve test in UniParcProteomeFastaParser.java --- ...java => UniParcEntryLightFastaParser.java} | 19 +-- .../uniparc/UniParcProteomeFastaParser.java | 2 +- .../parser/fasta/UniParcFastaParserTest.java | 141 ------------------ .../UniParcEntryLightFastaParserTest.java | 49 ++++++ .../fasta/uniparc/UniParcFastaParserTest.java | 45 ------ .../uniparc/UniParcFastaParserTestUtils.java | 12 +- .../UniParcProteomeFastaParserTest.java | 46 +++--- 7 files changed, 87 insertions(+), 227 deletions(-) rename core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/{UniParcFastaParser.java => UniParcEntryLightFastaParser.java} (65%) delete mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java create mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java delete mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParser.java similarity index 65% rename from core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java rename to core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParser.java index 898752dda..36d76b553 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParser.java @@ -1,8 +1,6 @@ -package org.uniprot.core.parser.fasta; +package org.uniprot.core.parser.fasta.uniparc; import org.uniprot.core.Sequence; -import org.uniprot.core.uniparc.UniParcCrossReference; -import org.uniprot.core.uniparc.UniParcEntry; import org.uniprot.core.uniparc.UniParcEntryLight; import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF; @@ -11,19 +9,8 @@ * @author jluo * @date: 24 Jun 2019 */ -public class UniParcFastaParser { - private UniParcFastaParser(){} - - public static String toFasta(UniParcEntry entry) { - String status = "active"; - boolean isActive = entry.getUniParcCrossReferences() - .stream() - .anyMatch(UniParcCrossReference::isActive); - if (!isActive) { - status = "inactive"; - } - return getFastaString(entry.getUniParcId().getValue(), status, entry.getSequence()); - } +public class UniParcEntryLightFastaParser { + private UniParcEntryLightFastaParser(){} public static String toFasta(UniParcEntryLight entry) { String status = "active"; diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index 25025789d..d8863d473 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -110,7 +110,7 @@ private static StringBuilder getFastaHeader(List xrefs, b if(!sourceIds.isEmpty()){ sb.append(" SS=").append(String.join("|", sourceIds)); } - sb.append(" UP=").append(proteomeId); + sb.append(" PC=").append(proteomeId); if(!component.isEmpty()){ sb.append(":").append(String.join("|", component)); } diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java deleted file mode 100644 index 9b14a50f7..000000000 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/UniParcFastaParserTest.java +++ /dev/null @@ -1,141 +0,0 @@ -package org.uniprot.core.parser.fasta; - -import org.junit.jupiter.api.Test; -import org.uniprot.core.Property; -import org.uniprot.core.Sequence; -import org.uniprot.core.impl.SequenceBuilder; -import org.uniprot.core.uniparc.*; -import org.uniprot.core.uniparc.impl.*; -import org.uniprot.core.uniprotkb.taxonomy.Organism; -import org.uniprot.core.uniprotkb.taxonomy.impl.OrganismBuilder; - -import java.time.LocalDate; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF; - -/** - * @author jluo - * @date: 24 Jun 2019 - */ -class UniParcFastaParserTest { - - public static final String EXPECTED_FASTA_RESULT = """ - >UPI0000083A08 status=active - MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT - LLRAIDWFRDNGYFNA"""; - public static final String EXPECTED_FASTA_RESULT_INACTIVE = """ - >UPI0000083A08 status=inactive - MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT - LLRAIDWFRDNGYFNA"""; - @Test - void testUniParcEntryToFasta() { - UniParcEntry entry = create(); - String fasta = UniParcFastaParser.toFasta(entry); - assertEquals(EXPECTED_FASTA_RESULT, fasta); - } - - @Test - void testUniParcEntryLightToFasta() { - UniParcEntryLight entry = createEntryLight(); - String fasta = UniParcFastaParser.toFasta(entry); - assertEquals(EXPECTED_FASTA_RESULT, fasta); - } - - @Test - void testUniParcEntryLightToFastaInactive() { - UniParcEntryLight entry = createEntryLight(); - entry = UniParcEntryLightBuilder.from(entry).extraAttributesAdd(HAS_ACTIVE_CROSS_REF, false).build(); - String fasta = UniParcFastaParser.toFasta(entry); - assertEquals(EXPECTED_FASTA_RESULT_INACTIVE, fasta); - } - - private UniParcEntry create() { - Sequence sequence = getSequence(); - List xrefs = getXrefs(); - List seqFeatures = getSeqFeatures(); - UniParcEntry entry = - new UniParcEntryBuilder() - .uniParcId(new UniParcIdBuilder("UPI0000083A08").build()) - .uniParcCrossReferencesSet(xrefs) - .sequence(sequence) - .sequenceFeaturesSet(seqFeatures) - .build(); - return entry; - } - - private UniParcEntryLight createEntryLight() { - return new UniParcEntryLightBuilder() - .uniParcId("UPI0000083A08") - .sequence(getSequence()) - .build(); - } - - private static Sequence getSequence() { - String seq = - "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT" + "LLRAIDWFRDNGYFNA"; - Sequence sequence = new SequenceBuilder(seq).build(); - return sequence; - } - - private List getSeqFeatures() { - List locations = Arrays.asList(new SequenceFeatureLocationBuilder().range(12, 23).alignment("55M").build(), new SequenceFeatureLocationBuilder().range(45, 89).build()); - InterProGroup domain = new InterProGroupBuilder().name("name1").id("id1").build(); - SequenceFeature sf = - new SequenceFeatureBuilder() - .interproGroup(domain) - .signatureDbType(SignatureDbType.PFAM) - .signatureDbId("sigId2") - .locationsSet(locations) - .build(); - SequenceFeature sf3 = - SequenceFeatureBuilder.from(sf).signatureDbType(SignatureDbType.PROSITE).build(); - return Arrays.asList(sf, sf3); - } - - private List getXrefs() { - Organism taxonomy = - new OrganismBuilder().taxonId(9606).scientificName("Homo sapiens").build(); - List properties = new ArrayList<>(); - properties.add(new Property("prop1", "pvalue")); - UniParcCrossReference xref = - new UniParcCrossReferenceBuilder() - .versionI(3) - .database(UniParcDatabase.SWISSPROT) - .id("P12345") - .version(7) - .active(true) - .created(LocalDate.of(2017, 5, 17)) - .lastUpdated(LocalDate.of(2017, 2, 27)) - .propertiesSet(properties) - .organism(taxonomy) - .proteinName("some pname") - .geneName("some gname") - .build(); - - List properties2 = new ArrayList<>(); - properties.add(new Property("prop2", "pvalue2")); - Organism taxonomy2 = new OrganismBuilder().taxonId(10090).scientificName("MOUSE").build(); - - UniParcCrossReference xref2 = - new UniParcCrossReferenceBuilder() - .versionI(1) - .database(UniParcDatabase.TREMBL) - .id("P52346") - .version(7) - .active(true) - .created(LocalDate.of(2017, 2, 12)) - .lastUpdated(LocalDate.of(2017, 4, 23)) - .propertiesSet(properties2) - .organism(taxonomy2) - .proteinName("some pname") - .proteomeId("UP00000564") - .component("chromosome 1") - .build(); - - return Arrays.asList(xref, xref2); - } -} diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java new file mode 100644 index 000000000..0efdeb8a4 --- /dev/null +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java @@ -0,0 +1,49 @@ +package org.uniprot.core.parser.fasta.uniparc; + +import static org.junit.jupiter.api.Assertions.*; +import static org.uniprot.core.parser.fasta.uniparc.UniParcFastaParserTestUtils.*; +import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF; + +import org.junit.jupiter.api.Test; +import org.uniprot.core.uniparc.*; +import org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder; + +/** + * @author jluo + * @date: 24 Jun 2019 + */ +class UniParcEntryLightFastaParserTest { + + public static final String EXPECTED_FASTA_RESULT = """ + >UPI0000083A08 status=active + MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT + LLRAIDWFRDNGYFNA"""; + public static final String EXPECTED_FASTA_RESULT_INACTIVE = """ + >UPI0000083A08 status=inactive + MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT + LLRAIDWFRDNGYFNA"""; + + + @Test + void testUniParcEntryLightToFasta() { + UniParcEntryLight entry = createEntryLight(); + String fasta = UniParcEntryLightFastaParser.toFasta(entry); + assertEquals(EXPECTED_FASTA_RESULT, fasta); + } + + @Test + void testUniParcEntryLightToFastaInactive() { + UniParcEntryLight entry = createEntryLight(); + entry = UniParcEntryLightBuilder.from(entry).extraAttributesAdd(HAS_ACTIVE_CROSS_REF, false).build(); + String fasta = UniParcEntryLightFastaParser.toFasta(entry); + assertEquals(EXPECTED_FASTA_RESULT_INACTIVE, fasta); + } + + private UniParcEntryLight createEntryLight() { + return new UniParcEntryLightBuilder() + .uniParcId("UPI0000083A08") + .sequence(getSequence()) + .build(); + } + +} diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java deleted file mode 100644 index dd48848fc..000000000 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java +++ /dev/null @@ -1,45 +0,0 @@ -package org.uniprot.core.parser.fasta.uniparc; - -import static org.junit.jupiter.api.Assertions.*; -import static org.uniprot.core.parser.fasta.uniparc.UniParcFastaParserTestUtils.*; - -import org.junit.jupiter.api.Test; -import org.uniprot.core.parser.fasta.UniParcFastaParser; -import org.uniprot.core.uniparc.*; -import org.uniprot.core.uniparc.impl.UniParcEntryBuilder; -import org.uniprot.core.uniparc.impl.UniParcIdBuilder; - -/** - * @author jluo - * @date: 24 Jun 2019 - */ -class UniParcFastaParserTest { - - @Test - void testToFastaActive() { - UniParcEntry entry = create(); - String fasta = UniParcFastaParser.toFasta(entry); - String expected = - ">UPI0000083A08 status=active\n" - + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" - + "LLRAIDWFRDNGYFNA"; - assertEquals(expected, fasta); - } - - @Test - void testToFastaInactive() { - UniParcEntry entry = - new UniParcEntryBuilder() - .uniParcId(new UniParcIdBuilder("UPI0000083A08").build()) - .uniParcCrossReferencesSet(getUniProtXrefs(false)) - .sequence(getSequence()) - .build(); - String fasta = UniParcFastaParser.toFasta(entry); - String expected = - ">UPI0000083A08 status=inactive\n" - + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" - + "LLRAIDWFRDNGYFNA"; - assertEquals(expected, fasta); - } - -} diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java index 32ad2428d..fa410e84e 100644 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTestUtils.java @@ -1,5 +1,6 @@ package org.uniprot.core.parser.fasta.uniparc; +import org.uniprot.core.Property; import org.uniprot.core.Sequence; import org.uniprot.core.impl.SequenceBuilder; import org.uniprot.core.uniparc.*; @@ -33,7 +34,7 @@ static List getUniProtXrefs(boolean active) { List result = new ArrayList<>(); Organism human = getOrganism(9606, "Homo sapiens"); - result.add(getXref(UniParcDatabase.SWISSPROT, "P12345", active, human, null, null, "Protein Name One", "Gene Name One")); + result.add(getXref(UniParcDatabase.SWISSPROT, "P12345", active, human, null, null, "Protein Name One", "Gene Name One", new Property(UniParcCrossReference.PROPERTY_SOURCES, "ABC01415:UP000005640:Chromosome 1"))); return result; } @@ -47,10 +48,14 @@ static UniParcCrossReference getXref(UniParcDatabase database, String id, boolea } static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism, String proteomeId, String component) { - return getXref(database,id, active, organism, proteomeId, component, null, null); + return getXref(database,id, active, organism, proteomeId, component, null, null,null); } - static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism, String proteomeId, String component, String proteinName, String geneName) { + static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism, String geneName, String proteinName, Property property) { + return getXref(database,id, active, organism, null, null, proteinName, geneName, property); + } + + static UniParcCrossReference getXref(UniParcDatabase database, String id, boolean active, Organism organism, String proteomeId, String component, String proteinName, String geneName, Property property) { return new UniParcCrossReferenceBuilder() .database(database) .id(id) @@ -60,6 +65,7 @@ static UniParcCrossReference getXref(UniParcDatabase database, String id, boolea .geneName(geneName) .proteomeId(proteomeId) .component(component) + .propertiesAdd(property) .build(); } } diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java index b9a115d32..a51686d1e 100644 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java @@ -1,6 +1,7 @@ package org.uniprot.core.parser.fasta.uniparc; import org.junit.jupiter.api.Test; +import org.uniprot.core.Property; import org.uniprot.core.uniparc.UniParcCrossReference; import org.uniprot.core.uniparc.UniParcDatabase; import org.uniprot.core.uniparc.UniParcEntry; @@ -17,12 +18,16 @@ class UniParcProteomeFastaParserTest { + private static final Property source1 = new Property(UniParcCrossReference.PROPERTY_SOURCES, "ABC01416:UP000005640:Chromosome 1"); + private static final Property source2 = new Property(UniParcCrossReference.PROPERTY_SOURCES, "ABC01417:UP000005640:Chromosome 2"); + private static final Organism humanOrganism = getOrganism(9606L, "Homo Sapiens"); + @Test void toFastaFullSingleValues() { UniParcEntry entry = create(); String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = - ">UPI0000083A08|EMBL:CQR81549 UP=UP000005640:Chromosome 1 OX=9606 OS=Homo Sapiens AC=P12345\n" + + ">UPI0000083A08 Protein Name One OS=Homo sapiens OX=9606 GN=Gene Name One AC=P12345 SS=ABC01415 PC=UP000005640:Chromosome 1\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + "LLRAIDWFRDNGYFNA"; assertEquals(expected, fasta); @@ -33,10 +38,9 @@ void toFastaFullMultiValues() { String proteomeId = "UP000005640"; Organism organism = getOrganism(9606L, "Homo Sapiens"); List xrefs = new ArrayList<>(); - xrefs.add(getXref(UniParcDatabase.SWISSPROT, "P21802", true, organism, null, null, "Protein Name 1", "Gene Name1")); - xrefs.add(getXref(UniParcDatabase.TREMBL, "P12345", true, organism, null, null, "Protein Name 2", "Gene Name2")); + xrefs.add(getXref(UniParcDatabase.SWISSPROT, "P21802", true, organism, null, null, "Protein Name 1", "Gene Name1", source1 )); + xrefs.add(getXref(UniParcDatabase.TREMBL, "P12345", true, organism, null, null, "Protein Name 2", "Gene Name2", source2)); xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, organism, proteomeId, "C1")); - xrefs.add(getXref(UniParcDatabase.EMBL_TPA, "XP54321", true, organism, proteomeId, "C2")); UniParcEntry entry = new UniParcEntryBuilder() .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) .uniParcCrossReferencesSet(xrefs) @@ -44,18 +48,19 @@ void toFastaFullMultiValues() { .build(); String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = - ">UPI0000083A09|EMBL_CON:XP12345|EMBL_TPA:XP54321 UP=UP000005640:C1|C2 OX=9606 OS=Homo Sapiens AC=P21802|P12345\n" + + ">UPI0000083A09 Protein Name 1|Protein Name 2 OS=Homo Sapiens OX=9606 GN=Gene Name1|Gene Name2 AC=P21802|P12345 SS=ABC01416|ABC01417 PC=UP000005640:Chromosome 1|Chromosome 2\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + "LLRAIDWFRDNGYFNA"; assertEquals(expected, fasta); } @Test - void toFastaWithoutAccessions() { + void toFastaFullAccessionActiveOnly() { String proteomeId = "UP000005640"; Organism organism = getOrganism(9606L, "Homo Sapiens"); List xrefs = new ArrayList<>(); - xrefs.add(getXref(UniParcDatabase.SWISSPROT_VARSPLIC, "P21802-1", true, organism)); + xrefs.add(getXref(UniParcDatabase.SWISSPROT, "P21802", true, organism, null, null, "Protein Name 1", "Gene Name1", source1 )); + xrefs.add(getXref(UniParcDatabase.TREMBL, "P12345", false, organism, null, null, "Protein Name 2", "Gene Name2", source2)); xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, organism, proteomeId, "C1")); UniParcEntry entry = new UniParcEntryBuilder() .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) @@ -64,18 +69,17 @@ void toFastaWithoutAccessions() { .build(); String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = - ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C1 OX=9606 OS=Homo Sapiens\n" + + ">UPI0000083A09 Protein Name 1 OS=Homo Sapiens OX=9606 GN=Gene Name1 AC=P21802 SS=ABC01416 PC=UP000005640:Chromosome 1\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + "LLRAIDWFRDNGYFNA"; assertEquals(expected, fasta); } @Test - void toFastaWithoutComponent() { + void toFastaWithoutAccessions() { String proteomeId = "UP000005640"; - Organism organism = getOrganism(9606L, "Homo Sapiens"); List xrefs = new ArrayList<>(); - xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, organism, proteomeId, null)); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, humanOrganism, proteomeId, "C1")); UniParcEntry entry = new UniParcEntryBuilder() .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) .uniParcCrossReferencesSet(xrefs) @@ -83,17 +87,17 @@ void toFastaWithoutComponent() { .build(); String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = - ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640 OX=9606 OS=Homo Sapiens\n" + + ">UPI0000083A09 OS=Homo Sapiens OX=9606 SS=XP12345 PC=UP000005640:C1\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + "LLRAIDWFRDNGYFNA"; assertEquals(expected, fasta); } @Test - void toFastaWithoutOrganism() { + void toFastaWithoutComponent() { String proteomeId = "UP000005640"; List xrefs = new ArrayList<>(); - xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, null, proteomeId, "C8")); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, humanOrganism, proteomeId, null)); UniParcEntry entry = new UniParcEntryBuilder() .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) .uniParcCrossReferencesSet(xrefs) @@ -101,17 +105,17 @@ void toFastaWithoutOrganism() { .build(); String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = - ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C8\n" + + ">UPI0000083A09 OS=Homo Sapiens OX=9606 SS=XP12345 PC=UP000005640\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + "LLRAIDWFRDNGYFNA"; assertEquals(expected, fasta); } @Test - void toFastaWithoutSource() { + void toFastaWithoutOrganism() { String proteomeId = "UP000005640"; List xrefs = new ArrayList<>(); - xrefs.add(getXref(UniParcDatabase.PDB, "PDB12345", true, null, proteomeId, "C9")); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, null, proteomeId, "C8")); UniParcEntry entry = new UniParcEntryBuilder() .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) .uniParcCrossReferencesSet(xrefs) @@ -119,17 +123,17 @@ void toFastaWithoutSource() { .build(); String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = - ">UPI0000083A09 UP=UP000005640:C9\n" + + ">UPI0000083A09 SS=XP12345 PC=UP000005640:C8\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + "LLRAIDWFRDNGYFNA"; assertEquals(expected, fasta); } @Test - void toFastaFilterInactiveSources() { + void toFastaActiveOnlySources() { String proteomeId = "UP000005640"; List xrefs = new ArrayList<>(); - xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, null, proteomeId, "C5")); + xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP12345", true, humanOrganism, proteomeId, "C5")); xrefs.add(getXref(UniParcDatabase.EMBL_CON, "XP54321", false, null, proteomeId, "C3")); UniParcEntry entry = new UniParcEntryBuilder() .uniParcId(new UniParcIdBuilder("UPI0000083A09").build()) @@ -138,7 +142,7 @@ void toFastaFilterInactiveSources() { .build(); String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = - ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C5\n" + + ">UPI0000083A09 OS=Homo Sapiens OX=9606 SS=XP12345 PC=UP000005640:C5\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + "LLRAIDWFRDNGYFNA"; assertEquals(expected, fasta); From 606d6ef1a22065f45d37a38676348ba7d588c19e Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Tue, 3 Dec 2024 13:21:16 +0000 Subject: [PATCH 11/13] improve uniparc xref voldemort index --- ...staParser.java => UniParcFastaParser.java} | 18 ++- .../UniParcEntryLightFastaParserTest.java | 49 ------- .../fasta/uniparc/UniParcFastaParserTest.java | 123 ++++++++++++++++++ .../xml/CrossReferenceConverterUtils.java | 5 + .../UniParcDBCrossReferenceConverter.java | 8 -- 5 files changed, 144 insertions(+), 59 deletions(-) rename core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/{UniParcEntryLightFastaParser.java => UniParcFastaParser.java} (68%) delete mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java create mode 100644 core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java similarity index 68% rename from core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParser.java rename to core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java index 36d76b553..5bd0e2f06 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java @@ -1,6 +1,8 @@ package org.uniprot.core.parser.fasta.uniparc; import org.uniprot.core.Sequence; +import org.uniprot.core.uniparc.UniParcCrossReference; +import org.uniprot.core.uniparc.UniParcEntry; import org.uniprot.core.uniparc.UniParcEntryLight; import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF; @@ -9,8 +11,20 @@ * @author jluo * @date: 24 Jun 2019 */ -public class UniParcEntryLightFastaParser { - private UniParcEntryLightFastaParser(){} +public class UniParcFastaParser { + private UniParcFastaParser(){} + + + public static String toFasta(UniParcEntry entry) { + String status = "active"; + boolean isActive = entry.getUniParcCrossReferences() + .stream() + .anyMatch(UniParcCrossReference::isActive); + if (!isActive) { + status = "inactive"; + } + return getFastaString(entry.getUniParcId().getValue(), status, entry.getSequence()); + } public static String toFasta(UniParcEntryLight entry) { String status = "active"; diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java deleted file mode 100644 index 0efdeb8a4..000000000 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcEntryLightFastaParserTest.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.uniprot.core.parser.fasta.uniparc; - -import static org.junit.jupiter.api.Assertions.*; -import static org.uniprot.core.parser.fasta.uniparc.UniParcFastaParserTestUtils.*; -import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF; - -import org.junit.jupiter.api.Test; -import org.uniprot.core.uniparc.*; -import org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder; - -/** - * @author jluo - * @date: 24 Jun 2019 - */ -class UniParcEntryLightFastaParserTest { - - public static final String EXPECTED_FASTA_RESULT = """ - >UPI0000083A08 status=active - MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT - LLRAIDWFRDNGYFNA"""; - public static final String EXPECTED_FASTA_RESULT_INACTIVE = """ - >UPI0000083A08 status=inactive - MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT - LLRAIDWFRDNGYFNA"""; - - - @Test - void testUniParcEntryLightToFasta() { - UniParcEntryLight entry = createEntryLight(); - String fasta = UniParcEntryLightFastaParser.toFasta(entry); - assertEquals(EXPECTED_FASTA_RESULT, fasta); - } - - @Test - void testUniParcEntryLightToFastaInactive() { - UniParcEntryLight entry = createEntryLight(); - entry = UniParcEntryLightBuilder.from(entry).extraAttributesAdd(HAS_ACTIVE_CROSS_REF, false).build(); - String fasta = UniParcEntryLightFastaParser.toFasta(entry); - assertEquals(EXPECTED_FASTA_RESULT_INACTIVE, fasta); - } - - private UniParcEntryLight createEntryLight() { - return new UniParcEntryLightBuilder() - .uniParcId("UPI0000083A08") - .sequence(getSequence()) - .build(); - } - -} diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java new file mode 100644 index 000000000..5a92563b8 --- /dev/null +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParserTest.java @@ -0,0 +1,123 @@ +package org.uniprot.core.parser.fasta.uniparc; + +import static org.junit.jupiter.api.Assertions.*; +import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF; + +import org.junit.jupiter.api.Test; +import org.uniprot.core.Property; +import org.uniprot.core.Sequence; +import org.uniprot.core.impl.SequenceBuilder; +import org.uniprot.core.uniparc.*; +import org.uniprot.core.uniparc.impl.*; +import org.uniprot.core.uniprotkb.taxonomy.Organism; +import org.uniprot.core.uniprotkb.taxonomy.impl.OrganismBuilder; + +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * @author jluo + * @date: 24 Jun 2019 + */ +class UniParcFastaParserTest { + + public static final String EXPECTED_FASTA_RESULT = """ + >UPI0000083A08 status=active + MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT + LLRAIDWFRDNGYFNA"""; + public static final String EXPECTED_FASTA_RESULT_INACTIVE = """ + >UPI0000083A08 status=inactive + MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT + LLRAIDWFRDNGYFNA"""; + @Test + void testUniParcEntryToFasta() { + UniParcEntry entry = create(); + String fasta = UniParcFastaParser.toFasta(entry); + assertEquals(EXPECTED_FASTA_RESULT, fasta); + } + + @Test + void testUniParcEntryLightToFasta() { + UniParcEntryLight entry = createEntryLight(); + String fasta = UniParcFastaParser.toFasta(entry); + assertEquals(EXPECTED_FASTA_RESULT, fasta); + } + + @Test + void testUniParcEntryLightToFastaInactive() { + UniParcEntryLight entry = createEntryLight(); + entry = UniParcEntryLightBuilder.from(entry).extraAttributesAdd(HAS_ACTIVE_CROSS_REF, false).build(); + String fasta = UniParcFastaParser.toFasta(entry); + assertEquals(EXPECTED_FASTA_RESULT_INACTIVE, fasta); + } + + private UniParcEntry create() { + Sequence sequence = getSequence(); + List xrefs = getXrefs(); + return new UniParcEntryBuilder() + .uniParcId(new UniParcIdBuilder("UPI0000083A08").build()) + .uniParcCrossReferencesSet(xrefs) + .sequence(sequence) + .build(); + } + + private UniParcEntryLight createEntryLight() { + return new UniParcEntryLightBuilder() + .uniParcId("UPI0000083A08") + .sequence(getSequence()) + .build(); + } + + private static Sequence getSequence() { + String seq = + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT" + "LLRAIDWFRDNGYFNA"; + Sequence sequence = new SequenceBuilder(seq).build(); + return sequence; + } + + private List getXrefs() { + Organism taxonomy = + new OrganismBuilder().taxonId(9606).scientificName("Homo sapiens").build(); + List properties = new ArrayList<>(); + properties.add(new Property("prop1", "pvalue")); + UniParcCrossReference xref = + new UniParcCrossReferenceBuilder() + .versionI(3) + .database(UniParcDatabase.SWISSPROT) + .id("P12345") + .version(7) + .active(true) + .created(LocalDate.of(2017, 5, 17)) + .lastUpdated(LocalDate.of(2017, 2, 27)) + .propertiesSet(properties) + .organism(taxonomy) + .proteinName("some pname") + .geneName("some gname") + .build(); + + List properties2 = new ArrayList<>(); + properties.add(new Property("prop2", "pvalue2")); + Organism taxonomy2 = new OrganismBuilder().taxonId(10090).scientificName("MOUSE").build(); + + UniParcCrossReference xref2 = + new UniParcCrossReferenceBuilder() + .versionI(1) + .database(UniParcDatabase.TREMBL) + .id("P52346") + .version(7) + .active(true) + .created(LocalDate.of(2017, 2, 12)) + .lastUpdated(LocalDate.of(2017, 4, 23)) + .propertiesSet(properties2) + .organism(taxonomy2) + .proteinName("some pname") + .proteomeId("UP00000564") + .component("chromosome 1") + .build(); + + return Arrays.asList(xref, xref2); + } + +} diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/CrossReferenceConverterUtils.java b/xml-parser/src/main/java/org/uniprot/core/xml/CrossReferenceConverterUtils.java index c9ca29d03..dc0cefd68 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/CrossReferenceConverterUtils.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/CrossReferenceConverterUtils.java @@ -1,6 +1,7 @@ package org.uniprot.core.xml; import com.google.common.base.Strings; +import org.uniprot.core.uniparc.UniParcCrossReference; import org.uniprot.core.uniparc.impl.UniParcCrossReferenceBuilder; import org.uniprot.core.uniprotkb.taxonomy.Organism; import org.uniprot.core.uniprotkb.taxonomy.impl.OrganismBuilder; @@ -18,6 +19,7 @@ public class CrossReferenceConverterUtils { public static final String PROPERTY_COMPONENT = "component"; public static final String PROPERTY_NCBI_TAXONOMY_ID = "NCBI_taxonomy_id"; public static final String PROPERTY_UNIPROTKB_ACCESSION = "UniProtKB_accession"; + public static final String PROPERTY_SOURCES = UniParcCrossReference.PROPERTY_SOURCES; private CrossReferenceConverterUtils(){} @@ -47,6 +49,9 @@ public static void populateUniParcCrossReferenceBuilder(String propertyType, Str case PROPERTY_UNIPROTKB_ACCESSION: builder.propertiesAdd(PROPERTY_UNIPROTKB_ACCESSION, propertyValue); break; + case PROPERTY_SOURCES: + builder.propertiesAdd(PROPERTY_SOURCES, propertyValue); + break; default: throw new XmlReaderException( "Unable to read xml property: " diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java b/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java index 77c631e8f..4da7e0502 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java @@ -32,7 +32,6 @@ public class UniParcDBCrossReferenceConverter public static final String PROPERTY_COMPONENT = "component"; public static final String PROPERTY_NCBI_TAXONOMY_ID = "NCBI_taxonomy_id"; public static final String PROPERTY_UNIPROTKB_ACCESSION = "UniProtKB_accession"; - public static final String PROPERTY_SOURCE = "source"; private final ObjectFactory xmlFactory; private final TaxonomyRepo taxonomyRepo; @@ -57,13 +56,6 @@ public UniParcCrossReference fromXml(DbReferenceType xmlObj) { .lastUpdated(XmlConverterHelper.dateFromXml(xmlObj.getLast())); for (PropertyType property : xmlObj.getProperty()) { - - //TODO: Change it - /* - case PROPERTY_SOURCE: - builder.propertiesAdd(PROPERTY_SOURCE, property.getValue()); - break; - */ CrossReferenceConverterUtils.populateUniParcCrossReferenceBuilder(property.getType(), property.getValue(), builder, taxonomyRepo); } if (xmlObj.getVersion() != null) builder.version(xmlObj.getVersion()); From 47003db44e4e1fcd2c43ed3b2fd5f435a3fb9edd Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Fri, 6 Dec 2024 13:48:25 +0000 Subject: [PATCH 12/13] code format --- .../fasta/uniparc/UniParcProteomeFastaParser.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index d8863d473..9d1b1609d 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -29,6 +29,9 @@ public static String toFasta(UniParcEntry entry) { for(UniParcCrossReference xref: entry.getUniParcCrossReferences()){ if(uniProtDatabases.contains(xref.getDatabase())){ uniProtXrefs.add(xref); + if(proteomeId == null && xref.hasProperties()){ + proteomeId = getProteomeId(xref.getProperties().get(0)); + } if(xref.isActive()){ hasActive = true; } @@ -51,6 +54,18 @@ public static String toFasta(UniParcEntry entry) { return sb.toString(); } + private static String getProteomeId(Property property) { + String result = null; + String[] sourcePropertyValues = property.getValue().split(","); + if(sourcePropertyValues.length == 1){ + String[] propertyValue = sourcePropertyValues[0].split(":"); + if(propertyValue.length > 1){ + result = propertyValue[1]; + } + } + return result; + } + private static StringBuilder getFastaHeader(List xrefs, boolean hasActive, String id, String proteomeId, boolean isSource) { Set proteinName = new LinkedHashSet<>(); Set geneNames = new LinkedHashSet<>(); From 6e1c554beb3dbceec93958a2bbabbd94dad706dc Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Mon, 9 Dec 2024 10:59:14 +0000 Subject: [PATCH 13/13] improve fasta details --- .../core/parser/fasta/uniparc/UniParcFastaParser.java | 10 ++-------- .../xml/uniparc/UniParcDBCrossReferenceConverter.java | 10 ---------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java index 5bd0e2f06..7b375c6d5 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcFastaParser.java @@ -5,6 +5,7 @@ import org.uniprot.core.uniparc.UniParcEntry; import org.uniprot.core.uniparc.UniParcEntryLight; +import static org.uniprot.core.parser.fasta.FastaUtils.*; import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF; /** @@ -40,14 +41,7 @@ private static String getFastaString(String entry, String status, Sequence seque sb.append(">").append(entry).append(" "); sb.append("status=").append(status); sb.append("\n"); - int columnCounter = 0; - for (char c : sequence.getValue().toCharArray()) { - if (columnCounter % 60 == 0 && columnCounter > 0) { - sb.append("\n"); - } - sb.append(c); - columnCounter++; - } + sb.append(parseSequence(sequence.getValue())); return sb.toString(); } } diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java b/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java index 4da7e0502..381caa0c1 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/uniparc/UniParcDBCrossReferenceConverter.java @@ -23,16 +23,6 @@ */ public class UniParcDBCrossReferenceConverter implements Converter { - - public static final String PROPERTY_GENE_NAME = "gene_name"; - public static final String PROPERTY_PROTEIN_NAME = "protein_name"; - public static final String PROPERTY_CHAIN = "chain"; - public static final String PROPERTY_NCBI_GI = "NCBI_GI"; - public static final String PROPERTY_PROTEOME_ID = "proteome_id"; - public static final String PROPERTY_COMPONENT = "component"; - public static final String PROPERTY_NCBI_TAXONOMY_ID = "NCBI_taxonomy_id"; - public static final String PROPERTY_UNIPROTKB_ACCESSION = "UniProtKB_accession"; - private final ObjectFactory xmlFactory; private final TaxonomyRepo taxonomyRepo;