Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trm 30823 UniParc proteome redundant fasta #249

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
5ffb871
First Version of FASTA with sequence sources
LeonardoGonzales Mar 12, 2024
805dd62
Updated second Version of FASTA with sequence sources
LeonardoGonzales May 9, 2024
d4fb112
Merge branch 'main' into TRM-30823-REDUNDANT-FASTA
LeonardoGonzales May 9, 2024
5d93997
Add exploratory options source ids and proteome and proteome componen…
LeonardoGonzales May 14, 2024
65f50d6
Merge branch 'main' into TRM-30823-REDUNDANT-FASTA
LeonardoGonzales May 14, 2024
5f11138
Fix exploratory options source ids and proteome and proteome componen…
LeonardoGonzales May 15, 2024
ac82972
Merge branch 'main' into TRM-30823-REDUNDANT-FASTA
LeonardoGonzales May 15, 2024
01e806c
add SWISSPROT_VARSPLIC database
LeonardoGonzales May 15, 2024
7a6b247
Merge branch 'main' into TRM-30823-REDUNDANT-FASTA
LeonardoGonzales Jun 6, 2024
2cb4648
Add stream to uniparc proteome endpoint
LeonardoGonzales Jun 13, 2024
cad6c55
Merge branch 'main' into TRM-30823-REDUNDANT-FASTA
LeonardoGonzales Aug 22, 2024
d3c669a
Merge branch 'uniparc_light_vd_v2' into TRM-30823-REDUNDANT-FASTA
LeonardoGonzales Aug 27, 2024
fc381eb
Merge changes
LeonardoGonzales Aug 27, 2024
149a8cf
add sequence source for proteome fasta
LeonardoGonzales Sep 3, 2024
01539d9
Merge branch 'uniparc_light_vd_v2' into TRM-30823-REDUNDANT-FASTA_TST
LeonardoGonzales Sep 24, 2024
335954d
Improvements in the fasta Format parser and sources attribute
LeonardoGonzales Oct 1, 2024
97d68a9
Merge branch 'main' into TRM-30823-REDUNDANT-FASTA_TST
LeonardoGonzales Nov 25, 2024
a13cefc
improve test in UniParcProteomeFastaParser.java
LeonardoGonzales Nov 28, 2024
606d6ef
improve uniparc xref voldemort index
LeonardoGonzales Dec 3, 2024
47003db
code format
LeonardoGonzales Dec 6, 2024
6e1c554
improve fasta details
LeonardoGonzales Dec 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
*/
public interface UniParcCrossReference extends CrossReference<UniParcDatabase> {

Copy link
Contributor

@ahmadshadab ahmadshadab Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add formula and an example how we construct source. :: and eg. ABC01415:UP000005640:Chromosome 1

public static final String PROPERTY_SOURCES = "sources";

int getVersionI();

Integer getVersion();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,92 +6,99 @@
import org.uniprot.core.util.EnumDisplay;

public enum UniParcDatabase implements Database, EnumDisplay {
EG_BACTERIA(900, "EnsemblBacteria", true, "https://www.ensemblgenomes.org/id/%id"),
EG_FUNGI(1000, "EnsemblFungi", true, "https://www.ensemblgenomes.org/id/%id"),
EG_METAZOA(1100, "EnsemblMetazoa", true, "https://www.ensemblgenomes.org/id/%id"),
EG_PLANTS(1200, "EnsemblPlants", true, "https://www.ensemblgenomes.org/id/%id"),
EG_PROTISTS(1300, "EnsemblProtists", true, "https://www.ensemblgenomes.org/id/%id"),

EMBL(300, "EMBL", true, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBL_CON(400, "EMBL_CON", true, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBL_TPA(500, "EMBL_TPA", false, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBL_TSA(600, "EMBL_TSA", true, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBLWGS(700, "EMBLWGS", true, "https://www.ebi.ac.uk/ena/browser/view/%id"),

ENSEMBL_VERTEBRATE(800, "Ensembl", true, "https://www.ensembl.org/id/%id"),
ENSEMBL_RAPID(1350, "EnsemblRapid", true, "https://rapid.ensembl.org/id/%id"),

EPO(1400, "EPO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=epo_prt&id=%id"),
FLYBASE(1500, "FlyBase", true, "https://flybase.org/reports/%id.html"),
EG_BACTERIA(900, "EnsemblBacteria", true, true, "https://www.ensemblgenomes.org/id/%id"),
EG_FUNGI(1000, "EnsemblFungi", true, true, "https://www.ensemblgenomes.org/id/%id"),
EG_METAZOA(1100, "EnsemblMetazoa", true, true, "https://www.ensemblgenomes.org/id/%id"),
EG_PLANTS(1200, "EnsemblPlants", true, true, "https://www.ensemblgenomes.org/id/%id"),
EG_PROTISTS(1300, "EnsemblProtists", true, true, "https://www.ensemblgenomes.org/id/%id"),

EMBL(300, "EMBL", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBL_CON(400, "EMBL_CON", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBL_TPA(500, "EMBL_TPA", false, true, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBL_TSA(600, "EMBL_TSA", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"),
EMBLWGS(700, "EMBLWGS", true, true, "https://www.ebi.ac.uk/ena/browser/view/%id"),

ENSEMBL_VERTEBRATE(800, "Ensembl", true, true, "https://www.ensembl.org/id/%id"),
ENSEMBL_RAPID(1350, "EnsemblRapid", true, true, "https://rapid.ensembl.org/id/%id"),

EPO(1400, "EPO", true, false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=epo_prt&id=%id"),
FLYBASE(1500, "FlyBase", true,false, "https://flybase.org/reports/%id.html"),
FUSION_GDB(
1550,
"FusionGDB",
true,
false,
"https://compbio.uth.edu/FusionGDB2/gene_search_result.cgi?type=quick_search&quick_search=%id"),

H_INV(1600, "H-InvDB", false),
IPI(1700, "IPI", false),
H_INV(1600, "H-InvDB",false, false),
IPI(1700, "IPI",false, false),

JPO(1800, "JPO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=jpo_prt&id=%id"),
KIPO(1900, "KIPO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=kipo_prt&id=%id"),
PATRIC(2000, "PATRIC", true, "https://www.patricbrc.org/view/Feature/%id"),
JPO(1800, "JPO", true,false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=jpo_prt&id=%id"),
KIPO(1900, "KIPO", true,false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=kipo_prt&id=%id"),
PATRIC(2000, "PATRIC", true,false, "https://www.patricbrc.org/view/Feature/%id"),
PDB(
2100,
"PDB",
true,
false,
"https://www.ebi.ac.uk/pdbe/entry/pdb/%id"), // need to remove the chain, eg "4q8n_A",
// just use "4q8n" as id
PIR(2200, "PIR", false),
PIR(2200, "PIR",false, false),

PIRARC(2300, "PIRARC", false),
PRF(2400, "PRF", false, "http://www.prf.or.jp/cgi-bin/seqget.pl?id=%id"),
REFSEQ(2500, "RefSeq", true, "https://www.ncbi.nlm.nih.gov/protein/%id"),
REMTREMBL(2600, "REMTREMBL", false),
PIRARC(2300, "PIRARC",false, false),
PRF(2400, "PRF",false, false, "http://www.prf.or.jp/cgi-bin/seqget.pl?id=%id"),
REFSEQ(2500, "RefSeq", true, true,"https://www.ncbi.nlm.nih.gov/protein/%id"),
REMTREMBL(2600, "REMTREMBL",false, false),
SEED(
2700,
"SEED",
true,
false,
"https://pubseed.theseed.org/seedviewer.cgi?page=Annotation&feature=%id"),

SGD(2800, "SGD", true, "https://www.yeastgenome.org/locus/%id"),
SWISSPROT(100, "UniProtKB/Swiss-Prot", true, "https://www.uniprot.org/uniprot/%id"),
SGD(2800, "SGD", true, false,"https://www.yeastgenome.org/locus/%id"),
SWISSPROT(100, "UniProtKB/Swiss-Prot", true, false,"https://www.uniprot.org/uniprot/%id"),
SWISSPROT_VARSPLIC(
200,
"UniProtKB/Swiss-Prot protein isoforms",
true,
false,
"https://www.uniprot.org/uniprot/%id"), // swissprot isoform
TAIR_ARABIDOPSIS(
2900,
"TAIR",
true,
false,
"https://www.arabidopsis.org/servlets/TairObject?type=aa_sequence&name=%id"),
TREMBL(100, "UniProtKB/TrEMBL", true, "https://www.uniprot.org/uniprot/%id"),
TREMBL(100, "UniProtKB/TrEMBL", true, false, "https://www.uniprot.org/uniprot/%id"),

TREMBLNEW(3000, "TREMBLNEW", false),
TREMBL_VARSPLIC(3100, "TREMBL_VARSPLIC", false),
TROME(3200, "TROME", true), // no link
UNIMES(3300, "UNIMES", false),
USPTO(3400, "USPTO", true, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uspto_prt&id=%id"),
TREMBLNEW(3000, "TREMBLNEW", false, false),
TREMBL_VARSPLIC(3100, "TREMBL_VARSPLIC", false, false),
TROME(3200, "TROME", true, false), // no link
UNIMES(3300, "UNIMES", false, false),
USPTO(3400, "USPTO", true, false, "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=uspto_prt&id=%id"),

VECTORBASE(3500, "VectorBase", false),
VEGA(3600, "VEGA", true, "https://vega.sanger.ac.uk/id/%id"),
WORMBASE_PARASITE(3700, "WBParaSite", true, "https://parasite.wormbase.org/id/%id"),
WORMBASE(3800, "WormBase", true, "https://wormbase.org/db/seq/protein?name=%id;class=CDS");
VECTORBASE(3500, "VectorBase", false, false),
VEGA(3600, "VEGA", true, false, "https://vega.sanger.ac.uk/id/%id"),
WORMBASE_PARASITE(3700, "WBParaSite", true, true, "https://parasite.wormbase.org/id/%id"),
WORMBASE(3800, "WormBase", true, true,"https://wormbase.org/db/seq/protein?name=%id;class=CDS");

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we know if a database is source? Could you please add source of this information in this class?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jie provided the list of databases that are sources in UniParc.

private final String displayName;
private final boolean alive;
private final String url;
private final int index;
private final boolean source;

UniParcDatabase(int index, String displayName, boolean alive) {
this(index, displayName, alive, "");
UniParcDatabase(int index, String displayName, boolean alive, boolean source) {
this(index, displayName, alive, source, "");
}

UniParcDatabase(int index, String displayName, boolean alive, String url) {
UniParcDatabase(int index, String displayName, boolean alive, boolean source,String url) {
this.index = index;
this.displayName = displayName;
this.alive = alive;
this.url = url;
this.source = source;
}

public int getIndex() {
Expand All @@ -110,6 +117,10 @@ public String getUrl() {
return url;
}

public boolean isSource() {
return source;
}

public static @Nonnull UniParcDatabase typeOf(@Nonnull String displayName) {
return EnumDisplay.typeOf(displayName, UniParcDatabase.class);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,16 @@ void hasNoUrl() {
assertEquals("", UniParcDatabase.VECTORBASE.getUrl());
}

@Test
void isSource() {
assertTrue( UniParcDatabase.EMBL.isSource());
}

@Test
void isNotSource() {
assertFalse( UniParcDatabase.FLYBASE.isSource());
}

@Test
void canGetIndex() {
assertEquals(100, UniParcDatabase.SWISSPROT.getIndex());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package org.uniprot.core.parser.fasta;

public class FastaUtils {

public static String parseSequence(String sequence) {
StringBuilder sb = new StringBuilder();
int columnCounter = 0;
for (char c : sequence.toCharArray()) {
if (columnCounter % 60 == 0 && columnCounter > 0) {
sb.append("\n");
}
sb.append(c);
columnCounter++;
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import org.uniprot.core.uniref.UniRefEntry;
import org.uniprot.core.uniref.UniRefEntryLight;

import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence;

/**
* @author jluo
* @date: 22 Aug 2019
Expand All @@ -15,30 +17,14 @@ private UniRefFastaParser() {}
public static String toFasta(UniRefEntryLight entry) {
StringBuilder sb = new StringBuilder();
sb.append(getHeader(entry)).append("\n");
int columnCounter = 0;
String sequence = entry.getRepresentativeMember().getSequence().getValue();
for (char c : sequence.toCharArray()) {
if (columnCounter % 60 == 0 && columnCounter > 0) {
sb.append("\n");
}
sb.append(c);
columnCounter++;
}
sb.append(parseSequence(entry.getRepresentativeMember().getSequence().getValue()));
return sb.toString();
}

public static String toFasta(UniRefEntry entry) {
StringBuilder sb = new StringBuilder();
sb.append(getHeader(entry)).append("\n");
String sequence = entry.getRepresentativeMember().getSequence().getValue();
int columnCounter = 0;
for (char c : sequence.toCharArray()) {
if (columnCounter % 60 == 0 && columnCounter > 0) {
sb.append("\n");
}
columnCounter++;
sb.append(c);
}
sb.append(parseSequence(entry.getRepresentativeMember().getSequence().getValue()));
return sb.toString();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package org.uniprot.core.parser.fasta;
package org.uniprot.core.parser.fasta.uniparc;

import org.uniprot.core.Sequence;
import org.uniprot.core.uniparc.UniParcCrossReference;
import org.uniprot.core.uniparc.UniParcEntry;
import org.uniprot.core.uniparc.UniParcEntryLight;

import static org.uniprot.core.parser.fasta.FastaUtils.*;
import static org.uniprot.core.uniparc.impl.UniParcEntryLightBuilder.HAS_ACTIVE_CROSS_REF;

/**
Expand All @@ -14,6 +15,7 @@
public class UniParcFastaParser {
private UniParcFastaParser(){}


public static String toFasta(UniParcEntry entry) {
String status = "active";
boolean isActive = entry.getUniParcCrossReferences()
Expand All @@ -39,14 +41,7 @@ private static String getFastaString(String entry, String status, Sequence seque
sb.append(">").append(entry).append(" ");
sb.append("status=").append(status);
sb.append("\n");
int columnCounter = 0;
for (char c : sequence.getValue().toCharArray()) {
if (columnCounter % 60 == 0 && columnCounter > 0) {
sb.append("\n");
}
sb.append(c);
columnCounter++;
}
sb.append(parseSequence(sequence.getValue()));
return sb.toString();
}
}
Loading
Loading