-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #91 from cdanielmachado/development
integrate development changes
- Loading branch information
Showing
48 changed files
with
747,502 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
import pandas as pd | ||
from reframed import load_cbmodel | ||
from carveme.reconstruction.utils import load_media_db | ||
from carveme.reconstruction.benchmark import benchmark_biolog, benchmark_essentiality, mcc | ||
from carveme import project_dir | ||
from subprocess import call | ||
import argparse | ||
|
||
|
||
organisms = { | ||
'bsub': 'Bacillus subtilis (168)', | ||
'ecol': 'Escherichia coli (K-12 MG1655)', | ||
'mgen': 'Mycoplasma genitalium (G-37)', | ||
'paer': 'Pseudomonas aeruginosa (PA01)', | ||
'rsol': 'Ralstonia solenacearum (GMI1000)', | ||
'sone': 'Shewanella oneidensis (MR-1)' | ||
} | ||
|
||
genomes = { | ||
'bsub': 'Bsubtilis_168.faa', | ||
'ecol': 'Ecoli_K12_MG1655.faa', | ||
'mgen': 'M_genitalium_G37.faa', | ||
'paer': 'Paeruginosa_PAO1.faa', | ||
'rsol': 'Rsolanacearum_GMI1000.faa', | ||
'sone': 'Soneidensis_MR1.faa' | ||
} | ||
|
||
data_path = project_dir + '/data/benchmark' | ||
|
||
biolog_media = { | ||
'bsub': 'M9', | ||
'ecol': 'M9', | ||
'paer': 'M9', | ||
'rsol': 'M9', | ||
'sone': 'ShewMM' | ||
} | ||
|
||
essentiality_media = { | ||
'bsub': 'LB', | ||
'ecol': 'M9', | ||
'mgen': None, | ||
'paer': 'M9[succ]', | ||
'sone': 'LB' | ||
} | ||
|
||
biolog_sources = { | ||
'bsub': ['C', 'N', 'P', 'S'], | ||
'ecol': ['C', 'N', 'P', 'S'], | ||
'paer': ['C'], | ||
'rsol': ['C', 'N', 'P', 'S'], | ||
'sone': ['C', 'N'], | ||
} | ||
|
||
elements = { | ||
'C': 'carbon', | ||
'N': 'nitrogen', | ||
'P': 'phosphorus', | ||
'S': 'sulfur', | ||
} | ||
|
||
biolog_compounds = { | ||
'C': {'glc__D', 'lac__D', 'lac__L'}, | ||
'N': {'nh4'}, | ||
'P': {'pi'}, | ||
'S': {'so4'}, | ||
} | ||
|
||
|
||
def build_models(): | ||
for org_id, genome in genomes.items(): | ||
print(f'Carving model for {organisms[org_id]}') | ||
|
||
fasta_file = f"{data_path}/fasta/{genome}" | ||
model_file = f"{data_path}/models/{org_id}.xml" | ||
mediadb = f"{data_path}/media_db.tsv" | ||
|
||
media = set() | ||
if org_id in biolog_media and biolog_media[org_id]: | ||
media.add(biolog_media[org_id]) | ||
if org_id in essentiality_media and essentiality_media[org_id]: | ||
media.add(essentiality_media[org_id]) | ||
media = ','.join(media) | ||
|
||
gapfill = f'-g "{media}" --mediadb {mediadb}' if media else '' | ||
|
||
call(f'carve {fasta_file} -o {model_file} {gapfill} --fbc2', shell=True) | ||
|
||
|
||
def load_models(): | ||
models = {} | ||
for org_id in organisms: | ||
models[org_id] = load_cbmodel(f"{data_path}/models/{org_id}.xml", flavor='bigg') | ||
return models | ||
|
||
|
||
def load_biolog_data(): | ||
biolog_data = {} | ||
for org_id, sources in biolog_sources.items(): | ||
biolog_data[org_id] = {} | ||
for source in sources: | ||
biolog_data[org_id][source] = \ | ||
pd.read_csv(f'{data_path}/biolog/{org_id}/biolog_{elements[source]}.tsv', sep='\t') | ||
|
||
return biolog_data | ||
|
||
|
||
def load_essentiality_data(): | ||
essential = {} | ||
non_essential = {} | ||
|
||
for org_id in essentiality_media: | ||
df = pd.read_csv(f'{data_path}/essentiality/{org_id}.tsv', sep='\t') | ||
essential[org_id] = {'G_' + x for x in df.query('phenotype == "E"')['bigg_id']} | ||
non_essential[org_id] = {'G_' + x for x in df.query('phenotype == "NE"')['bigg_id']} | ||
|
||
return essential, non_essential | ||
|
||
|
||
def run_biolog_benchmark(models, biolog_data, media_db): | ||
|
||
biolog_results = [] | ||
|
||
for org_id, medium in biolog_media.items(): | ||
print(f'Running biolog benchmark for {organisms[org_id]}') | ||
model = models[org_id] | ||
|
||
for source in biolog_sources[org_id]: | ||
compounds = set(media_db[medium]) - biolog_compounds[source] | ||
data = biolog_data[org_id][source] | ||
result = benchmark_biolog(model, compounds, data) | ||
result = [(org_id, source, met, res) for met, res in result.items()] | ||
biolog_results.extend(result) | ||
|
||
return pd.DataFrame(biolog_results, columns=['org', 'source', 'met', 'value']) | ||
|
||
|
||
def run_essentiality_benchmark(models, essential, non_essential, media_db): | ||
essentiality_results = [] | ||
|
||
for org_id, medium in essentiality_media.items(): | ||
print(f'Running essentiality benchmark for {organisms[org_id]}') | ||
model = models[org_id] | ||
in_vivo = {x: True for x in essential[org_id] & set(model.genes)} | ||
if non_essential[org_id]: | ||
in_vivo.update({x: False for x in non_essential[org_id] & set(model.genes)}) | ||
else: | ||
in_vivo.update({x: False for x in set(model.genes) - set(essential[org_id])}) | ||
|
||
compounds = media_db[medium] if medium else None | ||
result = benchmark_essentiality(model, compounds, in_vivo) | ||
result = [(org_id, gene, res) for gene, res in result.items()] | ||
essentiality_results.extend(result) | ||
|
||
return pd.DataFrame(essentiality_results, columns=['org', 'gene', 'value']) | ||
|
||
|
||
def benchmark(rebuild=True, biolog=True, essentiality=True): | ||
|
||
if rebuild: | ||
build_models() | ||
|
||
models = load_models() | ||
media_db = load_media_db(f'{data_path}/media_db.tsv') | ||
|
||
if biolog: | ||
biolog_data = load_biolog_data() | ||
df_biolog = run_biolog_benchmark(models, biolog_data, media_db) | ||
df_biolog.to_csv(f'{data_path}/results/biolog.tsv', sep='\t', index=False) | ||
value = mcc(df_biolog) | ||
print(f'Biolog final MCC value: {value:.3f}') | ||
|
||
if essentiality: | ||
essential, non_essential = load_essentiality_data() | ||
df_essentiality = run_essentiality_benchmark(models, essential, non_essential, media_db) | ||
df_essentiality.to_csv(f'{data_path}/results/essentiality.tsv', sep='\t', index=False) | ||
value = mcc(df_essentiality) | ||
print(f'Essentiality final MCC value: {value:.3f}') | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Benchmark CarveMe using biolog and gene essentiality data") | ||
|
||
parser.add_argument('--skip-rebuild', action='store_true', dest='no_rebuild', | ||
help="Do not rebuild models during this call.") | ||
parser.add_argument('--skip-biolog', action='store_true', dest='no_biolog', | ||
help="Skip biolog benchmark.") | ||
parser.add_argument('--skip-essentiality', action='store_true', dest='no_essentiality', | ||
help="Skip biolog benchmark.") | ||
|
||
args = parser.parse_args() | ||
|
||
benchmark(rebuild=(not args.no_rebuild), | ||
biolog=(not args.no_biolog), | ||
essentiality=(not args.no_essentiality)) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
bigg_id compound growth seed | ||
4abut g-Amino-Butyric Acid - cpd00281 | ||
abt__L L-Arabitol + cpd00417 | ||
ac Acetic Acid ++ cpd00029 | ||
acac Acetoacetic Acid ++ cpd00142 | ||
acgam N-Acetyl-D-Glucosamine ++ cpd00122 | ||
acmana N-Acetyl-b-D-Mannosamine + cpd00492 | ||
acnam N-Acetyl-Neuraminic Acid ++ cpd00232 | ||
adn Adenosine ++ cpd00182 | ||
akg a-Keto-Glutaric Acid + cpd00024 | ||
ala__D D-Alanine ++ cpd00117 | ||
ala__L L-Alanine + cpd00035 | ||
arab__D D-Arabinose ++ cpd00185 | ||
arab__L L-Arabinose + cpd19109 | ||
arbt Arbutin ++ cpd03696 | ||
asn__L L-Asparagine ++ cpd00132 | ||
asp__L L-Aspartic Acid ++ cpd00041 | ||
btd_RR 2,3-Butanediol - | ||
cellb D-Cellobiose ++ cpd03845 | ||
cit Citric Acid + cpd00137 | ||
crn D,L-Carnitine -- cpd00266 | ||
dad_2 2`-Deoxy-Adenosine ++ cpd00438 | ||
dextrin Dextrin ++ cpd11594 | ||
dha Dihydroxy-Acetone + cpd00157 | ||
drib 2-Deoxy-D-Ribose ++ cpd01242 | ||
f6p D-Fructose-6-Phosphate ++ cpd19035 | ||
for Formic Acid + cpd00047 | ||
fru D-Fructose ++ cpd00082 | ||
fum Fumaric Acid ++ cpd00106 | ||
g1p D-Glucose-1-Phosphate ++ cpd00089 | ||
g6p D-Glucose-6-Phosphate ++ cpd00079 | ||
gal D-Galactose ++ cpd00108 | ||
galctr__D Mucic Acid ++ cpd00652 | ||
galt Dulcitol ++ cpd01171 | ||
galur D-Galacturonic Acid ++ cpd00280 | ||
gam D-Glucosamine ++ cpd00276 | ||
glc__D a-D-Glucose ++ cpd00027 | ||
glcn__D D-Gluconic Acid ++ cpd00222 | ||
glcr D-Saccharic Acid ++ cpd00609 | ||
glcur D-Glucuronic Acid ++ cpd00164 | ||
gln__L L-Glutamine ++ cpd00053 | ||
glu__L L-Glutamic Acid ++ cpd00023 | ||
glx Glyoxylic Acid + cpd00040 | ||
gly Glycine -- cpd00033 | ||
glyc Glycerol ++ cpd00100 | ||
glyc3p D,L-a-Glycerol- Phosphate ++ cpd00080 | ||
glyclt Glycolic Acid + cpd00139 | ||
glycogen Glycogen ++ cpd00155 | ||
ile__L L-Isoleucine -- cpd00322 | ||
ins Inosine ++ cpd00246 | ||
lac__L L-Lactic Acid ++ cpd00159 | ||
leu__L L-Leucine -- cpd00107 | ||
lys__L L-Lysine -- cpd00039 | ||
mal__D D-Malic Acid ++ cpd00386 | ||
mal__L D,L-Malic Acid ++ cpd00130 | ||
mal__L L-Malic Acid ++ cpd00130 | ||
malt Maltose ++ cpd00179 | ||
malttr Maltotriose ++ cpd01262 | ||
man D-Mannose ++ cpd00138 | ||
mbdg b-Methyl-D-Glucoside ++ cpd15585 | ||
melib D-Melibiose ++ cpd03198 | ||
met__L L-Methionine -- cpd00060 | ||
mnl D-Mannitol ++ cpd00314 | ||
orn__L L-Ornithine + cpd00064 | ||
pala Palatinose ++ cpd01200 | ||
phe__L L-Phenylalanine -- cpd00066 | ||
ppa Propionic Acid + cpd00141 | ||
pro__L L-Proline ++ cpd00129 | ||
pyr Pyruvic Acid + cpd00020 | ||
raffin D-Raffinose ++ cpd00382 | ||
rib__D D-Ribose ++ cpd00105 | ||
rmn L-Rhamnose + cpd00396 | ||
salcn Salicin ++ cpd01030 | ||
sbt__D D-Sorbitol ++ cpd00588 | ||
ser__D D-Serine ++ cpd00550 | ||
ser__L L-Serine ++ cpd00054 | ||
srb__L L-Sorbose + cpd00212 | ||
succ Succinic Acid ++ cpd00036 | ||
sucr Sucrose ++ cpd00076 | ||
thr__L L-Threonine ++ cpd00161 | ||
thymd Thymidine ++ cpd00184 | ||
tre D-Trehalose ++ cpd00794 | ||
uri Uridine ++ cpd00249 | ||
val__L L-Valine -- cpd00156 | ||
xyl__D D-Xylose ++ cpd00154 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
bigg_id compound growth seed | ||
4abut g-Amino-N-Butyric Acid + cpd00281 | ||
acgam N-Acetyl-D-Glucosamine + cpd00122 | ||
ade Adenine - cpd00128 | ||
ala__D D-Alanine ++ cpd00117 | ||
ala__L L-Alanine + cpd00035 | ||
arg__L L-Arginine ++ cpd00051 | ||
asn__L L-Asparagine ++ cpd00132 | ||
asp__L L-Aspartic Acid ++ cpd00041 | ||
citr__L L-Citrulline + cpd00274 | ||
cys__L L-Cysteine ++ cpd00084 | ||
cytd Cytidine ++ cpd00367 | ||
etha Ethanolamine ++ cpd00162 | ||
gam D-Glucosamine ++ cpd00276 | ||
gln__L L-Glutamine ++ cpd00053 | ||
glu__D D-Glutamic Acid + cpd00186 | ||
glu__L L-Glutamic Acid + cpd00023 | ||
gly Glycine + cpd00033 | ||
ins Inosine - cpd00246 | ||
met__L L-Methionine + cpd00060 | ||
nh4 Ammonia ++ cpd00013 | ||
orn__L L-Ornithine + cpd00064 | ||
pro__L L-Proline ++ cpd00129 | ||
ser__L L-Serine ++ cpd00054 | ||
thr__L L-Threonine + cpd00161 | ||
thym Thymine - cpd00151 | ||
ura Uracil - cpd00092 | ||
urate Uric Acid + cpd00300 | ||
urea Urea + cpd00073 | ||
uri Uridine - cpd00249 | ||
val__L L-Valine + cpd00156 | ||
xan Xanthine ++ cpd00309 |
Oops, something went wrong.