Skip to content

Commit

Permalink
update nf pipeline. should output rhogs/pickle folder in nf, otherwis…
Browse files Browse the repository at this point in the history
…e sbatch error .command.run too long
  • Loading branch information
sinamajidian committed Sep 3, 2023
1 parent ebce1c7 commit 64fc2a7
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 80 deletions.
52 changes: 19 additions & 33 deletions FastOMA.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ params.proteomes = params.proteome_folder + "/*"
params.hogmap_in = params.input_folder + "/hogmap_in"

params.hogmap_folder = params.output_folder + "/hogmap"
//params.rhogs_folder = params.output_folder + "/rhogs_all"
params.species_tree = params.input_folder + "/species_tree.nwk"
//params.pickles_rhogs_folder = params.output_folder + "/pickle_rhogs"
params.pickles_temp = params.output_folder + "/pickles_temp"
params.genetrees_folder = params.output_folder + "/genetrees"


Expand All @@ -35,13 +34,13 @@ process omamer_run{
}


process infer_roothogs{ // publishDir params.rhogs_folder
process infer_roothogs{
input:
val ready_omamer_run
path hogmap_folder
path proteome_folder
output:
path "rhogs_all" // path "rhogs_all/*"
path "omamer_rhogs"
path "gene_id_dic_xml.pickle"
val true // nextflow-io.github.io/patterns/state-dependency/
script:
Expand All @@ -51,11 +50,10 @@ process infer_roothogs{ // publishDir params.rhogs_folder
}


process batch_roothogs{ // publishDir params.output_folder
process batch_roothogs{
input:
val ready_infer_roothogs
//path rhogs_folder
path "rhogs_all"
path "omamer_rhogs"
output:
path "rhogs_rest/*", optional: true
path "rhogs_big/*" , optional: true
Expand All @@ -66,15 +64,15 @@ process batch_roothogs{ // publishDir params.output_folder
"""
}

process hog_big{ //publishDir params.pickles_rhogs_folder
process hog_big{
publishDir params.pickles_temp
cpus 6
time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume`
memory {20.GB}
input:
val rhogsbig_tree_ready
output:
path "pickle_rhogs/*.pickle"
// path "*.pickle"
path "*.pickle"
path "*.fa", optional: true // msa if write True
path "*.nwk", optional: true // gene trees if write True
val true
Expand All @@ -84,13 +82,12 @@ process hog_big{ //publishDir params.pickles_rhogs_folder
"""
}

process hog_rest{ //publishDir params.pickles_rhogs_folder
process hog_rest{
publishDir params.pickles_temp
input:
val rhogsrest_tree_ready
output:
path "pickle_rhogs/*.pickle"
// path "*.pickle"
//path "pickle_rhogs/*.pickle"
path "*.pickle"
path "*.fa" , optional: true // msa if write True
path "*.nwk" , optional: true // gene trees if write True
val true
Expand All @@ -106,10 +103,10 @@ process collect_subhogs{
publishDir params.output_folder, mode: 'copy'
input:
val ready_hog_rest
val ready_hog_big // path pickle_rhogs // this is for depenedcy
path "pickle_rhogs/" // "*.pickle" // path "pickle_rhogs" // this is the folder includes pickles_rhogs
val ready_hog_big
path "pickles_temp" // this is the folder includes pickles_rhogs
path "gene_id_dic_xml.pickle"
path "rhogs_all"
path "omamer_rhogs"
output:
path "output_hog.orthoxml"
path "OrthologousGroupsFasta"
Expand All @@ -121,28 +118,25 @@ process collect_subhogs{
"""
}


workflow {
proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true)
proteome_folder = Channel.fromPath(params.proteome_folder)
hogmap_folder = Channel.fromPath(params.hogmap_folder)
// rhogs_folder = Channel.fromPath(params.rhogs_folder)

genetrees_folder = Channel.fromPath(params.genetrees_folder)
hogmap_in = Channel.fromPath(params.hogmap_in)

// pickles_rhogs_folder = Channel.fromPath(params.pickles_rhogs_folder)
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") // proteomes.view{"prot ${it}"}
pickles_temp = Channel.fromPath(params.pickles_temp)
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
proteomes_omamerdb = proteomes.combine(omamerdb)
proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
(hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog)
ready_omamer_run_c = ready_omamer_run.collect()

(rhogs_folder, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
// rhogs_folder and "rhogs_all" are the same
(omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
ready_infer_roothogs_c = ready_infer_roothogs.collect()

(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, rhogs_folder)
(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, omamer_rhogs)
ready_batch_roothogs_c = ready_batch_roothogs.collect()

species_tree = Channel.fromPath(params.species_tree)
Expand All @@ -155,16 +149,8 @@ workflow {
rhogsrest_tree = rhogsrest.combine(species_tree)
rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
(pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
all_pickles = pickle_big_rhog.mix(pickle_rest_rhog).collect() // all_pickles.view() // pickle_rhogs_folder = Channel.fromPath(params.output_folder+"/pickle_rhogs")

(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), all_pickles, gene_id_dic_xml, rhogs_folder) // pickles_rhogs_folder
(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs)
orthoxml_file.view{" output orthoxml file ${it}"}

}

// memory {12.GB * (2*task.attempt - 1)}
// time {24.hour}
// errorStrategy {
// task.exitStatus in [1,99,143,137,104,134,139,145,140] ? ‘retry’ : ‘terminate’
// }
// maxRetries 4
2 changes: 1 addition & 1 deletion FastOMA/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@

__packagename__ = "FastOMA"
__version__ = "0.0.6"
__version__ = "0.1.0"
2 changes: 1 addition & 1 deletion FastOMA/batch_roothogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def folder_1h_rhog(address_rhogs_folder, output_folder_big, output_folder_rest):

def batch_roothogs():

input_rhog = "./rhogs_all/" #
input_rhog = "./omamer_rhogs/" #
output_folder_big = "./rhogs_big/"
output_folder_rest = "./rhogs_rest/"
folder_1h_rhog(input_rhog, output_folder_big, output_folder_rest)
Expand Down
12 changes: 6 additions & 6 deletions FastOMA/collect_subhogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def collect_subhogs():
# tr|A0A0N7KCI6|A0A0N7KCI6_ORYSJ
# for qfo benchamrk, the middle should be wirtten in the file

pickle_folder = "./pickle_rhogs/" #pickle_rhogs
pickle_folder = "./pickles_temp/" #pickle_rhogs
output_xml_name = "./output_hog.orthoxml"
gene_id_pickle_file = "./gene_id_dic_xml.pickle"

Expand Down Expand Up @@ -136,8 +136,8 @@ def max_og_tree(tree):
return og_prot_list

input_orthoxml = output_xml_name # sys.argv[1] # "out_folder/output_hog_.orthoxml"
rhog_all_folder = "./rhogs_all/" #sys.argv[2] + "/" # "out_folder/rhogs_all/"
fasta_format = "fa" # of the rhogs_all
rhog_all_folder = "./omamer_rhogs/" #sys.argv[2] + "/" # "out_folder/rhogs_all/"
fasta_format = "fa" # of the rhogs

output_file_og_tsv = "OrthologousGroups.tsv"

Expand Down Expand Up @@ -168,12 +168,12 @@ def max_og_tree(tree):
for hog_id, og_prot_list in OGs.items(): # hog_id="HOG_0667494_sub10524"
rhog_id = "_".join(hog_id.split("_")[:2])

rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format
rhogs_all_prots = list(SeqIO.parse(rhogs_all_address, "fasta"))
omamer_rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format
omamer_rhogs_all_prots = list(SeqIO.parse(omamer_rhogs_all_address, "fasta"))

og_prots = []
og_prot_list = OGs[hog_id]
for rhogs_prot in rhogs_all_prots:
for rhogs_prot in omamer_rhogs_all_prots:
if rhogs_prot.id.split("||")[0] in og_prot_list:
sp = rhogs_prot.id.split("||")[1]
rhogs_prot.description += " [" + sp + "]"
Expand Down
4 changes: 2 additions & 2 deletions FastOMA/infer_roothogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@ def infer_roothogs():
query_species_names,
query_prot_names_species_mapped)
# for pure usage of this python file, you can set the output folder
# output_folder_rhog = _config.in_folder + "rhogs_all/"
# output_folder_rhog = _config.in_folder + "rhogs_all/" // omamer_rhogs
# using nextflow



# import sys
output_folder_rhog = "./rhogs_all/" # sys.argv[1] #
output_folder_rhog = "./omamer_rhogs/" # sys.argv[1] #
rhogid_num_list_filt1 = _utils_roothog.write_rhog(rhogids_list_filt, rhogids_prot_records_query_filt, output_folder_rhog, 2) # min_rhog_size, max_rhog_size
2 changes: 1 addition & 1 deletion FastOMA/infer_subhogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def infer_subhogs():
if inferhog_concurrent_on:
print("parallelization for subhog inference is on.")

pickles_rhog_folder = "./pickle_rhogs/"
pickles_rhog_folder = "./" # pickles_temp/ pickle_rhogs
if not os.path.exists(pickles_rhog_folder):
os.makedirs(pickles_rhog_folder)

Expand Down
55 changes: 19 additions & 36 deletions FastOMA_light.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,13 @@ params.proteomes = params.proteome_folder + "/*"
params.hogmap_in = params.input_folder + "/hogmap_in"

params.hogmap_folder = params.output_folder + "/hogmap"
//params.rhogs_folder = params.output_folder + "/rhogs_all"
params.species_tree = params.input_folder + "/species_tree.nwk"
//params.pickles_rhogs_folder = params.output_folder + "/pickle_rhogs"
params.pickles_temp = params.output_folder + "/pickles_temp"
params.genetrees_folder = params.output_folder + "/genetrees"


process omamer_run{
time {4.h}
memory {4.GB}
publishDir params.hogmap_folder
input:
path proteomes_omamerdb_inputhog
Expand All @@ -35,13 +33,13 @@ process omamer_run{
}


process infer_roothogs{ // publishDir params.rhogs_folder
process infer_roothogs{
input:
val ready_omamer_run
path hogmap_folder
path proteome_folder
output:
path "rhogs_all" // path "rhogs_all/*"
path "omamer_rhogs"
path "gene_id_dic_xml.pickle"
val true // nextflow-io.github.io/patterns/state-dependency/
script:
Expand All @@ -51,11 +49,10 @@ process infer_roothogs{ // publishDir params.rhogs_folder
}


process batch_roothogs{ // publishDir params.output_folder
process batch_roothogs{
input:
val ready_infer_roothogs
//path rhogs_folder
path "rhogs_all"
path "omamer_rhogs"
output:
path "rhogs_rest/*", optional: true
path "rhogs_big/*" , optional: true
Expand All @@ -66,15 +63,14 @@ process batch_roothogs{ // publishDir params.output_folder
"""
}

process hog_big{ //publishDir params.pickles_rhogs_folder
process hog_big{
publishDir params.pickles_temp
cpus 2
time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume`
memory {4.GB}
input:
val rhogsbig_tree_ready
output:
path "pickle_rhogs/*.pickle"
// path "*.pickle"
path "*.pickle"
path "*.fa", optional: true // msa if write True
path "*.nwk", optional: true // gene trees if write True
val true
Expand All @@ -84,13 +80,12 @@ process hog_big{ //publishDir params.pickles_rhogs_folder
"""
}

process hog_rest{ //publishDir params.pickles_rhogs_folder
process hog_rest{
publishDir params.pickles_temp
input:
val rhogsrest_tree_ready
output:
path "pickle_rhogs/*.pickle"
// path "*.pickle"
//path "pickle_rhogs/*.pickle"
path "*.pickle"
path "*.fa" , optional: true // msa if write True
path "*.nwk" , optional: true // gene trees if write True
val true
Expand All @@ -102,14 +97,13 @@ process hog_rest{ //publishDir params.pickles_rhogs_folder


process collect_subhogs{
memory {4.GB}
publishDir params.output_folder, mode: 'copy'
input:
val ready_hog_rest
val ready_hog_big // path pickle_rhogs // this is for depenedcy
path "pickle_rhogs/" // "*.pickle" // path "pickle_rhogs" // this is the folder includes pickles_rhogs
val ready_hog_big
path "pickles_temp" // this is the folder includes pickles_rhogs
path "gene_id_dic_xml.pickle"
path "rhogs_all"
path "omamer_rhogs"
output:
path "output_hog.orthoxml"
path "OrthologousGroupsFasta"
Expand All @@ -121,28 +115,25 @@ process collect_subhogs{
"""
}


workflow {
proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true)
proteome_folder = Channel.fromPath(params.proteome_folder)
hogmap_folder = Channel.fromPath(params.hogmap_folder)
// rhogs_folder = Channel.fromPath(params.rhogs_folder)

genetrees_folder = Channel.fromPath(params.genetrees_folder)
hogmap_in = Channel.fromPath(params.hogmap_in)

// pickles_rhogs_folder = Channel.fromPath(params.pickles_rhogs_folder)
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") // proteomes.view{"prot ${it}"}
pickles_temp = Channel.fromPath(params.pickles_temp)
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
proteomes_omamerdb = proteomes.combine(omamerdb)
proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
(hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog)
ready_omamer_run_c = ready_omamer_run.collect()

(rhogs_folder, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
// rhogs_folder and "rhogs_all" are the same
(omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
ready_infer_roothogs_c = ready_infer_roothogs.collect()

(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, rhogs_folder)
(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, omamer_rhogs)
ready_batch_roothogs_c = ready_batch_roothogs.collect()

species_tree = Channel.fromPath(params.species_tree)
Expand All @@ -155,16 +146,8 @@ workflow {
rhogsrest_tree = rhogsrest.combine(species_tree)
rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
(pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
all_pickles = pickle_big_rhog.mix(pickle_rest_rhog).collect() // all_pickles.view() // pickle_rhogs_folder = Channel.fromPath(params.output_folder+"/pickle_rhogs")

(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), all_pickles, gene_id_dic_xml, rhogs_folder) // pickles_rhogs_folder
(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs)
orthoxml_file.view{" output orthoxml file ${it}"}

}

// memory {12.GB * (2*task.attempt - 1)}
// time {24.hour}
// errorStrategy {
// task.exitStatus in [1,99,143,137,104,134,139,145,140] ? ‘retry’ : ‘terminate’
// }
// maxRetries 4

0 comments on commit 64fc2a7

Please sign in to comment.