Skip to content

Commit

Permalink
Updated fasta_ltrretriever_lai
Browse files Browse the repository at this point in the history
  • Loading branch information
GallVp committed Jun 3, 2024
1 parent 3548df4 commit 019fb37
Show file tree
Hide file tree
Showing 11 changed files with 232 additions and 133 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v2.0.0 - [31-May-2024]
## v2.0.0 - [03-June-2024]

### `Added`

Expand Down
4 changes: 2 additions & 2 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
},
"custom/shortenfastaids": {
"branch": "main",
"git_sha": "da7a50af5eaac9601e23f33e00bf24e61f9c27ab",
"git_sha": "e226301053a2b0f9a6af514b454b3ae7c91daa80",
"installed_by": ["fasta_ltrretriever_lai"]
},
"gffread": {
Expand Down Expand Up @@ -121,7 +121,7 @@
},
"fasta_ltrretriever_lai": {
"branch": "main",
"git_sha": "8088445a270d324cb06e085f18790839c9684754",
"git_sha": "d57f5fb09ab3edca3d4e16dd41beb684a7192d04",
"installed_by": ["subworkflows"]
},
"fastq_bwa_mem_samblaster": {
Expand Down
2 changes: 1 addition & 1 deletion modules/pfr/custom/shortenfastaids/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ channels:

dependencies:
- biopython==1.75
- python=3.8
- python==3.8.13
3 changes: 2 additions & 1 deletion modules/pfr/custom/shortenfastaids/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process CUSTOM_SHORTENFASTAIDS {

output:
tuple val(meta), path("*.short.ids.fasta") , emit: short_ids_fasta , optional: true
tuple val(meta), path("*.short.ids.tsv") , emit: short_ids_tsv , optional: true
tuple val(meta), path("*.short.ids.tsv") , emit: short_ids_tsv
path "versions.yml" , emit: versions

when:
Expand All @@ -25,6 +25,7 @@ process CUSTOM_SHORTENFASTAIDS {
stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
touch "${meta.id}.short.ids.tsv"
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | cut -d' ' -f2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ def fail_if_new_ids_not_valid(ids):

if not do_ids_need_to_change(input_ids_and_descriptions):
print("IDs have acceptable length and character. No change required.")
with open(f"{output_files_prefix}.short.ids.tsv", "w") as f:
f.write("IDs have acceptable length and character. No change required.")
exit(0)

new_ids = shorten_ids(
Expand Down
6 changes: 2 additions & 4 deletions modules/pfr/custom/shortenfastaids/tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ nextflow_process {
process "CUSTOM_SHORTENFASTAIDS"

tag "modules"
tag "modules_nfcore"
tag "modules_pfr"
tag "custom"
tag "custom/shortenfastaids"

Expand All @@ -25,9 +25,7 @@ nextflow_process {
then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() },
{ assert process.out.short_ids_fasta == [] },
{ assert process.out.short_ids_tsv == [] }
{ assert snapshot(process.out).match() }
)
}

Expand Down
36 changes: 28 additions & 8 deletions modules/pfr/custom/shortenfastaids/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,12 @@

],
"1": [

[
{
"id": "test"
},
"test.short.ids.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"2": [
"versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba"
Expand All @@ -64,7 +69,12 @@

],
"short_ids_tsv": [

[
{
"id": "test"
},
"test.short.ids.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba"
Expand All @@ -73,9 +83,9 @@
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "23.10.1"
"nextflow": "24.04.1"
},
"timestamp": "2024-05-10T15:27:57.244358"
"timestamp": "2024-06-02T20:54:36.815875"
},
"homo_sapiens-genome_fasta-no_change": {
"content": [
Expand All @@ -84,7 +94,12 @@

],
"1": [

[
{
"id": "test"
},
"test.short.ids.tsv:md5,642382addc4beba37088b1ebe09d38cf"
]
],
"2": [
"versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba"
Expand All @@ -93,7 +108,12 @@

],
"short_ids_tsv": [

[
{
"id": "test"
},
"test.short.ids.tsv:md5,642382addc4beba37088b1ebe09d38cf"
]
],
"versions": [
"versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba"
Expand All @@ -102,9 +122,9 @@
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "23.10.1"
"nextflow": "24.04.1"
},
"timestamp": "2023-12-07T13:32:54.220188"
"timestamp": "2024-06-02T20:54:17.945233"
},
"homo_sapiens-genome2_fasta-length_change": {
"content": [
Expand Down
122 changes: 80 additions & 42 deletions subworkflows/pfr/fasta_ltrretriever_lai/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,65 @@ workflow FASTA_LTRRETRIEVER_LAI {

take:
ch_fasta // channel: [ val(meta), fasta ]
ch_monoploid_seqs // channel: [ val(meta), txt ]; Optional: Set to [] if not needed
// val(meta) from ch_fasta and ch_monoploid_seqs are only required
// to have the same `id`
ch_monoploid_seqs // channel: [ val(meta2), txt ]; Optional: Set to [] if not needed
// val(meta) from ch_fasta and val(meta2) from ch_monoploid_seqs are
// only required to have the same `id`
skip_lai // val(true|false)

main:
ch_versions = Channel.empty()

// Prapre input channels
ch_monoploid_seqs_plain = ch_monoploid_seqs
?: Channel.empty()
| filter { meta2, seqs -> seqs }
// Cater to channel: [ meta2, [] ]
| map { meta2, seqs -> [ meta2.id, seqs ] }

// MOUDLE: CUSTOM_SHORTENFASTAIDS
CUSTOM_SHORTENFASTAIDS ( ch_fasta )

ch_short_ids_fasta = ch_fasta
| join(CUSTOM_SHORTENFASTAIDS.out.short_ids_fasta, by:0, remainder:true)
| map { meta, fasta, short_ids_fasta ->
if ( fasta ) { [ meta, short_ids_fasta ?: fasta ] }
ch_short_ids_tsv = CUSTOM_SHORTENFASTAIDS.out.short_ids_tsv
ch_shortenfastaids_branch = ch_short_ids_tsv
| branch { meta, tsv ->
change: ! tsv.text.contains('IDs have acceptable length and character')
nonchange: tsv.text.contains('IDs have acceptable length and character')
}

ch_short_ids_tsv = CUSTOM_SHORTENFASTAIDS.out.short_ids_tsv
ch_short_monoploid_seqs = ch_short_ids_tsv
ch_short_ids_fasta = ch_shortenfastaids_branch.nonchange
| join(
ch_monoploid_seqs ?: Channel.empty()
ch_fasta
)
| map { meta, tsv, fasta -> [ meta, fasta ] }
| mix(
ch_shortenfastaids_branch.change
| join(
CUSTOM_SHORTENFASTAIDS.out.short_ids_fasta
)
| map { meta, tsv, fasta -> [ meta, fasta ] }
)
| filter { meta, tsv, seqs -> seqs } // Cater to channel: [ meta, tsv, [] ]
| map { meta, short_ids_tsv, monoploid_seqs ->
map_monoploid_seqs_to_new_ids(meta, short_ids_tsv, monoploid_seqs)

ch_versions = ch_versions.mix(CUSTOM_SHORTENFASTAIDS.out.versions.first())

// collectFile: Map monoploid seqs to short IDs
ch_short_monoploid_seqs = ch_short_ids_tsv
| map { meta, tsv -> [ meta.id, tsv ] }
| join(ch_monoploid_seqs_plain)
| map { id, tsv, seqs ->
map_monoploid_seqs_to_new_ids(id, tsv, seqs)
}
| collectFile(newLine:true)
| map { seqs ->
def id = seqs.name.split('.mapped.monoploid.seqs.txt')[0]

[ [ id: id ], seqs ]
[ id, seqs ]
}
ch_versions = ch_versions.mix(CUSTOM_SHORTENFASTAIDS.out.versions.first())
| join(
ch_short_ids_tsv
| map { meta, tsv -> [ meta.id, meta, tsv ] }
)
| map { id, seqs, meta, tsv -> [ meta, seqs ] }


// MODULE: LTRHARVEST
LTRHARVEST ( ch_short_ids_fasta )
Expand Down Expand Up @@ -78,51 +104,63 @@ workflow FASTA_LTRRETRIEVER_LAI {
)

ch_pass_list = LTRRETRIEVER_LTRRETRIEVER.out.pass_list
ch_ltrlib = LTRRETRIEVER_LTRRETRIEVER.out.ltrlib
ch_annotation_out = LTRRETRIEVER_LTRRETRIEVER.out.annotation_out
ch_pass_out = ch_pass_list.join(ch_annotation_out)
ch_annotation_gff = LTRRETRIEVER_LTRRETRIEVER.out.annotation_gff
ch_ltrlib = LTRRETRIEVER_LTRRETRIEVER.out.ltrlib
ch_versions = ch_versions.mix(LTRRETRIEVER_LTRRETRIEVER.out.versions.first())

// MODULE: LAI
ch_lai_inputs = skip_lai
? Channel.empty()
: ch_short_ids_fasta
| join(ch_pass_list)
| join(ch_annotation_out)
| map { meta, fasta, pass, out ->
[ meta.id, meta, fasta, pass, out ]
}
// MODULE: LTRRETRIEVER_LAI
ch_short_ids_fasta_mono = ch_short_ids_fasta
| join(
ch_short_monoploid_seqs
| map { meta, mono -> [ meta.id, mono ] },
ch_short_monoploid_seqs,
by:0,
remainder: true
)
| map { id, meta, fasta, pass, out, mono ->
[ meta, fasta, pass, out, mono ?: [] ]
// Danger! This partial join can fail
| filter { meta, fasta, seqs -> fasta }
// This filter safeguards against fail on upstream
// process failure: https://github.com/nextflow-io/nextflow/issues/5043
// fasta may come from upstream processes
// seqs also comes from upstream processes, it is optional
// and may not be present for some of the combinations
| map { id, fasta, seqs -> [ id, fasta, seqs ?: [] ] }

ch_lai_inputs = skip_lai
? Channel.empty()
: ch_short_ids_fasta_mono
| join(
ch_pass_out
)
| map { meta, fasta, seqs, pass, out ->
[ meta, fasta, pass, out, seqs ]
}
LTRRETRIEVER_LAI(
ch_lai_inputs.map { meta, fasta, pass, out, mono -> [ meta, fasta ] },
ch_lai_inputs.map { meta, fasta, pass, out, mono -> pass },
ch_lai_inputs.map { meta, fasta, pass, out, mono -> out },
ch_lai_inputs.map { meta, fasta, pass, out, mono -> mono }
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> [ meta, fasta ] },
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> pass },
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> out },
ch_lai_inputs.map { meta, fasta, pass, out, seqs -> seqs }
)

ch_lai_log = LTRRETRIEVER_LAI.out.log
ch_lai_out = LTRRETRIEVER_LAI.out.lai_out
ch_versions = ch_versions.mix(LTRRETRIEVER_LAI.out.versions.first())

// MODULE: CUSTOM_RESTOREGFFIDS
ch_restorable_gff_tsv = ch_annotation_gff.join(ch_short_ids_tsv)
ch_gff_tsv_branch = ch_annotation_gff.join(ch_short_ids_tsv)
| branch { meta, gff, tsv ->
change: ! tsv.text.contains('IDs have acceptable length and character')
nochange: tsv.text.contains('IDs have acceptable length and character')
}

CUSTOM_RESTOREGFFIDS (
ch_restorable_gff_tsv.map { meta, gff, tsv -> [ meta, gff ] },
ch_restorable_gff_tsv.map { meta, gff, tsv -> tsv }
ch_gff_tsv_branch.change.map { meta, gff, tsv -> [ meta, gff ] },
ch_gff_tsv_branch.change.map { meta, gff, tsv -> tsv }
)

ch_restored_gff = ch_annotation_gff
| join(CUSTOM_RESTOREGFFIDS.out.restored_ids_gff3, by:0, remainder:true)
| map { meta, gff, restored_gff -> [ meta, restored_gff ?: gff ] }
ch_restored_gff = ch_gff_tsv_branch.nochange
| map { meta, gff, tsv -> [ meta, gff ] }
| mix(CUSTOM_RESTOREGFFIDS.out.restored_ids_gff3)

ch_versions = ch_versions.mix(CUSTOM_RESTOREGFFIDS.out.versions.first())

Expand All @@ -135,12 +173,12 @@ workflow FASTA_LTRRETRIEVER_LAI {
}


def map_monoploid_seqs_to_new_ids(meta, short_ids_tsv, monoploid_seqs) {
def map_monoploid_seqs_to_new_ids(id, short_ids_tsv, monoploid_seqs) {

def short_ids_head = short_ids_tsv.text.split('\n')[0]

if (short_ids_head == "IDs have acceptable length and character. No change required.") {
return [ "${meta.id}.mapped.monoploid.seqs.txt" ] + monoploid_seqs.text.split('\n')
return [ "${id}.mapped.monoploid.seqs.txt" ] + monoploid_seqs.text.split('\n')
}

def orig_to_new_ids = [:]
Expand All @@ -159,5 +197,5 @@ def map_monoploid_seqs_to_new_ids(meta, short_ids_tsv, monoploid_seqs) {
mapped_ids.add(orig_to_new_ids[original_id])
}

return [ "${meta.id}.mapped.monoploid.seqs.txt" ] + mapped_ids
return [ "${id}.mapped.monoploid.seqs.txt" ] + mapped_ids
}
Loading

0 comments on commit 019fb37

Please sign in to comment.