Skip to content

Commit

Permalink
Merge pull request #21 from nf-core/dev
Browse files Browse the repository at this point in the history
Update from Upstream
  • Loading branch information
subwaystation authored Nov 30, 2021
2 parents 75081a5 + 0d49ec1 commit f6dc602
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 30 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,5 @@ jobs:
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --smoothxg consensus_spec 10,100,1000
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --vcf_spec "gi|568815561:#,gi|568815567:#"
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --smoothxg_write_maf
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --wfmash_chunks 2
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --wfmash_only
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ RUN apt-get update \
procps \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*

COPY bin/split_approx_mappings_in_chunks.py /

# Install miniconda
RUN wget \
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,10 @@ You can cite the `nf-core` publication as follows:
In addition, references of tools and data used in this pipeline are as follows:
> **ODGI: understanding pangenome graphs.**
>
> Andrea Guarracino, Simon Heumos, Sven Nahnsen, Pjotr Prins & Erik Garrison.
>
> _bioRxiv_ 2021 Nov 11 doi: [10.1101/2021.11.10.467921](https://doi.org/10.1101/2021.11.10.467921).
<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->
55 changes: 55 additions & 0 deletions bin/split_approx_mappings_in_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Usage
# Run:
# python3 split_approx_mappings_in_chunks.py approximate_mappings.paf 4
# It will generate the following files:
# approximate_mappings.paf.chunk_0.paf
# approximate_mappings.paf.chunk_1.paf
# approximate_mappings.paf.chunk_2.paf
# approximate_mappings.paf.chunk_3.paf

import sys

# The script that takes the approximate mappings, weighs each mapping by computing its length * (1 - estimated identity),
# then creates N new files where the mapping sets have a similar sum of weights.

def split_chunks(l, n):
result = [[] for i in range(n)]
sums = [0] * n
i = 0
for e in l:
result[i].append(e)
sums[i] += e[1]
i = sums.index(min(sums))
return result


if __name__ == '__main__':
path_approx_mappings = sys.argv[1]
num_of_chunks = int(sys.argv[2])

rank_to_mapping_dict = {}
mapping_list = []

with open(path_approx_mappings) as f:
for rank, line in enumerate(f):
# We could avoid keeping everything in memory by reading the file again later
rank_to_mapping_dict[rank] = line

_, _, query_start, query_end, _, _, _, target_start, target_end, _, _, _, estimated_identity = line.strip().split('\t')

num_mapped_bases = max(int(query_end) - int(query_start), int(target_end) - int(target_start))
estimated_identity = float(estimated_identity.split('id:f:')[1]) / 100.0

# High divergence makes alignment more difficult
weight = num_mapped_bases * (1 - estimated_identity)

mapping_list.append((rank, weight))

# Chunk the tuples by looking at their weigths
chunk_list = split_chunks(mapping_list, num_of_chunks)

# Collect the ranks from the tuples to generate balanced chunks
for num_chunk, element_list in enumerate(chunk_list):
with open(path_approx_mappings + f'.chunk_{num_chunk}.paf', 'w') as fw:
for rank, _ in element_list:
fw.write(rank_to_mapping_dict[rank])
147 changes: 118 additions & 29 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,68 @@ if (!params.smoothxg_num_haps) {
n_haps = params.wfmash_n_mappings
}

process wfmashMap {
publishDir "${params.outdir}/wfmash_map", mode: "${params.publish_dir_mode}"

input:
tuple val(f), path(fasta)

output:
tuple val(f), path("${f}.${wfmash_prefix}.map.paf")

"""
wfmash ${wfmash_exclude_cmd} \
-s ${params.wfmash_segment_length} \
-l ${params.wfmash_block_length} \
${wfmash_merge_cmd} \
${wfmash_split_cmd} \
-p ${params.wfmash_map_pct_id} \
-n ${params.wfmash_n_mappings} \
-k ${params.wfmash_mash_kmer} \
-t ${task.cpus} \
-m \
$fasta $fasta \
>${f}.${wfmash_prefix}.map.paf
"""
}

process splitApproxMappingsInChunks {
publishDir "${params.outdir}/wfmash_chunks", mode: "${params.publish_dir_mode}"

input:
tuple val(f), path(paf)
output:
path("${f}*.chunk_*.paf")
"""
python3 /split_approx_mappings_in_chunks.py $paf ${params.wfmash_chunks}
"""
}

process wfmashAlign {
publishDir "${params.outdir}/wfmash_align", mode: "${params.publish_dir_mode}"

input:
tuple val(f), path(fasta), path(paf)

output:
path("${paf}.align.paf")

"""
wfmash ${wfmash_exclude_cmd} \
-s ${params.wfmash_segment_length} \
-l ${params.wfmash_block_length} \
${wfmash_merge_cmd} \
${wfmash_split_cmd} \
-p ${params.wfmash_map_pct_id} \
-n ${params.wfmash_n_mappings} \
-k ${params.wfmash_mash_kmer} \
-t ${task.cpus} \
-i $paf \
$fasta $fasta \
>${paf}.align.paf
"""
}

process wfmash {
publishDir "${params.outdir}/wfmash", mode: "${params.publish_dir_mode}"

Expand Down Expand Up @@ -76,17 +138,23 @@ process seqwish {

input:
tuple val(f), path(fasta)
path(wfmash)
path(pafs)

output:
tuple val(f), path("${f}${seqwish_prefix}.gfa")

script:
"""
if [[ \$(ls *.paf | wc -l) == 1 ]]; then
input=$pafs
else
input=\$(ls *.paf | tr '\\\n' ',')
input=\${input::-1}
fi
seqwish \
-t ${task.cpus} \
-s $fasta \
-p $wfmash \
-p \$input \
-k ${params.seqwish_min_match_length} \
-g ${f}${seqwish_prefix}.gfa -P \
-B ${params.seqwish_transclose_batch} \
Expand Down Expand Up @@ -319,34 +387,52 @@ process multiQC {
workflow {
main:

wfmash(fasta)
seqwish(fasta, wfmash.out.collect{it[1]})
smoothxg(seqwish.out)
gfaffix(smoothxg.out.gfa_smooth)

odgiBuild(seqwish.out.collect{it[1]}.mix(smoothxg.out.consensus_smooth.flatten(), gfaffix.out.gfa_norm))
odgiStats(odgiBuild.out)

odgiVizOut = Channel.empty()
if (do_1d) {
odgiVizOut = odgiViz(odgiBuild.out.filter( ~/.*smoothxg.*/ ))
}
odgiDrawOut = Channel.empty()
if (do_2d) {
odgiLayout(odgiBuild.out.filter( ~/.*smoothxg.*/ ))
odgiDrawOut = odgiDraw(odgiLayout.out)
}

if (params.vcf_spec != false) {
vg_deconstruct(gfaffix.out.gfa_norm)
if (params.wfmash_only) {
// TODO Once we changed the way we changed the publish_dir_mode, we have to emit the .paf file as default, else not
if (params.wfmash_chunks == 1) {
wfmash(fasta)
} else {
wfmashMap(fasta)
splitApproxMappingsInChunks(wfmashMap.out)
wfmashAlign(fasta.combine(splitApproxMappingsInChunks.out.flatten()))
}
} else {
if (params.wfmash_chunks == 1) {
wfmash(fasta)
seqwish(fasta, wfmash.out.collect{it[1]})
} else {
wfmashMap(fasta)
splitApproxMappingsInChunks(wfmashMap.out)
wfmashAlign(fasta.combine(splitApproxMappingsInChunks.out.flatten()))
seqwish(fasta, wfmashAlign.out.collect())
}
smoothxg(seqwish.out)
gfaffix(smoothxg.out.gfa_smooth)

odgiBuild(seqwish.out.collect{it[1]}.mix(smoothxg.out.consensus_smooth.flatten(), gfaffix.out.gfa_norm))
odgiStats(odgiBuild.out)

odgiVizOut = Channel.empty()
if (do_1d) {
odgiVizOut = odgiViz(odgiBuild.out.filter( ~/.*smoothxg.*/ ))
}
odgiDrawOut = Channel.empty()
if (do_2d) {
odgiLayout(odgiBuild.out.filter( ~/.*smoothxg.*/ ))
odgiDrawOut = odgiDraw(odgiLayout.out)
}

if (params.vcf_spec != false) {
vg_deconstruct(gfaffix.out.gfa_norm)
}

multiQC(
odgiStats.out.collect().ifEmpty([]),
odgiVizOut.collect().ifEmpty([]),
odgiDrawOut.collect().ifEmpty([]),
ch_multiqc_config
)
}

multiQC(
odgiStats.out.collect().ifEmpty([]),
odgiVizOut.collect().ifEmpty([]),
odgiDrawOut.collect().ifEmpty([]),
ch_multiqc_config
)
}

// /*
Expand Down Expand Up @@ -388,6 +474,9 @@ def helpMessage() {
--wfmash_no_splits disable splitting of input sequences during mapping [default: OFF]
--wfmash_exclude--delim [c] skip mappings between sequences with the same name prefix before
the given delimiter character [default: all-vs-all and !self]
--wfmash_chunks The number of files to generate from the approximate wfmash mappings to scale across a whole cluster. It is recommended to set this to the number of available nodes. If only one machine is available, leave it at 1. [default: 1]
--wfmash_only If this parameter is set, only the wfmash alignment step of the pipeline is executed. This option is offered for users who want to use wfmash on a cluster. [default: OFF]
Seqwish options:
--seqwish_min_match_length [n] ignore exact matches below this length [default: 47]
--seqwish_transclose_batch [n] number of bp to use for transitive closure batch [default: 10000000]
Expand Down
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ params {
wfmash_merge_segments = false
wfmash_no_splits = false
wfmash_exclude_delim = false
wfmash_chunks = 1
wfmash_only = false

// Seqwish options
seqwish_min_match_length = 47
Expand Down
11 changes: 10 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@
"type": "string",
"description": "skip mappings between sequences with the same name prefix before the given delimiter character",
"fa_icon": "fas fa-align-center"
},
"wfmash_chunks": {
"type": "integer",
"default": 1,
"description": "The number of files to generate from the approximate wfmash mappings to scale across a whole cluster. It is recommended to set this to the number of available nodes. If only one machine is available, leave it at 1."
},
"wfmash_only": {
"type": "boolean",
"description": "If this parameter is set, only the wfmash alignment step of the pipeline is executed. This option is offered for users who want to use wfmash on a cluster."
}
}
},
Expand Down Expand Up @@ -146,7 +155,7 @@
},
"smoothxg_block_ratio_min": {
"type": "number",
"default": 0.0,
"default": 0,
"description": "minimum small / large length ratio to cluster in a block"
},
"smoothxg_block_id_min": {
Expand Down

0 comments on commit f6dc602

Please sign in to comment.