-
Notifications
You must be signed in to change notification settings - Fork 442
/
unicycler.xml
724 lines (660 loc) · 34.4 KB
/
unicycler.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
<tool id="unicycler" name="Create assemblies with Unicycler" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.09">
<description>pipeline for bacterial genomes</description>
<macros>
<token name="@TOOL_VERSION@">0.5.1</token>
<token name="@VERSION_SUFFIX@">0</token>
</macros>
<edam_topics>
<edam_topic>topic_0196</edam_topic>
</edam_topics>
<edam_operations>
<edam_operation>operation_0525</edam_operation>
</edam_operations>
<xrefs>
<xref type="bio.tools">unicycler</xref>
</xrefs>
<requirements>
<requirement type="package" version="@TOOL_VERSION@">unicycler</requirement>
<requirement type="package" version="1.20">samtools</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
#for r in $reuse
ln -s $r.reuse_file ${r.reuse_step}.gfa &&
#end for
## Preparing files
#if str( $paired_unpaired.fastq_input_selector ) == "paired"
#if $paired_unpaired.fastq_input1.is_of_type("fastq.gz"):
#set fq1 = "fq1.fastq.gz"
#else
#set fq1 = "fq1.fastq"
#end if
#if $paired_unpaired.fastq_input2.is_of_type("fastq.gz"):
#set fq2 = "fq2.fastq.gz"
#else
#set fq2 = "fq2.fastq"
#end if
ln -s '${paired_unpaired.fastq_input1}' $fq1 &&
ln -s '${paired_unpaired.fastq_input2}' $fq2 &&
#elif str( $paired_unpaired.fastq_input_selector ) == "paired_collection"
#if $paired_unpaired.fastq_input1.forward.is_of_type("fastq.gz"):
#set fq1 = "fq1.fastq.gz"
#else
#set fq1 = "fq1.fastq"
#end if
#if $paired_unpaired.fastq_input1.reverse.is_of_type("fastq.gz"):
#set fq2 = "fq2.fastq.gz"
#else
#set fq2 = "fq2.fastq"
#end if
ln -s '${paired_unpaired.fastq_input1.forward}' $fq1 &&
ln -s '${paired_unpaired.fastq_input1.reverse}' $fq2 &&
#elif str( $paired_unpaired.fastq_input_selector ) == "single"
#if $paired_unpaired.fastq_input1.is_of_type("fastqsanger.gz"):
#set fq = "fq.fastq.gz"
#else
#set fq = "fq.fastq"
#end if
ln -s '${paired_unpaired.fastq_input1}' '$fq' &&
#end if
#if $long
#if $long.is_of_type("fastq"):
#set lr = "lr.fastq"
#elif $long.is_of_type("fastq.gz"):
#set lr = "lr.fastq.gz"
#elif $long.is_of_type("fasta")
#set lr = "lr.fasta"
#end if
ln -s '${long}' '$lr' &&
#end if
## Running Unicycler
unicycler -t "\${GALAXY_SLOTS:-4}"
-o ./
--verbosity 3
#if str( $paired_unpaired.fastq_input_selector ) == "paired"
-1 '$fq1'
-2 '$fq2'
#elif str( $paired_unpaired.fastq_input_selector ) == "paired_collection"
-1 '$fq1'
-2 '$fq2'
#elif str( $paired_unpaired.fastq_input_selector ) == "single"
-s '$fq'
#end if
#if $long
-l $lr
#end if
## General Unicycler Options section
## ----------------------------------------------------------
--mode '$mode'
--min_fasta_length '$min_fasta_length'
--linear_seqs '$linear_seqs'
#if str($min_anchor_seg_len) != ''
--min_anchor_seg_len '$min_anchor_seg_len'
#end if
## Spades Options section
## ----------------------------------------------------------
--min_kmer_frac '$spades.min_kmer_frac'
--max_kmer_frac '$spades.max_kmer_frac'
#if str($spades.kmers) != ''
--kmers '$spades.kmers'
#end if
--kmer_count '$spades.kmer_count'
--depth_filter '$spades.depth_filter'
#if $spades.largest_component
--largest_component
#end if
## Rotation Options section
## ----------------------------------------------------------
$rotation.no_rotate
#if $rotation.start_genes
--start_genes '$rotation.start_genes'
#end if
--start_gene_id '$rotation.start_gene_id'
--start_gene_cov '$rotation.start_gene_cov'
## Graph cleaning Options sdection
## ----------------------------------------------------------
--min_component_size '$graph_clean.min_component_size'
--min_dead_end_size '$graph_clean.min_dead_end_size'
## Long Read Alignment Options
## ----------------------------------------------------------
#if $lr_align.contamination
--contamination '$lr_align.contamination'
#end if
--scores '${lr_align.scores}'
#if str($lr_align.low_score) != ''
--low_score '$lr_align.low_score'
#end if
$lr_align.no_simple_bridges
--keep $keep
#if $keep != '0'
&& mkdir 'spades_graphs'
&& mv 00*gfa './spades_graphs/'
#end if
#if $keep == '2' and $long
&& samtools view -@ "\${GALAXY_SLOTS:-4}" -u 'read_alignment/long_read_alignments.sam' | samtools sort -@ "\${GALAXY_SLOTS:-4}" -o 'read_alignment/long_read_alignments.bam'
#end if
]]></command>
<inputs>
<conditional name="paired_unpaired">
<param name="fastq_input_selector" type="select" label="Paired or Single end data?" help="Select between paired and single end data">
<option selected="True" value="paired">Paired</option>
<option value="paired_collection">Paired Collection</option>
<option value="single">Single</option>
<option value="none">None</option>
</param>
<when value="paired">
<param name="fastq_input1" argument="-1" type="data" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz"
label="Select first set of reads" help="Specify dataset with forward reads"/>
<param name="fastq_input2" argument="-2" type="data" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz"
label="Select second set of reads" help="Specify dataset with reverse reads"/>
</when>
<when value="paired_collection">
<param name="fastq_input1" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz" type="data_collection" collection_type="paired" label="Select a paired collection" />
</when>
<when value="single">
<param name="fastq_input1" argument="-s" type="data" format="fastqsanger,fastqsanger.gz,fastq,fastq.gz"
label="Select unpaired reads" help="Specify dataset with unpaired reads"/>
</when>
<when value="none">
</when>
</conditional>
<param argument="--long" optional="true" type="data" format="fastqsanger,fastqsanger.gz,fasta,fastq,fastq.gz" label="Select long reads. If there are no long reads, leave this empty"/>
<param argument="--mode" type="select" label="Select Bridging mode">
<option value="conservative">Conservative (smaller contigs, lower misassembly)</option>
<option value="normal" selected="True">Normal (moderate contig size and misassembly rate)</option>
<option value="bold">Bold (longest contigs, higher misassembly rate)</option>
</param>
<param argument="--min_fasta_length" type="integer" value="100" label="Exclude contigs from the FASTA file which are shorter than this length (bp)"/>
<param argument="--linear_seqs" type="integer" value="0" label="The expected number of linear (i.e. non-circular) sequences in the assembly"/>
<param argument="--min_anchor_seg_len" type="integer" min="0" optional="true" label="Unicycler will not use segments shorter than this as scaffolding anchors"/>
<section name="spades" expanded="False" title="SPAdes options"
help="Unicycler uses SPAdes to construct assembly graphs. You can modify some of the SPAdes settings here. Use this ONLY if you know what you are doing!">
<param argument="--min_kmer_frac" type="float" min="0" max="1" value="0.2"
label="Lowest k-mer size for SPAdes assembly, expressed as a fraction of the read length"/>
<param argument="--max_kmer_frac" type="float" min="0" max="1" value="0.95"
label="Highest k-mer size for SPAdes assembly, expressed as a fraction of the read length"/>
<param argument="--kmers" type="text" value="" optional="true" label="Exact k-mers to use for SPAdes assembly, comma-separated">
<validator type="regex" message="Kmers must be comma-separated odd integers (no repitition) without space in the range of 11 to 127 (inclusive)">^(\d*[13579],)*(\d*[13579])$</validator>
</param>
<param argument="--kmer_count" type="integer" min="0" value="10" label="Number of k-mer steps to use in SPAdes assembly"/>
<param argument="--depth_filter" type="float" min="0" max="1" value="0.25"
label="Filter out contigs lower than this fraction of the chromosomal depth" help="It is done if does not result in graph dead ends"/>
<param argument="--largest_component" type="boolean" checked="false"
label="Only keep the largest connected component of the assembly graph"/>
</section>
<section name="rotation" expanded="false" title="Rotation options"
help="These options control the rotation of completed circular sequence near the end of the Unicycler pipeline. Use this ONLY if you know what you are doing!">
<param argument="--no_rotate" type="boolean" checked="false" truevalue="--no_rotate" falsevalue=""
label="Do not rotate completed replicons to start at a standard gene." help="Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence."/>
<param argument="--start_genes" optional="true" type="data" format="fasta" label="FASTA file of genes for start point of rotated replicons" />
<param argument="--start_gene_id" type="float" min="0" max="100" value="90" label="The minimum required BLAST percent identity for a start gene search"/>
<param argument="--start_gene_cov" type="float" min="0" max="100" value="95" label="The minimum required BLAST percent coverage for a start gene search"/>
</section>
<section name="graph_clean" expanded="false" title="Graph cleaning options"
help="These options control the removal of small leftover sequences after bridging is complete.">
<param argument="--min_component_size" type="integer" min="0" value="1000"
label="Unbridged graph components smaller than this size will be removed from the final graph" />
<param argument="--min_dead_end_size" type="integer" min="0" value="1000"
label="Graph dead ends smaller than this size will be removed from the final graph"/>
</section>
<section name="lr_align" expanded="false" title="Long read alignment parameters" help="These options control the alignment of long reads to the assembly graph.">
<param argument="--contamination" optional="true" type="data" format="fasta"
label="FASTA file of known contamination in long reads, e.g. lambda, phiXm or puc18 spike-ins." />
<param argument="--scores" type="text" value="3,-6,-5,-2" label="Comma-delimited string of alignment scores: match, mismatch, gap open, gap extend"/>
<param argument="--low_score" optional="true" type="integer" value=""
label="Score threshold - alignments below this are considered poor" help="default = set automatically"/>
<param argument="--no_simple_bridges" type="boolean" truevalue="--no_simple_bridges" falsevalue="" checked="false" label="Simple long-read bridging" help="Default: No" />
</section>
<param argument="--keep" type="select" label="Outputs to keep" help="Level of file retention. Default: 1">
<option value="0">0: only keep final files</option>
<option value="1" selected="true">1: save graphs at main checkpoints</option>
<option value="2">2: also keep SAM</option>
</param>
<repeat name="reuse" title="Reuse checkpoint files from earlier runs" max="1" help="">
<param name="reuse_file" type="data" optional="false" format="gfa1" label="Checkpoint file"/>
<param name="reuse_step" type="select" label="Checkpoint">
<option value="002_depth_filter">002_depth_filter</option>
<option value="003_overlaps_removed">003_overlaps_removed</option>
<option value="004_bridges_applied">004_bridges_applied</option>
</param>
</repeat>
</inputs>
<outputs>
<data name="assembly_graph" format="gfa1" from_work_dir="assembly.gfa" label="${tool.name} on ${on_string}: Final Assembly Graph" />
<data name="assembly" format="fasta" from_work_dir="assembly.fasta" label="${tool.name} on ${on_string}: Final Assembly"/>
<collection name="spades_collection" type="list" label="${tool.name} on ${on_string}: SPAdes graphs">
<discover_datasets pattern="(?P<designation>.*)\.gfa" format="gfa1" directory="spades_graphs"/>
<filter>keep != "0"</filter>
</collection>
<data name="bam_file" format="bam" from_work_dir="read_alignment/long_read_alignments.bam" label="${tool.name} on ${on_string}: Long read alignments BAM">
<filter>keep == "2" and long</filter>
</data>
</outputs>
<tests>
<test expect_num_outputs="2">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="paired" />
<param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" />
<param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" />
</conditional>
<param name="mode" value="normal" />
<param name="min_fasta_length" value="100"/>
<param name="linear_seqs" value="0"/>
<section name="spades">
<param name="min_kmer_frac" value="0.2"/>
<param name="max_kmer_frac" value="0.95"/>
<param name="kmer_count" value="10"/>
<param name="depth_filter" value="0.25"/>
</section>
<section name="rotation">
<param name="no_rotate" value=""/>
<param name="start_gene_id" value="90"/>
<param name="start_gene_cov" value="95"/>
</section>
<section name="graph_clean">
<param name="min_component_size" value="1000"/>
<param name="min_dead_end_size" value="1000"/>
</section>
<section name="lr_align">
<param name="scores" value="3,-6,-5,-2"/>
</section>
<param name="keep" value="0"/>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/>
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text="length=5386" />
</assert_contents>
</output>
</test>
<!--
Following test corresponds to the command:
unicycler -t "${GALAXY_SLOTS:-8}" -o ./ - -verbose 3 - -pilon_path `pilon - -jar_dir` \
-1 test-data/phix_f.fq.gz -2 test-data/phix_r.fq.gz -l test-data/onp.fa \
- -mode 'normal' - -no_correct
This command causes a segfault with the current version of unicycler on bioconda for Linux
during the minimap step (which seems to be compiled C code). A gist of the log can be found
at: https://gist.github.com/jmchilton/b411b695170c1daea6589f5d76e326cb.
-->
<test expect_num_outputs="2">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="paired" />
<param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger.gz" />
<param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger.gz" />
</conditional>
<param name="long" value="onp.fa" ftype="fasta" />
<param name="mode" value="normal" />
<param name="min_fasta_length" value="100"/>
<param name="linear_seqs" value="0"/>
<section name="spades">
<param name="min_kmer_frac" value="0.2"/>
<param name="max_kmer_frac" value="0.95"/>
<param name="kmer_count" value="10"/>
<param name="depth_filter" value="0.25"/>
</section>
<section name="rotation">
<param name="no_rotate" value=""/>
<param name="start_gene_id" value="90"/>
<param name="start_gene_cov" value="95"/>
</section>
<section name="graph_clean">
<param name="min_component_size" value="1000"/>
<param name="min_dead_end_size" value="1000"/>
</section>
<section name="lr_align">
<param name="scores" value="3,-6,-5,-2"/>
</section>
<param name="keep" value="0"/>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/>
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text="length=5386" />
</assert_contents>
</output>
</test>
<test expect_num_outputs="2">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="paired_collection"/>
<param name="fastq_input1">
<collection type="paired">
<element name="forward" value="phix_f.fq.gz" ftype="fastqsanger" />
<element name="reverse" value="phix_r.fq.gz" ftype="fastqsanger" />
</collection>
</param>
</conditional>
<param name="mode" value="normal" />
<param name="min_fasta_length" value="100"/>
<param name="linear_seqs" value="0"/>
<section name="spades">
<param name="min_kmer_frac" value="0.2"/>
<param name="max_kmer_frac" value="0.95"/>
<param name="kmer_count" value="10"/>
<param name="depth_filter" value="0.25"/>
</section>
<section name="rotation">
<param name="no_rotate" value=""/>
<param name="start_gene_id" value="90"/>
<param name="start_gene_cov" value="95"/>
</section>
<section name="graph_clean">
<param name="min_component_size" value="1000"/>
<param name="min_dead_end_size" value="1000"/>
</section>
<section name="lr_align">
<param name="scores" value="3,-6,-5,-2"/>
</section>
<param name="keep" value="0"/>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/>
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text="length=5386" />
</assert_contents>
</output>
</test>
<test expect_num_outputs="2">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="none"/>
</conditional>
<param name="min_anchor_seg_len" value="10"/>
<section name="spades">
<param name="kmers" value="21,23"/>
</section>
<param name="long" value="only_long.fasta" ftype="fasta" />
<param name="keep" value="0"/>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_text text="S" />
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text=">1" />
</assert_contents>
</output>
</test>
<!-- test checkpoint graph reuse
TODO more precise test and check difference to call wo reuse -->
<test expect_num_outputs="2">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="paired_collection"/>
<param name="fastq_input1">
<collection type="paired">
<element name="forward" value="phix_f.fq.gz" ftype="fastqsanger" />
<element name="reverse" value="phix_r.fq.gz" ftype="fastqsanger" />
</collection>
</param>
</conditional>
<param name="long" value="only_long.fasta" ftype="fasta" />
<repeat name="reuse">
<param name="reuse_file" value="phix__spades_graph.gfa1"/>
<param name="reuse_step" value="002_depth_filter"/>
</repeat>
<param name="keep" value="0"/>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_text text="S" />
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text=">1" />
</assert_contents>
</output>
</test>
<!-- Test keep value = 1 -->
<test expect_num_outputs="3">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="paired" />
<param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" />
<param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" />
</conditional>
<param name="mode" value="normal" />
<param name="keep" value="1"/>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/>
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text="length=5386" />
</assert_contents>
</output>
<output_collection name="spades_collection" type="list" count="14">
<element name="001_spades_graph_k027">
<assert_contents>
<has_text text="TTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAAC"/>
</assert_contents>
</element>
</output_collection>
</test>
<!-- Test keep value = 2 -->
<test expect_num_outputs="4">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="paired" />
<param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" />
<param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" />
</conditional>
<param name="long" value="onp.fa" ftype="fasta" />
<param name="mode" value="normal" />
<param name="keep" value="2"/>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/>
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text="length=5386" />
</assert_contents>
</output>
<output_collection name="spades_collection" type="list" count="14">
<element name="001_spades_graph_k027">
<assert_contents>
<has_text text="TTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAAC"/>
</assert_contents>
</element>
<!-- there are gfa files for more k that are not tested explicily
Aim of testing these is to be sure about the names of the graphs,
since they are used for reuse. Hence if there is a change here
update reuse accordingly-->
<element name="001_spades_graph_k127">
<assert_contents>
<has_line_matching expression="^S.*"/>
</assert_contents>
</element>
<element name="002_depth_filter">
<assert_contents>
<has_line_matching expression="^S.*"/>
</assert_contents>
</element>
<element name="003_overlaps_removed">
<assert_contents>
<has_line_matching expression="^S.*"/>
</assert_contents>
</element>
<element name="004_bridges_applied">
<assert_contents>
<has_line_matching expression="^S.*"/>
</assert_contents>
</element>
<element name="005_final_clean">
<assert_contents>
<has_line_matching expression="^S.*"/>
</assert_contents>
</element>
</output_collection>
<output name="bam_file" ftype="bam">
<assert_contents>
<has_size value="2084" delta="100"/>
</assert_contents>
</output>
</test>
<!-- Test no simple bridges option -->
<test expect_num_outputs="2">
<conditional name="paired_unpaired">
<param name="fastq_input_selector" value="paired" />
<param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" />
<param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" />
</conditional>
<param name="long" value="onp.fa" ftype="fasta" />
<param name="mode" value="normal" />
<param name="keep" value="0"/>
<section name="lr_align">
<param name="no_simple_bridges" value="true"/>
</section>
<output name="assembly_graph" ftype="gfa1">
<assert_contents>
<has_line_matching expression="S\t1\t[ATCG]{5386,5386}\tLN:i:5386\tdp:f:1.0"/>
</assert_contents>
</output>
<output name="assembly" ftype="fasta">
<assert_contents>
<has_text text="length=5386" />
</assert_contents>
</output>
<assert_command>
<has_text text="--no_simple_bridges" />
</assert_command>
</test>
</tests>
<help><![CDATA[
**Unicycler**
Unicycler is a hybrid assembly pipeline for bacterial genomes. It uses both Illumina reads and long reads (PacBio or Nanopore) to produce complete and accurate assemblies. It is written by `Ryan Wick`_ at the University of Melbourne's Centre for Systems Genomics. Much of the description below is lifted from Unicycler's `github page`_.
.. _`Ryan Wick`: https://github.com/rrwick
.. _`github page`: https://github.com/rrwick/Unicycler
-----
**Input data**
Unicycler accepts inputs short (Illumina) reads in FASTQ format. Galaxy places additional requirement of having these in FASTQ format with `Sanger encoding`_ of quality scores. Long reads (from Oxford Nanopore or PacBio) can be either in FASTQ of FASTA form.
.. _`Sanger encoding`: https://en.wikipedia.org/wiki/FASTQ_format#Quality
The input options are::
-1 SHORT1, --short1 SHORT1
FASTQ file of short reads (first reads in each pair)
-2 SHORT2, --short2 SHORT2
FASTQ file of short reads (second reads in each pair)
-s SHORT_UNPAIRED, --short_unpaired SHORT_UNPAIRED
FASTQ file of unpaired short reads
-l LONG, --long LONG
FASTQ or FASTA file of long reads, if all reads are available at start.
-----
**Bridging mode**
Unicycler can be run in three modes: conservative, normal (the default) and bold, set with the --mode option. Conservative mode is least likely to produce a complete assembly but has a very low risk of misassembly. Bold mode is most likely to produce a complete assembly but carries greater risk of misassembly. Normal mode is intermediate regarding both completeness and misassembly risk. See `description of modes`_ for more information.
.. _`description of modes`: https://github.com/rrwick/Unicycler#conservative-normal-and-bold
The available modes are::
--mode {conservative,normal,bold}
Bridging mode (default: normal)
conservative = smaller contigs, lowest misassembly rate
normal = moderate contig size and misassembly rate
bold = longest contigs, higher misassembly rate
----
**Skip SPAdes error correction step**
Sequencing data contains a substantial number of sequencing errors that manifest themselves as deviations (bulges and non-connected components) within the assembly graph. One of the ways to improve the graph even constructing it is to minimize the amount sequencing errors by performing error correction. SPAdes, which is used by Unicycler for error correction and assembly, uses `BayesHammer`_ to correct the reads. Here is a brief summary of what it does:
1. SPAdes (or rather BayesHammer) counts *k*-mers in reads and computed *k*-mer statistics that takes into account base quality values.
2. `Hamming graph`_ is constructed for *k*-mers is which *k*-mers are nodes. In this graph edges connect nodes (*k*-mers) is they differ from each other by a number of nucleotides up to a certain threshold (the `Hamming distance`_). The graph is central to the error correction algorithm.
3. At this step Bayesian subclustering of the graph produced in the previous step. For each *k*-mer we now know the center of its subcluster.
4. Solid *k*-mers are derived from cluster centers and are assumed to be *error free*.
5. Solid *k*-mers are mapped back to the reads and used to correct them.
This step takes considerable time, so if one need to quickly evaluate assemblies this step can be skipped. However, this is not recommended if one if trying to produce a final high quality assembly.
.. _`BayesHammer`: https://goo.gl/1iGkMe
.. _`Hamming graph`: https://en.wikipedia.org/wiki/Hamming_graph
.. _`Hamming distance`: https://en.wikipedia.org/wiki/Hamming_distance
-----
**Do not rotate completed replicons to start at a standard gene**
Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence.
The following option turns rotation on and off::
--no_rotate
Do not rotate completed replicons
to start at a standard gene
(default: completed replicons are rotated)
**Do not use Pilon to polish the final assembly**
`Pilon`_ is a tool for improving overall quality of draft assemblies and finding variation among strains. Unicycler uses it for assembly *polishing*.
The following option turns pilon part of Unicycler pipeline on and off::
--no_pilon
Do not use Pilon to polish the
final assembly (default: Pilon is used)
.. _`Pilon`: https://github.com/broadinstitute/pilon/wiki
------
**Expected number of linear sequences**
If you expect your sample to contain linear (non circular) sequences, set this option::
--linear_seqs EXPECTED_LINEAR_SEQS
The expected number of linear (i.e. non-circular)
sequences in the underlying sequence
----
**SPAdes options**
This section provides control of SPAdes options::
--min_kmer_frac MIN_KMER_FRAC
Lowest k-mer size for SPAdes assembly,
expressed as a fraction of the read length
(default: 0.2)
--max_kmer_frac MAX_KMER_FRAC
Highest k-mer size for SPAdes assembly,
expressed as a fraction of the read length
(default: 0.95)
--kmer_count KMER_COUNT
Number of k-mer steps to use in
SPAdes assembly (default: 10)
--depth_filter DEPTH_FILTER
Filter out contigs lower than this fraction
of the chromosomal depth, if doing so does
not result in graph dead ends (default: 0.25)
----
**Rotation options**
Unicycler attempts to rotate circular assemblies to make sure that they begin at a consistent starting gene. The following parameters control assembly rotation::
--start_genes START_GENES
FASTA file of genes for start point
of rotated replicons
(default: start_genes.fasta)
--start_gene_id START_GENE_ID
The minimum required BLAST percent identity
for a start gene search
(default: 90.0)
--start_gene_cov START_GENE_COV
The minimum required BLAST percent coverage
for a start gene search
(default: 95.0)
-----
**Graph cleaning options**
These options control the removal of small leftover sequences after bridging is complete::
--min_component_size MIN_COMPONENT_SIZE
Unbridged graph components smaller
than this size (bp) will be removed
from the final graph (default: 1000)
--min_dead_end_size MIN_DEAD_END_SIZE
Graph dead ends smaller than this size (bp)
will be removed from the final graph
(default: 1000)
-----
**Long read alignment options**
These options control the alignment of long reads to the assembly graph::
--contamination CONTAMINATION
FASTA file of known contamination in long reads
--scores SCORES
Comma-delimited string of alignment scores:
match, mismatch, gap open, gap extend
(default: 3,-6,-5,-2)
--low_score LOW_SCORE
Score threshold - alignments below this
are considered poor
(default: set threshold automatically)
-----
**Outputs**
Galaxy's wrapped for Unicycler produces two outputs:
* final assembly in FASTA format
* final assembly grapth in graph format
While most will likely be interested in the FASTA dataset, the graph dataset is also quite useful and can be visualized using tools such as `Bandage`_.
.. _`Bandage`: https://github.com/rrwick/Bandage
]]></help>
<citations>
<citation type="doi">10.1101/096412</citation>
</citations>
</tool>