forked from geraldinepascal/FROGS-wrappers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.xml
501 lines (417 loc) · 24.3 KB
/
preprocess.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
<?xml version="1.0"?>
<!--
# Copyright (C) 2015 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-->
<tool id="FROGS_preprocess" name="FROGS Pre-process" version="3.1">
<description>merging, denoising and dereplication.</description>
<requirements>
<requirement type="package" version="3.1.0">frogs</requirement>
</requirements>
<stdio>
<exit_code range="1:" />
<exit_code range=":-1" />
</stdio>
<command>
preprocess.py $sequencer_type.sequencer_selected
--output-dereplicated $dereplicated_file --output-count $count_file --summary $summary_file
--nb-cpus \${GALAXY_SLOTS:-1}
--min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size
#if $sequencer_type.sequencer_selected == "illumina"
#if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard"
--five-prim-primer $sequencer_type.sequencing_protocol.five_prim_primer --three-prim-primer $sequencer_type.sequencing_protocol.three_prim_primer
#else
--without-primers
#end if
#else
--five-prim-primer $sequencer_type.five_prim_primer --three-prim-primer $sequencer_type.three_prim_primer
#end if
#if $sequencer_type.input_type.input_type_selected == "archive"
--input-archive $sequencer_type.input_type.archive_file
#if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_merged"
--already-contiged
#elif $sequencer_type.sequencer_selected == "illumina"
--R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size
--mismatch-rate $sequencer_type.input_type.archive_type.mm_rate
--merge-software $sequencer_type.input_type.archive_type.merge_software_type.merge_software_selected
#if $sequencer_type.input_type.archive_type.merge_software_type.merge_software_selected == "flash"
--expected-amplicon-size $sequencer_type.input_type.archive_type.merge_software_type.expected_amplicon_size
#end if
#if $sequencer_type.input_type.archive_type.keep_unmerged
--keep-unmerged
#end if
#end if
#else
#set $sep = ' '
#if $sequencer_type.sequencer_selected == "illumina"
--samples-names
#for $current in $sequencer_type.input_type.files_by_samples_type.samples
$sep"${current.name.strip()}"
#end for
--input-R1
#for $current in $sequencer_type.input_type.files_by_samples_type.samples
$sep${current.R1_file}
#end for
#if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_merged"
--already-contiged
#else
--input-R2
#for $current in $sequencer_type.input_type.files_by_samples_type.samples
$sep${current.R2_file}
#end for
--R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size
--mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate
--merge-software $sequencer_type.input_type.files_by_samples_type.merge_software_type.merge_software_selected
#if $sequencer_type.input_type.files_by_samples_type.merge_software_type.merge_software_selected == "flash"
--expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.expected_amplicon_size
#end if
#if $sequencer_type.input_type.files_by_samples_type.keep_unmerged
--keep-unmerged
#end if
#end if
#else
--input-R1
#for $current in $sequencer_type.input_type.samples
$sep${current.R1_file}
#end for
--samples-names
#for $current in $sequencer_type.input_type.samples
$sep"${current.name.strip()}"
#end for
#end if
#end if
</command>
<inputs>
<conditional name="sequencer_type">
<param name="sequencer_selected" type="select" label="Sequencer" help="Select the sequencing technology used to produce the sequences.">
<option value="illumina" selected="true">Illumina</option>
<option value="454">454</option>
</param>
<when value="illumina">
<!-- Samples -->
<conditional name="input_type">
<param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with two files (R1 and R2) by sample.">
<option value="files_by_samples" selected="true">Files by samples</option>
<option value="archive">Archive</option>
</param>
<when value="archive">
<param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file(s) for each sample." optional="false" />
<conditional name="archive_type">
<param name="archive_type_selected" type="select" label="Reads already merged ?" help="The archive contains 1 file by sample : R1 and R2 are already merged by pair.">
<option value="paired" selected="true">No</option>
<option value="already_merged">Yes</option>
</param>
<!-- $sequencer_type.input_type.archive_type.archive_type_selected == "already_merged" -->
<when value="paired">
<!-- Reads size -->
<param name="R1_size" type="integer" label="Reads 1 size" help="The maximum read1 size." value="" optional="false" />
<param name="R2_size" type="integer" label="Reads 2 size" help="The maximum read2 size." value="" optional="false" />
<param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" />
<conditional name="merge_software_type">
<param name="merge_software_selected" type="select" label="Merge software" help="Select the software to merge paired-end reads.">
<option value="vsearch" selected="true">Vsearch</option>
<option value="flash">Flash</option>
</param>
<when value="flash">
<param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
</when>
<when value="vsearch"></when>
</conditional>
<param name="keep_unmerged" type="boolean" label="Would you like to keep unmerged reads?" help="No : Unmerged reads will be excluded; Yes : unmerged reads will be artificially combined with 100 N. (default No)" />
</when>
<when value="already_merged"></when>
</conditional>
</when>
<when value="files_by_samples">
<conditional name="files_by_samples_type">
<param name="files_by_samples_type_selected" type="select" label="Reads already contiged ?" help="The inputs contain 1 file by sample : R1 and R2 are already merged by pair.">
<option value="paired" selected="true">No</option>
<option value="already_merged">Yes</option>
</param>
<when value="paired">
<!-- Samples -->
<repeat name="samples" title="Samples" min="1">
<param name="name" type="text" label="Name" help="The sample name." optional="false">
<validator type="empty_field" message="This parameter is required." />
</param>
<param format="fastq" name="R1_file" type="data" label="Reads 1" help="R1 FASTQ file of paired-end reads." />
<param format="fastq" name="R2_file" type="data" label="reads 2" help="R2 FASTQ file of paired-end reads." />
</repeat>
<!-- Reads size -->
<param name="R1_size" type="integer" label="Reads 1 size" help="The maximum read1 size." value="" optional="false" />
<param name="R2_size" type="integer" label="Reads 2 size" help="The maximum read2 size." value="" optional="false" />
<param name="mm_rate" type="float" label="mismatch rate." help="The maximum rate of mismatches in the overlap region" value="0.1" optional="false" />
<conditional name="merge_software_type">
<param name="merge_software_selected" type="select" label="Merge software" help="Select the software to merge paired-end reads.">
<option value="vsearch" selected="true">Vsearch</option>
<option value="flash">Flash</option>
</param>
<when value="flash">
<param name="expected_amplicon_size" type="integer" label="Expected amplicon size" help="Maximum amplicon length expected in approximately 90% of the amplicons." value="" />
</when>
<when value="vsearch"></when>
</conditional>
<param name="keep_unmerged" type="boolean" label="Would you like to keep unmerged reads?" help="No : Unmerged reads will be excluded; Yes : unmerged reads will be artificially combined with 100 N. (default No)" />
</when>
<when value="already_merged">
<repeat name="samples" title="Samples" min="1">
<param name="name" type="text" label="Name" help="The sample name." optional="false">
<validator type="empty_field" message="This parameter is required." />
</param>
<param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of merged reads." />
</repeat>
</when>
</conditional>
</when>
</conditional>
<!-- Amplicons -->
<param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons (with primers)." value="" optional="false" />
<param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons (with primers)." value="" optional="false" />
<!-- Primers -->
<conditional name="sequencing_protocol">
<param name="sequencing_protocol_selected" type="select" label="Sequencing protocol" help="The protocol used for sequencing step: standard or custom with PCR primers as sequencing primers.">
<option value="standard" selected="true">Illumina standard</option>
<option value="without_primers">Custom protocol (Kozich et al. 2013)</option>
</param>
<when value="standard">
<param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
<validator type="empty_field" message="This parameter is required." />
</param>
<param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
<validator type="empty_field" message="This parameter is required." />
</param>
</when>
<when value="without_primers"></when>
</conditional>
</when>
<when value="454">
<!-- Samples -->
<conditional name="input_type">
<param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample.">
<option value="files_by_samples" selected="true">One file by sample</option>
<option value="archive">Archive</option>
</param>
<when value="archive">
<param name="archive_file" type="data" format="tar" label="Archive file" help="The tar file containing the sequences file for each sample." optional="false" />
</when>
<when value="files_by_samples">
<repeat name="samples" title="Samples" min="1">
<param name="name" type="text" label="Name" help="The sample name." optional="false" />
<param format="fastq" name="R1_file" type="data" label="Sequence file" help="FASTQ file of sample." />
</repeat>
</when>
</conditional>
<!-- Amplicons -->
<param name="min_amplicon_size" type="integer" label="Minimum amplicon size" help="The minimum size for the amplicons (with primers)." value="" optional="false" />
<param name="max_amplicon_size" type="integer" label="Maximum amplicon size" help="The maximum size for the amplicons (with primers)." value="" optional="false" />
<!-- Primers -->
<param name="five_prim_primer" type="text" size="20" label="5' primer" help="The 5' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
<validator type="empty_field" message="This parameter is required." />
</param>
<param name="three_prim_primer" type="text" size="20" label="3' primer" help="The 3' primer sequence (wildcards are accepted). The orientation is detailed below in 'Primers parameters'." optional="false">
<validator type="empty_field" message="This parameter is required." />
</param>
</when>
</conditional>
</inputs>
<outputs>
<data format="fasta" name="dereplicated_file" label="${tool.name}: dereplicated.fasta" from_work_dir="dereplicated.fasta" />
<data format="tabular" name="count_file" label="${tool.name}: count.tsv" from_work_dir="count.tsv" />
<data format="html" name="summary_file" label="${tool.name}: report.html" from_work_dir="report.html" />
</outputs>
<tests>
<test>
<conditional name="sequencer_type">
<param name="sequencer_selected" value="illumina"/>
<conditional name="input_type">
<param name="input_type_selected" value="archive"/>
<param name="archive_file" ftype="tar" value="input/test_dataset.tar.gz"/>
<conditional name="archive_type">
<param name="archive_type_selected" value="paired"/>
<param name="R1_size" value="267"/>
<param name="R2_size" value="266"/>
<param name="mm_rate" value="0.15"/>
<conditional name="merge_software_type">
<param name="merge_software_selected" value="flash" />
<param name="expected_amplicon_size" value="420"/>
</conditional>
<param name="keep_unmerged" value="true"/>
</conditional>
</conditional>
<param name="min_amplicon_size" value="44"/>
<param name="max_amplicon_size" value="490"/>
<conditional name="sequencing_protocol">
<param name="sequencing_protocol_selected" value="standard"/>
<param name="five_prim_primer" value="GGCGVACGGGTGAGTAA"/>
<param name="three_prim_primer" value="GTGCCAGCNGCNGCGG"/>
</conditional>
</conditional>
<output name="dereplicated_file" file="references/01-prepro-flash.fasta"/>
<output name="count_file" file="references/01-prepro-flash.tsv"/>
<output name="summary_file" file="references/01-prepro-flash.html" compare="sim_size" delta="0"/>
<!--output name="summary_file">
<assert_contents>
<has_text_matching expression="FROGS\sPre-process" />
<has_text_matching expression="splA_01" />
</assert_contents>
</output-->
</test>
<test>
<conditional name="sequencer_type">
<param name="sequencer_selected" value="illumina"/>
<conditional name="input_type">
<param name="input_type_selected" value="archive"/>
<param name="archive_file" ftype="tar" value="input/test_dataset.tar.gz"/>
<conditional name="archive_type">
<param name="archive_type_selected" value="paired"/>
<param name="R1_size" value="267"/>
<param name="R2_size" value="266"/>
<param name="mm_rate" value="0.15"/>
<conditional name="merge_software_type">
<param name="merge_software_selected" value="vsearch" />
</conditional>
<param name="keep_unmerged" value="true"/>
</conditional>
</conditional>
<param name="min_amplicon_size" value="44"/>
<param name="max_amplicon_size" value="490"/>
<conditional name="sequencing_protocol">
<param name="sequencing_protocol_selected" value="standard"/>
<param name="five_prim_primer" value="GGCGVACGGGTGAGTAA"/>
<param name="three_prim_primer" value="GTGCCAGCNGCNGCGG"/>
</conditional>
</conditional>
<output name="dereplicated_file" file="references/01-prepro-vsearch.fasta"/>
<output name="count_file" file="references/01-prepro-vsearch.tsv"/>
<output name="summary_file" file="references/01-prepro-vsearch.html" compare="sim_size" delta="0"/>
<!--output name="summary_file">
<assert_contents>
<has_text_matching expression="FROGS\sPre-process" />
<has_text_matching expression="splA_01" />
</assert_contents>
</output-->
</test>
</tests>
<help>
.. image:: static/images/frogs_images/FROGS_logo.png
:height: 144
:width: 110
.. class:: infomark page-header h2
What it does
FROGS Pre-process filters and dereplicates amplicons for use in diversity analysis.
.. class:: infomark page-header h2
Inputs/Outputs
.. class:: h3
Inputs
Sample files added one after another or provide in an archive file (tar.gz).
.. container:: row
.. container:: col-md-6
**Illumina inputs**
:Usage: For samples sequenced in paired-end. The amplicon length must be inferior to the length of the R1 plus R2 length. R1 and R2 are merged by the common region.
:Files: One R1 and R2 by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_)
:Example: splA_R1.fastq.gz, splA_R2.fastq.gz, splB_R1.fastq.gz, splB_R2.fastq.gz
OR
:Usage: For samples sequenced in single-ends or when R1 and R2 reads are already merged.
:Files: One sequence file by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_).
:Example: splA.fastq.gz, splB.fastq.gz
.. container:: col-md-6
**454 inputs**
:Files: One sequence file by sample (format `FASTQ <https://en.wikipedia.org/wiki/FASTA_format>`_)
:Example: splA.fastq.gz, splB.fastq.gz
Remark: In an archive if you use R1 and R2 files they names must end with *_R1* and *_R2*.
.. class:: h3
Outputs
**Sequence file** (dereplicated.fasta):
Only one file with all samples sequences (format `FASTA <https://en.wikipedia.org/wiki/FASTA_format>`_). These sequences are dereplicated: strictly identical sequence are represented only once and the initial count is kept in count file.
**Count file** (count.tsv):
This file contains the count of all unique sequences in each sample (format `TSV <https://en.wikipedia.org/wiki/Tab-separated_values>`_).
**Summary file** (report.html):
This file reports the number of remaining sequences after each filter (format `HTML <https://en.wikipedia.org/wiki/HTML>`_).
.. image:: static/images/frogs_images/FROGS_preprocess_summary_v3.png
:height: 850
:width: 831
It also presents the length distribution of the remaining full amplicon sequences.
.. image:: static/images/frogs_images/FROGS_preprocess_lengthsSamples_v3.png
:height: 379
:width: 364
.. class:: infomark page-header h2
How it works
.. csv-table::
:header: "Steps", "Illumina", "454"
:widths: 5, 150, 150
:class: table table-striped
"1", "For un-merged data: merges R1 and R2 with a maximum of M% mismatch in the overlaped region(`VSEARCH <https://github.com/torognes/vsearch/>`_ or `FLASH <https://ccb.jhu.edu/software/FLASH/>`_ or optionnaly `PEAR <https://sco.h-its.org/exelixis/web/software/pear/>`_). Resulting un-merged reads may optionnaly be artificially combined by adding 100 N between the reads", "/"
"2", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and then remove primers in the remaining sequence (`cutadapt <http://cutadapt.readthedocs.org/en/latest/guide.html>`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence and reverse complement the sequences on strand - (`cutadapt <http://cutadapt.readthedocs.org/en/latest/guide.html>`_). The primer search accepts 10% of differences"
"3", "Filters sequences with ambiguous nucleotides and for merged sequences filters on their length which must be range between 'Minimum amplicon size - primer length' and 'Maximum amplicon size - primer length'", "the tool removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10"
"4", "Dereplicates sequences", "Dereplicates sequences"
.. class:: infomark page-header h2
Advices/details on parameters
.. class:: h3
What is the differency between overlapped sequences and combined sequences?
- **Case of a sequencing of overlapping sequences: case of 16S V3-V4 amplicon MiSeq sequencing**
.. image:: static/images/frogs_images/FROGS_preprocess_overlapped_sequence.png
:height: 261
:width: 531
- **Case of a sequencing of non-overlapping sequences: case of ITS1 amplicon MiSeq sequencing**
.. image:: static/images/frogs_images/FROGS_preprocess_combined_sequence1.png
:height: 279
:width: 797
.. class:: warningmark
**“FROGS combined” warning points**
Reads pair are not merged because:
- the real amplicon length is greater than de number of base sequences (500 bp for MiSeq 2x250bp)
- the overlapped region is smaller than 10 (fixed parameter in FROGS).
Thus, “FROGS combined” sequences are artificial and present particular features especially on size.
Imagine a MiSeq sequencing of 2x250pb with sequences that cannot overlap, the resulting “FROGS combined” sequences length will be 600 bp.
.. image:: static/images/frogs_images/FROGS_preprocess_combined_sequence2.png
:height: 357
:width: 798
.. class:: h3
Keeping or not un-merged paired reads
This option is usefull when and only when, targeted amplicon is longer than the sequencing technology can provide (ITS amplicon for example). In other case, carefully, you will only keep noise in your analysis.
.. class:: h3
Primers parameters
The (`Kozich et al. 2013 <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/>`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers.
In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation.
.. role:: alert-info
Example:
5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3'
Value for parameter 5' primer: ATGCCC
Value for parameter 3' primer: ATTTCAG
.. class:: h3
FLASH : Amplicons sizes parameters
The two following images show two examples of perfect values fors sizes parameters.
.. image:: static/images/frogs_images/FROGS_preprocess_ampliconSize_unimodal_v3.png
:height: 415
:width: 676
.. image:: static/images/frogs_images/FROGS_preprocess_ampliconSize_multimodal_v3.png
:height: 415
:width: 676
Don't worry the "Expected amplicon size" does not need to be very accurate, and only necessary for sequences merging with FLASH.
.. class:: h3
If the filter 'merged' reduce drasticaly the number of sequences:
In un-merged Illumina data, the reduction of dataset by the merged filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem.
If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC <http://www.bioinformatics.babraham.ac.uk/projects/fastqc/>`_) you can try to cut the end of your sequences and relaunch the preprocess tool. You can either raise the mismatch percent in the overlapped region, but not too much!
----
**Contact**
Contacts: [email protected]
Repository: https://github.com/geraldinepascal/FROGS
website: http://frogs.toulouse.inra.fr/
Please cite the **FROGS article**: *Escudie F., et al. Bioinformatics, 2018. FROGS: Find, Rapidly, OTUs with Galaxy Solution.*
</help>
<citations>
<citation type="doi">10.1093/bioinformatics/btx791</citation>
</citations>
</tool>