diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 56cb32e0..d3c39ca8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: NXF_VER: - "21.10.3" - "latest-everything" - profile: ["test_immcantation_devel"] + profile: ["test_assembled_immcantation_devel", "test_raw_immcantation_devel"] fail-fast: false steps: - name: Check out pipeline code diff --git a/CHANGELOG.md b/CHANGELOG.md index ac5e6ae9..8055883d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [3.0dev] - + +### `Added` + +- Added compulsory AIRR fields in input samplesheet. + +### `Fixed` + +### `Dependencies` + +### `Deprecated` + ## [2.3.0] - 2022-09-22 "Expelliarmus" ### `Added` diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 39c1b45d..7f3adf03 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -61,6 +61,10 @@ def check_samplesheet(file_in): "species", "pcr_target_locus", "single_cell", + "sex", + "tissue", + "biomaterial_provider", + "age" ] header = [x.strip('"') for x in fin.readline().strip().split("\t")] for col in REQUIRED_COLUMNS: diff --git a/conf/modules.config b/conf/modules.config index 9b542213..f002011a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -286,7 +286,7 @@ process { withName: ADD_META_TO_TAB { publishDir = [ - path: { "${params.outdir}/vdj_annotation/06-annotat-metadata/${meta.id}" }, + path: { "${params.outdir}/vdj_annotation/06-annotate-metadata/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -352,13 +352,26 @@ process { ] } - withName: DEFINE_CLONES { + withName: DEFINE_CLONES_COMPUTE { publishDir = [ path: { "${params.outdir}/clonal_analysis/define_clones" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - ext.args = ['outname':'', 'model':'hierarchical', 'method':'nt', 'linkage':'single', 'outputby':'sample_id'] + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':30] + } + + withName: DEFINE_CLONES_REPORT { + publishDir = [ + path: { "${params.outdir}/clonal_analysis/define_clones" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':30] } withName: DOWSER_LINEAGES { diff --git a/conf/test.config b/conf/test.config index 4925c93e..cc260ec2 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test.tsv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test_airr.tsv' cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/C_primers.fasta' vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/V_primers.fasta' imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' @@ -35,3 +35,11 @@ params { umi_start = 6 umi_position = 'R1' } + +process{ + withName:"DEFINE_CLONES*"{ + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':10] + } +} diff --git a/conf/test_immcantation_devel.config b/conf/test_assembled_immcantation_devel.config similarity index 76% rename from conf/test_immcantation_devel.config rename to conf/test_assembled_immcantation_devel.config index 31bde2ae..49c6f5a4 100644 --- a/conf/test_immcantation_devel.config +++ b/conf/test_assembled_immcantation_devel.config @@ -4,7 +4,7 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/airrflow -profile test, + * nextflow run nf-core/airrflow -profile test_assembled_immcantation_devel, */ params { @@ -19,8 +19,8 @@ params { // Input data mode = 'assembled' input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-reveal/test_reveal_metadata.tsv' - imgtdb_base = '/usr/local/share/germlines/imgt' - igblast_base = '/usr/local/share/igblast' + imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' + igblast_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/igblast_base.zip' igphyml = '/usr/local/share/igphyml/src/igphyml' reassign = true diff --git a/conf/test_fetchimgt.config b/conf/test_fetchimgt.config index 9664c860..b4d020dd 100644 --- a/conf/test_fetchimgt.config +++ b/conf/test_fetchimgt.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test.tsv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test_airr.tsv' cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/C_primers.fasta' vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/V_primers.fasta' @@ -33,3 +33,12 @@ params { umi_start = 6 umi_position = 'R1' } + +process{ + withName:"DEFINE_CLONES*"{ + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':10] + } +} + diff --git a/conf/test_full.config b/conf/test_full.config index 138e6edf..6e20a046 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,7 +15,7 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/metadata_pcr_umi_airr.tsv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/metadata_pcr_umi_airr_300.tsv' cprimers = 's3://nf-core-awsmegatests/airrflow/input_data/pcr_umi/cprimers.fasta' vprimers = 's3://nf-core-awsmegatests/airrflow/input_data/pcr_umi/vprimers.fasta' imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' diff --git a/conf/test_no_umi.config b/conf/test_no_umi.config index 03d7eb9e..a2bf1a3c 100644 --- a/conf/test_no_umi.config +++ b/conf/test_no_umi.config @@ -27,7 +27,7 @@ params { umi_length = 0 // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-no-umi/Metadata_test-no-umi.tsv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-no-umi/Metadata_test-no-umi_airr.tsv' cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-no-umi/Greiff2014_CPrimers.fasta' vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-no-umi/Greiff2014_VPrimers.fasta' imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' @@ -39,6 +39,11 @@ process { // When not using UMIs, set the coord parameter based on source (e.g., sra or illumina) withName: PRESTO_ASSEMBLEPAIRS_SANS_UMI { ext.args = '--rc tail --coord sra --maxerror 0.3' - } - + } + withName:"DEFINE_CLONES*"{ + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':10] + } } + diff --git a/conf/test_nocluster.config b/conf/test_nocluster.config index 766ad1a1..e8dcf70f 100644 --- a/conf/test_nocluster.config +++ b/conf/test_nocluster.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test.tsv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test_airr.tsv' cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/C_primers.fasta' vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/V_primers.fasta' imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' @@ -36,3 +36,11 @@ params { umi_position = 'R1' cluster_sets = false } + +process{ + withName:"DEFINE_CLONES*"{ + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':10] + } +} diff --git a/conf/test_raw_immcantation_devel.config b/conf/test_raw_immcantation_devel.config new file mode 100644 index 00000000..27d638b9 --- /dev/null +++ b/conf/test_raw_immcantation_devel.config @@ -0,0 +1,54 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/airrflow -profile test_raw_immcantation_devel, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/Metadata_test_airr.tsv' + cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/C_primers.fasta' + vprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-bcr/V_primers.fasta' + + imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' + igblast_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/igblast_base.zip' + igphyml = '/usr/local/share/igphyml/src/igphyml' + + mode = 'fastq' + + library_generation_method = 'specific_pcr_umi' + cprimer_position = 'R1' + index_file = true + umi_length = 8 + umi_start = 6 + umi_position = 'R1' +} + +process{ + withLabel:immcantation{ + container = 'immcantation/suite:devel' + } + withName:"DEFINE_CLONES*"{ + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':10] + } +} + +env { + PYTHONNOUSERSITE = 0 +} diff --git a/conf/test_tcr.config b/conf/test_tcr.config index 3a8069b9..6fe6e315 100644 --- a/conf/test_tcr.config +++ b/conf/test_tcr.config @@ -29,10 +29,18 @@ params { // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-tcr/TCR_metadata.tsv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-tcr/TCR_metadata_airr.tsv' cprimers = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-tcr/cprimers.fasta' race_linker = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-tcr/linker.fasta' imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' igblast_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/igblast_base.zip' } + +process{ + withName:"DEFINE_CLONES*"{ + ext.args = ['outname':'', 'model':'hierarchical', + 'method':'nt', 'linkage':'single', + 'outputby':'sample_id', 'min_n':10] + } +} diff --git a/docs/images/airrflow_workflow_overview.png b/docs/images/airrflow_workflow_overview.png new file mode 100644 index 00000000..17b8eac2 Binary files /dev/null and b/docs/images/airrflow_workflow_overview.png differ diff --git a/docs/images/airrflow_workflow_overview.svg b/docs/images/airrflow_workflow_overview.svg new file mode 100644 index 00000000..5a208af6 --- /dev/null +++ b/docs/images/airrflow_workflow_overview.svg @@ -0,0 +1,2743 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +. +CC-BY 4.0. Design originally by Zandra Fagernäs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + bcftools + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/bcellmagic-subway-TB.svg b/docs/images/bcellmagic-subway-TB.svg deleted file mode 100644 index f85f5df4..00000000 --- a/docs/images/bcellmagic-subway-TB.svg +++ /dev/null @@ -1,1023 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - tsv - - - - Samplesheet - - - - - fa - - - - - - - fa - - - - cprimers - vprimers - - - - - - - - - - - - fastq - - - - - - - - - - - FastQC - - --umi_length > 0 - --cluster_sets - Assemblepairs - - Maskprimers - PairSeq - Annotate primers - Collapse duplicates - - - FastQC - Filter byquality - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Cluster sequencesby similarity - - - Add cluster tofasta header - - Build consensus - - PairSeq - - Assemblepairs - - - Collapse consensus_count - Annotate metadata - - At least 2 - - - - - - - - - - - - - diff --git a/docs/images/metro-map-airrflow.png b/docs/images/metro-map-airrflow.png new file mode 100644 index 00000000..9230e3ec Binary files /dev/null and b/docs/images/metro-map-airrflow.png differ diff --git a/docs/images/metro-map-airrflow.svg b/docs/images/metro-map-airrflow.svg new file mode 100644 index 00000000..1117253c --- /dev/null +++ b/docs/images/metro-map-airrflow.svg @@ -0,0 +1,5312 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/reveal-subway-LR.svg b/docs/images/reveal-subway-LR.svg deleted file mode 100644 index 5b9dd487..00000000 --- a/docs/images/reveal-subway-LR.svg +++ /dev/null @@ -1,1224 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - tsv - - - - - - - tsv - - - - - - - fa - - - - - - zip - - - - Samplesheet - - - - MakeDb - Filter quality - Productive - mod3 junction - Add metadata - - - - - - Fetch databases - Convert - - - - - - - - - - - - - - - - - - - - - - - - IMGT - - - - - - - - - - - - - - - - - - - - - - Creategermlines - Single cell QC - Removechimeric - Detectcrosscontamination - Collapseduplicates - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Build lineage trees - - - - - Find threshold - - - - - - - - - - - - - - - Define clones - - - - - - - - 1 - - - - - 3 - - - - - 2 - - - - - - - - - diff --git a/docs/images/reveal-subway-TB.svg b/docs/images/reveal-subway-TB.svg deleted file mode 100644 index 21333d23..00000000 --- a/docs/images/reveal-subway-TB.svg +++ /dev/null @@ -1,1093 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - tsv - - - - - - - tsv - - - - - - - fa - - - - - - - zip - - - - Samplesheet - - - - MakeDb - Filter quality - Productive - mod3 junction - Add metadata - - - - - - Create germlines - Single cell QC - Fetch databases - Remove chimeric - Detect cross-contamination - Collapse duplicates - - Build lineage trees - - - - Convert - AssignGenes - - - - - - - - - - - - - - - - - - - - - - - - - - IMGT - - - - - - - - - - - - - - - - - - --productive_only - - - - - - - - - - - - - - - --remove_chimeric - - - - - - --threshold auto - Find threshold - - - - - - - - - - - - - - - Define clones - - diff --git a/docs/usage.md b/docs/usage.md index f9d8ca4b..4bfe3035 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,7 +6,9 @@ ## Introduction -The airrflow pipeline allows processing bulk targeted BCR and TCR sequencing data from multiplex or RACE PCR protocols. It performs V(D)J assignment, clonotyping, lineage reconsctruction and repertoire analysis using the [Immcantation](https://immcantation.readthedocs.io/en/stable/) framework. +The airrflow pipeline allows processing BCR and TCR targeted sequencing data from bulk and single-cell sequencing protocols. It performs V(D)J assignment, clonotyping, lineage reconsctruction and repertoire analysis using the [Immcantation](https://immcantation.readthedocs.io/en/stable/) framework. + +![nf-core/airrflow overview](images/airrflow_workflow_overview.png) ## Running the pipeline @@ -37,28 +39,32 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -## AIRR fields support +## Input metadata ### Supported AIRR fields -nf-core/airrflow offers full support for the AIRR metadata fields. The minimum metadata fields that are needed by the pipeline are listed in the table below. Other metadata fields can be provided in the input samplesheet, which will be available for reporting and introducing comparisons among repertoires. +nf-core/airrflow offers full support for the [AIRR standards 1.4](https://docs.airr-community.org/en/stable/datarep/metadata.html) metadata annotation. The minimum metadata fields that are needed by the pipeline are listed in the table below. Other non-mandatory AIRR fields can be provided in the input samplesheet, which will be available for reporting and introducing comparisons among repertoires. -| AIRR field | Type | Name | Description | +| AIRR field | Type | Parameter Name | Description | | ------------------------- | ------------------ | ----------------------------- | ----------------------------------------------------- | -| sample_id | Samplesheet column | sample_id | Sample ID assigned by submitter, unique within study | -| subject_id | Samplesheet column | subject_id | Subject ID assigned by submitter, unique within study | -| species | Samplesheet column | species | Subject species | -| pcr_target_locus | Samplesheet column | pcr_target_locus | Designation of the target locus (IG or TR) | +| sample_id | Samplesheet column | | Sample ID assigned by submitter, unique within study | +| subject_id | Samplesheet column | | Subject ID assigned by submitter, unique within study | +| species | Samplesheet column | | Subject species | +| tissue | Samplesheet column | | Sample tissue | +| pcr_target_locus | Samplesheet column | | Designation of the target locus (IG or TR) | +| sex | Samplesheet column | | Subject sex | +| age | Samplesheet column | | Subject age | +| biomaterial_provider | Samplesheet column | | Name of sample biomaterial provider | | library_generation_method | Parameter | `--library_generation_method` | Generic type of library generation | -### Fastq input samplesheet +### Fastq input samplesheet (bulk) -The required input file is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species` and `pcr_target_locus` are required. An example samplesheet is: +The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. An example samplesheet is: -| sample_id | filename_R1 | filename_R2 | filename_I1 | subject_id | species | pcr_target_locus | intervention | collection_time_point_relative | cell_subset | -| --------- | ------------------------------- | ------------------------------- | ------------------------------- | ---------- | ------- | ---------------- | -------------- | ------------------------------ | ------------ | -| sample01 | sample1_S8_L001_R1_001.fastq.gz | sample1_S8_L001_R2_001.fastq.gz | sample1_S8_L001_I1_001.fastq.gz | Subject02 | human | IG | Drug_treatment | Baseline | plasmablasts | -| sample02 | sample2_S8_L001_R1_001.fastq.gz | sample2_S8_L001_R2_001.fastq.gz | sample2_S8_L001_I1_001.fastq.gz | Subject02 | human | TR | Drug_treatment | Baseline | plasmablasts | +| sample_id | filename_R1 | filename_R2 | filename_I1 | subject_id | species | pcr_target_locus | tissue | sex | age | biomaterial_provider | single_cell | intervention | collection_time_point_relative | cell_subset | +| --------- | ------------------------------- | ------------------------------- | ------------------------------- | ---------- | ------- | ---------------- | ------ | ------ | --- | -------------------- | ----------- | -------------- | ------------------------------ | ------------ | +| sample01 | sample1_S8_L001_R1_001.fastq.gz | sample1_S8_L001_R2_001.fastq.gz | sample1_S8_L001_I1_001.fastq.gz | Subject02 | human | IG | blood | NA | 53 | sequencing_facility | FALSE | Drug_treatment | Baseline | plasmablasts | +| sample02 | sample2_S8_L001_R1_001.fastq.gz | sample2_S8_L001_R2_001.fastq.gz | sample2_S8_L001_I1_001.fastq.gz | Subject02 | human | TR | blood | female | 78 | sequencing_facility | FALSE | Drug_treatment | Baseline | plasmablasts | - sample_id: Sample ID assigned by submitter, unique within study. - filename_R1: path to fastq file with first mates of paired-end sequencing. @@ -66,7 +72,11 @@ The required input file is a sample sheet in TSV format (tab separated). The col - filename_I1 (optional): path to fastq with illumina index and UMI (unique molecular identifier) barcode. - subject_id: Subject ID assigned by submitter, unique within study. - species: species from which the sample was taken. Supported species are `human` and `mouse`. +- tissue: tissue from which the sample was taken. E.g. `blood`, `PBMC`, `brain`. - pcr_target_locus: Designation of the target locus (`IG` or `TR`). +- sex: Subject biological sex (`female`, `male`, etc.). +- age: Subject biological age. +- single_cell: TRUE or FALSE. Fastq input samplesheet only supports a FALSE value. Other optional columns can be added. These columns will be available when building the contrasts for the repertoire comparison report. It is recommended that these columns also follow the AIRR nomenclature. Examples are: @@ -78,6 +88,14 @@ Other optional columns can be added. These columns will be available when buildi The metadata specified in the input file will then be automatically annotated in a column with the same header in the tables generated by the pipeline. +### Assembled input samplesheet (bulk or single-cell) + +The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename`, `subject_id`, `species`, `tissue`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. + +An example samplesheet is + + + ## Supported library generation methods (protocols) | Library generation methods (AIRR) | Description | Name in pipeline | Commercial protocols | diff --git a/modules/local/alakazam/alakazam_shazam_repertoires.nf b/modules/local/alakazam/alakazam_shazam_repertoires.nf index b062f0ce..ce3628d5 100644 --- a/modules/local/alakazam/alakazam_shazam_repertoires.nf +++ b/modules/local/alakazam/alakazam_shazam_repertoires.nf @@ -1,5 +1,5 @@ process ALAKAZAM_SHAZAM_REPERTOIRES { - tag "report" + tag "${meta.id}" label 'process_high' conda (params.enable_conda ? "conda-forge::r-base=4.1.2 bioconda::r-alakazam=1.2.0 bioconda::r-shazam=1.1.0 conda-forge::r-kableextra=1.3.4 conda-forge::r-knitr=1.33 conda-forge::r-stringr=1.4.0 conda-forge::r-dplyr=1.0.6 conda-forge::r-optparse=1.7.1" : null) @@ -8,7 +8,7 @@ process ALAKAZAM_SHAZAM_REPERTOIRES { 'quay.io/biocontainers/mulled-v2-7da73314bcc47157b442d16c3dcfbe81e75a404f:9bb35f8114dffcd97b3afb5de8587355aca16b66-0' }" input: - path(tab) // sequence tsv table in AIRR format + tuple val(meta), path(tab) // sequence tsv table in AIRR format path("Table_sequences.tsv") path(repertoire_report) path(css) diff --git a/modules/local/changeo/changeo_assigngenes.nf b/modules/local/changeo/changeo_assigngenes.nf index 9e193285..e7c7610a 100644 --- a/modules/local/changeo/changeo_assigngenes.nf +++ b/modules/local/changeo/changeo_assigngenes.nf @@ -2,6 +2,7 @@ process CHANGEO_ASSIGNGENES { tag "$meta.id" label 'process_low' label 'immcantation' + label 'changeo' conda (params.enable_conda ? "bioconda::changeo=1.2.0 bioconda::igblast=1.17.1 conda-forge::wget=1.20.1" : null) // Conda package container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/changeo/changeo_buildtrees.nf b/modules/local/changeo/changeo_buildtrees.nf index d4fce4fd..aa4cc961 100644 --- a/modules/local/changeo/changeo_buildtrees.nf +++ b/modules/local/changeo/changeo_buildtrees.nf @@ -2,6 +2,8 @@ process CHANGEO_BUILDTREES { tag "$meta.id" label 'process_medium' label 'immcantation' + label 'changeo' + conda (params.enable_conda ? "conda-forge::r-base=4.1.2 bioconda:r-alakazam=1.2.0 bioconda::changeo=1.2.0 bioconda::igphyml=1.1.3" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/changeo/changeo_convertdb_fasta.nf b/modules/local/changeo/changeo_convertdb_fasta.nf index c1d94965..3d1f8e99 100644 --- a/modules/local/changeo/changeo_convertdb_fasta.nf +++ b/modules/local/changeo/changeo_convertdb_fasta.nf @@ -2,6 +2,8 @@ process CHANGEO_CONVERTDB_FASTA { tag "$meta.id" label 'process_low' label 'immcantation' + label 'changeo' + conda (params.enable_conda ? "bioconda::changeo=1.2.0 bioconda::igblast=1.17.1" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/changeo/changeo_creategermlines.nf b/modules/local/changeo/changeo_creategermlines.nf index c26cccae..9a6acf41 100644 --- a/modules/local/changeo/changeo_creategermlines.nf +++ b/modules/local/changeo/changeo_creategermlines.nf @@ -2,6 +2,8 @@ process CHANGEO_CREATEGERMLINES { tag "$meta.id" label 'process_low' label 'immcantation' + label 'changeo' + conda (params.enable_conda ? "bioconda::changeo=1.2.0 bioconda::igblast=1.17.1" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/changeo/changeo_defineclones.nf b/modules/local/changeo/changeo_defineclones.nf index 63d6620b..884ba236 100644 --- a/modules/local/changeo/changeo_defineclones.nf +++ b/modules/local/changeo/changeo_defineclones.nf @@ -2,6 +2,8 @@ process CHANGEO_DEFINECLONES { tag "$meta.id" label 'process_medium' label 'immcantation' + label 'changeo' + conda (params.enable_conda ? "bioconda::changeo=1.2.0 bioconda::igblast=1.17.1" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/changeo/changeo_makedb.nf b/modules/local/changeo/changeo_makedb.nf index d874f5bc..36a627ad 100644 --- a/modules/local/changeo/changeo_makedb.nf +++ b/modules/local/changeo/changeo_makedb.nf @@ -2,6 +2,8 @@ process CHANGEO_MAKEDB { tag "$meta.id" label 'process_low' label 'immcantation' + label 'changeo' + conda (params.enable_conda ? "bioconda::changeo=1.2.0 bioconda::igblast=1.17.1" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/changeo/changeo_parsedb_select.nf b/modules/local/changeo/changeo_parsedb_select.nf index 32cf02b8..145c2f9a 100644 --- a/modules/local/changeo/changeo_parsedb_select.nf +++ b/modules/local/changeo/changeo_parsedb_select.nf @@ -2,6 +2,8 @@ process CHANGEO_PARSEDB_SELECT { tag "$meta.id" label 'process_low' label 'immcantation' + label 'changeo' + conda (params.enable_conda ? "bioconda::changeo=1.2.0 bioconda::igblast=1.17.1" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/changeo/changeo_parsedb_split.nf b/modules/local/changeo/changeo_parsedb_split.nf index 7b0a75a9..91a8ac9f 100644 --- a/modules/local/changeo/changeo_parsedb_split.nf +++ b/modules/local/changeo/changeo_parsedb_split.nf @@ -2,6 +2,8 @@ process CHANGEO_PARSEDB_SPLIT { tag "$meta.id" label 'process_low' label 'immcantation' + label 'changeo' + conda (params.enable_conda ? "bioconda::changeo=1.2.0 bioconda::igblast=1.17.1" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/enchantr/define_clones.nf b/modules/local/enchantr/define_clones.nf index c4e4bfaf..0fad973e 100644 --- a/modules/local/enchantr/define_clones.nf +++ b/modules/local/enchantr/define_clones.nf @@ -10,7 +10,7 @@ def asString (args) { return s } process DEFINE_CLONES { - tag 'all_reps' + tag "${meta.id}" label 'process_long_parallelized' label 'immcantation' @@ -21,8 +21,7 @@ process DEFINE_CLONES { 'quay.io/biocontainers/r-enchantr:0.0.3--r42hdfd78af_1' }" input: - //tuple val(meta), path(tabs) // sequence tsv in AIRR format - path(tabs) + tuple val(meta), path(tabs) // meta, sequence tsv in AIRR format val threshold path imgt_base @@ -35,19 +34,21 @@ process DEFINE_CLONES { script: def args = asString(task.ext.args) ?: '' + def thr = threshold.join("") """ Rscript -e "enchantr::enchantr_report('define_clones', \\ report_params=list('input'='${tabs.join(',')}', \\ 'imgt_db'='${imgt_base}', \\ 'cloneby'='${params.cloneby}', \\ - 'threshold'=${threshold}, \\ + 'force'=FALSE, \\ + 'threshold'=${thr}, \\ 'singlecell'='${params.singlecell}','outdir'=getwd(), \\ 'nproc'=${task.cpus},\\ - 'log'='all_reps_clone_command_log' ${args}))" + 'log'='${meta.id}_clone_command_log' ${args}))" echo "${task.process}": > versions.yml Rscript -e "cat(paste0(' enchantr: ',packageVersion('enchantr'),'\n'))" >> versions.yml - mv enchantr 'all_reps_clone_report' + mv enchantr '${meta.id}_clone_report' """ } diff --git a/modules/local/enchantr/dowser_lineages.nf b/modules/local/enchantr/dowser_lineages.nf index 844683cf..fad1a72e 100644 --- a/modules/local/enchantr/dowser_lineages.nf +++ b/modules/local/enchantr/dowser_lineages.nf @@ -11,7 +11,7 @@ def asString (args) { } process DOWSER_LINEAGES { - tag "$tabs" + tag "${meta.id}" label 'process_high' label 'process_long' @@ -26,7 +26,7 @@ process DOWSER_LINEAGES { input: //tuple val(meta), path(tabs) // sequence tsv in AIRR format - path(tabs) + tuple val(meta), path(tabs) output: path("*_command_log.txt"), emit: logs //process logs diff --git a/modules/local/enchantr/find_threshold.nf b/modules/local/enchantr/find_threshold.nf index 2fe6e91d..b0129668 100644 --- a/modules/local/enchantr/find_threshold.nf +++ b/modules/local/enchantr/find_threshold.nf @@ -6,7 +6,6 @@ process FIND_THRESHOLD { label 'immcantation' label 'enchantr' - cache 'lenient' conda (params.enable_conda ? "bioconda::r-enchantr=0.0.3" : null) diff --git a/modules/local/enchantr/single_cell_qc.nf b/modules/local/enchantr/single_cell_qc.nf index 2c3eb498..3147ad58 100644 --- a/modules/local/enchantr/single_cell_qc.nf +++ b/modules/local/enchantr/single_cell_qc.nf @@ -23,7 +23,7 @@ process SINGLE_CELL_QC { 'quay.io/biocontainers/r-enchantr:0.0.3--r42hdfd78af_1' }" input: - path tabs + path(tabs) output: path("*/*scqc-pass.tsv"), emit: tab // sequence tsv in AIRR format diff --git a/modules/local/presto/presto_parseheaders_metadata.nf b/modules/local/presto/presto_parseheaders_metadata.nf index 07004704..853b0e9b 100644 --- a/modules/local/presto/presto_parseheaders_metadata.nf +++ b/modules/local/presto/presto_parseheaders_metadata.nf @@ -18,7 +18,7 @@ process PRESTO_PARSEHEADERS_METADATA { script: def args = task.ext.args ?: '' """ - ParseHeaders.py add -s $reads -o "${reads.baseName}_reheader-pass.fastq" $args -u ${meta.id} ${meta.subject} ${meta.species} ${meta.locus} + ParseHeaders.py add -s $reads -o "${reads.baseName}_reheader-pass.fastq" $args -u ${meta.id} ${meta.subject_id} ${meta.species} ${meta.locus} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 43d32eb6..6bb599e5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,15 +12,19 @@ params { // Input parameters input = null mode = "fastq" + miairr="$projectDir/assets/reveal/mapping_MiAIRR_BioSample_v1.3.1.tsv" + - // Databases options + // ---------------------------- + // database options + // ---------------------------- igblast_base = null imgtdb_base = null save_databases = true - // Bcellmagic specific options --------- - - // protocol options + // ---------------------------- + // sequencing protocol options + // ---------------------------- library_generation_method = null race_linker = null @@ -38,7 +42,9 @@ params { umi_length = -1 umi_start = 0 - // pRESTO options + // -------------------------- + // sequence assembly options + // -------------------------- filterseq_q = 20 primer_maxerror = 0.2 primer_mask_mode = 'cut' @@ -47,37 +53,48 @@ params { buildconsensus_maxgap = 0.5 cluster_sets = true - // vdj assignment options + // ----------------------- + // vdj annotation options + // ----------------------- productive_only = true reassign = true - // bulk germlines and filtering options + // ----------------------- + // bulk filtering options + // ----------------------- remove_chimeric = true detect_contamination = false + collapseby = 'sample_id' + + // ----------------------- + // clonal analysis options + // ----------------------- + cloneby = 'subject_id' + singlecell = 'single_cell' + clonal_threshold = 'auto' + skip_all_clones_report = false + + // tree lineage options + igphyml="/usr/local/bin/igphyml" + skip_lineage = false + // old bcellmagic options ---------------------- // Clustering parameters threshold_method = 'density' - // Downstream analysis + // ----------------------- + // reporting options + // ----------------------- skip_report = false - skip_lineage = false - - // Report report_rmd = "$projectDir/assets/repertoire_comparison.Rmd" report_css = "$projectDir/assets/nf-core_style.css" report_logo = "$projectDir/assets/nf-core-airrflow_logo_light.png" report_logo_img = "$projectDir/assets/nf-core-airrflow_logo_reports.png" - // ----------------------------------- - - // Reveal specific options ----------- - collapseby = 'sample_id' - cloneby = 'subject_id' - singlecell='single_cell' - clonal_threshold = 'auto' - miairr="$projectDir/assets/reveal/mapping_MiAIRR_BioSample_v1.3.1.tsv" - igphyml="/usr/local/bin/igphyml" + // ----------------------- + // generic nf-core options + // ----------------------- // References igenomes_base = 's3://ngi-igenomes/igenomes' @@ -205,7 +222,8 @@ profiles { test_tcr { includeConfig 'conf/test_tcr.config' } test_no_umi { includeConfig 'conf/test_no_umi.config' } test_assembled { includeConfig 'conf/test_assembled.config' } - test_immcantation_devel { includeConfig 'conf/test_immcantation_devel.config' } + test_raw_immcantation_devel { includeConfig 'conf/test_raw_immcantation_devel.config' } + test_assembled_immcantation_devel { includeConfig 'conf/test_assembled_immcantation_devel.config' } test_nocluster { includeConfig 'conf/test_nocluster.config' } test_fetchimgt { includeConfig 'conf/test_fetchimgt.config' } } diff --git a/subworkflows/local/assembled_input_check.nf b/subworkflows/local/assembled_input_check.nf index 8f4981a5..5bab29b7 100644 --- a/subworkflows/local/assembled_input_check.nf +++ b/subworkflows/local/assembled_input_check.nf @@ -14,21 +14,23 @@ workflow ASSEMBLED_INPUT_CHECK { main: // TODO: validate input should check that sample_ids are unique + VALIDATE_INPUT ( samplesheet, miairr, collapseby, cloneby ) //removed reassign - validated_input = VALIDATE_INPUT.out.validated_input - validated_input + ch_validated_input = VALIDATE_INPUT.out.validated_input + ch_validated_input .splitCsv(header: true, sep:'\t') .map { get_meta(it) } .branch { it -> fasta: it[0].filename =~ /[fasta|fa]$/ tsv: it[0].filename =~ /tsv$/ } - .set{ch_metadata} + .set{ ch_metadata } emit: ch_fasta = ch_metadata.fasta ch_tsv = ch_metadata.tsv - validated_input = validated_input + validated_input = ch_validated_input + versions = VALIDATE_INPUT.out.versions } // Function to map diff --git a/subworkflows/local/bulk_qc_and_filter.nf b/subworkflows/local/bulk_qc_and_filter.nf index 7c9ab70f..2eddf86b 100644 --- a/subworkflows/local/bulk_qc_and_filter.nf +++ b/subworkflows/local/bulk_qc_and_filter.nf @@ -58,9 +58,6 @@ workflow BULK_QC_AND_FILTER { .dump() COLLAPSE_DUPLICATES( - //ch_bulk_chimeric_pass - // .map{ it -> [ it[1] ] } - // .collect() ch_for_collapse ) diff --git a/subworkflows/local/clonal_analysis.nf b/subworkflows/local/clonal_analysis.nf index 944b92fa..bab8bbe3 100644 --- a/subworkflows/local/clonal_analysis.nf +++ b/subworkflows/local/clonal_analysis.nf @@ -1,5 +1,6 @@ include { FIND_THRESHOLD } from '../../modules/local/enchantr/find_threshold' -include { DEFINE_CLONES } from '../../modules/local/enchantr/define_clones' +include { DEFINE_CLONES as DEFINE_CLONES_COMPUTE } from '../../modules/local/enchantr/define_clones' +include { DEFINE_CLONES as DEFINE_CLONES_REPORT } from '../../modules/local/enchantr/define_clones' include { DOWSER_LINEAGES } from '../../modules/local/enchantr/dowser_lineages' workflow CLONAL_ANALYSIS { @@ -11,9 +12,15 @@ workflow CLONAL_ANALYSIS { main: ch_versions = Channel.empty() + if (params.clonal_threshold == "auto") { + + ch_find_threshold = ch_repertoire.map{ it -> it[1] } + .collect() + .dump(tag:'find_threshold') + FIND_THRESHOLD ( - ch_repertoire, + ch_find_threshold, ch_logo ) ch_threshold = FIND_THRESHOLD.out.mean_threshold @@ -31,22 +38,74 @@ workflow CLONAL_ANALYSIS { clone_threshold = params.clonal_threshold } - DEFINE_CLONES( - ch_repertoire, - clone_threshold, - ch_imgt + // prepare ch for define clones + ch_repertoire.map{ it -> [ it[0]."${params.cloneby}", + it[0].id, + it[0].subject_id, + it[0].species, + it[0].single_cell, + it[0].locus, + it[1] ] } + .groupTuple() + .dump(tag:'cloneby') + .map{ get_meta_tabs(it) } + .dump(tag:'cloneby_after_map') + .set{ ch_define_clones } + + DEFINE_CLONES_COMPUTE( + ch_define_clones, + clone_threshold.collect(), + ch_imgt.collect() ) - ch_versions = ch_versions.mix(DEFINE_CLONES.out.versions) + ch_versions = ch_versions.mix(DEFINE_CLONES_COMPUTE.out.versions) + + // prepare ch for define clones all samples report + DEFINE_CLONES_COMPUTE.out.tab + .collect() + .map { it -> [ [id:'all_reps'], it ] } + .dump(tag: 'all_tabs_cloned') + .set{ch_all_repertoires_cloned} + + if (!params.skip_all_clones_report){ + DEFINE_CLONES_REPORT( + ch_all_repertoires_cloned, + clone_threshold.collect(), + ch_imgt.collect() + ) + } + + // prepare ch for dowser lineages + DEFINE_CLONES_COMPUTE.out.tab + .flatten() + .map { it -> [ [id: "${it.baseName}".replaceFirst("__clone-pass", "")], it ] } + .dump(tag: 'tab_cloned') + .set{ch_repertoires_cloned} if (!params.skip_lineage){ DOWSER_LINEAGES( - DEFINE_CLONES.out.tab - .flatten() + ch_repertoires_cloned ) ch_versions = ch_versions.mix(DOWSER_LINEAGES.out.versions) } emit: - repertoire = DEFINE_CLONES.out.tab + repertoire = ch_all_repertoires_cloned versions = ch_versions } + +// Function to map +def get_meta_tabs(arr) { + def meta = [:] + meta.id = [arr[0]].unique().join("") + meta.sample_ids = arr[1] + meta.subject_id = arr[2] + meta.species = arr[3] + meta.single_cell = arr[4].unique().join("") + meta.locus = arr[5].unique().join("") + + def array = [] + + array = [ meta, arr[6].flatten() ] + + return array +} diff --git a/subworkflows/local/fastq_input_check.nf b/subworkflows/local/fastq_input_check.nf index 3cf6a92b..9650e393 100644 --- a/subworkflows/local/fastq_input_check.nf +++ b/subworkflows/local/fastq_input_check.nf @@ -3,6 +3,7 @@ */ include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +//include { VALIDATE_INPUT } from '../../modules/local/enchantr/validate_input' workflow FASTQ_INPUT_CHECK { take: @@ -13,22 +14,38 @@ workflow FASTQ_INPUT_CHECK { .tsv .splitCsv ( header:true, sep:'\t' ) .map { create_fastq_channels(it) } - .set { reads } + .set { ch_reads } + // VALIDATE_INPUT( + // samplesheet, + // params.miairr, + // params.collapseby, + // params.cloneby + // ) + + // VALIDATE_INPUT.out.validated_input + // .splitCsv(header: true, sep:'\t') + // .map { get_meta(it) } + // .set{ ch_reads } emit: - reads // channel: [ val(meta), [ reads ] ] + reads = ch_reads // channel: [ val(meta), [ reads ] ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] samplesheet = SAMPLESHEET_CHECK.out.tsv // tsv metadata file } // Function to map def create_fastq_channels(LinkedHashMap col) { + def meta = [:] - meta.id = col.sample_id - meta.subject = col.subject_id - meta.locus = col.pcr_target_locus - meta.species = col.species - meta.single_cell = 'false' + + meta.id = col.sample_id + meta.subject_id = col.subject_id + meta.species = col.species + meta.collapseby_group = col."${params.collapseby}" + meta.cloneby_group = col."${params.cloneby}" + meta.filetype = "fastq" + meta.single_cell = col.single_cell.toLowerCase() + meta.locus = col.pcr_target_locus def array = [] if (!file(col.filename_R1).exists()) { diff --git a/subworkflows/local/merge_tables_wf.nf b/subworkflows/local/merge_tables_wf.nf index e0acfd1b..3e422a65 100644 --- a/subworkflows/local/merge_tables_wf.nf +++ b/subworkflows/local/merge_tables_wf.nf @@ -34,7 +34,7 @@ def get_meta_tabs(arr) { meta.id = arr[0] meta.samples = arr[1] meta.locus = arr[2].unique().join("") - meta.subject = arr[3].unique().join("") + meta.subject_id = arr[3].unique().join("") meta.species = arr[4].unique().join("") def array = [] diff --git a/subworkflows/local/sequence_assembly.nf b/subworkflows/local/sequence_assembly.nf index a741e34b..490a0e4d 100644 --- a/subworkflows/local/sequence_assembly.nf +++ b/subworkflows/local/sequence_assembly.nf @@ -42,7 +42,6 @@ include { ALAKAZAM_SHAZAM_REPERTOIRES } from '../../modules/local/alakazam/alaka // Local: Sub-workflows include { FASTQ_INPUT_CHECK } from '../../subworkflows/local/fastq_input_check' -include { MERGE_TABLES_WF } from '../../subworkflows/local/merge_tables_wf' include { PRESTO_UMI } from '../../subworkflows/local/presto_umi' include { PRESTO_SANS_UMI } from '../../subworkflows/local/presto_sans_umi' diff --git a/subworkflows/local/single_cell_qc_and_filtering.nf b/subworkflows/local/single_cell_qc_and_filtering.nf index f8d18d12..e139aa53 100644 --- a/subworkflows/local/single_cell_qc_and_filtering.nf +++ b/subworkflows/local/single_cell_qc_and_filtering.nf @@ -7,15 +7,37 @@ workflow SINGLE_CELL_QC_AND_FILTERING { main: ch_versions = Channel.empty() + repertoires + .dump(tag:"scqc-reps") + .map{ it -> [ it[0].id, + it[0] ] } + .set{ch_onlymeta} + + repertoires + .map { it -> it[1]} + .collect() + .dump(tag:'scqc-aftercollect') + .set{ch_repertoire_allsc} + SINGLE_CELL_QC( - repertoires - .map{ it -> [ it[1] ] } - .collect() + ch_repertoire_allsc ) + + SINGLE_CELL_QC.out.tab + .flatten() + .dump(tag:"scqc-output") + .map { it -> [ "${it.baseName}".replaceFirst("__scqc-pass", ""), it ] } + .dump(tag:"scqc-output-filename") + .set{ch_repertoire_after_scqc_with_sampleid} + // ch_file_sizes = ch_file_sizes.mix(SINGLE_CELL_QC.out.logs) ch_versions = ch_versions.mix(SINGLE_CELL_QC.out.versions.ifEmpty(null)) + ch_repertoire_after_scqc_withmeta = ch_onlymeta.join(ch_repertoire_after_scqc_with_sampleid) + .dump(tag:'scqc-out-joined-meta') + .map{ it -> [ it[1], it[2] ]} + emit: versions = ch_versions - repertoires = SINGLE_CELL_QC.out.tab -} + repertoires = ch_repertoire_after_scqc_withmeta +} \ No newline at end of file diff --git a/subworkflows/local/vdj_annotation.nf b/subworkflows/local/vdj_annotation.nf index 08a6d211..f9caa1d2 100644 --- a/subworkflows/local/vdj_annotation.nf +++ b/subworkflows/local/vdj_annotation.nf @@ -62,6 +62,8 @@ workflow VDJ_ANNOTATION { ch_versions = ch_versions.mix(FETCH_DATABASES.out.versions.ifEmpty(null)) } + ch_fasta.dump(tag:'input_assigngenes') + CHANGEO_ASSIGNGENES ( ch_fasta, ch_igblast.collect() diff --git a/workflows/airrflow.nf b/workflows/airrflow.nf index bbf433b4..f5f09fd7 100644 --- a/workflows/airrflow.nf +++ b/workflows/airrflow.nf @@ -112,6 +112,7 @@ workflow AIRRFLOW { params.miairr, params.collapseby, params.cloneby) + ch_versions = ch_versions.mix( ASSEMBLED_INPUT_CHECK.out.versions.ifEmpty(null) ) if (params.reassign) { CHANGEO_CONVERTDB_FASTA_FROM_AIRR( @@ -146,7 +147,7 @@ workflow AIRRFLOW { ch_fasta, ch_validated_samplesheet.collect() ) - ch_versions = ch_versions.mix( VDJ_ANNOTATION.out.versions. ifEmpty(null)) + ch_versions = ch_versions.mix( VDJ_ANNOTATION.out.versions.ifEmpty(null)) // Split bulk and single cell repertoires ch_repertoire_by_processing = VDJ_ANNOTATION.out.repertoire @@ -167,7 +168,6 @@ workflow AIRRFLOW { ch_versions = ch_versions.mix( BULK_QC_AND_FILTER.out.versions.ifEmpty(null)) ch_bulk_filtered = BULK_QC_AND_FILTER.out.repertoires - .map{it -> it[1]} .dump(tag: 'bulk_filt_out') // Single cell: QC and filtering @@ -182,9 +182,7 @@ workflow AIRRFLOW { // Mixing bulk and single cell channels for clonal analysis ch_repertoires_for_clones = ch_bulk_filtered .mix(SINGLE_CELL_QC_AND_FILTERING.out.repertoires) - .dump(tag: 'after mix') - .collect() - .dump(tag: 'after collect') + .dump(tag: 'sc bulk mix') // Clonal analysis CLONAL_ANALYSIS(