Merge pull request #258 from ggabernet/report-fix

Report fix and other bugfixes
nf-core · May 29, 2023 · b262243 · b262243
2 parents e5ccb67 + 7ee7461
commit b262243
Show file tree

Hide file tree

Showing 19 changed files with 148 additions and 183 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,11 +48,11 @@ jobs:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
-        profile: ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled", "test_igblast"]
+        profile: ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled"]
       fail-fast: false
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1

diff --git a/.github/workflows/ci_immcantation.yml b/.github/workflows/ci_immcantation.yml
@@ -29,7 +29,7 @@ jobs:
       fail-fast: false
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
-## [3.0.1dev] - "Portus" Hotfix
+## [3.1] - "Portus" Hotfix
 
 ### `Added`
 
@@ -13,6 +13,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### `Fixed`
 
 - [#250](https://github.com/nf-core/airrflow/pull/250) Fixed log parsing with `removeprefix` instead of `lstrip`.
+- [#258](https://github.com/nf-core/airrflow/pull/258) Fixes to plotly plots in report sometimes not rendering.
+- [#258](https://github.com/nf-core/airrflow/pull/258) Remove direct call to Igblast in favor of a fix in ChangeO.
+- [#258](https://github.com/nf-core/airrflow/pull/258) Added check for whitespaces in certain columns in samplesheet.
+- [#258](https://github.com/nf-core/airrflow/pull/258) Added missing Immcantation references in Airrflow report.
 
 ## [3.0] - 2023-03-20 "Portus"
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -16,10 +16,6 @@
 
   > Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu, fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics. 2018 Sept 1; 34(17):i884–i890. doi: 10.1093/bioinformatics/bty560.
 
-- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
-
-  > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
-
 - [pRESTO](https://doi.org/10.1093/bioinformatics/btu138)
 
   > Vander Heiden, J. A., Yaari, G., Uduman, M., Stern, J. N. H., O’Connor, K. C., Hafler, D. A., … Kleinstein, S. H. (2014). pRESTO: a toolkit for processing high-throughput sequencing raw reads of lymphocyte receptor repertoires. Bioinformatics, 30(13), 1930–1932.
@@ -32,9 +28,30 @@
 
   > Stern, J. N. H., Yaari, G., Vander Heiden, J. A., Church, G., Donahue, W. F., Hintzen, R. Q., … O’Connor, K. C. (2014). B cells populating the multiple sclerosis brain mature in the draining cervical lymph nodes. Science Translational Medicine, 6(248).
 
+- [SCOPer](https://doi.org/10.1093/bioinformatics/bty235)
+
+  > Nouri N, Kleinstein S (2018). “A spectral clustering-based method for identifying clones from high-throughput B cell repertoire sequencing data.” Bioinformatics, i341-i349.
+
+  > Nouri N, Kleinstein S (2020). “Somatic hypermutation analysis for improved identification of B cell clonal families from next-generation sequencing data.” PLOS Computational Biology, 16(6), e1007977.
+
+  > Gupta N, Adams K, Briggs A, Timberlake S, Vigneault F, Kleinstein S (2017). “Hierarchical clustering can identify B cell clones with high confidence in Ig repertoire sequencing data.” The Journal of Immunology, 2489-2499.
+
+- [Dowser](https://doi.org/10.1371/journal.pcbi.1009885)
+
+  > Hoehn K, Pybus O, Kleinstein S (2022). “Phylogenetic analysis of migration, differentiation, and class switching in B cells.” PLoS Computational Biology.
+
+- [IgPhyML](https://www.pnas.org/doi/10.1073/pnas.1906020116)
+
+  > Hoehn K, Van der Heiden J, Zhou J, Lunter G, Pybus O, Kleinstein S (2019). “Repertoire-wide phylogenetic models of B cell molecular evolution reveal evolutionary signatures of aging and vaccination.” PNAS.
+
 - [TIgGER](https://doi.org/10.1073/pnas.1417683112)
+
   > Gadala-maria, D., Yaari, G., Uduman, M., & Kleinstein, S. H. (2015). Automated analysis of high-throughput B-cell sequencing data reveals a high frequency of novel immunoglobulin V gene segment alleles. Proceedings of the National Academy of Sciences, 112(8), 1–9.
 
+- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
+
+  > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
+
 ## Software packaging/containerisation tools
 
 - [Anaconda](https://anaconda.com)

diff --git a/assets/repertoire_comparison.Rmd b/assets/repertoire_comparison.Rmd
@@ -10,10 +10,6 @@ output:
     css: ./nf-core_style.css
     highlight: pygments
     pdf_document: true
-    pandoc_args: [
-      "+RTS", "-K4000m",
-      "-RTS"
-    ]
   html_notebook:
     toc: yes
 ---
@@ -46,6 +42,9 @@ dir.create(outdir)
 seq_dir <- paste(outdir, "Sequence_numbers_summary", sep="/")
 dir.create(seq_dir)
 
+# Read data from the work directory
+datadir <- "."
+
 ```
 
 # Number of sequences
@@ -143,7 +142,7 @@ ggplotly(seqs_plot_assembled)
 ```{r read_data, include=FALSE}
 # paths to the files are found in the first column of all_repertoires_report_tabs.txt,
 # in the current folder
-all_files <- read.delim("all_repertoires_report_tabs.txt", header=F, sep="\t")[[1]]
+all_files <- system(paste0("find '",datadir,"' -name '*clone-pass.tsv'"), intern=T)
 
 diversity_dir <- paste(outdir, "Diversity", sep="/")
 abundance_dir <- paste(outdir, "Abundance", sep="/")
@@ -199,13 +198,14 @@ p_ca <- ggplot(abund@abundance, aes(x = rank, y = p)) +
                     ymax = upper, fill = sample_id), alpha = 0.4) +
     geom_line(aes(color = sample_id)) +
     ggtitle(abund_main) +
-    xlab("log(Rank)") + ylab("Abundance") +
+    xlab('log(Rank)') + ylab('Abundance') +
     scale_x_log10(limits = NULL,
                 breaks = scales::trans_breaks("log10", function(x) 10^x),
                 labels = scales::trans_format("log10", scales::math_format(10^.x))) +
-    scale_y_continuous(labels = scales::percent) #+
-    #facet_grid(cols = vars(locus), rows = vars(subject_id), scales="free", drop = T)
-ggplotly(p_ca)
+    scale_y_continuous(labels = scales::percent)
+
+p_ca
+
 ```
 
 ```{r plot_abundance, include = FALSE}
@@ -214,6 +214,7 @@ ggsave(plot=p_ca, filename = paste0(abundance_dir,"/Clonal_abundance_subject.png
 write.table(abund@abundance, file = paste0(abundance_dir, "/Clonal_abundance_data_subject.tsv"), sep="\t", quote = F, row.names = F)
 ```
 
+
 # Clonal diversity
 
 The clonal diversity $D$ of the repertoire was calculated according to the general formula of Hill Diversity
@@ -268,9 +269,9 @@ div_p <- ggplot(sample_div@diversity, aes(x = q, y = d, group=sample_id)) +
                     fill = sample_id), alpha = 0.4) +
     geom_line(aes(color = sample_id)) +
     xlab("q") + ylab("Diversity(q)") +
-    ggtitle(sample_main) #+
-    #facet_grid(cols=vars(locus), rows=vars(subject_id))
-ggplotly(div_p)
+    ggtitle(sample_main)
+
+div_p
 ```
 ```{r plot_diversity, include = FALSE}
 ggsave(plot=div_p, filename=paste0(diversity_dir,"/Diversity_patient_grid.png"), device="png", width = 25, height = 10, units="cm")
@@ -299,7 +300,6 @@ g2 <- ggplot(family, aes(x=gene, y=clone_freq, fill=sample_id, group=sample_id))
     theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
     ylab("Frequency") +
     xlab("") +
-    #facet_grid(cols=vars(locus), rows=vars(subject_id)) +
     theme(legend.position = "right")
 ggplotly(g2)
 
@@ -328,9 +328,7 @@ g2 <- ggplot(family, aes(x=gene, y=clone_freq, fill=sample_id, group=sample_id))
     ggtitle("V Gene Family Usage") +
     theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
     ylab("Frequency") +
-    xlab("") #+
-    #facet_wrap(vars(subject_id,locus), scales="free_x", ncol = 1) +
-    #theme(legend.position = "none")
+    xlab("")
 ggplotly(g2)
 ggsave(filename = paste0(vfamily_dir, "/V_gene_distribution_by_clone_patient.pdf"), plot = g2, width = 20, height = 40, units = "cm")
 ggsave(filename = paste0(vfamily_dir, "/V_gene_distribution_by_clone_patient.png"), plot = g2, width = 20, height = 40, units = "cm")
@@ -352,9 +350,7 @@ g2 <- ggplot(family, aes(x=gene, y=seq_freq, fill=sample_id, group=sample_id)) +
     ggtitle("V Gene Usage") +
     theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
     ylab("Frequency") +
-    xlab("") #+
-    #facet_wrap(vars(subject_id,locus), scales="free_x", ncol = 1) +
-    #theme(legend.position = "none")
+    xlab("")
 ggplotly(g2)
 ggsave(filename = paste0(vfamily_dir, "/V_gene_distribution_by_sequence_patient.pdf"), plot = g2, width = 20, height = 40, units = "cm")
 ggsave(filename = paste0(vfamily_dir, "/V_gene_distribution_by_sequence_patient.png"), plot = g2, width = 20, height = 40, units = "cm")
@@ -366,16 +362,6 @@ write.table(family, file = paste0(vfamily_dir, "/V_gene_distribution_by_sequence
 
 If you use nf-core/airrflow for your analysis, please cite it using the following DOI: [10.5281/zenodo.3607408](https://doi.org/10.5281/zenodo.3607408)
 
-Please also cite the `nf-core` and `Nextflow` publications:
-
-- [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/)
-
-  > Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031.
-
-- [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/)
-
-  > Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311.
-
 In addition, citations for the tools and data used in this pipeline are as follows:
 
 - [pRESTO](https://doi.org/10.1093/bioinformatics/btu138)
@@ -386,12 +372,40 @@ In addition, citations for the tools and data used in this pipeline are as follo
 
   > Gupta, N. T., Vander Heiden, J. A., Uduman, M., Gadala-Maria, D., Yaari, G., & Kleinstein, S. H. (2015). Change-O: a toolkit for analyzing large-scale B cell immunoglobulin repertoire sequencing data: Table 1. Bioinformatics, 31(20), 3356–3358.
 
+- [IgBLAST](https://doi.org/10.1093/nar/gkt382)
+
+  > Ye, J., Ma, N., Madden, T. L., & Ostell, J. M. (2013). IgBLAST: An immunoglobulin variable domain sequence analysis tool. Nucleic Acids Research, 41(Web Server issue), W34.
+
 - [Alakazam](https://doi.org/10.1126/scitranslmed.3008879)
 
-  > Stern, J. N. H., Yaari, G., Vander Heiden, J. A., Church, G., Donahue, W. F., Hintzen, R. Q., … O’Connor, K. C. (2014). B cells populating the multiple sclerosis brain mature in the draining cervical lymph nodes. Science Translational Medicine, 6(248).
+  > Stern, J. N. H., Yaari, G., Vander Heiden, J. A., Church, G., Donahue, W. F., Hintzen, R. Q., … O’Connor, K. C. (2014). B cells populating the multiple sclerosis brain mature in the draining cervical lymph nodes. Science Translational Medicine, 6(248), 248ra107.
+
+- [SCOPer](https://doi.org/10.1093/bioinformatics/bty235)
+
+  > Nouri N, Kleinstein S (2018). “A spectral clustering-based method for identifying clones from high-throughput B cell repertoire sequencing data.” Bioinformatics, i341-i349.
+
+  > Nouri N, Kleinstein S (2020). “Somatic hypermutation analysis for improved identification of B cell clonal families from next-generation sequencing data.” PLOS Computational Biology, 16(6), e1007977.
+
+  > Gupta N, Adams K, Briggs A, Timberlake S, Vigneault F, Kleinstein S (2017). “Hierarchical clustering can identify B cell clones with high confidence in Ig repertoire sequencing data.” The Journal of Immunology, 2489-2499.
+
+- [Dowser](https://doi.org/10.1371/journal.pcbi.1009885)
+
+  > Hoehn K, Pybus O, Kleinstein S (2022). “Phylogenetic analysis of migration, differentiation, and class switching in B cells.” PLoS Computational Biology.
+
+- [IgPhyML](https://www.pnas.org/doi/10.1073/pnas.1906020116)
+
+  > Hoehn K, Van der Heiden J, Zhou J, Lunter G, Pybus O, Kleinstein S (2019). “Repertoire-wide phylogenetic models of B cell molecular evolution reveal evolutionary signatures of aging and vaccination.” PNAS.
+
+- [TIgGER](https://doi.org/10.1073/pnas.1417683112)
+
+  > Gadala-maria, D., Yaari, G., Uduman, M., & Kleinstein, S. H. (2015). Automated analysis of high-throughput B-cell sequencing data reveals a high frequency of novel immunoglobulin V gene segment alleles. Proceedings of the National Academy of Sciences, 112(8), 1–9.
 
 - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
 
+- [Fastp](https://doi.org/10.1093/bioinformatics/bty560)
+
+  > Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu, fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics. 2018 Sept 1; 34(17):i884–i890.
+
 - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
 
   > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -7,6 +7,7 @@
 import errno
 import argparse
 import pandas as pd
+import re
 
 
 def parse_args(args=None):
@@ -51,8 +52,8 @@ def check_samplesheet(file_in):
     sample_run_dict = {}
     with open(file_in, "r") as fin:
         ## Check that required columns are present
-        MIN_COLS = 7
-        REQUIRED_COLUMNS = [
+        min_cols = 7
+        required_columns = [
             "sample_id",
             "filename_R1",
             "filename_R2",
@@ -65,12 +66,21 @@ def check_samplesheet(file_in):
             "biomaterial_provider",
             "age",
         ]
+        no_whitespaces = [
+            "sample_id",
+            "filename_R1",
+            "filename_R2",
+            "subject_id",
+            "species",
+            "pcr_target_locus",
+            "tissue",
+        ]
         header = [x.strip('"') for x in fin.readline().strip().split("\t")]
-        for col in REQUIRED_COLUMNS:
+        for col in required_columns:
             if col not in header:
                 print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
                 print("Header is missing column {}".format(col))
-                print("Header must contain columns {}".format("\t".join(REQUIRED_COLUMNS)))
+                print("Header must contain columns {}".format("\t".join(required_columns)))
                 sys.exit(1)
 
         ## Check that rows have the same fields as header, and at least the compulsory ones are provided
@@ -85,9 +95,9 @@ def check_samplesheet(file_in):
                     line,
                 )
             num_cols = len([x for x in lspl if x])
-            if num_cols < MIN_COLS:
+            if num_cols < min_cols:
                 print_error(
-                    "Invalid number of populated columns (should be {})!".format(MIN_COLS),
+                    "Invalid number of populated columns (should be {})!".format(min_cols),
                     "Line",
                     line,
                 )
@@ -119,6 +129,16 @@ def check_samplesheet(file_in):
                     "The same subject_id cannot belong to different species! Check input file columns 'subject_id' and 'species'."
                 )
 
+        ## Check that values do not contain spaces in the no whitespaces columns
+        for col in no_whitespaces:
+            values = tab[col].tolist()
+            if any([re.search(r"\s+", s) for s in values]):
+                print_error(
+                    "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
+                        col, no_whitespaces
+                    )
+                )
+
 
 def main(args=None):
     args = parse_args(args)

diff --git a/bin/execute_report.R b/bin/execute_report.R
@@ -4,13 +4,19 @@
 library(rmarkdown)
 library(optparse)
 
+
 option_list = list(
     make_option(c("-r", "--report_file"), type="character", default=NULL, help="report rmarkdown file", metavar="character")
 )
 
+
 opt_parser = OptionParser(option_list=option_list)
 opt = parse_args(opt_parser)
 
+
+#Set pandoc stack size memory
+options(pandoc.stack.size="4000m")
+
 wd=getwd()
 
 rmarkdown::render(opt$report_file, output_file = "Airrflow_report.html", knit_root_dir = wd, output_dir = wd)
diff --git a/conf/modules.config b/conf/modules.config
@@ -299,15 +299,6 @@ process {
         ext.args = '--format blast'
     }
 
-    withName: IGBLAST_ASSIGNGENES {
-        publishDir = [
-            path: { "${params.outdir}/vdj_annotation/01-assign-genes/${meta.id}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-        ext.args = "-domain_system imgt -ig_seqtype Ig -outfmt 19 -show_translation"
-    }
-
     withName: CHANGEO_MAKEDB {
         publishDir = [
             path: { "${params.outdir}/vdj_annotation/02-make-db/${meta.id}" },
@@ -450,16 +441,6 @@ process {
         ext.args = ['build':'igphyml']
     }
 
-//    withName: CHANGEO_PARSEDB_SELECT {
-//        publishDir = [
-//            path: { "${params.outdir}/changeo/04-parsedb-select/${meta.id}" },
-//            mode: params.publish_dir_mode,
-//            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-//        ]
-//        ext.args = '-f v_call j_call -u "IG[HLK]" --regex --logic all'
-//        ext.args2 = '-f v_call j_call -u "TR" --regex --logic all'
-//    }
-
     // -------------------------------
     // Reports
     // -------------------------------