From 91f35cfbf876c2f1b22fbbe8e29833d52d3a7b49 Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Wed, 10 Apr 2024 10:59:02 -0400 Subject: [PATCH 1/7] General: Add WDL translations for pipeline and salmon-quant for testing (cherry picked from commit 3acdf937c06a5f78c33977348909606672f60cc3) --- pipeline.wdl | 31 ++++++ steps/salmon-quantification.wdl | 170 ++++++++++++++++++++++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 pipeline.wdl create mode 100644 steps/salmon-quantification.wdl diff --git a/pipeline.wdl b/pipeline.wdl new file mode 100644 index 0000000..cd0d9f0 --- /dev/null +++ b/pipeline.wdl @@ -0,0 +1,31 @@ +## Use double '#' for workflow-level comments +## This workflow implements a one-task workflow + +# write the WDL version number 'version 1.0' -- 1 +# possible to write 'WDL developent' as a version number as well +version development + +# create a workflow named 'HelloWorld' -- 2 +import "./steps/salmon-quantification.wdl" as SalmonQuantification +workflow SalmonRNAseq { + input { + Array[Directory] fastq_dir + Directory? img_dir + Directory? metadata_dir + String assay + Int threads + Int? expected_cell_count + Boolean? keep_all_barcodes + } + + call SalmonQuantification.SalmonQuantification { + input: + fastq_dir = fastq_dir, + img_dir = img_dir, + metadata_dir = metadata_dir, + assay = assay, + threads = threads, + expected_cell_count = expected_cell_count, + keep_all_barcodes = keep_all_barcodes + } +} diff --git a/steps/salmon-quantification.wdl b/steps/salmon-quantification.wdl new file mode 100644 index 0000000..d12d340 --- /dev/null +++ b/steps/salmon-quantification.wdl @@ -0,0 +1,170 @@ +version development + +workflow SalmonQuantification { + input { + Array[Directory] fastq_dir + Directory? img_dir + Directory? metadata_dir + String assay + Int threads + Int? expected_cell_count + Boolean? keep_all_barcodes + } + + output { + Directory salmon_output = Salmon.output_dir + File count_matrix_h5ad = AnnotateCells.annotated_h5ad_file + File? raw_count_matrix = AlevinToAnndata.raw_expr_h5ad + File genome_build_json = AlevinToAnndata.genome_build_json + } + + call AdjustBarcodes{ + input: + assay = assay, + fastq_dir = fastq_dir + } + + call TrimReads { + input: + assay = assay, + adj_fastq_dir = AdjustBarcodes.adj_fastq_dir, + orig_fastq_dirs = fastq_dir, + threads = threads + } + + call Salmon { + input: + orig_fastq_dirs = fastq_dir, + trimmed_fastq_dir = TrimReads.trimmed_fastq_dir, + assay = assay, + threads = threads, + expected_cell_count = expected_cell_count, + keep_all_barcodes = keep_all_barcodes + } + + call AlevinToAnndata { + input: + assay = assay, + alevin_dir = Salmon.output_dir + } + + call AnnotateCells { + input: + assay = assay, + orig_fastq_dirs = fastq_dir, + h5ad_file = AlevinToAnndata.expr_h5ad, + img_dir = img_dir, + metadata_dir = metadata_dir, + metadata_json = AdjustBarcodes.metadata_json + } +} + +task AdjustBarcodes { + input { + String assay + Array[Directory] fastq_dir + } + + output { + Directory adj_fastq_dir = "adj_fastq" + File? metadata_json = "metadata.json" + } + + command { + /opt/adjust_barcodes.py ~{assay} directory ~{sep(" ", fastq_dir)} + } + + runtime { + container: "hubmap/scrna-barcode-adj:latest" + } +} + +task TrimReads { + input { + String assay + Directory adj_fastq_dir + Array[Directory] orig_fastq_dirs + Int threads + } + + output { + Directory trimmed_fastq_dir = "trimmed" + } + + runtime { + container: "hubmap/scrna-trim-reads:latest" + } + + command { + /opt/trim_reads.py ~{assay} ~{adj_fastq_dir} ~{sep(" ", orig_fastq_dirs)} + } +} + +task Salmon { + input { + String assay + Directory trimmed_fastq_dir + Array[Directory] orig_fastq_dirs + Int threads + Int? expected_cell_count + Boolean? keep_all_barcodes + } + + output { + Directory output_dir = "salmon_out" + } + + runtime { + container: "hubmap/salmon-grch38:latest" + } + + command { + /opt/salmon_wrapper.py ~{assay} ~{trimmed_fastq_dir} ~{sep(" ", orig_fastq_dirs)} --threads ~{threads} ~{if defined(expected_cell_count) then "--expected-cell-count " + expected_cell_count else ""} ~{if defined(keep_all_barcodes) then "--keep-all-barcodes " + keep_all_barcodes else ""} + } +} + +task AlevinToAnndata { + input { + String assay + Directory alevin_dir + } + + output { + File? raw_expr_h5ad = "raw_expr.h5ad" + File expr_h5ad = "expr.h5ad" + File genome_build_json = "genome_build.json" + } + + runtime { + container: "hubmap/scrna-analysis:latest" + } + + # Need to fix this command + command { + /opt/alevin_to_anndata.py ~{assay} ~{alevin_dir} + } +} + +task AnnotateCells { + input { + String assay + File h5ad_file + Array[Directory] orig_fastq_dirs + Directory? img_dir + Directory? metadata_dir + File? metadata_json + } + + output { + File annotated_h5ad_file = "expr.h5ad" + } + + runtime { + container: "hubmap/scrna-analysis:latest" + } + + # Need to fix this command + command { + /opt/annotate_cells.py + } +} \ No newline at end of file From d16afc5856ae1407024aa7e4876020e2f7bc0c24 Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Wed, 10 Apr 2024 13:13:27 -0400 Subject: [PATCH 2/7] General: Adjust AnnotateCells task --- steps/salmon-quantification.wdl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/steps/salmon-quantification.wdl b/steps/salmon-quantification.wdl index d12d340..9686e81 100644 --- a/steps/salmon-quantification.wdl +++ b/steps/salmon-quantification.wdl @@ -139,7 +139,6 @@ task AlevinToAnndata { container: "hubmap/scrna-analysis:latest" } - # Need to fix this command command { /opt/alevin_to_anndata.py ~{assay} ~{alevin_dir} } @@ -163,8 +162,7 @@ task AnnotateCells { container: "hubmap/scrna-analysis:latest" } - # Need to fix this command command { - /opt/annotate_cells.py + /opt/annotate_cells.py ~{assay} ~{h5ad_file} ~{sep(" ", orig_fastq_dirs)} ~{if defined(img_dir) then "--img_dir " + img_dir else ""} ~{if defined(metadata_dir) then "--metadata_dir " + metadata_dir else ""} ~{if defined(metadata_json) then "--metadata_json " + metadata_json else ""} } } \ No newline at end of file From f63ff90e2e06fd34b38a968442d68cecda6abf48 Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Wed, 10 Apr 2024 13:45:40 -0400 Subject: [PATCH 3/7] General: Add more tasks for testing --- pipeline.wdl | 34 +++++++++++++++++++++++++++++++++- steps/fastqc.wdl | 20 ++++++++++++++++++++ steps/scanpy-analysis.wdl | 26 ++++++++++++++++++++++++++ steps/scvelo-analysis.wdl | 21 +++++++++++++++++++++ steps/squidpy-analysis.wdl | 27 +++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 steps/fastqc.wdl create mode 100644 steps/scanpy-analysis.wdl create mode 100644 steps/scvelo-analysis.wdl create mode 100644 steps/squidpy-analysis.wdl diff --git a/pipeline.wdl b/pipeline.wdl index cd0d9f0..be0e36c 100644 --- a/pipeline.wdl +++ b/pipeline.wdl @@ -7,6 +7,11 @@ version development # create a workflow named 'HelloWorld' -- 2 import "./steps/salmon-quantification.wdl" as SalmonQuantification +import "./steps/fastqc.wdl" as FastQC +import "./steps/scanpy-analysis.wdl" as ScanPyAnalysis +import "./steps/scvelo-analysis.wdl" as ScVeloAnalysis +import "./steps/squidpy-analysis.wdl" as SquidPyAnalysis + workflow SalmonRNAseq { input { Array[Directory] fastq_dir @@ -28,4 +33,31 @@ workflow SalmonRNAseq { expected_cell_count = expected_cell_count, keep_all_barcodes = keep_all_barcodes } -} + + scatter (fastq in fastq_dir) { + call FastQC.FastQC { + input: + fastq_dir = fastq, + threads = threads + } + } + + call ScanPyAnalysis.ScanPyAnalysis { + input: + assay = assay, + h5ad_file = SalmonQuantification.SalmonQuantification.count_matrix_h5ad + } + + call ScVeloAnalysis.ScVeloAnalysis { + input: + spliced_h5ad_file = SalmonQuantification.SalmonQuantification.count_matrix_h5ad, + assay_name = assay + } + + call SquidPyAnalysis.SquidPyAnalysis { + input: + assay = assay, + h5ad_file = ScanPyAnalysis.ScanPyAnalysis.filtered_data_h5ad, + img_dir = img_dir + } +} \ No newline at end of file diff --git a/steps/fastqc.wdl b/steps/fastqc.wdl new file mode 100644 index 0000000..dc70aa1 --- /dev/null +++ b/steps/fastqc.wdl @@ -0,0 +1,20 @@ +version development + +task FastQC { + input { + Directory fastq_dir + Int threads + } + + output { + Directory fastqc_dir = "fastqc_output" + } + + runtime { + container: "hubmap/scrna-analysis:latest" + } + + command { + /opt/fastqc_wrapper.py ~{fastq_dir} ~{threads} + } +} \ No newline at end of file diff --git a/steps/scanpy-analysis.wdl b/steps/scanpy-analysis.wdl new file mode 100644 index 0000000..2bba5bb --- /dev/null +++ b/steps/scanpy-analysis.wdl @@ -0,0 +1,26 @@ +version development + +task ScanPyAnalysis { + input { + String assay + File h5ad_file + } + + output { + File filtered_data_h5ad = "secondary_analysis.h5ad" + File dispersion_plot = "dispersion_plot.pdf" + File umap_plot = "umap_by_leiden_cluster.pdf" + File? spatial_plot = "spatial_pos_by_leiden_cluster.pdf" + File umap_density_plot = "umap_embedding_density.pdf" + File marker_gene_plot_t_test = "marker_genes_by_cluster_t_test.pdf" + File marker_gene_plot_logreg = "marker_genes_by_cluster_logreg.pdf" + } + + runtime { + container: "hubmap/scrna-analysis:latest" + } + + command { + /opt/scanpy_entry_point.py ~{assay} ~{h5ad_file} + } +} \ No newline at end of file diff --git a/steps/scvelo-analysis.wdl b/steps/scvelo-analysis.wdl new file mode 100644 index 0000000..c683322 --- /dev/null +++ b/steps/scvelo-analysis.wdl @@ -0,0 +1,21 @@ +version development + +task ScVeloAnalysis { + input { + File spliced_h5ad_file + String assay_name + } + + output { + File? annotated_h5ad_file = "scvelo_annotated.h5ad" + File? embedding_grid_plot = "scvelo_embedding_grid.pdf" + } + + runtime { + container: "hubmap/scrna-analysis:latest" + } + + command { + /opt/scvelo_analysis.py ~{spliced_h5ad_file} ~{assay_name} + } +} \ No newline at end of file diff --git a/steps/squidpy-analysis.wdl b/steps/squidpy-analysis.wdl new file mode 100644 index 0000000..2057739 --- /dev/null +++ b/steps/squidpy-analysis.wdl @@ -0,0 +1,27 @@ +version development + +task SquidPyAnalysis { + input { + String assay + File h5ad_file + Directory? img_dir + } + + output { + File? squidpy_annotated_h5ad = "squidpy_annotated.h5ad" + File? neighborhood_enrichment_plot = "neighborhood_enrichment.pdf" + File? co_occurrence_plot = "co_occurrence.pdf" + File? spatial_plot = "spatial_scatter.pdf" + File? interaction_matrix_plot = "interaction_matrix.pdf" + File? centrality_scores_plot = "centrality_scores.pdf" + File? ripley_plot = "ripley.pdf" + } + + runtime { + container: "hubmap/squidpy-analysis:latest" + } + + command { + /opt/squidpy_entry_point.py ~{assay} ~{h5ad_file} ~{img_dir} + } +} \ No newline at end of file From ba6c5d1e9335b1ceac6eb968905972f5c57f1bb5 Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Wed, 10 Apr 2024 13:48:51 -0400 Subject: [PATCH 4/7] General: Use "as" for calls --- pipeline.wdl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pipeline.wdl b/pipeline.wdl index be0e36c..ad11a7a 100644 --- a/pipeline.wdl +++ b/pipeline.wdl @@ -23,7 +23,7 @@ workflow SalmonRNAseq { Boolean? keep_all_barcodes } - call SalmonQuantification.SalmonQuantification { + call SalmonQuantification.SalmonQuantification as SalmonQuantificationCall { input: fastq_dir = fastq_dir, img_dir = img_dir, @@ -35,29 +35,29 @@ workflow SalmonRNAseq { } scatter (fastq in fastq_dir) { - call FastQC.FastQC { + call FastQC.FastQC as FastQCCall { input: fastq_dir = fastq, threads = threads } } - call ScanPyAnalysis.ScanPyAnalysis { + call ScanPyAnalysis.ScanPyAnalysis as ScanPyAnalysisCall { input: assay = assay, - h5ad_file = SalmonQuantification.SalmonQuantification.count_matrix_h5ad + h5ad_file = SalmonQuantificationCall.count_matrix_h5ad } - call ScVeloAnalysis.ScVeloAnalysis { + call ScVeloAnalysis.ScVeloAnalysis as ScVeloAnalysisCall { input: - spliced_h5ad_file = SalmonQuantification.SalmonQuantification.count_matrix_h5ad, + spliced_h5ad_file = SalmonQuantificationCall.count_matrix_h5ad, assay_name = assay } - call SquidPyAnalysis.SquidPyAnalysis { + call SquidPyAnalysis.SquidPyAnalysis as SquidPyAnalysisCall { input: assay = assay, - h5ad_file = ScanPyAnalysis.ScanPyAnalysis.filtered_data_h5ad, + h5ad_file = ScanPyAnalysisCall.filtered_data_h5ad, img_dir = img_dir } } \ No newline at end of file From 6e1d9ba699473adef263aad60196d2d1db31070c Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Wed, 10 Apr 2024 14:00:00 -0400 Subject: [PATCH 5/7] General: Finalize main pipeline.wdl --- pipeline.wdl | 9 +++++++++ steps/compute-qc-metrics.wdl | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 steps/compute-qc-metrics.wdl diff --git a/pipeline.wdl b/pipeline.wdl index ad11a7a..ac9dd2a 100644 --- a/pipeline.wdl +++ b/pipeline.wdl @@ -11,6 +11,7 @@ import "./steps/fastqc.wdl" as FastQC import "./steps/scanpy-analysis.wdl" as ScanPyAnalysis import "./steps/scvelo-analysis.wdl" as ScVeloAnalysis import "./steps/squidpy-analysis.wdl" as SquidPyAnalysis +import "./steps/compute-qc-metrics.wdl" as ComputeQCMetrics workflow SalmonRNAseq { input { @@ -60,4 +61,12 @@ workflow SalmonRNAseq { h5ad_file = ScanPyAnalysisCall.filtered_data_h5ad, img_dir = img_dir } + + call ComputeQCMetrics.ComputeQCMetrics as ComputeQCMetricsCall { + input: + assay = assay, + h5ad_primary = SalmonQuantificationCall.count_matrix_h5ad, + h5ad_secondary = ScanPyAnalysisCall.filtered_data_h5ad, + salmon_dir = SalmonQuantificationCall.salmon_output + } } \ No newline at end of file diff --git a/steps/compute-qc-metrics.wdl b/steps/compute-qc-metrics.wdl new file mode 100644 index 0000000..b499f06 --- /dev/null +++ b/steps/compute-qc-metrics.wdl @@ -0,0 +1,23 @@ +version development + +task ComputeQCMetrics { + input { + String assay + File h5ad_primary + File h5ad_secondary + Directory salmon_dir + } + + output { + File scanpy_qc_results = "qc_results.hdf5" + File qc_metrics = "qc_results.json" + } + + runtime { + container: "hubmap/scrna-analysis:latest" + } + + command { + /opt/compute_qc_metrics.py ~{assay} ~{h5ad_primary} ~{h5ad_secondary} ~{salmon_dir} + } +} \ No newline at end of file From fbe0ed6d0b51c3e32c3b1378828aa116f437c40a Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Wed, 10 Apr 2024 14:10:40 -0400 Subject: [PATCH 6/7] General: Add outputs to pipeline.wdl --- pipeline.wdl | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pipeline.wdl b/pipeline.wdl index ac9dd2a..03319e7 100644 --- a/pipeline.wdl +++ b/pipeline.wdl @@ -69,4 +69,30 @@ workflow SalmonRNAseq { h5ad_secondary = ScanPyAnalysisCall.filtered_data_h5ad, salmon_dir = SalmonQuantificationCall.salmon_output } + + output { + Directory salmon_output = SalmonQuantificationCall.salmon_output + File count_matrix_h5ad = SalmonQuantificationCall.count_matrix_h5ad + File? raw_count_matrix = SalmonQuantificationCall.raw_count_matrix + File genome_build_json = SalmonQuantificationCall.genome_build_json + Array[Directory] fastqc_dir = FastQCCall.fastqc_dir + File scanpy_qc_results = ComputeQCMetricsCall.scanpy_qc_results + File qc_report = ComputeQCMetricsCall.qc_metrics + File dispersion_plot = ScanPyAnalysisCall.dispersion_plot + File umap_plot = ScanPyAnalysisCall.umap_plot + File umap_density_plot = ScanPyAnalysisCall.umap_density_plot + File? spatial_plot = ScanPyAnalysisCall.spatial_plot + File filtered_data_h5ad = ScanPyAnalysisCall.filtered_data_h5ad + File marker_gene_plot_t_test = ScanPyAnalysisCall.marker_gene_plot_t_test + File marker_gene_plot_logreg = ScanPyAnalysisCall.marker_gene_plot_logreg + File? scvelo_annotated_h5ad = ScVeloAnalysisCall.annotated_h5ad_file + File? scvelo_embedding_grid_plot = ScVeloAnalysisCall.embedding_grid_plot + File? squidpy_annotated_h5ad = SquidPyAnalysisCall.squidpy_annotated_h5ad + File? neighborhood_enrichment_plot = SquidPyAnalysisCall.neighborhood_enrichment_plot + File? co_occurrence_plot = SquidPyAnalysisCall.co_occurrence_plot + File? interaction_matrix_plot = SquidPyAnalysisCall.interaction_matrix_plot + File? centrality_scores_plot = SquidPyAnalysisCall.centrality_scores_plot + File? ripley_plot = SquidPyAnalysisCall.ripley_plot + File? squidpy_spatial_plot = SquidPyAnalysisCall.spatial_plot + } } \ No newline at end of file From 2a6177a13edb1535cd078df145610f655e290bc1 Mon Sep 17 00:00:00 2001 From: Juan Puerto <=> Date: Tue, 16 Apr 2024 09:15:41 -0400 Subject: [PATCH 7/7] General: Add .docstore.yml file --- .dockstore.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .dockstore.yml diff --git a/.dockstore.yml b/.dockstore.yml new file mode 100644 index 0000000..6616325 --- /dev/null +++ b/.dockstore.yml @@ -0,0 +1,8 @@ +version: 1.2 +workflows: + - subclass: WDL + primaryDescriptorPath: ./pipeline.wdl + name: salmon-rnaseq-wdl + - subclass: CWL + primaryDescriptorPath: ./pipeline.cwl + name: salmon-rnaseq-cwl