BIMSBbioinfo · borauyar · May 23, 2024 · May 21, 2024 · May 23, 2024 · May 23, 2024
diff --git a/Makefile.am b/Makefile.am
@@ -17,6 +17,7 @@ dist_pkglibexec_scripts_SCRIPTS =					\
   scripts/norm_counts_deseq.R	\
   scripts/count_reads.R	\
   scripts/collate_read_counts.R \
+  scripts/collate_deseq_results.R \
   scripts/validate_input_annotation.R
 
 dist_pkgdata_DATA =									\

diff --git a/pigx-common b/pigx-common
diff --git a/scripts/translate_sample_sheet_for_report.R b/scripts/translate_sample_sheet_for_report.R
@@ -28,5 +28,5 @@ s = data.frame(data.table::fread(sample_sheet), check.names = F)
 rownames(s) = s$name
 s$group = s$sample_type
 s = s[colnames(s)[-grep("name|reads", colnames(s))]]
-write.table(s, "colData.tsv", sep="\t")
+write.table(s, "", sep="\t")
 
diff --git a/snakefile.py b/snakefile.py
@@ -2,7 +2,7 @@
 #
 # Copyright © 2017, 2018 Bora Uyar <[email protected]>
 # Copyright © 2017, 2018 Jonathan Ronen <[email protected]>
-# Copyright © 2017-2021 Ricardo Wurmus <[email protected]>
+# Copyright © 2017-2024 Ricardo Wurmus <[email protected]>
 #
 # This file is part of the PiGx RNAseq Pipeline.
 #
@@ -212,7 +212,7 @@ def lookup(column, predicate, fields=[]):
     'salmon_index' : {
         'description': "Create SALMON index file.",
         'files':
-          [os.path.join(OUTPUT_DIR, 'salmon_index', "pos.bin")]
+          [os.path.join(OUTPUT_DIR, 'salmon_index.tar')]
     },
     'salmon_quant' : {
         'description': "Calculate read counts per transcript using SALMON.",
@@ -305,8 +305,8 @@ def lookup(column, predicate, fields=[]):
 
 rule translate_sample_sheet_for_report:
   input: SAMPLE_SHEET_FILE
-  output: os.path.join(os.getcwd(), "colData.tsv")
-  shell: "{RSCRIPT_EXEC} {SCRIPTS_DIR}/translate_sample_sheet_for_report.R {input}"
+  output: os.path.join(OUTPUT_DIR, "colData.tsv")
+  shell: "{RSCRIPT_EXEC} {SCRIPTS_DIR}/translate_sample_sheet_for_report.R {input} > {output}"
 
 # determine if the sample library is single end or paired end
 def isSingleEnd(args):
@@ -442,42 +442,49 @@ def hisat2_file_arguments(args):
       CDNA_FASTA,
       rules.check_annotation_files.output
   output:
-      salmon_index_file = os.path.join(OUTPUT_DIR, 'salmon_index', "pos.bin")
+      tar = os.path.join(OUTPUT_DIR, 'salmon_index.tar')
   resources:
       mem_mb = config['execution']['rules']['salmon_index']['memory']
   params:
       salmon_index_dir = os.path.join(OUTPUT_DIR, 'salmon_index')
   log: os.path.join(LOG_DIR, "salmon", 'salmon_index.log')
-  shell: "{SALMON_INDEX_EXEC} -t {input[0]} -i {params.salmon_index_dir} -p {SALMON_INDEX_THREADS} >> {log} 2>&1"
+  shell: "({SALMON_INDEX_EXEC} -t {input[0]} -i {params.salmon_index_dir} -p {SALMON_INDEX_THREADS} && cd {params.salmon_index_dir} && tar cf {output.tar}.temp . && mv {output.tar}.temp {output.tar}) >> {log} 2>&1"
 
 rule salmon_quant:
   input:
-      # This rule really depends on the whole directory (see
-      # params.index_dir), but we can't register it as an input/output
-      # in its own right since Snakemake 5.
-      index_file = rules.salmon_index.output.salmon_index_file,
+      index_tar = rules.salmon_index.output.tar,
       reads = map_input
   output:
       os.path.join(SALMON_DIR, "{sample}", "quant.sf"),
-      os.path.join(SALMON_DIR, "{sample}", "quant.genes.sf")
+      os.path.join(SALMON_DIR, "{sample}", "quant.genes.sf"),
+      salmon_quant_tar = os.path.join(OUTPUT_DIR, "{sample}", "salmon-quant.tar")
   resources:
       mem_mb = config['execution']['rules']['salmon_quant']['memory']
   params:
-      index_dir = rules.salmon_index.params.salmon_index_dir,
       outfolder = os.path.join(SALMON_DIR, "{sample}")
   log: os.path.join(LOG_DIR, "salmon", 'salmon_quant_{sample}.log')
   run:
-    if(len(input.reads) == 1):
-        COMMAND = "{SALMON_QUANT_EXEC} -i {params.index_dir} -l A -p {SALMON_QUANT_THREADS} -r {input.reads} -o {params.outfolder} --seqBias --gcBias -g {GTF_FILE} >> {log} 2>&1"
-    elif(len(input.reads) == 2):
-        COMMAND = "{SALMON_QUANT_EXEC} -i {params.index_dir} -l A -p {SALMON_QUANT_THREADS} -1 {input.reads[0]} -2 {input.reads[1]} -o {params.outfolder} --seqBias --gcBias -g {GTF_FILE} >> {log} 2>&1"
+    if (len(input.reads) == 1):
+        pe_se_args="-r {}".format(input.reads)
+    else:
+        pe_se_args="-1 {reads[0]} -2 {reads[1]}".format(reads=input.reads)
+    COMMAND = f"\
+(index_dir=$(mktemp -d) && \
+tar xf {input.index_tar} -C $index_dir && \
+{SALMON_QUANT_EXEC} -i $index_dir -l A \
+    -p {SALMON_QUANT_THREADS} {pe_se_args} \
+    -o {params.outfolder} \
+    --seqBias --gcBias \
+    -g {GTF_FILE} && \
+mkdir -p $(dirname {output.salmon_quant_tar}) && \
+cd {SALMON_DIR} && tar cf {output.salmon_quant_tar}.temp {wildcards.sample} && \
+mv {output.salmon_quant_tar}.temp {output.salmon_quant_tar}) >> {log} 2>&1"
     shell(COMMAND)
 
 rule counts_from_SALMON:
   input:
-      quantFiles = expand(os.path.join(SALMON_DIR, "{sample}", "quant.sf"), sample=SAMPLES),
-      quantGenesFiles = expand(os.path.join(SALMON_DIR, "{sample}", "quant.genes.sf"), sample=SAMPLES),
-      colDataFile = rules.translate_sample_sheet_for_report.output
+      colDataFile = rules.translate_sample_sheet_for_report.output,
+      salmon_quant_tar = expand(os.path.join(OUTPUT_DIR, "{sample}", "salmon-quant.tar"), sample=SAMPLES)
   output:
       os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.transcripts.tsv"),
       os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.genes.tsv"),
@@ -486,7 +493,7 @@ def hisat2_file_arguments(args):
   resources:
       mem_mb = config['execution']['rules']['counts_from_SALMON']['memory']
   log: os.path.join(LOG_DIR, "salmon", 'salmon_import_counts.log')
-  shell: "{RSCRIPT_EXEC} {SCRIPTS_DIR}/counts_matrix_from_SALMON.R {SALMON_DIR} {COUNTS_DIR} {input.colDataFile} >> {log} 2>&1"
+  shell: "(tempdir=$(mktemp -d); for f in {input.salmon_quant_tar}; do tar xf $f -C $tempdir; done; {RSCRIPT_EXEC} {SCRIPTS_DIR}/counts_matrix_from_SALMON.R $tempdir {COUNTS_DIR} {input.colDataFile}) >> {log} 2>&1"
 
 # compute genome coverage using megadepth
 rule coverage_megadepth:
@@ -617,7 +624,20 @@ def hisat2_file_arguments(args):
   resources:
     mem_mb = config['execution']['rules']['report1']['memory']
   shell:
-    "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}'  --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"
+    """{RSCRIPT_EXEC} {params.reportR}       \
+    --logo={params.logo}                     \
+    --prefix='{wildcards.analysis}'          \
+    --reportFile={params.reportRmd}          \
+    --countDataFile={input.counts}           \
+    --colDataFile={input.coldata}            \
+    --gtfFile={GTF_FILE}                     \
+    --caseSampleGroups='{params.case}'       \
+    --controlSampleGroups='{params.control}' \
+    --covariates='{params.covariates}'       \
+    --workdir={params.outdir}                \
+    --organism='{ORGANISM}'                  \
+    --description='{params.description}'     \
+    --selfContained='{params.selfContained}' >> {log} 2>&1"""
 
 rule deseq_collate_report1:
   input:
@@ -656,7 +676,20 @@ def hisat2_file_arguments(args):
     os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq_results.tsv')
   resources:
     mem_mb = config['execution']['rules']['report2']['memory']
-  shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.transcripts' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"
+  shell: """{RSCRIPT_EXEC} {params.reportR}          \
+  --logo={params.logo}                               \
+  --prefix='{wildcards.analysis}.salmon.transcripts' \
+  --reportFile={params.reportRmd}                    \
+  --countDataFile={input.counts}                     \
+  --colDataFile={input.coldata}                      \
+  --gtfFile={GTF_FILE}                               \
+  --caseSampleGroups='{params.case}'                 \
+  --controlSampleGroups='{params.control}'           \
+  --covariates='{params.covariates}'                 \
+  --workdir={params.outdir}                          \
+  --organism='{ORGANISM}'                            \
+  --description='{params.description}'               \
+  --selfContained='{params.selfContained}' >> {log} 2>&1"""
 
 rule deseq_collate_report2:
   input:
@@ -695,7 +728,20 @@ def hisat2_file_arguments(args):
     os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq_results.tsv')
   resources:
     mem_mb = config['execution']['rules']['report3']['memory']
-  shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.genes' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"
+  shell: """{RSCRIPT_EXEC} {params.reportR}    \
+  --logo={params.logo}                         \
+  --prefix='{wildcards.analysis}.salmon.genes' \
+  --reportFile={params.reportRmd}              \
+  --countDataFile={input.counts}               \
+  --colDataFile={input.coldata}                \
+  --gtfFile={GTF_FILE}                         \
+  --caseSampleGroups='{params.case}'           \
+  --controlSampleGroups='{params.control}'     \
+  --covariates='{params.covariates}'           \
+  --workdir={params.outdir}                    \
+  --organism='{ORGANISM}'                      \
+  --description='{params.description}'         \
+  --selfContained='{params.selfContained}' >> {log} 2>&1"""
 
 rule deseq_collate_report3:
   input: