-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile-reprocess
197 lines (176 loc) · 12 KB
/
Snakefile-reprocess
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
include: "Snakefile"
include: "Snakefile-proteomics"
def get_array_design_from_xml():
"""
Parse xml config file here.
"""
from xml.dom import minidom
xmldoc = minidom.parse( config['accession']+'-configuration.xml' )
itemlist = xmldoc.getElementsByTagName('array_design')
array_designs_grabbed = []
for s in itemlist:
array_designs_grabbed.append( " ".join(s.firstChild.nodeValue.split() ) )
return array_designs_grabbed
# This script used to rerun studies that have new matrices generated from IRAP/ISL
def get_outputs():
"""
First method to be executed since it is run by rule all.
"""
import os.path
import datetime
print('Starting getting list of outputs..' + str(datetime.datetime.now()))
outputs = []
#get metrics only for baseline
global metrics
if get_from_config_or_metadata_summary('experiment_type') == 'rnaseq_mrna_baseline':
metrics = get_metrics_reprocess()
# Read this now so that it is available for all other needs
read_metadata_summary()
global skip_accession
skip_accession = read_skip_steps_file()
global run_deconv_accession
run_deconv_accession = read_run_deconv_file()
required_config=['tool']
check_config_required(fields=required_config)
global experiment_type
experiment_type=get_from_config_or_metadata_summary('experiment_type')
print(experiment_type)
# collect output for reprocessing
if experiment_type == 'rnaseq_mrna_baseline':
#if config['tool']=="all-baseline" or 'baseline-tracks' in config['tool'] or 'baseline-heatmap' in config['tool'] or 'baseline-coexpression' in config['tool']:
outputs.extend(expand("logs/"+f"{config['accession']}"+"-add_runs_to_db.done" ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-get_irap_versions_file.done" ))
outputs.append(f"{config['accession']}-raw-counts.tsv.undecorated")
outputs.extend(expand(config['accession']+"-{metric}.tsv.undecorated", metric=metrics))
outputs.extend(expand(config['accession']+"-transcripts-{metric}.tsv.undecorated", metric=["tpms"] ))
outputs.extend(expand("logs/"+f"{config['accession']}-copy_transcript_relative_isoforms.done" ))
if skip(config['accession'],'rnaseq_qc'):
outputs.extend(expand("qc/"+f"{config['accession']}-irap-single-lib-report.tsv" ))
outputs.extend(expand(config['accession']+"-{metric}.tsv.undecorated.quantile_normalized", metric=metrics))
outputs.extend(expand(config['accession']+"-{metric}.tsv.undecorated.aggregated", metric=metrics))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-rule-transcripts_na_check_{metric}.done", metric=["tpms"] ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-quantile_normalise_transcripts_{metric}.done", metric=["tpms"] ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-summarize_transcripts_{metric}.done", metric=["tpms"] ))
outputs.append(f"{config['accession']}-analysis-methods.tsv_baseline_rnaseq")
if skip(config['accession'],'atlas_experiment_summary'):
outputs.append(f"{config['accession']}-atlasExperimentSummary.Rdata")
outputs.extend(expand(config['accession']+"-{metric}.tsv", metric=metrics))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-transcripts-{metric}.tsv.done", metric=["tpms"] ))
if skip(config['accession'],'baseline-heatmap'):
outputs.extend(expand(f"{config['accession']}"+"-heatmap-{metric}.pdf", metric=metrics ))
outputs.append(f"{config['accession']}-heatmap.pdf")
# baseline tracks
if skip(config['accession'],'baseline-tracks'):
check_config_required(fields=['metadata_summary'], method='baseline-tracks')
outputs.extend(expand(config['accession']+".{a_id}.genes.expressions_{metric}.bedGraph",
a_id=get_assay_ids(),
metric=metrics))
outputs.extend(expand( f"{config['accession']}"+".{a_id}.genes.expressions.bedGraph", a_id=get_assay_ids() ) ) #symlink only for tpms
# generating coexpressions matrix for baseline experiment
if skip(config['accession'],'baseline-coexpression'):
outputs.extend(expand(f"{config['accession']}"+"-{metric}-coexpressions.tsv.gz", metric=metrics ))
outputs.append(f"{config['accession']}-coexpressions.tsv.gz")
# add deconvolution results for some accessions
if run_deconvolution(config['accession']):
outputs.append(f"{config['accession']}-deconvolution.proportions.tsv")
# collect output for differential rna-seq
if experiment_type == 'rnaseq_mrna_differential':
outputs.extend(expand("logs/"+f"{config['accession']}"+"-add_runs_to_db.done" ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-get_irap_versions_file.done" ))
outputs.append(f"{config['accession']}-raw-counts.tsv.undecorated")
if skip(config['accession'],'rnaseq_qc'):
outputs.extend(expand("qc/"+f"{config['accession']}-irap-single-lib-report.tsv" ))
outputs.append(f"{config['accession']}-analytics.tsv.undecorated")
outputs.extend(expand("logs/"+f"{config['accession']}"+"-{c_id}-mvaPlot.png.done", c_id=get_contrast_ids() ))
outputs.append(f"{config['accession']}-analytics.tsv.undecorated.unrounded")
outputs.append(f"{config['accession']}-analysis-methods.tsv_differential_rnaseq")
if skip(config['accession'],'atlas_experiment_summary'):
outputs.append(f"{config['accession']}-atlasExperimentSummary.Rdata")
outputs.append(f"{config['accession']}-analytics.tsv")
outputs.append(f"{config['accession']}-analytics.tsv.unrounded")
outputs.extend(expand("logs/"+f"{config['accession']}"+".differential_statistics_rnaseq.done" ))
outputs.append(f"{config['accession']}-raw-counts.tsv")
if skip(config['accession'],'percentile_ranks'):
outputs.append(f"{config['accession']}-percentile-ranks.tsv")
# differential gsea
if skip(config['accession'],'differential-gsea'):
check_config_required(fields=['bioentities_properties'], method='differential-gsea')
outputs.extend( expand( config['accession']+".{c_id}.{ext_db}.{type}",
c_id=get_contrast_ids(),
ext_db=get_ext_db(),
type=["gsea.tsv", "gsea_list.tsv"]))
outputs.extend( expand("logs/"+config['accession']+".{c_id}.{ext_db}.{type}",
c_id=get_contrast_ids(),
ext_db=get_ext_db(),
type=["check_differential_gsea.done", "check_differential_gsea_list.done"]))
# generate Genome browser tracks (bedGraph) for the experiment
if skip(config['accession'],'differential-tracks'):
outputs.extend(expand(config['accession']+".{id}.{type}", id=get_contrast_ids(), type=["genes.pval.bedGraph", "genes.log2foldchange.bedGraph"]))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-decorate_differential_rnaseq.done" ))
# add deconvolution results for some accessions
if run_deconvolution(config['accession']):
outputs.append(f"{config['accession']}-deconvolution.proportions.tsv")
# collect output for differential microarrays
if experiment_type == 'microarray_1colour_mrna_differential' or experiment_type =='microarray_2colour_mrna_differential' or experiment_type =='microarray_1colour_microrna_differential':
outputs.extend(expand("logs/"+f"{config['accession']}"+"-get_normalized_expressions_microarray.done" ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-check_normalized_expressions_microarray.done" ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-microarray_qc.done" ))
outputs.append(f"{config['accession']}-analysis-methods.tsv")
outputs.extend(expand("logs/"+f"{config['accession']}"+"-decorate_temp_norm_expr_microarray.done" ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-merge_probe_ids_microarray.done" ))
#outputs.extend(expand(config['accession']+"_{ad}-normalized-expressions.tsv.undecorated", ad=get_array_design_from_xml() ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-differential_statistics_microarray.done" ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-check_nas_microarray.done" ))
outputs.extend(expand(config['accession']+"_{ad}-analytics.tsv.undecorated.unrounded", ad=get_array_design_from_xml() ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"_{ad}-round_log2_fold_changes_microarray.done" ,ad=get_array_design_from_xml() ))
if skip(config['accession'],'atlas_experiment_summary'):
outputs.append(f"{config['accession']}-atlasExperimentSummary.Rdata")
# decorate
outputs.extend(expand(config['accession']+"_{ad}-analytics.tsv", ad=get_array_design_from_xml() ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"_{ad}-decorate_differential_microarray.done" ,ad=get_array_design_from_xml() ))
if skip(config['accession'],'percentile_ranks'):
outputs.append(f"{config['accession']}-percentile-ranks.tsv")
# differential gsea
if skip(config['accession'],'differential-gsea'):
check_config_required(fields=['bioentities_properties'], method='differential-gsea')
outputs.extend( expand( config['accession']+".{c_id}.{ext_db}.{type}",
c_id=get_contrast_ids(),
ext_db=get_ext_db(),
type=["gsea.tsv", "gsea_list.tsv"]))
outputs.extend( expand("logs/"+config['accession']+".{c_id}.{ext_db}.{type}",
c_id=get_contrast_ids(),
ext_db=get_ext_db(),
type=["check_differential_gsea.done", "check_differential_gsea_list.done"]))
# generate Genome browser tracks (bedGraph) for the experiment
if skip(config['accession'],'differential-tracks'):
outputs.extend(expand(config['accession']+".{id}.{type}", id=get_contrast_ids(), type=["genes.pval.bedGraph", "genes.log2foldchange.bedGraph"]))
# delete intermediate files
outputs.extend(expand("logs/"+f"{config['accession']}"+"-delete_intermediate_files_microarray.done" ))
# collect output for baseline proteomics
if experiment_type == 'proteomics_baseline' or experiment_type =='proteomics_baseline_dia':
outputs.extend(expand("logs/"+f"{config['accession']}"+"-final_check_tsv_data_file_not_empty_of_data" ))
# collect output for differential proteomics
if experiment_type == 'proteomics_differential':
if skip(config['accession'],'percentile_ranks'):
outputs.append(f"{config['accession']}-percentile-ranks.tsv")
# commenting the tracks as E-PROT-* appear not to be having any
# outputs.extend(expand(config['accession']+".{id}.{type}", id=get_contrast_ids(), type=["genes.pval.bedGraph", "genes.log2foldchange.bedGraph"]))
if skip(config['accession'],'differential-gsea'):
outputs.extend( expand( config['accession']+".{c_id}.{ext_db}.{type}",
c_id=get_contrast_ids(),
ext_db=get_ext_db(),
type=["gsea.tsv", "gsea_list.tsv"]))
outputs.extend( expand("logs/"+config['accession']+".{c_id}.{ext_db}.{type}",
c_id=get_contrast_ids(),
ext_db=get_ext_db(),
type=["check_differential_gsea.done", "check_differential_gsea_list.done"]))
# For all experiment types, move data to atlas_exps and create condensed SDRF
outputs.extend(expand("logs/"+f"{config['accession']}"+"-copy_experiment_from_analysis_to_atlas_exps.done" ))
outputs.extend(expand("logs/"+f"{config['accession']}"+"-get_magetab_for_experiment.done" ))
print(outputs)
print('Getting list of outputs.. done')
print(datetime.datetime.now())
return outputs
rule all:
input:
required_outputs=get_outputs()