-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline_isoprot.py
133 lines (94 loc) · 3.14 KB
/
pipeline_isoprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""========================
Pipeline isoprot
By Chen-Yi Wang (2024/June)
===========================
Overview
========
This code is a pipeline for protein alignment and domain prediction after dtu analysis.
Pipeline tasks
==============
The pipeline consists of the following steps:
* multisequence protein Alignment.
* Prediction of protein domains based on protein sequences
Usage
=====
The pipeline requires a configured pipeline.yml file,
which contains various settings and parameters required for
the pipeline to run.
To generate the config file to change the running of the pipeline you need to
run:
tallytrin isoprot config
This will generate a pipeline.yml file that the user can modify to change the
output of the pipeline. Once the user has modified the pipeline.yml file the
pipeline can then be ran using the following commandline command:
tallytrin isoprot make full -v5
You can run the pipeline locally (without a cluster) using --local
tallytrin isoprot make full -v5 --local
Input files
===========
Input files should be tsv files generated in the folder "StageR_df_plotproportions" after dtu analysis.
Pipeline output
===============
The pipeline outputs pdf files of multialignment and predicated protein domains.
Code
====
"""
import sys
import os
import pysam
import glob
import pandas as pd
from ruffus import *
import cgatcore.iotools as iotools
import cgatcore.pipeline as P
import cgatcore.experiment as E
from cgatcore.pipeline import cluster_runnable
# load options from the config file
PARAMS = P.get_parameters(
["%s/pipeline.yml" % os.path.splitext(__file__)[0],
"../pipeline.yml",
"pipeline.yml"])
# Root of Rmarkdown folder in pipeline folder
RMD_ROOT = os.path.join(os.path.dirname(__file__), "pipeline_isoprot")
# R folder in main directory
R_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__),"R"))
def prot_align():
'''
R script task to run protein alignment
'''
R_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__),"R"))
job_memory = "70G"
statement = '''
Rscript %(R_PATH)s/prot_align.R'''
P.run(statement, job_options='-t 48:00:00')
@follows(prot_align)
def prot_domain():
'''
R script task to run protein domain prediction
'''
R_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__),"R"))
job_memory = "70G"
statement = '''
Rscript %(R_PATH)s/prot_domain.R'''
P.run(statement, job_options='-t 48:00:00')
@follows(prot_align, prot_domain)
def full():
'''
A placeholder function that serves as a checkpoint
to run all previous ruffus tasks and ensure that all
previous tasks are completed.
'''
pass
def main(argv=None):
'''
The main function that runs the pipeline using the cgatcore.pipeline module.
Takes an optional argument list (default is sys.argv).
Please note that some of these functions use external Python scripts or
tools. For a complete understanding of their functionality, it is
necessary to examine the code of those scripts as well.
'''
if argv is None:
argv = sys.argv
P.main(argv)
if __name__ == "__main__":
sys.exit(P.main(sys.argv))