-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorthomcl_parse_results.py
executable file
·65 lines (49 loc) · 2.23 KB
/
orthomcl_parse_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#! /usr/bin/env python3
from Bio import SeqIO
import pandas as pd
from pathlib import Path
import argparse
class orthomcl():
def __init__(self, mclgroups, outdir):
self.mclgroups = mclgroups
self.outdir = outdir
def mclcount(self):
data = pd.read_csv(self.mclgroups, sep = " ", header=None, index_col=0, encoding='utf8', low_memory=False)
count = data
count = count.apply(lambda x: x.str.split("|").str.get(0))
count = count.apply(lambda x: x.value_counts(), axis = 1)
count = count.fillna(0).astype('int')
count.to_csv(Path(self.outdir).joinpath('mclCount.csv'), sep='\t', index=True)
single = count
for i in single.columns:
single = single[single[i]==1]
single.to_csv(Path(self.outdir).joinpath('mclSingle.csv'), sep='\t', index=True)
return data
def sequences(self, records, data):
seq_dir = Path(self.outdir).joinpath('Orthogroup_Sequences')
Path.mkdir(seq_dir,exist_ok=True)
records_list = {}
for record in records:
records_list[record.id] = record
group = data
group = group.apply(lambda x: x.str.split('|').str.get(1))
for group in group.index:
seqs = [records_list[i] for i in list(data.loc[group,:].astype('str')) if i!='nan']
SeqIO.write(seqs, Path(seq_dir).joinpath('{group}.fa'.format(group=str(group).replace(":",""))), 'fasta')
def main():
parse = argparse.ArgumentParser()
parse.add_argument('-i','--input', help='the groups file from orthomcl-pipeline in groups directory')
parse.add_argument('-f','--fasta', help='the all fasta sequences used in analysis, you can use \'cat\' to merge all fasta files in \'compliant_fasta\' directory')
parse.add_argument('-o','--output', help='the output directory')
args = parse.parse_args()
outdir = args.output
if Path(outdir).exists():
pass
else:
Path.mkdir(Path(outdir).resolve())
ortho = orthomcl(args.input, outdir)
data = ortho.mclcount()
records = SeqIO.parse(args.fasta, 'fasta')
ortho.sequences(records, data)
if __name__ == "__main__":
main()