-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrarefaction.py
84 lines (72 loc) · 2.16 KB
/
rarefaction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
import argparse
import sys
import json
import operator
import random
from grimoire.io import GFF_stream
## Command line stuff ##
parser = argparse.ArgumentParser(
description='Estimates a rarefaction curve for introns.',
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--gff3', required=True, type=str,
metavar='<path>', help='gff file, may be compressed')
parser.add_argument('--source', required=True, type=str,
metavar='<str>', help='rule-based parsing based on gff source')
parser.add_argument('--maxcov', required=False, type=int, default = 10000,
metavar='<int>', help='number of simulated reads [10000]')
parser.add_argument('--step', required=False, type=int, default = 100,
metavar='<int>', help='coverage range increment [100]')
arg = parser.parse_args()
# memorize coordinates of introns
gff = GFF_stream(arg.gff3)
wormbase = {}
splice = {}
for f in gff:
if f.type != 'intron': continue
stringy = '{}{}{}-{}'.format(f.chrom, f.strand, f.beg, f.end)
if f.score == '.': f.score = 1.0
else: f.score = float(f.score)
if f.source == 'WormBase':
wormbase[stringy] = f.score
elif f.source == 'RNASeq_splice':
splice[stringy] = f.score
def observations(ramp, cov):
obs = set()
for i in range(cov):
p = random.random()
for j in range(len(ramp)):
if p <= ramp[j]:
obs.add(j)
break
return(len(obs))
def mass_ramp(source):
mass = 0
for feature in source:
mass += source[feature]
pramp = []
for fstring, fmass in sorted(source.items(), key = operator.itemgetter(1)):
pramp.append(fmass / mass)
cramp = []
cramp.append(pramp[0])
for i in range(1, len(pramp)):
cramp.append(cramp[i - 1] + pramp[i])
obs = []
for i in range(0, arg.maxcov, arg.step):
obs.append(observations(cramp, i))
return(obs)
anno_obs = mass_ramp(wormbase)
splice_obs = mass_ramp(splice)
print('cov\tWormBase\tRNASeq_splice')
for i in range(len(anno_obs)):
print('{}\t{}\t{}'.format(i * arg.step, anno_obs[i], splice_obs[i]))
print('gff\t{}\t{}'.format(len(wormbase), len(splice)))
"""
for s in source:
count = 0
mass = 0
for string in source[s]:
count += 1
mass += source[s][string]
print(s, count, mass, mass/count)
"""