-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdna10x.py
136 lines (127 loc) · 5.53 KB
/
dna10x.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#! /usr/bin/python
import argparse
import gzip
import io
import os
from os.path import exists
from glob import glob
from pysam import AlignmentFile
from count_dna import chrfragments,chrfragments_output,fragments
import subprocess
from address import contig_address
from error_correct import cbccorrect,revcomp
from functools import partial
from multiprocessing import Pool
def parse_user_input():
parser = argparse.ArgumentParser()
parser.add_argument('-bc','--bcl',required=False,help='Directory with base calling data.')
parser.add_argument('-s','--samplesheet',required=True,help='Path to sample sheet.')
parser.add_argument('-d','--directory',required=True,help='Output directory.')
parser.add_argument('-b','--barcodes',required=True,help='Path to 10x barcode whitelist.')
parser.add_argument('-t','--threads',required=True,type=int,help='Number of cpu threads.')
parser.add_argument('-r','--reference',required=True,help='Path to reference genome.')
parser.add_argument('-i','--insert-size',required=True,type=int,help='Maximum insert size.')
parser.add_argument('-m','--minscore',required=True,type=float,help='Alignment score threshold (as a fraction of read length).')
parser.add_argument('-p','--barcode-position',type=int,required=True,help='1-indexed cell barcode position in read 2.')
parser.add_argument('-rc','--revcomp',help='Reverse complement cell barcodes.',action='store_true')
parser.add_argument('-sf','--skip-fastq',action='store_true',help='Skip fastq generation if they already exist.')
parser.add_argument('-sa','--skip-align',action='store_true',help='Skip alignment with bwa mem if bam already exists.')
parser.add_argument('-c','--cutadapt',action='store_true',help='Run cutadapt on interleaved fastq.')
parser.add_argument('-ad','--adapter',required=False,help='Adapter sequence to be trimmed by cutadapt.')
parser.add_argument('-sd','--skip-demux',action='store_true',help='Skip production of barcoded fastqs.')
return parser
parser = parse_user_input()
ui = parser.parse_args()
samplesheet=ui.samplesheet
if exists(samplesheet):
with open(samplesheet) as f:
next(f)
samples = {line.split(',')[1]:ui.directory for line in f}
else:
print("Error: can't find sample sheet.")
exit()
threads=int(ui.threads)
directory = ui.directory
bcl=ui.bcl
if not ui.skip_fastq:
print('Launching Cell Ranger to make fastqs...')
cmd = 'cellranger-atac mkfastq --run=%(bcl)s --id=%(directory)s --csv=%(samplesheet)s --project=%(directory)s' % vars()
print(cmd)
os.system(cmd)
reference = ui.reference
barcodes = ui.barcodes
pos = ui.barcode_position
for sample in samples:
project = samples[sample]
fastqpath = directory+'/outs/fastq_path/'+project+'/'+sample+'/*'
fastqs = glob(fastqpath)
R1list = [fq for fq in fastqs if fq.find('_R1_')>-1]
R2list = [fq for fq in fastqs if fq.find('_R2_')>-1]
R3list = [fq for fq in fastqs if fq.find('_R3_')>-1]
R1list.sort()
R2list.sort()
R3list.sort()
j=1
procs=[]
fastqouts=[]
bamout=directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'.bam'
if not ui.skip_demux:
for r1,r2,r3 in zip(R1list,R2list,R3list):
fastqout = directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'_'+str(j)+'.fastq.gz'
cmd='python call_demux.py -r1 %(r1)s -r2 %(r2)s -r3 %(r3)s -p %(pos)d| gzip > %(fastqout)s' % vars()
print(cmd)
p=subprocess.Popen(cmd,shell=True)
procs.append(p)
fastqouts.append(fastqout)
j+=1
p_exit = [p.wait() for p in procs]
else:
for r1,r2,r3 in zip(R1list,R2list,R3list):
fastqout = directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'_'+str(j)+'.fastq.gz'
fastqouts.append(fastqout)
j+=1
if not ui.skip_align:
fqs = ' '.join(fastqouts)
print(fastqouts)
if not ui.cutadapt:
cmd = "bwa mem -p -C -M -t %(threads)d %(reference)s '<zcat %(fqs)s' | samtools view -Sb - > %(bamout)s" % vars()
print(cmd)
os.system(cmd)
else:
adapter = ui.adapter
cmd = "zcat %(fqs)s | cutadapt -a %(adapter)s -A %(adapter)s --interleaved --cores=%(threads)s -o - - | bwa mem -p -C -M -t %(threads)s %(reference)s - | samtools view -Sb - > %(bamout)s" % vars()
print(cmd)
os.system(cmd)
addressfile=directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'.address.txt.gz'
fragfile=directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'.fragments.tsv'
chrs=[line.split()[0][1::] for line in open(reference) if line[0]=='>']
chrs=sorted(chrs)
bclen=16
if exists(barcodes):
if ui.revcomp:
bcset = set([revcomp(line.split()[0]) for line in open(barcodes)])
else:
bcset = set([line.split()[0] for line in open(barcodes)])
else:
print("Error: can't find barcode whitelist file.")
exit()
addressfile = directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample
ch_cbcdict,ch_qcbcdict,cbcfreq_dict=contig_address(addressfile,chrs,bamout,bclen,bcset,ui.insert_size,ui.minscore)
for ch in chrs:
newcbcs=cbccorrect(ch_cbcdict[ch],ch_qcbcdict[ch],cbcfreq_dict)
fragfile=directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'.'+ch+'.fragments.tsv'
addressfile = directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'.'+ch+'.address.txt.gz'
fragments(sample,reference,addressfile,fragfile,chrs,newcbcs)
fragfile = directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'.fragments.tsv'
with open(fragfile,'w') as g:
for i,ch in enumerate(chrs):
infile = directory+'/outs/fastq_path/'+project+'/'+sample+'/'+sample+'.'+ch+'.fragments.tsv'
with open(infile) as f:
for line in f:
if line[0]=='#':
if i==0:
g.write(line)
else:
g.write(line)
cmd='rm %(infile)s' % vars()
os.system(cmd)