Skip to content

Commit

Permalink
get phasing info from mpileups
Browse files Browse the repository at this point in the history
  • Loading branch information
trgaleev committed Nov 20, 2017
1 parent 29e9ae9 commit 3b696c7
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 82 deletions.
3 changes: 2 additions & 1 deletion PIPELINE.mk
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ $(PREFIX)_raw_counts_ref_allele_ratios.pdf: $(PREFIX)_raw_counts.tsv

# counts
$(PREFIX)_raw_counts.tsv: $(PREFIX)_h1_uniqreads.mpileup $(PREFIX)_h2_uniqreads.mpileup
python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_ref.bed \
$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
$(PREFIX)_discarded_HetSNVs.tsv \
$(PREFIX)_h1_uniqreads.mpileup $(PREFIX)_h2_uniqreads.mpileup > $@

Expand Down
3 changes: 2 additions & 1 deletion PIPELINE_aggregated_counts.mk
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ $(PREFIX)_raw_counts_ref_allele_ratios.pdf: $(PREFIX)_raw_counts.tsv

# counts
$(PREFIX)_raw_counts.tsv: $(INPUT_UNIQ_READS_PILEUP_FILES)
python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_ref.bed \
$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
$(PREFIX)_discarded_HetSNVs.tsv \
$(INPUT_UNIQ_READS_PILEUP_FILES) > $@

Expand Down
6 changes: 3 additions & 3 deletions filter_phase_warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ def check_subset(l1,l2):
with open(sys.argv[1], 'a') as rm_hetSNV_f:
for line in sys.stdin:
if not line.startswith('#'):
(chr,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
(chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom,
warning_h1,warning_h2,cnv) = line.split()
if check_subset(warning_h1.split(','),['.','zero_cnt']) and check_subset(warning_h2.split(','),['.','zero_cnt']) and h1_coord != 'notLifted?' and h2_coord != 'notLifted?':
sys.stdout.write('\t'.join([
chr,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom, cnv
])+'\n')
else:
rm_hetSNV_f.write('\t'.join([chr, ref_coord,
rm_hetSNV_f.write('\t'.join([chrm, ref_coord,
'_'.join([cA,cC,cG,cT,'','um',warning_h1,warning_h2])])+'\n')
else: sys.stdout.write('\t'.join(line.split()[:15] + [line.split()[-1]])+'\n')
4 changes: 0 additions & 4 deletions get_hetSNV_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@
if not line.startswith('#'):
CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT = line.split('\t')[:9]
if FILTER=='PASS':
# require all PASS records to be phased: if some aren't, vcf2diploid will phase them randomly,
# but to keep track which allele goes to which haplotype, this (random phasing) needs to be done before vcf2diploid
# todo: incorporate this step into makePersonalGenome.mk
if '/' in line.split('\t')[sample_col_idx].split(':')[0]: sys.exit('\n\n' + sys.argv[0] + '\n Error: the variant in the following line is not phased\n' + line)
GTl=line.split('\t')[sample_col_idx].split(':')[0].replace('/','|').split('|')
al=[REF]+ALT.split(',')
if al[int(GTl[0])]!=al[int(GTl[1])] and len(al[int(GTl[0])])==len(al[int(GTl[1])])==len(REF)==1:
Expand Down
153 changes: 90 additions & 63 deletions pileup2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,51 +7,58 @@


hetSNV_dict = defaultdict(dict)
hetSNV_list = []
bases = set(['A', 'C', 'G', 'T', 'N'])
discarded = set([])

rmvdhets_file = open(sys.argv[4],'w')

rmvdhets_file = open(sys.argv[5],'w')

duplicate_posns = {}
# read the h1 and h2 coords
with open(sys.argv[2],'r') as in_h1:
with open(sys.argv[3],'r') as in_h1:
for line in in_h1:
line = line.strip()
h_chr, _, h_crd, ref_info = line.split('\t')
#h_cr, h = h_chr.split('_')
r_chr, r_crd, r_a, a1, a2 = ref_info.split('_')
if 'h1_pos' in hetSNV_dict[r_chr+'_'+r_crd] or r_chr+'_'+r_crd in duplicate_posns:
duplicate_posns[r_chr+'_'+r_crd] = None
continue
if a1 not in bases or a2 not in bases:
rmvdhets_file.write('\t'.join([r_chr, r_crd, ref_info]) + '\n')
discarded.add(r_chr + '_' + r_crd)
continue
#hetSNV_dict[r_chr+'_'+r_crd][h]={'crd':h_crd}
hetSNV_dict[r_chr+'_'+r_crd]['h1_pos'] = h_chr+'_'+h_crd
hetSNV_dict[r_chr+'_'+r_crd]['h1_pos'] = h_chr + '_' + h_crd
hetSNV_dict[r_chr+'_'+r_crd]['r_a'] = r_a
hetSNV_dict[r_chr+'_'+r_crd]['a1'] = a1
hetSNV_dict[r_chr+'_'+r_crd]['a2'] = a2
hetSNV_dict[r_chr+'_'+r_crd]['h1_a'] = a1
hetSNV_dict[r_chr+'_'+r_crd]['h2_a'] = a2

with open(sys.argv[3],'r') as in_h2:
with open(sys.argv[4],'r') as in_h2:
for line in in_h2:
line=line.strip()
line = line.strip()
h_chr, _, h_crd, ref_info = line.split('\t')
#h_cr, h = h_chr.split('_')
r_chr, r_crd, r_a, a1, a2 = ref_info.split('_')
if 'h2_pos' in hetSNV_dict[r_chr+'_'+r_crd] or r_chr+'_'+r_crd in duplicate_posns:
duplicate_posns[r_chr+'_'+r_crd] = None
continue
if a1 not in bases or a2 not in bases:
if r_chr + '_' + r_crd not in discarded: rmvdhets_file.write('\t'.join([r_chr, r_crd, ref_info]) + '\n')
continue
#hetSNV_dict[r_chr+'_'+r_crd][h]={'crd':h_crd}
hetSNV_dict[r_chr+'_'+r_crd]['h2_pos'] = h_chr+'_'+h_crd
hetSNV_dict[r_chr+'_'+r_crd]['h2_pos'] = h_chr + '_' + h_crd
hetSNV_dict[r_chr+'_'+r_crd]['r_a'] = r_a
hetSNV_dict[r_chr+'_'+r_crd]['a1'] = a1
hetSNV_dict[r_chr+'_'+r_crd]['a2'] = a2
hetSNV_list.append(r_chr+'_'+r_crd)


hetSNV_dict[r_chr+'_'+r_crd]['h1_a'] = a1 # mostly redundant, but needed for cases, where
hetSNV_dict[r_chr+'_'+r_crd]['h2_a'] = a2 # an snv couldn't be lifted over for one of the haplotypes

for i in duplicate_posns:
del hetSNV_dict[i]
sys.stderr.write('WARN: same coords of two SNVs: ' + ' '.join(i.split('_')) + ' ? Removing.\n')
rmvdhets_file.write('\t'.join(i.split('_') + ['duplicate_pos']) + '\n')
# this doesn't catch homozygous SNVs though, when duplicated with a het - vcf2diploid may actually incorporate either? version
# thus, the input vcf should not have duplicated snvs
# todo: add a check when generating ref.bed

# read the pileups
pileup_dict = read_pileup.pileup_to_basecnts(sys.argv[5:])
pileup_dict = read_pileup.pileup_to_basecnts(sys.argv[6:])

sys.stdout.write('\t'.join([
'#chr',
Expand All @@ -73,50 +80,70 @@
'warn_h2'
])+'\n')

#for k in hetSNV_dict:
# to keep the original order
for k in hetSNV_list:

basecnts_h1 = pileup_dict.get(hetSNV_dict[k].get('h1_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt'})
basecnts_h2 = pileup_dict.get(hetSNV_dict[k].get('h2_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt'})

basecnts = {
'A':basecnts_h1['A'] + basecnts_h2['A'],
'C':basecnts_h1['C'] + basecnts_h2['C'],
'G':basecnts_h1['G'] + basecnts_h2['G'],
'T':basecnts_h1['T'] + basecnts_h2['T'],
'N':basecnts_h1['N'] + basecnts_h2['N']
}

tot_cnt = basecnts[hetSNV_dict[k]['a1']] + basecnts[hetSNV_dict[k]['a2']]

if tot_cnt >= int(sys.argv[1]):

ref_cnt = basecnts[hetSNV_dict[k]['r_a']]
if ref_cnt > tot_cnt:
rmvdhets_file.write(k.split('_')[0] + '\t' + k.split('_')[1] + '\tref_allele:' + str(ref_cnt) + '_h1:' + str(basecnts[hetSNV_dict[k]['a1']]) + '_h2:' + str(basecnts[hetSNV_dict[k]['a2']]) + '\n')
continue
#print basecnts
#print k +'\t' + str(ref_cnt) + '\t' + str(tot_cnt) + '\n'
pbinom=binom.binomtest(ref_cnt, tot_cnt, 0.5)
sys.stdout.write('\t'.join([
k.split('_')[0],
k.split('_')[1],
hetSNV_dict[k].get('h1_pos','notLifted?'),
hetSNV_dict[k].get('h2_pos','notLifted?'),
hetSNV_dict[k]['r_a'],
hetSNV_dict[k]['a1'],
hetSNV_dict[k]['a2'],
str(basecnts['A']),
str(basecnts['C']),
str(basecnts['G']),
str(basecnts['T']),
str(basecnts['N']),
str(float(ref_cnt)/float(tot_cnt)),
str(tot_cnt),
str(pbinom),
basecnts_h1['warning'],
basecnts_h2['warning']
])+'\n')

with open(sys.argv[2], 'r') as in_ref:
for line in in_ref:

k = line.split('\t')[0] + '_' + line.split('\t')[2]

# when not lifted neither to h1 nor h2
if k not in hetSNV_dict: continue

basecnts_h1 = pileup_dict.get(hetSNV_dict[k].get('h1_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt', 'h1_a':'.'})
basecnts_h2 = pileup_dict.get(hetSNV_dict[k].get('h2_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt', 'h2_a':'.'})

basecnts = {
'A':basecnts_h1['A'] + basecnts_h2['A'],
'C':basecnts_h1['C'] + basecnts_h2['C'],
'G':basecnts_h1['G'] + basecnts_h2['G'],
'T':basecnts_h1['T'] + basecnts_h2['T'],
'N':basecnts_h1['N'] + basecnts_h2['N']
}

tot_cnt = basecnts.get(basecnts_h1['h1_a'], 0) + basecnts.get(basecnts_h2['h2_a'], 0)

if tot_cnt >= max(int(sys.argv[1]), 1):
# need at least one read to get at least one hap nt from mpileups
# using that to figure out what phasing was assigned by vcf2diploid
# if wasn't phased in the .vcf:

if (basecnts_h1['h1_a'], basecnts_h2['h2_a']) in [(hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']), (hetSNV_dict[k]['h1_a'], '.'), ('.', hetSNV_dict[k]['h2_a'])]:
basecnts_h1['h1_a'], basecnts_h2['h2_a'] = hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']

elif (basecnts_h2['h2_a'], basecnts_h1['h1_a']) in [(hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']), (hetSNV_dict[k]['h1_a'], '.'), ('.', hetSNV_dict[k]['h2_a'])]:
basecnts_h2['h2_a'], basecnts_h1['h1_a'] = hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']

else:
#sys.exit(sys.argv[0] + '\n confused with phasing in vcf vs mpileup / hap sequences\n' + line)
sys.stderr.write(sys.argv[0] + ' WARN: confused with phasing in .vcf vs .mpileup, skipping: ' + k + '\n')
rmvdhets_file.write('\t'.join(k.split('_') + ['vcf_vs_mpileup_alleles']) + '\n')
continue


ref_cnt = basecnts[hetSNV_dict[k]['r_a']]
if ref_cnt > tot_cnt:
rmvdhets_file.write(k.split('_')[0] + '\t' + k.split('_')[1] + '\tref_allele:' + str(ref_cnt) + '_h1:' + str(basecnts[hetSNV_dict[k]['h1_a']]) + '_h2:' + str(basecnts[hetSNV_dict[k]['h2_a']]) + '\n')
continue

pbinom=binom.binomtest(ref_cnt, tot_cnt, 0.5)
sys.stdout.write('\t'.join([
k.split('_')[0],
k.split('_')[1],
hetSNV_dict[k].get('h1_pos','notLifted?'),
hetSNV_dict[k].get('h2_pos','notLifted?'),
hetSNV_dict[k]['r_a'],
basecnts_h1['h1_a'],
basecnts_h2['h2_a'],
str(basecnts['A']),
str(basecnts['C']),
str(basecnts['G']),
str(basecnts['T']),
str(basecnts['N']),
str(float(ref_cnt)/float(tot_cnt)),
str(tot_cnt),
str(pbinom),
basecnts_h1['warning'],
basecnts_h2['warning']
])+'\n')

rmvdhets_file.close()
23 changes: 13 additions & 10 deletions read_pileup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def pileup_to_basecnts (filelist):
with open(mf,'r') as in_m:
for line in in_m:
warning = '.'
chr, crd, a, tot_pileup_cnt, seq, _ = line.split()
chrm, crd, a, tot_pileup_cnt, seq, _ = line.split()
a = a.upper()
basecnts={'A':0, 'C':0, 'G':0, 'T':0, 'N':0}

Expand Down Expand Up @@ -47,24 +47,27 @@ def pileup_to_basecnts (filelist):
for base in basecnts:
if base in seq.upper(): basecnts[base] = seq.upper().count(base)
if (basecnts['A'] + basecnts['C'] + basecnts['G'] + basecnts['T'] + basecnts['N'] + deleted_bases + spliced) != int(tot_pileup_cnt):
sys.exit('error1: unexpected base counts / symbols in pileup line\n'+line)
sys.exit(sys.argv[0] + '\nerror1: unexpected base counts / symbols in pileup line\n'+line)

if sum(basecnts.values()) > basecnts[a]: warning = str(sum(basecnts.values()) - basecnts[a])+'_other_alleles'
if max(basecnts.values()) > basecnts[a]: warning = 'hap_allele_not_largest_cnt'
if sum(basecnts.values()) == 0: warning = 'zero_cnt'

basecnts['warning'] = warning

# pileup_dict[chr+'_'+crd] = basecnts
# pileup_dict[chrm+'_'+crd] = basecnts
# to accomodate more than 2 sets of mpileup files combined for the same individual and assay
if chr+'_'+crd not in pileup_dict:
pileup_dict[chr+'_'+crd] = basecnts
if chrm+'_'+crd not in pileup_dict:
pileup_dict[chrm+'_'+crd] = basecnts
pileup_dict[chrm+'_'+crd]['warning'] += (','+basecnts['warning'])
pileup_dict[chrm+'_'+crd][chrm.split('_')[1] + '_a'] = a
else:
pileup_dict[chr+'_'+crd]['A'] += basecnts['A']
pileup_dict[chr+'_'+crd]['C'] += basecnts['C']
pileup_dict[chr+'_'+crd]['G'] += basecnts['G']
pileup_dict[chr+'_'+crd]['T'] += basecnts['T']
pileup_dict[chr+'_'+crd]['warning'] += (','+basecnts['warning'])
pileup_dict[chrm+'_'+crd]['A'] += basecnts['A']
pileup_dict[chrm+'_'+crd]['C'] += basecnts['C']
pileup_dict[chrm+'_'+crd]['G'] += basecnts['G']
pileup_dict[chrm+'_'+crd]['T'] += basecnts['T']
pileup_dict[chrm+'_'+crd]['warning'] += (','+basecnts['warning'])
pileup_dict[chrm+'_'+crd][chrm.split('_')[1] + '_a'] = a

return pileup_dict

0 comments on commit 3b696c7

Please sign in to comment.