get phasing info from mpileups

aberthel · Nov 20, 2017 · 3b696c7 · 3b696c7
1 parent 29e9ae9
commit 3b696c7
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 82 deletions.
diff --git a/PIPELINE.mk b/PIPELINE.mk
@@ -145,7 +145,8 @@ $(PREFIX)_raw_counts_ref_allele_ratios.pdf: $(PREFIX)_raw_counts.tsv
 
 # counts
 $(PREFIX)_raw_counts.tsv: $(PREFIX)_h1_uniqreads.mpileup $(PREFIX)_h2_uniqreads.mpileup
-	python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
+	python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_ref.bed \
+	$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
 	$(PREFIX)_discarded_HetSNVs.tsv \
 	$(PREFIX)_h1_uniqreads.mpileup $(PREFIX)_h2_uniqreads.mpileup > $@
 

diff --git a/PIPELINE_aggregated_counts.mk b/PIPELINE_aggregated_counts.mk
@@ -79,7 +79,8 @@ $(PREFIX)_raw_counts_ref_allele_ratios.pdf: $(PREFIX)_raw_counts.tsv
 
 # counts
 $(PREFIX)_raw_counts.tsv: $(INPUT_UNIQ_READS_PILEUP_FILES)
-	python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
+	python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_ref.bed \
+	$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
 	$(PREFIX)_discarded_HetSNVs.tsv \
 	$(INPUT_UNIQ_READS_PILEUP_FILES) > $@
 

diff --git a/filter_phase_warnings.py b/filter_phase_warnings.py
@@ -8,15 +8,15 @@ def check_subset(l1,l2):
 with open(sys.argv[1], 'a') as rm_hetSNV_f:
     for line in sys.stdin:
         if not line.startswith('#'):
-            (chr,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
+            (chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
                 cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom,
                 warning_h1,warning_h2,cnv) = line.split()
             if check_subset(warning_h1.split(','),['.','zero_cnt']) and check_subset(warning_h2.split(','),['.','zero_cnt']) and h1_coord != 'notLifted?' and h2_coord != 'notLifted?': 
                 sys.stdout.write('\t'.join([
-                                               chr,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
+                                               chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
                                                cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom, cnv
                                            ])+'\n')
             else: 
-                rm_hetSNV_f.write('\t'.join([chr, ref_coord, 
+                rm_hetSNV_f.write('\t'.join([chrm, ref_coord, 
                                     '_'.join([cA,cC,cG,cT,'','um',warning_h1,warning_h2])])+'\n')
         else: sys.stdout.write('\t'.join(line.split()[:15] + [line.split()[-1]])+'\n')
diff --git a/get_hetSNV_bed.py b/get_hetSNV_bed.py
@@ -22,10 +22,6 @@
     if not line.startswith('#'):
         CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT = line.split('\t')[:9]
         if FILTER=='PASS': 
-            # require all PASS records to be phased: if some aren't, vcf2diploid will phase them randomly,
-            # but to keep track which allele goes to which haplotype, this (random phasing) needs to be done before vcf2diploid
-            # todo: incorporate this step into makePersonalGenome.mk 
-            if '/' in line.split('\t')[sample_col_idx].split(':')[0]: sys.exit('\n\n' + sys.argv[0] + '\n Error: the variant in the following line is not phased\n' + line)
             GTl=line.split('\t')[sample_col_idx].split(':')[0].replace('/','|').split('|')
             al=[REF]+ALT.split(',')
             if al[int(GTl[0])]!=al[int(GTl[1])] and len(al[int(GTl[0])])==len(al[int(GTl[1])])==len(REF)==1:

diff --git a/pileup2counts.py b/pileup2counts.py
@@ -7,51 +7,58 @@
 
 
 hetSNV_dict = defaultdict(dict)
-hetSNV_list = []
 bases = set(['A', 'C', 'G', 'T', 'N'])
 discarded = set([])
 
-rmvdhets_file = open(sys.argv[4],'w')
-
+rmvdhets_file = open(sys.argv[5],'w')
 
+duplicate_posns = {}
 # read the h1 and h2 coords
-with open(sys.argv[2],'r') as in_h1:
+with open(sys.argv[3],'r') as in_h1:
     for line in in_h1:
         line = line.strip()
         h_chr, _, h_crd, ref_info = line.split('\t')
         #h_cr, h = h_chr.split('_')
         r_chr, r_crd, r_a, a1, a2 = ref_info.split('_')
+        if 'h1_pos' in hetSNV_dict[r_chr+'_'+r_crd] or r_chr+'_'+r_crd in duplicate_posns:
+            duplicate_posns[r_chr+'_'+r_crd] = None
+            continue
         if a1 not in bases or a2 not in bases: 
             rmvdhets_file.write('\t'.join([r_chr, r_crd, ref_info]) + '\n')
             discarded.add(r_chr + '_' + r_crd)
             continue
-        #hetSNV_dict[r_chr+'_'+r_crd][h]={'crd':h_crd}
-        hetSNV_dict[r_chr+'_'+r_crd]['h1_pos'] = h_chr+'_'+h_crd
+        hetSNV_dict[r_chr+'_'+r_crd]['h1_pos'] = h_chr + '_' + h_crd
         hetSNV_dict[r_chr+'_'+r_crd]['r_a'] = r_a
-        hetSNV_dict[r_chr+'_'+r_crd]['a1'] = a1
-        hetSNV_dict[r_chr+'_'+r_crd]['a2'] = a2
+        hetSNV_dict[r_chr+'_'+r_crd]['h1_a'] = a1
+        hetSNV_dict[r_chr+'_'+r_crd]['h2_a'] = a2
 
-with open(sys.argv[3],'r') as in_h2:
+with open(sys.argv[4],'r') as in_h2:
     for line in in_h2:
-        line=line.strip()
+        line = line.strip()
         h_chr, _, h_crd, ref_info = line.split('\t')
         #h_cr, h = h_chr.split('_')
         r_chr, r_crd, r_a, a1, a2 = ref_info.split('_')
+        if 'h2_pos' in hetSNV_dict[r_chr+'_'+r_crd] or r_chr+'_'+r_crd in duplicate_posns:
+            duplicate_posns[r_chr+'_'+r_crd] = None
+            continue
         if a1 not in bases or a2 not in bases: 
             if r_chr + '_' + r_crd not in discarded: rmvdhets_file.write('\t'.join([r_chr, r_crd, ref_info]) + '\n')
             continue
-        #hetSNV_dict[r_chr+'_'+r_crd][h]={'crd':h_crd}
-        hetSNV_dict[r_chr+'_'+r_crd]['h2_pos'] = h_chr+'_'+h_crd
+        hetSNV_dict[r_chr+'_'+r_crd]['h2_pos'] = h_chr + '_' + h_crd
         hetSNV_dict[r_chr+'_'+r_crd]['r_a'] = r_a
-        hetSNV_dict[r_chr+'_'+r_crd]['a1'] = a1
-        hetSNV_dict[r_chr+'_'+r_crd]['a2'] = a2
-        hetSNV_list.append(r_chr+'_'+r_crd)
-
-
+        hetSNV_dict[r_chr+'_'+r_crd]['h1_a'] = a1  # mostly redundant, but needed for cases, where
+        hetSNV_dict[r_chr+'_'+r_crd]['h2_a'] = a2  # an snv couldn't be lifted over for one of the haplotypes
 
+for i in duplicate_posns:
+    del hetSNV_dict[i]
+    sys.stderr.write('WARN: same coords of two SNVs: ' + ' '.join(i.split('_')) + ' ? Removing.\n')
+    rmvdhets_file.write('\t'.join(i.split('_') + ['duplicate_pos']) + '\n')
+    # this doesn't catch homozygous SNVs though, when duplicated with a het - vcf2diploid may actually incorporate either? version
+    # thus, the input vcf should not have duplicated snvs
+    # todo: add a check when generating ref.bed
 
 # read the pileups
-pileup_dict = read_pileup.pileup_to_basecnts(sys.argv[5:])
+pileup_dict = read_pileup.pileup_to_basecnts(sys.argv[6:])
 
 sys.stdout.write('\t'.join([
     '#chr',
@@ -73,50 +80,70 @@
     'warn_h2'
 ])+'\n')
 
-#for k in hetSNV_dict:
-# to keep the original order
-for k in hetSNV_list:
-
-    basecnts_h1 = pileup_dict.get(hetSNV_dict[k].get('h1_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt'})
-    basecnts_h2 = pileup_dict.get(hetSNV_dict[k].get('h2_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt'})
-
-    basecnts = {
-        'A':basecnts_h1['A'] + basecnts_h2['A'],
-        'C':basecnts_h1['C'] + basecnts_h2['C'],
-        'G':basecnts_h1['G'] + basecnts_h2['G'],
-        'T':basecnts_h1['T'] + basecnts_h2['T'],
-        'N':basecnts_h1['N'] + basecnts_h2['N']
-    }
-
-    tot_cnt = basecnts[hetSNV_dict[k]['a1']] + basecnts[hetSNV_dict[k]['a2']]
-
-    if tot_cnt >= int(sys.argv[1]):
-
-        ref_cnt = basecnts[hetSNV_dict[k]['r_a']]
-	if ref_cnt > tot_cnt:
-		rmvdhets_file.write(k.split('_')[0] + '\t' + k.split('_')[1] + '\tref_allele:' + str(ref_cnt) + '_h1:' + str(basecnts[hetSNV_dict[k]['a1']]) + '_h2:' + str(basecnts[hetSNV_dict[k]['a2']]) + '\n')
-		continue
-	#print basecnts
-	#print k +'\t' + str(ref_cnt) + '\t' + str(tot_cnt) + '\n'
-        pbinom=binom.binomtest(ref_cnt, tot_cnt, 0.5)
-        sys.stdout.write('\t'.join([    
-            k.split('_')[0],
-            k.split('_')[1],
-            hetSNV_dict[k].get('h1_pos','notLifted?'),
-            hetSNV_dict[k].get('h2_pos','notLifted?'),
-            hetSNV_dict[k]['r_a'],
-            hetSNV_dict[k]['a1'],
-            hetSNV_dict[k]['a2'],
-            str(basecnts['A']),        
-            str(basecnts['C']),        
-            str(basecnts['G']),        
-            str(basecnts['T']),
-            str(basecnts['N']),
-            str(float(ref_cnt)/float(tot_cnt)),
-            str(tot_cnt),
-            str(pbinom),
-            basecnts_h1['warning'],
-            basecnts_h2['warning']
-        ])+'\n')
+
+with open(sys.argv[2], 'r') as in_ref:
+    for line in in_ref:
+
+        k = line.split('\t')[0] + '_' + line.split('\t')[2]
+
+        # when not lifted neither to h1 nor h2
+        if k not in hetSNV_dict: continue
+
+        basecnts_h1 = pileup_dict.get(hetSNV_dict[k].get('h1_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt', 'h1_a':'.'})
+        basecnts_h2 = pileup_dict.get(hetSNV_dict[k].get('h2_pos',None), {'A':0, 'C':0, 'G':0, 'T':0, 'N':0, 'warning':'zero_cnt', 'h2_a':'.'})
+
+        basecnts = {
+            'A':basecnts_h1['A'] + basecnts_h2['A'],
+            'C':basecnts_h1['C'] + basecnts_h2['C'],
+            'G':basecnts_h1['G'] + basecnts_h2['G'],
+            'T':basecnts_h1['T'] + basecnts_h2['T'],
+            'N':basecnts_h1['N'] + basecnts_h2['N']
+        }
+
+        tot_cnt = basecnts.get(basecnts_h1['h1_a'], 0) + basecnts.get(basecnts_h2['h2_a'], 0)
+
+        if tot_cnt >= max(int(sys.argv[1]), 1):
+        # need at least one read to get at least one hap nt from mpileups
+        # using that to figure out what phasing was assigned by vcf2diploid
+        # if wasn't phased in the .vcf:
+
+            if   (basecnts_h1['h1_a'], basecnts_h2['h2_a']) in [(hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']), (hetSNV_dict[k]['h1_a'], '.'), ('.', hetSNV_dict[k]['h2_a'])]:
+                basecnts_h1['h1_a'], basecnts_h2['h2_a'] = hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']
+
+            elif (basecnts_h2['h2_a'], basecnts_h1['h1_a']) in [(hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']), (hetSNV_dict[k]['h1_a'], '.'), ('.', hetSNV_dict[k]['h2_a'])]:
+                basecnts_h2['h2_a'], basecnts_h1['h1_a'] = hetSNV_dict[k]['h1_a'], hetSNV_dict[k]['h2_a']
+
+            else:
+		#sys.exit(sys.argv[0] + '\n confused with phasing in vcf vs mpileup / hap sequences\n' + line)
+		sys.stderr.write(sys.argv[0] + '   WARN: confused with phasing in .vcf vs .mpileup, skipping: ' + k + '\n')
+                rmvdhets_file.write('\t'.join(k.split('_') + ['vcf_vs_mpileup_alleles']) + '\n')
+                continue
+
+
+            ref_cnt = basecnts[hetSNV_dict[k]['r_a']]
+            if ref_cnt > tot_cnt:
+                rmvdhets_file.write(k.split('_')[0] + '\t' + k.split('_')[1] + '\tref_allele:' + str(ref_cnt) + '_h1:' + str(basecnts[hetSNV_dict[k]['h1_a']]) + '_h2:' + str(basecnts[hetSNV_dict[k]['h2_a']]) + '\n')
+                continue
+
+            pbinom=binom.binomtest(ref_cnt, tot_cnt, 0.5)
+            sys.stdout.write('\t'.join([    
+                k.split('_')[0],
+                k.split('_')[1],
+                hetSNV_dict[k].get('h1_pos','notLifted?'),
+                hetSNV_dict[k].get('h2_pos','notLifted?'),
+                hetSNV_dict[k]['r_a'],
+                basecnts_h1['h1_a'],
+                basecnts_h2['h2_a'],
+                str(basecnts['A']),        
+                str(basecnts['C']),        
+                str(basecnts['G']),        
+                str(basecnts['T']),
+                str(basecnts['N']),
+                str(float(ref_cnt)/float(tot_cnt)),
+                str(tot_cnt),
+                str(pbinom),
+                basecnts_h1['warning'],
+                basecnts_h2['warning']
+            ])+'\n')
 
 rmvdhets_file.close()
diff --git a/read_pileup.py b/read_pileup.py
@@ -16,7 +16,7 @@ def pileup_to_basecnts (filelist):
         with open(mf,'r') as in_m:
             for line in in_m:
                 warning = '.'
-                chr, crd, a, tot_pileup_cnt, seq, _ = line.split()
+                chrm, crd, a, tot_pileup_cnt, seq, _ = line.split()
                 a = a.upper()
                 basecnts={'A':0, 'C':0, 'G':0, 'T':0, 'N':0}
 
@@ -47,24 +47,27 @@ def pileup_to_basecnts (filelist):
                 for base in basecnts:
                     if base in seq.upper(): basecnts[base] = seq.upper().count(base)
                 if (basecnts['A'] + basecnts['C'] + basecnts['G'] + basecnts['T'] + basecnts['N'] + deleted_bases + spliced) != int(tot_pileup_cnt):
-                    sys.exit('error1: unexpected base counts / symbols in pileup line\n'+line)
+                    sys.exit(sys.argv[0] + '\nerror1: unexpected base counts / symbols in pileup line\n'+line)
 
                 if sum(basecnts.values()) > basecnts[a]: warning = str(sum(basecnts.values()) - basecnts[a])+'_other_alleles'                
                 if max(basecnts.values()) > basecnts[a]: warning = 'hap_allele_not_largest_cnt'
                 if sum(basecnts.values()) == 0: warning = 'zero_cnt'
 
                 basecnts['warning'] = warning
 
-                # pileup_dict[chr+'_'+crd] = basecnts
+                # pileup_dict[chrm+'_'+crd] = basecnts
                 # to accomodate more than 2 sets of mpileup files combined for the same individual and assay
-                if chr+'_'+crd not in pileup_dict:
-                    pileup_dict[chr+'_'+crd] = basecnts
+                if chrm+'_'+crd not in pileup_dict:
+                    pileup_dict[chrm+'_'+crd] = basecnts
+                    pileup_dict[chrm+'_'+crd]['warning'] += (','+basecnts['warning'])
+                    pileup_dict[chrm+'_'+crd][chrm.split('_')[1] + '_a'] = a
                 else:
-                    pileup_dict[chr+'_'+crd]['A'] += basecnts['A']
-                    pileup_dict[chr+'_'+crd]['C'] += basecnts['C']
-                    pileup_dict[chr+'_'+crd]['G'] += basecnts['G']
-                    pileup_dict[chr+'_'+crd]['T'] += basecnts['T']
-                    pileup_dict[chr+'_'+crd]['warning'] += (','+basecnts['warning'])
+                    pileup_dict[chrm+'_'+crd]['A'] += basecnts['A']
+                    pileup_dict[chrm+'_'+crd]['C'] += basecnts['C']
+                    pileup_dict[chrm+'_'+crd]['G'] += basecnts['G']
+                    pileup_dict[chrm+'_'+crd]['T'] += basecnts['T']
+                    pileup_dict[chrm+'_'+crd]['warning'] += (','+basecnts['warning'])
+                    pileup_dict[chrm+'_'+crd][chrm.split('_')[1] + '_a'] = a
 
     return pileup_dict