Skip to content

Commit

Permalink
h1/h2 to hap1/hap2
Browse files Browse the repository at this point in the history
  • Loading branch information
trgaleev committed Jul 20, 2018
1 parent f89aa46 commit 5294aa2
Show file tree
Hide file tree
Showing 20 changed files with 224 additions and 224 deletions.
2 changes: 1 addition & 1 deletion FalsePos.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def simpval2(cnt,bm):
with open(ifile, 'r') as inf:
inf.readline()
for line in inf:
(chr,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,cA,cC,cG,cT,cN,
(chr,ref_coord,hap1_coord,hap2_coord,ref_allele,hap1_allele,hap2_allele,cA,cC,cG,cT,cN,
ref_allele_ratio,sum_ref_n_alt_cnts,p_binom,cnv,mmap_log) = line.split('\t')
act_pvals_list.append(float(p_binom))
counts = [int(e) for e in [cA, cC, cG, cT]]
Expand Down
4 changes: 2 additions & 2 deletions FalsePos_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ def simpval2(cnt,bm):
with open(ifile, 'r') as inf:
inf.readline()
for line in inf:
region,h1_count,h2_count,h1_allele_ratio,p_binom,snv_count,snv_coords = line.split('\t')
region,hap1_count,hap2_count,hap1_allele_ratio,p_binom,snv_count,snv_coords = line.split('\t')
act_pvals_list.append(float(p_binom))
#counts = [int(e) for e in [cA, cC, cG, cT]]
#counts = sorted(counts, reverse=True)[:2]
counts = [int(h1_count), int(h2_count)]
counts = [int(hap1_count), int(hap2_count)]
cnt_sums_list.append(sum(counts))
act_pvals = numpy.array(act_pvals_list)
cnt_sums = numpy.array(cnt_sums_list)
Expand Down
38 changes: 19 additions & 19 deletions PIPELINE.mk
Original file line number Diff line number Diff line change
Expand Up @@ -156,42 +156,42 @@ $(PREFIX)_ref_allele_ratios.filtered_counts.chrs1-22$(KEEP_CHR).pdf: $(PREFIX)_f
# in non-autosomal chr;
# and sites with seemingly misphased/miscalled nearby variants
# filter/adjust sites imbalanced likely due to unaccounted multi-mapping reads
$(PREFIX)_filtered_counts.chrs1-22$(KEEP_CHR).tsv: $(PREFIX)_raw_counts.tsv $(PREFIX)_h1_mmapreads.mpileup $(PREFIX)_h2_mmapreads.mpileup
$(PREFIX)_filtered_counts.chrs1-22$(KEEP_CHR).tsv: $(PREFIX)_raw_counts.tsv $(PREFIX)_hap1_mmapreads.mpileup $(PREFIX)_hap2_mmapreads.mpileup
cat $< | \
python $(PL)/filter_cnv_sites.py $(PREFIX)_discarded_HetSNVs.tsv $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_rd.tab | \
python $(PL)/filter_non-autosomal_chr.py $(PREFIX)_discarded_HetSNVs.tsv $(KEEP_CHR) | \
python $(PL)/filter_phase_warnings.py $(PREFIX)_discarded_HetSNVs.tsv | \
python $(PL)/filter_sites_w_mmaps.py $(AMB_MODE) $(PREFIX)_discarded_HetSNVs.tsv $(PREFIX)_sites_w_mmaps.log \
$(PREFIX)_h1_mmapreads.mpileup $(PREFIX)_h2_mmapreads.mpileup > $@
$(PREFIX)_hap1_mmapreads.mpileup $(PREFIX)_hap2_mmapreads.mpileup > $@

# allelic ratio distrs
$(PREFIX)_ref_allele_ratios.raw_counts.pdf: $(PREFIX)_raw_counts.tsv
cat $< | python $(PL)/plot_AllelicRatio_distribution.py $(PREFIX)_raw_counts

# counts
$(PREFIX)_raw_counts.tsv: $(PREFIX)_h1_uniqreads.mpileup $(PREFIX)_h2_uniqreads.mpileup
$(PREFIX)_raw_counts.tsv: $(PREFIX)_hap1_uniqreads.mpileup $(PREFIX)_hap2_uniqreads.mpileup
python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_ref.bed \
$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed \
$(PREFIX)_discarded_HetSNVs.tsv \
$(PREFIX)_h1_uniqreads.mpileup $(PREFIX)_h2_uniqreads.mpileup > $@
$(PREFIX)_hap1_uniqreads.mpileup $(PREFIX)_hap2_uniqreads.mpileup > $@


# pileups
$(PREFIX)_h1_mmapreads.mpileup: $(HetSNV_MMAPALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_h1.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed --output $@
$(PREFIX)_hap1_mmapreads.mpileup: $(HetSNV_MMAPALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap1.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed --output $@

$(PREFIX)_h2_mmapreads.mpileup: $(HetSNV_MMAPALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_h2.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed --output $@
$(PREFIX)_hap2_mmapreads.mpileup: $(HetSNV_MMAPALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap2.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed --output $@

$(PREFIX)_h1_uniqreads.mpileup: $(HetSNV_UNIQALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_h1.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed --output $@
$(PREFIX)_hap1_uniqreads.mpileup: $(HetSNV_UNIQALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap1.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed --output $@

$(PREFIX)_h2_uniqreads.mpileup: $(HetSNV_UNIQALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_h2.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed --output $@
$(PREFIX)_hap2_uniqreads.mpileup: $(HetSNV_UNIQALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap2.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed --output $@

# by default --ff was also filtering-out some other - secondary? reads: [UNMAP,SECONDARY,QCFAIL,DUP], leaving only UNMAP for now

Expand All @@ -200,15 +200,15 @@ $(PREFIX)_h2_uniqreads.mpileup: $(HetSNV_UNIQALNS_FILENAME)

# non-uniq alns over hetSNVs:
$(PREFIX)_$(ALIGNMENT_MODE)-params_crdsorted_mmapreads_over_hetSNVs.bam: $(FINAL_ALIGNMENT_FILENAME)
cat $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed | \
cat $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed | \
$(SAMTOOLS) view -h -L - $< | awk '$$5!="255" {print $$0}' | \
$(SAMTOOLS) view -b - > $@
$(SAMTOOLS) index $@
$(SAMTOOLS) flagstat $@ > $@.stat

# uniq alns over hetSNVs:
$(PREFIX)_$(ALIGNMENT_MODE)-params_crdsorted_uniqreads_over_hetSNVs.bam: $(FINAL_ALIGNMENT_FILENAME)
cat $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed | \
cat $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed | \
$(SAMTOOLS) view -h -q 255 -L - $< | \
$(SAMTOOLS) view -b - > $@
$(SAMTOOLS) index $@
Expand Down
34 changes: 17 additions & 17 deletions PIPELINE_aggregate_over_genomic_regions.mk
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,16 @@ $(info $(empty_string))
######################


all: $(PREFIX)_region_h1_ratios.filtered_counts.chrs1-22$(KEEP_CHR).pdf $(PREFIX)_region_h1_ratios.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.pdf $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
all: $(PREFIX)_region_hap1_ratios.filtered_counts.chrs1-22$(KEEP_CHR).pdf $(PREFIX)_region_hap1_ratios.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.pdf $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv

# todo
# this seems to work, but the way it deals with paths, filenames, etc needs to be cleaned up
# here it is even worse
# fix columns
$(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
awk '{print $$1"\t"$$2"\t"$$3"\t"$$2"\t"$$3"\t0\t0\t"$$2+$$3"\t"$$4"\t1.0"}' $< | \
sed 's/h1_count\th2_count\t0\t0\t0/cA\tcC\tcG\tcT\tsum_ref_n_alt_cnts/' | \
sed 's/h1_allele_ratio\t1.0/ref_allele_ratio\tcnv/' > $(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-total.$(Cntthresh_min)-lower.tsv.tmp
sed 's/hap1_count\thap2_count\t0\t0\t0/cA\tcC\tcG\tcT\tsum_ref_n_alt_cnts/' | \
sed 's/hap1_allele_ratio\t1.0/ref_allele_ratio\tcnv/' > $(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-total.$(Cntthresh_min)-lower.tsv.tmp
Rscript $(PL)/alleledb_calcOverdispersion.R \
$(PREFIX)_region_filtered_counts_min.$(Cntthresh_tot)-total.$(Cntthresh_min)-lower.tsv.tmp \
$(PREFIX)_region_FDR-$(FDR_CUTOFF).betabinomial.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min
Expand All @@ -74,38 +74,38 @@ $(PREFIX)_interesting_regions.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntt
cat $< | python $(PL)/filter_regions_by_pval.py $(PREFIX)_region_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt > $@

# allelic ratio distrs
$(PREFIX)_region_h1_ratios.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.pdf: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
cat $< | python $(PL)/plot_AllelicRatio_distribution.py $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt h1_allele_ratio
$(PREFIX)_region_hap1_ratios.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.pdf: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
cat $< | python $(PL)/plot_AllelicRatio_distribution.py $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt hap1_allele_ratio

$(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).tsv
cat $< | python $(PL)/filter_regions_by_counts.py $(Cntthresh_tot) $(Cntthresh_min) > $@

# allelic ratio distrs
$(PREFIX)_region_h1_ratios.filtered_counts.chrs1-22$(KEEP_CHR).pdf: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).tsv
cat $< | python $(PL)/plot_AllelicRatio_distribution.py $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR) h1_allele_ratio
$(PREFIX)_region_hap1_ratios.filtered_counts.chrs1-22$(KEEP_CHR).pdf: $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).tsv
cat $< | python $(PL)/plot_AllelicRatio_distribution.py $(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR) hap1_allele_ratio

$(PREFIX)_region_filtered_counts.chrs1-22$(KEEP_CHR).tsv: $(PREFIX)_region_raw_counts_uniq.tsv $(PREFIX)_region_raw_counts_mmap.tsv
awk '{print $$1"\t"$$2"\t"$$3"\t"$$4"\t"$$5"\t"$$6}' $< | \
python $(PL)/filter_regions_non-autosomal_chr.py $(REGIONS_FILE) $(PREFIX)_discarded_regions.tsv $(KEEP_CHR) | \
python $(PL)/filter_regions_w_mmaps.py $(AMB_MODE) \
$(PREFIX)_discarded_regions.tsv $(PREFIX)_regions_w_mmaps.log $(PREFIX)_region_raw_counts_mmap.tsv > $@

$(PREFIX)_region_raw_counts_mmap.tsv: hets_regions_h2.bed
cat hets_regions_h1.bed hets_regions_h2.bed | $(BEDTOOLS_intersectBed) -a stdin -b $(MMAP_ALN_FILES) -split -wb -bed | \
python $(PL)/intersect2counts.py mmap hets_regions_h1.bed > $@
$(PREFIX)_region_raw_counts_mmap.tsv: hets_regions_hap2.bed
cat hets_regions_hap1.bed hets_regions_hap2.bed | $(BEDTOOLS_intersectBed) -a stdin -b $(MMAP_ALN_FILES) -split -wb -bed | \
python $(PL)/intersect2counts.py mmap hets_regions_hap1.bed > $@

# allowing multiple UNIQ_ALN_FILES files, but probably slower (since became -b instead of -a), also changes to intersect2counts.py to accomodate output file format
#$(PREFIX)_region_counts.tsv: hets_regions_h1.bed hets_regions_h2.bed
# $(BEDTOOLS_intersectBed) -a $(UNIQ_ALN_FILES) -b hets_regions_h1.bed hets_regions_h2.bed -split -wb -bed | \
#$(PREFIX)_region_counts.tsv: hets_regions_hap1.bed hets_regions_hap2.bed
# $(BEDTOOLS_intersectBed) -a $(UNIQ_ALN_FILES) -b hets_regions_hap1.bed hets_regions_hap2.bed -split -wb -bed | \
# python $(PL)/intersect2counts.py > $@


# -split -- not counting reads that splice over a het
$(PREFIX)_region_raw_counts_uniq.tsv: hets_regions_h2.bed
cat hets_regions_h1.bed hets_regions_h2.bed | $(BEDTOOLS_intersectBed) -a stdin -b $(UNIQ_ALN_FILES) -split -wb -bed | \
python $(PL)/intersect2counts.py uniq hets_regions_h1.bed > $@
$(PREFIX)_region_raw_counts_uniq.tsv: hets_regions_hap2.bed
cat hets_regions_hap1.bed hets_regions_hap2.bed | $(BEDTOOLS_intersectBed) -a stdin -b $(UNIQ_ALN_FILES) -split -wb -bed | \
python $(PL)/intersect2counts.py uniq hets_regions_hap1.bed > $@

hets_regions_h2.bed : $(REGIONS_FILE) $(COUNTS_FILE)
hets_regions_hap2.bed : $(REGIONS_FILE) $(COUNTS_FILE)
grep -v '^#chr' $(COUNTS_FILE) | awk '{print $$1"\t"$$2-1"\t"$$2}' | $(BEDTOOLS_intersectBed) -a $(REGIONS_FILE) -b stdin | \
python $(PL)/refbed2hapcoords.py $(COUNTS_FILE) hets_regions_h1.bed hets_regions_h2.bed
python $(PL)/refbed2hapcoords.py $(COUNTS_FILE) hets_regions_hap1.bed hets_regions_hap2.bed

6 changes: 3 additions & 3 deletions PIPELINE_aggregated_counts.mk
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ $(info $(empty_string))
### PIPELINE START ###
######################

all: $(PREFIX)_ref_allele_ratios.raw_counts.pdf $(PREFIX)_ref_allele_ratios.filtered_counts.chrs1-22$(KEEP_CHR).pdf $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
#all: $(PREFIX)_ref_allele_ratios.raw_counts.pdf $(PREFIX)_ref_allele_ratios.filtered_counts.chrs1-22$(KEEP_CHR).pdf $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv

#all: $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
all: $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv

# this seems to work, but the way it deals with paths, filenames, etc needs to be cleaned up
$(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
Expand Down Expand Up @@ -85,7 +85,7 @@ $(PREFIX)_ref_allele_ratios.raw_counts.pdf: $(PREFIX)_raw_counts.tsv
# counts
$(PREFIX)_raw_counts.tsv: $(INPUT_UNIQ_READS_PILEUP_FILES)
python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_ref.bed \
$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_h2.bed \
$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed \
$(PREFIX)_discarded_HetSNVs.tsv \
$(INPUT_UNIQ_READS_PILEUP_FILES) > $@

Expand Down
4 changes: 2 additions & 2 deletions filter_by_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
th_each = int(sys.argv[2])
sys.stdout.write(sys.stdin.readline())
for line in sys.stdin:
(chrm, ref_coord, h1_coord, h2_coord, ref_allele, h1_allele, h2_allele,
(chrm, ref_coord, hap1_coord, hap2_coord, ref_allele, hap1_allele, hap2_allele,
cA, cC, cG, cT, cN, ref_allele_ratio, sum_ref_n_alt_cnts, p_binom, cnv, mmap_log) = line.split('\t')
counts = {'A': int(cA), 'C': int(cC), 'G':int(cG), 'T':int(cT), 'N':int(cN)}
if counts[h1_allele] + counts[h2_allele] >= th_tot and counts[h1_allele] >= th_each and counts[h2_allele] >= th_each:
if counts[hap1_allele] + counts[hap2_allele] >= th_tot and counts[hap1_allele] >= th_each and counts[hap2_allele] >= th_each:
sys.stdout.write(line)
6 changes: 3 additions & 3 deletions filter_by_pval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
if line.startswith('Target'): pth = float(line.split()[-1])

for line in sys.stdin:
(chr,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,cA,cC,cG,cT,cN,
(chr,ref_coord,hap1_coord,hap2_coord,ref_allele,hap1_allele,hap2_allele,cA,cC,cG,cT,cN,
ref_allele_ratio,sum_ref_n_alt_cnts,p_binom,cnv,mmap_log) = line.split('\t')
outcolmns = '\t'.join([
chr,
ref_coord,
ref_allele,
h1_allele,
h2_allele,
hap1_allele,
hap2_allele,
cA, cC, cG, cT, cN,
ref_allele_ratio,
p_binom
Expand Down
4 changes: 2 additions & 2 deletions filter_cnv_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
with open(sys.argv[1], 'a') as rm_hetSNV_f:
for line in sys.stdin:
if not line.startswith('#'):
(chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
(chrm,ref_coord,hap1_coord,hap2_coord,ref_allele,hap1_allele,hap2_allele,
cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom,
warning_h1,warning_h2) = line.split()
warning_hap1,warning_hap2) = line.split()
rd = float(cnv_dict[chrm+"_"+ref_coord])
if 0.5 <= rd <= 1.5:
sys.stdout.write(line.strip()+'\t'+str(rd)+'\n')
Expand Down
4 changes: 2 additions & 2 deletions filter_non-autosomal_chr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
for line in sys.stdin:

if not line.startswith('#'):
(chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
(chrm,ref_coord,hap1_coord,hap2_coord,ref_allele,hap1_allele,hap2_allele,
cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom,
warning_h1,warning_h2,cnv) = line.split('\t')
warning_hap1,warning_hap2,cnv) = line.split('\t')

if chrm in keep_chrs: sys.stdout.write(line)
else: rm_hetSNV_f.write('\t'.join([chrm,ref_coord,
Expand Down
10 changes: 5 additions & 5 deletions filter_phase_warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ def check_subset(l1,l2):
with open(sys.argv[1], 'a') as rm_hetSNV_f:
for line in sys.stdin:
if not line.startswith('#'):
(chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
(chrm,ref_coord,hap1_coord,hap2_coord,ref_allele,hap1_allele,hap2_allele,
cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom,
warning_h1,warning_h2,cnv) = line.split()
if check_subset(warning_h1.split(','),['.','zero_cnt']) and check_subset(warning_h2.split(','),['.','zero_cnt']) and h1_coord != 'notLifted?' and h2_coord != 'notLifted?':
warning_hap1,warning_hap2,cnv) = line.split()
if check_subset(warning_hap1.split(','),['.','zero_cnt']) and check_subset(warning_hap2.split(','),['.','zero_cnt']) and hap1_coord != 'notLifted?' and hap2_coord != 'notLifted?':
sys.stdout.write('\t'.join([
chrm,ref_coord,h1_coord,h2_coord,ref_allele,h1_allele,h2_allele,
chrm,ref_coord,hap1_coord,hap2_coord,ref_allele,hap1_allele,hap2_allele,
cA,cC,cG,cT,cN,ref_allele_ratio,sum_ref_n_alt_cnts,p_binom, cnv
])+'\n')
else:
rm_hetSNV_f.write('\t'.join([chrm, ref_coord,
'_'.join([cA,cC,cG,cT,'','um',warning_h1,warning_h2])])+'\n')
'_'.join([cA,cC,cG,cT,'','um',warning_hap1,warning_hap2])])+'\n')
else: sys.stdout.write('\t'.join(line.split()[:15] + [line.split()[-1]])+'\n')
4 changes: 2 additions & 2 deletions filter_regions_by_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
sys.stdout.write(sys.stdin.readline())
for line in sys.stdin:

region,h1_count,h2_count,h1_allele_ratio,p_binom,snv_count,mmap_log = line.split('\t')
if int(h1_count) + int(h2_count) >= th_tot and int(h1_count) >= th_each and int(h2_count) >= th_each:
region,hap1_count,hap2_count,hap1_allele_ratio,p_binom,snv_count,mmap_log = line.split('\t')
if int(hap1_count) + int(hap2_count) >= th_tot and int(hap1_count) >= th_each and int(hap2_count) >= th_each:
sys.stdout.write(line)
4 changes: 2 additions & 2 deletions filter_regions_by_pval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
sys.stdout.write(sys.stdin.readline())
for line in sys.stdin:

region, h1_count, h2_count, h1_allele_ratio, p_binom, snv_count, mmap_log = line.split('\t')
outcolmns = '\t'.join([region, h1_count, h2_count, h1_allele_ratio, p_binom, snv_count])
region, hap1_count, hap2_count, hap1_allele_ratio, p_binom, snv_count, mmap_log = line.split('\t')
outcolmns = '\t'.join([region, hap1_count, hap2_count, hap1_allele_ratio, p_binom, snv_count])

if float(p_binom) <= pth: sys.stdout.write(outcolmns+'\n')
Loading

0 comments on commit 5294aa2

Please sign in to comment.