diff --git a/Simulations_test/KIN_hmm/contamination_missSpecification/Rscripts/misscontam_plot.R b/Simulations_test/KIN_hmm/contamination_missSpecification/Rscripts/misscontam_plot.R new file mode 100644 index 0000000..ddde44c --- /dev/null +++ b/Simulations_test/KIN_hmm/contamination_missSpecification/Rscripts/misscontam_plot.R @@ -0,0 +1,53 @@ + +library(ggplot2) +#install.packages("magrittr") +#install.packages("dplyr") +library(magrittr) +library(dplyr) +library(tidyr) + +#inf='/mnt/diversity/divyaratan_popli/100arc/inbreeding/fastsim_gz/hmm_numba_incorrect_roh_i1_i1_insteadOf_i1_i2/roc/contam0_inbred0_model_performance_allroc_asc0.csv.gz' +#infile='/mnt/diversity/divyaratan_popli/review_sim/hmm_numba/miss_contam/output/missC10/contam1_inbred0_model_performance_allLikelihoods_inphapProbs_asc0.csv' + +contamination_plot <- function(infile,outfile){ + + + compinp=read.csv(file = infile, header = TRUE) + compinp$Relatedness <- factor(compinp$Relatedness, levels = c("fir", "sec", 'un_all','pc','sib','deg3')) + + data <- compinp %>% + pivot_longer(c(True_positive,False_positive), names_to = "T_F", values_to = "classification_proportion")#Plot + data$T_F <- factor(data$T_F, levels= c('True_positive','False_positive')) + + #data<-data[!(data$method=="read_inppshap" | data$method=="allLikelihoods_inppshap"),] + #data = subset(data, select = -c(method) ) + + data$Coverage <- as.factor(data$Coverage) + + #changing some column names: + + Relatedness.labs <- c("Unrelated", "3rd Degree", "2nd Degree", "1st Degree", "Siblings", "Parent-Child") + names(Relatedness.labs) <- c("un_all", "deg3", "sec", "fir", "sib", "pc") + + T_F.labs <- c("False Positive", "True Positive") + names(T_F.labs) <- c("False_positive", "True_positive") + + plot<-ggplot(data=data, aes(x=contamination, y=classification_proportion, color=Coverage)) + plot<-plot+geom_line() + plot<- plot + facet_grid(T_F ~ Relatedness, labeller = labeller(T_F = T_F.labs, Relatedness = Relatedness.labs), scales="free") + #plot + geom_hline(yintercept=0.05, linetype="dashed") + plot <- plot + geom_hline(data = data.frame(T_F='False_positive'), aes(yintercept = 0.05), linetype = "dotted") + plot <- plot + geom_vline(data = data, aes(xintercept = 2.5), linetype = "dashed") + + plot <- plot+ labs(y="Classification Proportions", x = "miss-specified contamination level(%)") + plot + theme_bw() + theme( + strip.background = element_rect( + color="transparent", fill="transparent", size=1.5, linetype="solid" + ), legend.position="bottom", text = element_text(size=12) + ) + ggsave(outfile, width = 8, height = 5, dpi = 150, units = "in", device='png') + +} + + +contamination_plot(infile=snakemake@input[["infile"]], outfile=snakemake@output[["outfile"]]) diff --git a/Simulations_test/KIN_hmm/contamination_missSpecification/Snakefile b/Simulations_test/KIN_hmm/contamination_missSpecification/Snakefile new file mode 100644 index 0000000..de7e466 --- /dev/null +++ b/Simulations_test/KIN_hmm/contamination_missSpecification/Snakefile @@ -0,0 +1,229 @@ +import numpy as np +import scipy +import random +import pandas as pd +from scipy.stats import binom +from math import isclose +from scipy.special import gammaln +from scipy import optimize +import math +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from scipy.special import loggamma +from scipy.special import beta as Betaf +from scipy.optimize import minimize +from numpy.linalg import matrix_power +from scipy.stats import chisquare +from hmm_inbred_constVar_testing import * + +########################################################################### +"""IMPORTANT*** +run all to generate all files for miss contam analysis +run allt to get pc-sib files +""" +########################################################################### +instates=[0,1,2] + + +readrel=['un','gr','fir','id'] +readrel_nf=['un','gr','fir','id','NF'] + +kinrel=['un','deg5','deg4','deg3','gr','hsib','avu','sib','pc','id'] +kinrel_nf=['un','deg5','deg4','deg3','gr','hsib','avu','sib','pc','id','NF'] + +pclist=['0_8','1_8','1_9','1_10','2_9','2_10','3_11','4_11','10_12','11_12','5_13','12_13','6_14','13_14','8_15','8_16','9_16','10_16'] +grlist=['1_12','2_12','3_12','4_12','10_13','11_13','12_14','12_16','5_14'] +idlist=['0_15', '1_16'] +siblist=['9_10'] +firlist=pclist+siblist +thirdlist=['9_13','1_13','2_13','3_13','4_13','8_12','13_16','10_14','11_14'] +fourthlist=['8_13','9_14','1_14','2_14','3_14','4_14'] +fifthlist=['8_14'] +halfsib=['8_9','8_10'] +avulist=['9_12'] +seclist=grlist+halfsib+avulist + +truerel=['un','deg5','deg4','deg3','gr','hsib','avu','sib','pc','id'] +runlist=list(range(1,61)) +totalch=list(range(1,23)) +asclist=[0,2,3] + +covlist=[4, 0.5, 0.2, 0.1, 0.05] +Mfilist=[0] +cntlist=[0,1] +inplist=['hapProbs','pshap'] +#expected overlaps: [4,000,000, 160,000-40,000, 26000-6400, 10,000-2500, 3600-900, 1600- 400] +inds=list(range(17)) + +mind_list=[10] +mcont_list=[0.5,1.5,2.5,3.5,4.5,5.5] + +e=0.01 #transition matrix error +stateno=3 + +allrel=['un','deg5','deg4','deg3','gr','hsib','avu','sib','pc','id'] +relno=len(allrel) +stateno=3 + +statlist=['res','lik'] +listf=[] +for i, l1 in enumerate(inds): + for l2 in inds[(i+1):]: + s = '%s_%s' %(str(l1).strip("['']"), str(l2).strip("['']")) + listf.append(s) +#list_models=['allfixed','pfixed','Afixed','nonefixed'] +#listf0=['0_'+str(x) for x in list(range(1,17))] + +listf10=[str(x)+'_10' for x in list(range(0,10))] + ['10_'+str(x) for x in list(range(11,17))] + +folders=["allLikelihoods"] + + +datafolder="input_files/" +paramfolder="/mnt/diversity/divyaratan_popli/100arc/inbreeding/sim_relatedA/transitions_combined/" +hbdfolder="input_files/hbd_hmm/" +readfolder="/mnt/diversity/divyaratan_popli/100arc/readscripts/" + + +rule hmm: + input: + pfile=datafolder+"missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/hmm_parameters/p1_file.gz", + dfile=datafolder+"missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/remHighdiv_diff.csv.gz", + tfile=datafolder+"missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/remHighdiv_total.csv.gz", + Afiles=expand(paramfolder+"transition_matrix_{rel}.csv", rel=allrel) + + + output: + resfiles=expand("output/missC{{mcont}}_{{mind}}/contam{{cnt}}/inbred{{inb}}/run{{RID}}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}/relation_{rel}_file/res_inp{{inpMode}}/pw_{{listind}}.csv.gz", rel=allrel), + likmerged="output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}/allLikelihoods_inp{inpMode}/pw_{listind}.csv.gz", + + run: + l=wildcards.listind + print(l) + i1=l.split('_')[0] + i2=l.split('_')[1] + print(i1,i2) + + pairf1=hbdfolder+"missC%s_%s/contam%s/inbred%s/run%s/coverage%s/filtered%s/asc%s_res/gamma_%s/pw_%s.csv.gz" %(str(wildcards.mcont),str(wildcards.mind),str(wildcards.cnt),str(wildcards.inb),str(wildcards.RID),str(wildcards.cov),str(wildcards.Mfil),str(wildcards.asc),str(wildcards.inpMode),i1) + pairf2=hbdfolder+"missC%s_%s/contam%s/inbred%s/run%s/coverage%s/filtered%s/asc%s_res/gamma_%s/pw_%s.csv.gz" %(str(wildcards.mcont),str(wildcards.mind),str(wildcards.cnt),str(wildcards.inb),str(wildcards.RID),str(wildcards.cov),str(wildcards.Mfil),str(wildcards.asc),str(wildcards.inpMode),i2) + + p1thresh=datafolder+"missC%s_%s/contam%s/inbred%s/run%s/coverage%s/asc%s/inputMode_id%s_fil%s/id_remHighdiv_total.csv" %(str(wildcards.mcont),str(wildcards.mind),str(wildcards.cnt),str(wildcards.inb),str(wildcards.RID),str(wildcards.cov),str(wildcards.asc),str(wildcards.inpMode),str(wildcards.Mfil)) + + + hmm(difffile=input.dfile, totalfile=input.tfile, listind=wildcards.listind, + listf=np.array(listf), targets=np.array(inds),pfile=input.pfile, + Afiles=input.Afiles, resfiles=output.resfiles, likfile=output.likmerged, + pairf1=pairf1, pairf2=pairf2, + p1thresh=p1thresh, thresh=10, instates=instates, rels=allrel) + + +rule get_relatable10: + input: + likfile=expand("output/missC{{mcont}}_10/contam{{cnt}}/inbred{{inb}}/run{{RID}}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}/{{myfolder}}_inp{{inpMode}}/pw_{listind}.csv.gz", listind=listf10), + output: + liktab="output/missC{mcont}_10/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}/relatable_{myfolder}_inp{inpMode}.csv.gz" + run: + getRelatable(filist=input.likfile, relatable=output.liktab, pairs=listf10, rels=allrel) + + +rule model_performance: + input: + infiles= "output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}/relatable_allLikelihoods_inp{inpMode}.csv.gz" + + output: + table_cut="output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}/allLikelihoods_inp{inpMode}/performance_table_cut{c}.csv.gz" + + run: + #filenames=np.loadtxt(fname=input.infiles, dtype='str', delimiter=',') + filename=input.infiles + lik_colind=pd.read_csv(filename, sep=",", header=0,index_col=0) + + runtable_cut=pd.DataFrame(0,index=truerel, columns=kinrel_nf) + + for fi in range(len(lik_colind)): + lik=lik_colind.loc[fi,:] + l=lik_colind.loc[fi,'pair'] + + if l in grlist: + label='gr' + elif l in pclist: + label='pc' + elif l in siblist: + label='sib' + elif l in idlist: + label='id' + elif l in thirdlist : + label='deg3' + elif l in fourthlist : + label='deg4' + elif l in halfsib : + label='hsib' + elif l in avulist : + label='avu' + elif l in fifthlist : + label='deg5' + else: + label='un' + + maxli=lik_colind.loc[fi,'relatedness'] + + if lik_colind.loc[fi,'loglik_ratio'] >= float(wildcards.c): + runtable_cut.loc[label,maxli]=runtable_cut.loc[label,maxli]+1 + else: + runtable_cut.loc[label,'NF']=runtable_cut.loc[label,'NF']+1 + + + with pd.option_context('display.max_rows', len(runtable_cut.index), 'display.max_columns', len(runtable_cut.columns)): + runtable_cut.to_csv(output.table_cut, sep=',') + + +rule bigtable: + input: + tables_cut=expand("output/missC{{mcont}}_{{mind}}/contam{{cnt}}/inbred{{inb}}/run{RID}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}/{{folder}}/performance_table_cut{{c}}.csv.gz", RID=runlist), + output: + alltable_cut="output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/model_performance_{folder}/coverage{cov}/asc{asc}/filtered{Mfil}_cut{c}.csv.gz", + run: + bigTable(tables=input.tables_cut, alltable=output.alltable_cut) + + +rule all: + input: + #read=expand("contam{cnt}/inbred{inb}/model_performance_{folder}/coverage{cov}/asc{asc}/filtered{Mfil}_cut{cutoff}.csv.gz", cnt=config["cnt"], inb=config["inbs"],folder="read_inppshap", cov=config["cov"], Mfil=[0,1], asc=config["asc"], cutoff=0), + kin=expand("output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/model_performance_{folder}/coverage{cov}/asc{asc}/filtered{Mfil}_cut{cutoff}.csv.gz", cnt=1, inb=0, folder="allLikelihoods_inphapProbs", cov=covlist, Mfil=0, asc=0, cutoff=1.0, mcont=config["mc"], mind=mind_list) + + + +#need pc-sib table as well + +rule pc_sib: + input: + likfile=expand("output/missC{{mcont}}_{{mind}}/contam{{cnt}}/inbred{{inb}}/run{{RID}}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}/{{myfolder}}_inp{{inpMode}}/pw_{listind}.csv.gz", listind=listf10), + output: + #pcsib="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}/pc_sib_{myfolder}_inp{inpMode}_{cut_pcsib}.csv.gz", + table="output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}/pc_sib_table_{myfolder}_inp{inpMode}/{cut_pcsib}.csv.gz" + run: + getpc_sib(filist=input.likfile, pairs=listf10, truerel=truerel,kinrel_nf=kinrel_nf,pclist=pclist, siblist=siblist, grlist=grlist,halfsib=halfsib, + avulist=avulist,thirdlist=thirdlist,fourthlist=fourthlist,fifthlist=fifthlist,idlist=idlist, + runtable=output.table, ctw=wildcards.cut_pcsib) + +rule pc_sib_all: + input: + tables=expand("output/missC{{mcont}}_{{mind}}/contam{{cnt}}/inbred{{inb}}/run{RID}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}/pc_sib_table_{{myfolder}}_inp{{inpMode}}/{{cut_pcsib}}.csv.gz", RID=runlist) + + output: + alltable="output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/pc_sib_performance_{myfolder}_inp{inpMode}/coverage{cov}/asc{asc}/filtered{Mfil}_cut{cut_pcsib}/pc_sib.csv.gz" + run: + bigTable(tables=input.tables, alltable=output.alltable) + +rule allt: + input: expand("output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/pc_sib_performance_allLikelihoods_inphapProbs/coverage{cov}/asc{asc}/filtered0_cut{cut_pcsib}/pc_sib.csv.gz", cut_pcsib=1,cov=0.03,cnt=1,inb=0,asc=0, mind=10, mcont=mcont_list) +""" +rule out_file: + input: + kin=expand("output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/model_performance_{folder}/coverage{cov}/asc{asc}/filtered{Mfil}_cut{cutoff}.csv.gz", cnt=1, inb=0, folder="allLikelihoods_inphapProbs", cov=covlist, Mfil=0, asc=0, cutoff=1.0, mcont=config["mc"], mind=mind_list), + pc_sib=expand("output/missC{mcont}_{mind}/contam{cnt}/inbred{inb}/pc_sib_performance_allLikelihoods_inphapProbs/coverage{cov}/asc{asc}/filtered0_cut{cut_pcsib}/pc_sib.csv.gz", cut_pcsib=1,cov=covlist,cnt=1,inb=0,asc=0, mind=10, mcont=mcont_list) + output: + outf=expand("output/missC10/contam{cnt}_inbred{inb}_model_performance_{folder}_asc{asc}.csv.gz",cnt=1, inb=0, folder="allLikelihoods_inphapProbs", Mfil=0, asc=0) + run: +""" diff --git a/Simulations_test/KIN_hmm/contamination_missSpecification/hmm_inbred_constVar_testing.py b/Simulations_test/KIN_hmm/contamination_missSpecification/hmm_inbred_constVar_testing.py new file mode 100644 index 0000000..d7fc7de --- /dev/null +++ b/Simulations_test/KIN_hmm/contamination_missSpecification/hmm_inbred_constVar_testing.py @@ -0,0 +1,633 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jan 21 21:36:58 2021 + +@author: divyaratan_popli +""" + + +import scipy +import numpy as np +import random +import pandas as pd +from scipy.stats import binom +from math import isclose +from scipy.special import gammaln +from scipy import optimize +import math +import matplotlib.pyplot as plt +from scipy.special import loggamma +from scipy.special import beta as Betaf +from scipy.optimize import minimize_scalar +import pylab +from numpy.linalg import matrix_power +from scipy.special import logsumexp +import numba + +#print(scipy.__version__) + + + +""" +#to stop the program if there is a warning +import warnings + +with warnings.catch_warnings(): + warnings.simplefilter('error') + for chrm in range(chrm_no): + + for t in range(pos[chrm],pos[chrm+1]-1): + + + for i in range(M): + print.info(chrm,t,i) + numerator_l = np.log(alpha[t, i]) + np.log(A[i, :]) + np.log(B[:, t + 1].T) + np.log(beta[t + 1, :].T) - np.log(scale[t+1]) #it is the probability that hmm path has particular arc at time t + xi_l[i, :, t] = numerator_l + + +""" +@numba.vectorize +def Betal(x, y): + # numba doesn't understand scipy.special.gammaln, but it understands math.lgamma, + # which is the same, except that it doesn't understand arrays. + return math.lgamma(x) + math.lgamma(y) - math.lgamma(x+y) + +@numba.vectorize +def fact(x): + return math.lgamma(x + 1) + +@numba.njit +def forward(A,B,pi,data): + N = A.shape[0] + T = data.shape[0] + alpha = np.zeros((T,N)) + alpha_p = np.zeros((T,N)) + scale = np.zeros(T) + + alpha[0,:] = pi * B[:,0] + alpha_p[0,:] = alpha[0,:] / alpha[0,:].sum() + scale[0] = alpha[0,:].sum() + + for t in range(1,T): + alpha[t,:]= (alpha_p[t-1,:] @ A) * B[:,t] + alpha_p[t,:]= alpha[t,:] / alpha[t,:].sum() + scale[t] = alpha[t,:].sum() + + return alpha_p, scale + +#@numba.njit +def backward(A, B, data, scale): + + N= np.shape(A)[0] + T= np.shape(data)[0] + beta=np.zeros((T,N)) + beta_p=np.zeros((T,N)) + + beta[-1,:]=np.ones(N) + beta_p[-1,:]=np.ones(N) + + for t in reversed(range(T-1)): + for n in range(N): + beta[t,n]= sum( A[n,:] * beta_p[t+1,:] * B[:,t+1]) + beta_p[t,n]= beta[t,n]/scale[t+1] + + return beta_p + + + +@numba.njit +def objective(a, meanp, k, data): + cost=0 + + b=a * (1-meanp)/meanp + + for w in range(len(data)): + + ll=(fact(data[w,1]) - fact(data[w,0]) - fact(data[w,1]-data[w,0]) + \ + Betal(data[w,0]+a, data[w,1]-data[w,0]+b) - Betal(a,b)) + + + Wll= ll * k[w] + + cost= cost + Wll + + return (-1)*cost + + + +def makeB(data,xin,M,T,inbr): + + B = np.zeros((inbr,M,T)) + + for st in range(inbr): + for re in range(M): + a=xin[st,re,0] + b=xin[st,re,1] + B[st,re,:]=np.exp(fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+a, data[:,1]-data[:,0]+b) - Betal(a,b)) + + return B + +def getEmissions(B, hbd): + B2=np.zeros([np.shape(B)[1],np.shape(B)[2]]) + scale1=np.zeros(np.shape(B)[2]) + + for i in range(np.shape(B)[1]): + B2[i,:]=(B[0,i,:] * hbd[:,0]) + (B[1,i,:] * hbd[:,1]) + (B[2,i,:] * hbd[:,2]) + + scale1=B2.max(axis=0) + B2scaled=B2/scale1 + + return B2scaled, np.sum(np.log(scale1)) + + +def expectation(data, hbd, A, B, pos, chrm_no, pi): + + M = A.shape[0] + T = len(data) + alpha=np.zeros((T,M)) + beta=np.zeros((T,M)) + scale=np.zeros(T) + + B2d,log_bscale1 = getEmissions(B, hbd) + + + for chrm in range(chrm_no): + data_chr=data[pos[chrm]:pos[chrm+1],:] + + alpha[pos[chrm]:pos[chrm+1],:],scale[pos[chrm]:pos[chrm+1]] = forward(A,B2d[:,pos[chrm]:pos[chrm+1]],pi,data_chr) + beta[pos[chrm]:pos[chrm+1],:] = backward(A,B2d[:,pos[chrm]:pos[chrm+1]],data_chr,scale[pos[chrm]:pos[chrm+1]]) + + post= alpha*beta + + li=np.sum(np.log(scale))+ log_bscale1 + + + assert isclose(sum(abs(np.sum(post,axis=1) - np.ones(np.shape(B2d)[1]))), 0, abs_tol=1e-7), "sum of alpha and beta is not 1" + assert isclose(sum(abs(np.sum(A,axis=1) - np.ones(np.shape(A)[0]))), 0, abs_tol=1e-7), "sum of transition matrix columns is not 1" + + pi_new=matrix_power(A,10000)[0] + #pi_new=np.mean(post[pos[:-1],:],0) + + return post.T, li, pi_new + + +def makexin(x): + xin=np.zeros([3,3,2]) + xin[0,:,:]=np.reshape(x[0:6],(3,2)) + xin[1,:,:]=[x[4:6],x[2:4],x[6:8]] + xin[2,:,:]=[x[6:8],x[2:4],x[6:8]] + #print("xin=",xin) + return xin + + +def emission(gamma, data, hbd, p1avg, A, inbr, x0, bnds): + + M = A.shape[0] + T = len(data) + + k_pc= gamma[0,:] * hbd[:,0] + k_un= gamma[1,:] + k_id= (gamma[2,:] * hbd[:,0]) + (gamma[0,:] * hbd[:,1]) + k_inb= (gamma[2,:] * hbd[:,1]) + (gamma[2,:] * hbd[:,2]) + (gamma[0,:] * hbd[:,2]) + + [mean_pc,mean_un,mean_id,mean_inb]=[p1avg[0,0],p1avg[0,1],p1avg[0,2],p1avg[2,2]] + [a_pc, a_un, a_id, a_inb]=[x0[0],x0[2],x0[4],x0[6]] + + + + + + solution_pc = minimize_scalar(objective,a_pc,args=(mean_pc,k_pc,data),method='Bounded',bounds=[bnds[0],10000],options={'disp':1}) + a_pc=solution_pc.x + b_pc= a_pc * (1-mean_pc)/mean_pc + + solution_un = minimize_scalar(objective,a_un,args=(mean_un,k_un,data),method='Bounded',bounds=[bnds[1],10000],options={'disp':1}) + a_un=solution_un.x + b_un= a_un * (1-mean_un)/mean_un + + solution_id = minimize_scalar(objective,a_id,args=(mean_id,k_id,data),method='Bounded',bounds=[bnds[2],10000],options={'disp':1}) + a_id=solution_id.x + b_id= a_id * (1-mean_id)/mean_id + + solution_inb = minimize_scalar(objective,a_inb,args=(mean_inb,k_inb,data),method='Bounded',bounds=[bnds[3],10000],options={'disp':1}) + a_inb=solution_inb.x + b_inb= a_inb * (1-mean_inb)/mean_inb + + + + x=[a_pc,b_pc,a_un,b_un,a_id,b_id,a_inb,b_inb] + + xin=makexin(x) + + + up_p=np.zeros([np.shape(xin)[0], np.shape(xin)[1]]) + for pi in range(np.shape(xin)[0]): + for ps in range(np.shape(xin)[1]): + up_p[pi,ps]=xin[pi,ps,0] / (xin[pi,ps,0] + xin[pi,ps,1]) + + + Bu=makeB(data,xin,M,T,inbr) + + return Bu, up_p, xin + + +def bnd_calc(mean_p, dist, propvar): + + r=(1-mean_p)/mean_p + t=propvar*dist*dist + + low_bnd=-1 * (t + (t*r*r) + r*((2*t)-1))/(t+ (3*t*r) + (3*r*r*t) + (r*r*r*t)) + return low_bnd + +def baum_welch(data, hbd, A, B, pos, p1avg, inbr, x0, max_iter=1000): + #print(data) + chrm_no=len(pos)-1 + n_iter=0 + diff=1 + last_lik=0 + + + #print("initial beta parameters are:",x0) + + [mean_pc,mean_un,mean_id,mean_inb]=[p1avg[0,0],p1avg[0,1],p1avg[0,2],p1avg[2,2]] + + di=(mean_un-mean_pc) + + bnd_pc= bnd_calc(mean_p=mean_pc, dist=di, propvar=3/4) + bnd_un= bnd_calc(mean_p=mean_un, dist=di, propvar=1) + bnd_id= bnd_calc(mean_p=mean_id, dist=di, propvar=1/2) + bnd_inb= bnd_calc(mean_p=mean_inb, dist=di, propvar=1) + + pi=np.array([1/3,1/3,1/3]) + + while diff > 0.000001 and n_iter1),'g_noin']=1 + pair2.loc[(pair2['g_noin']!=9) & (pair2['g_noin']>1),'g_noin']=1 + + pair1.loc[pair1['g_noin']<0,'g_noin']=0 + pair2.loc[pair2['g_noin']<0,'g_noin']=0 + +# pair1['ninb']=pair1['dis']/pair1['count'] #1 means no hbd +# pair1['inb']=1-pair1['ninb'] +# pair2['ninb']=pair2['dis']/pair2['count'] #1 means no hbd +# pair2['inb']=1-pair2['ninb'] + + pair1.loc[pair1['infocount']0.5,'g_noin']=1 +# pair2.loc[pair2['g_noin']>0.5,'g_noin']=1 + + pair1['ninb']=pair1['g_noin'] #1 means no hbd + pair1['inb']=1-pair1['g_noin'] + pair2['ninb']=pair2['g_noin'] #1 means no hbd + pair2['inb']=1-pair2['g_noin'] + + + badwin1=list(pair1.index[pair1['ninb']==9]) + badwin2=list(pair2.index[pair2['ninb']==9]) + + pair1.loc[badwin1,'inb']=0 + pair1.loc[badwin1,'ninb']=1 + pair2.loc[badwin2,'inb']=0 + pair2.loc[badwin2,'ninb']=1 + + + + hbd=np.zeros([len(total),3]) + hbd[:,0]=pair1['ninb']*pair2['ninb'] + hbd[:,1]=(pair1['ninb']*pair2['inb']) + (pair2['ninb']*pair1['inb']) + hbd[:,2]=pair1['inb']*pair2['inb'] + + gudwin=total>0 + data=data[gudwin,:] + hbd=hbd[gudwin,:] + + chrmlist=np.asarray(pair1['chrom']) + chrmlist=chrmlist[gudwin] + + win=np.sum(gudwin) + + with open(pfile,"r") as f: + p_1=float(f.read()) + + p_12=p_1/2 + inerror=p_12/2 + initial_p=np.array([[(p_1+p_12)/2,p_1,p_12], + [p_12,p_1,inerror], + [inerror,p_1, inerror]]) + + + pos=np.where(chrmlist[:-1] != chrmlist[1:])[0]+1 + pos=np.append(pos,np.shape(total)[0]) + pos=np.insert(pos, 0, 0, axis=0) + + + + pi= np.array([1/3,1/3,1/3]) + + #A=A1 + likall=[] + for rel_cnt in range(len(rels)): + resall=np.ones(len(total))*9 + Afile=Afiles[rel_cnt] + + A=np.array(pd.read_csv(Afile,sep=',',header=None,index_col=False)) + + Bu = np.zeros((inbr_states,np.shape(A)[0],win)) #Emission probability + #for st in instates: + # for re in range(np.shape(A)[0]): + # Bu[st,re,:]=binom.pmf(data[:,0], data[:,1], initial_p[st,re]) + + [b0, b1, b2, b3]=[1000,1000,1000,1000] + [a0,a1,a2,a3]=[b0*initial_p[0,0]/(1-initial_p[0,0]), b1*initial_p[0,1]/(1-initial_p[0,1]), b2*initial_p[0,2]/(1-initial_p[0,2]), b3*initial_p[2,2]/(1-initial_p[2,2])] + x0 = [a0,b0,a1,b1,a2,b2,a3,b3] + + xin0=makexin(x=x0) + Bu=makeB(data=data,xin=xin0, M=A.shape[0], T=len(data), inbr=inbr_states) + + + + #print(A) + #[B, log_bscale]= scaleB(Bu, instates) + B=Bu + + np.argwhere(np.isnan(B)) + + + + gamma,A,B,up_p,lik, pi= baum_welch(data=data, hbd=hbd, A=A, B=B, pos=pos, p1avg=initial_p, inbr=inbr_states, x0=x0) + res=viterbi(data, A, B, pi) + resall[gudwin]=res + likall.append(lik) + np.savetxt(fname=resfiles[rel_cnt], X=resall,delimiter=',') + + np.savetxt(fname=likfile, X=likall, delimiter=',') + + +""" +p1="Chagyrskaya1141" +p2="Chagyrskaya13" +pair=p1+"_"+p2 +datafolder="/mnt/diversity/divyaratan_popli/100arc/inbreeding/laurits_writeup/Ch9_rem/neavar/noCh12_bayesContam/" +hbdfolder="/mnt/diversity/divyaratan_popli/100arc/inbreeding/laurits_writeup/Ch9_rem/neavar/noCh12_26mar_fil112_114/" +rel='sib' +fil='0' +pfile=datafolder+"hmm_parameters_fil%s/p1_file" %(fil) +phalf=datafolder+"hmm_parameters_fil%s/phalf_file" %(fil) +datafile=datafolder+"mergedwin_remHighdiv_fil%s/pw_%s.csv" %(fil,pair) +Afile="/mnt/diversity/divyaratan_popli/100arc/inbreeding/simulation_markov_inbreeding3/new_transitions1e6/transition_matrix_%s.csv" %(rel) +pairf1=hbdfolder+"hbd_win_fil%s/pw_%s.csv" %(fil,p1) +pairf2=hbdfolder+"hbd_win_fil%s/pw_%s.csv" %(fil,p2) + +p1thresh=datafolder+"identicalmergedwin_fil1/pw_%s.csv" %(p1) +p2thresh=datafolder+"identicalmergedwin_fil1/pw_%s.csv" %(p2) +thresh=20 +instates=[0,1,2] +inbr_states=3 +inerror=0.001 +upA=rel + +plt.plot(data[:,0]/data[:,1]) +plt.hlines(y=p_1,xmin=0,xmax=250) +plt.hlines(y=p_12,xmin=0,xmax=250) + + +d='0.15' +inb="1" +p1="4" +p2="11" +pair=p1+"_"+p2 +run=15 +datafolder="/mnt/diversity/divyaratan_popli/100arc/inbreeding/simulation_markov_inbreeding3/inbred%s/run%s/downsample%s/" %(inb,run,d) +rel='pc' +fil='0' + +pfile=datafolder+"asc0_hmm_parameters_fil%s/p1_file" %(fil) +phalf=datafolder+"asc0_hmm_parameters_fil%s/phalf_file" %(fil) +datafile=datafolder+"asc0_mergedwin_fil%s/pw_%s.csv" %(fil,pair) +Afile="/mnt/diversity/divyaratan_popli/100arc/inbreeding/simulation_markov_inbreeding3/new_transitions1e6/transition_matrix_%s.csv" %(rel) +pairf1="/mnt/diversity/divyaratan_popli/100arc/inbreeding/simulation_markov_inbreeding3/hbd_hmm/inbred%s/run%s/downsample%s/filtered%s/asc0_res/gamma/pw_%s.csv" %(inb,run,d,fil,p1) +pairf2="/mnt/diversity/divyaratan_popli/100arc/inbreeding/simulation_markov_inbreeding3/hbd_hmm/inbred%s/run%s/downsample%s/filtered%s/asc0_res/gamma/pw_%s.csv" %(inb,run,d,fil,p2) +p1thresh=datafolder+"asc0_identicalmergedwin_fil1/pw_%s.csv" %(p1) +p2thresh=datafolder+"asc0_identicalmergedwin_fil1/pw_%s.csv" %(p2) +thresh=2 +instates=[0,1,2] +inbr_states=3 +inerror=0.001 +upA=rel +""" + + + + + +def getRelatable(filist, relatable, pairs, rels): + + cols=["pair", "relatedness","second_guess", "loglik_ratio", "withinDeg_second_guess", "withinDeg_ll"] + df=pd.DataFrame(columns=cols) + + for i,fi in enumerate(filist): + pair=pairs[i] + lik=np.loadtxt(fi,dtype='float', delimiter = ",") + data=pd.DataFrame(lik.reshape(-1,len(rels)), columns=rels) + print(data) + + reldeg = { + "identical": "id", + "first": ["sib","pc"], + "second": ["gr","hsib","avu"], + "deg3": "deg3", + "un": ["deg4","deg5", "un"], + } + + + b=data.sort_values(by=0, ascending=False, axis=1) + + df.loc[i,"relatedness"]=b.columns[0] + df.loc[i,"pair"]=pair + + for r in reldeg: + if b.columns[0] in reldeg[r]: + temp=reldeg[r] + + if isinstance(temp, str): + temp=[temp] + + b_otherdeg=b[list(set(b.columns) - set(temp))].sort_values(by=0, ascending=False, axis=1) + df.loc[i,"second_guess"]=b_otherdeg.columns[0] + df.loc[i,"loglik_ratio"]=b.iloc[0,0] - b_otherdeg.iloc[0,0] + + b_indeg=b[temp].sort_values(by=0, ascending=False, axis=1) + if len(b_indeg.iloc[0]) > 1: + df.loc[i,"withinDeg_second_guess"]=b_indeg.columns[1] + df.loc[i,"withinDeg_ll"]=b_indeg.iloc[0,0] - b_indeg.iloc[0,1] + else: + df.loc[i,"withinDeg_second_guess"]=np.nan + df.loc[i,"withinDeg_ll"]=np.nan + + with pd.option_context('display.max_rows', len(df.index), 'display.max_columns', len(df.columns)): + df.to_csv(relatable, sep=',') + + +def bigTable(tables, alltable): + filenames=tables + lik_colind=pd.read_csv(filenames[0], sep=",", header=0,index_col=0) + c=lik_colind.columns.values + r=lik_colind.index.values + bigtable=pd.DataFrame(0,index=r, columns=c) + for t in tables: + lik=pd.read_csv(t, sep=",", header=0,index_col=0) + bigtable=bigtable+lik + bigtable["sum"] = bigtable.sum(axis=1) + bigtable = bigtable.loc[:,c].div(bigtable["sum"], axis=0) + + with pd.option_context('display.max_rows', len(bigtable.index), 'display.max_columns', len(bigtable.columns)): + bigtable.to_csv(alltable, sep=',') + +def getpc_sib(filist, pairs, truerel,kinrel_nf,pclist, siblist, grlist,halfsib,avulist,thirdlist,fourthlist,fifthlist,idlist,runtable,ctw): + + cols=["pair", "relatedness","second_guess", "loglik_ratio"] + df=pd.DataFrame(columns=cols) + runtable_cut=pd.DataFrame(0,index=truerel, columns=kinrel_nf) + + for i,fi in enumerate(filist): + pair=pairs[i] + lik=np.loadtxt(fi,dtype='float', delimiter = ",") + data=pd.DataFrame(lik.reshape(-1,len(truerel)), columns=truerel) + + b=data.sort_values(by=0, ascending=False, axis=1) + + print(b) + + + df.loc[i,"relatedness"]=b.columns[0] + df.loc[i,"pair"]=pair + + df.loc[i,"second_guess"]=b.columns[1] + df.loc[i,"loglik_ratio"]=(b[b.columns[0]] - b[b.columns[1]]).item() + + l=df.loc[i,"pair"] + + if l in grlist: + label='gr' + elif l in pclist: + label='pc' + elif l in siblist: + label='sib' + elif l in idlist: + label='id' + elif l in thirdlist : + label='deg3' + elif l in fourthlist : + label='deg4' + elif l in halfsib : + label='hsib' + elif l in avulist : + label='avu' + elif l in fifthlist : + label='deg5' + else: + label='un' + + + if df.loc[i,'loglik_ratio'] >= float(ctw): + runtable_cut.loc[label,df.loc[i,"relatedness"]]=runtable_cut.loc[label,df.loc[i,"relatedness"]]+1 + else: + runtable_cut.loc[label,'NF']=runtable_cut.loc[label,'NF']+1 + + with pd.option_context('display.max_rows', len(runtable_cut.index), 'display.max_columns', len(runtable_cut.columns)): + runtable_cut.to_csv(runtable, sep=',') diff --git a/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/Snakefile b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/Snakefile new file mode 100644 index 0000000..dae6a21 --- /dev/null +++ b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/Snakefile @@ -0,0 +1,480 @@ +import os +import msprime as msp +import numpy as np +import random +import pandas as pd +from scipy import stats +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +#from simulation_hapcallingall_function import * + +from input_preparation_func import makePseudohap, filterPseudohap, READInput, findDiff, getWin, mergeChrm, getP, identicalDiff, contamFilemiss, nhFile, contamAll,getHighDiv, remHighDiv + + +length=95814313 #end - start for chr 13 +interval=1e7 +#interval_1M=1e6 +pedsim_folder= "/mnt/diversity/divyaratan_popli/100arc/inbreeding/pedsim/ped-sim/" + +runlist=list(range(1,61)) +totalch=list(range(1,23)) + +allcoverage=[4, 0.5, 0.2, 0.1, 0.05, 0.03] + + +#Mfilist=[0,1] +#Mfilist=config["fil"] +#expected overlaps: [4,000,000, 160,000-40,000, 26000-6400, 10,000-2500, 3600-900, 1600- 400] +cntlist=[1] +#inblist=[0,1] +inblist=[0] +asclist=[0] +Mfilist=[0] +#asclist=config["a"] +inplist=['hapProbs'] +inds=list(range(17)) +covlist=[4, 0.5, 0.2, 0.1, 0.05, 0.03] + +mind_list=[10] +mcont_list=[0.5,1.5,2.5,3.5,4.5,5.5] +e=0.01 #transition matrix error +relno=7 +stateno=3 + +contam_est=[2,0,0,0,0,0,2.5,0,3,0,2.5,0,1.5,0,1,0.5,1] + +listf=[] +for i, l1 in enumerate(inds): + for l2 in inds[(i+1):]: + s = '%s_%s' %(str(l1).strip("['']"), str(l2).strip("['']")) + listf.append(s) + +listidentical=inds + +related_total=8+2+2+2## 4 children, 1 grandchild, 1 (3rd) degree, 1 (4th) degree +unrelated_total=16#8 individuals +identical_total=4## 2 identical +others_total=2+2+2+10 ##high quality genomes from vindija, altai, denisova, and human +human_total=10 +nea_nh=0 +human_nh=24 + +#even-numbered are females. odd-numbered are males +""" +rule sim: + output: + unrelated_sim= "sim_init/run{RID}/chrm{ch}/unrelated_sim.csv.gz", #only unrelated genotypes, all others are 0 now + position_file= "sim_init/run{RID}/chrm{ch}/position_sim.csv.gz" + + run: + + rec=simulate_seq(unrelated=unrelated_total, #8 individuals, + related= related_total, ## 4 children, 1 grandchild, 1 (3rd) degree, 1 (4th) degree + identical= identical_total, ## 2 identical + others= others_total, ##high quality genomes from vindija, altai, denisova, and human + out_gen=output.unrelated_sim, + pos_file= output.position_file, #output: position of all mutations + chrm=wildcards.ch, #chromosome number + '/' + RID=wildcards.RID, + Ne=1e4, + length=length, + recombination_rate=1e-8, + mutation_rate=1e-8, + recombination_rate_child=1e-8 + + ) + + +rule pedsim: + input: + genmap = pedsim_folder + "refined_mf.simmap", + intf = pedsim_folder + "interfere/nu_p_campbell.tsv", + def1 = pedsim_folder + "{sex}/gf.def" + output: + ibdfile = "pedsim/{sex}/run{RID}/chrm{ch}/index{i}.seg" + resources: + mem_mb=2000 + shell: + "/mnt/diversity/divyaratan_popli/100arc/inbreeding/pedsim/ped-sim/ped-sim -d {input.def1} -m {input.genmap} -o pedsim/{wildcards.sex}/run{wildcards.RID}/chrm{wildcards.ch}/index{wildcards.i} --intf {input.intf}" + + +rule post_sim: + input: + unrelated_sim= "sim_init/run{RID}/chrm{ch}/unrelated_sim.csv.gz", #only unrelated genotypes, all others are 0 now + position_file= "sim_init/run{RID}/chrm{ch}/position_sim.csv.gz", + ibdfiles= expand("pedsim/{sex}/run{{RID}}/chrm{{ch}}/index{i}.seg", sex=["male","female"], i=list(range(7))) + output: + diploid_sim= "post_sim/inbred0/run{RID}/chrm{ch}/diploid_sim.csv.gz", + diploid_sim_i= "post_sim/inbred1/run{RID}/chrm{ch}/diploid_sim.csv.gz", + outhbd="post_sim/inbred0/run{RID}/chrm{ch}/hbd_sim.csv.gz", + outhbd_i="post_sim/inbred1/run{RID}/chrm{ch}/hbd_sim.csv.gz", + + run: + rec=postSim(genfile=input.unrelated_sim, + posfile=input.position_file, + unrelated=unrelated_total, #8 individuals, + related=related_total, ## 4 children, 1 grandchild, 1 (3rd deg) relative + identical= identical_total, ## 2 identical + others=others_total, ##high quality genomes from vindija and altai, denisova and modern humans + diploid_sim=output.diploid_sim, + diploid_sim_i=output.diploid_sim_i, + out_hbdall=output.outhbd, + out_hbdall_i=output.outhbd_i, + chrm=wildcards.ch, #chromosome number + '/' + RID=wildcards.RID, + Ne=1e3, + length=length, + recombination_rate=1e-8, + mutation_rate=1e-8, + ibdfiles=input.ibdfiles + ) + + +rule get_reads: + input: + gen_file="post_sim/inbred0/run{RID}/chrm{ch}/diploid_sim.csv.gz" + output: + reads="post_sim/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/prereads.csv.gz" + run: + getReads(genf=input.gen_file, coverage=wildcards.cov, iscontam=wildcards.cnt, + contaml=contam_est, totalind=int((unrelated_total+related_total+identical_total)/2), c_no=human_total, outf=output.reads) + + +rule final_reads: + input: + read01="post_sim/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/prereads.csv.gz", + gen_file="post_sim/inbred{inb}/run{RID}/chrm{ch}/diploid_sim.csv.gz", + output: + reads="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/reads.csv.gz" + run: + finalReads(genf=input.gen_file,read01=input.read01,totalind=int((unrelated_total+related_total+identical_total)/2), outf=output.reads) + + +rule get_ascertainment: + input: + diploid_sim= "post_sim/inbred0/run{RID}/chrm{ch}/diploid_sim.csv.gz", + position_file= "sim_init/run{RID}/chrm{ch}/position_sim.csv.gz", + output: + position_out= "sim_init/run{RID}/chrm{ch}/asc{asc}_ascertained_position_sim.csv.gz", + + run: + getAscertainment(ascertainment_scheme=wildcards.asc, + dipfile=input.diploid_sim, + posfile=input.position_file, + pos_out=output.position_out, + chrm=wildcards.ch + ) + + + +rule prob_diffs: + input: + reads="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/reads.csv.gz", + pos_asc="sim_init/run{RID}/chrm{ch}/asc{asc}_ascertained_position_sim.csv.gz", + pos_all="sim_init/run{RID}/chrm{ch}/position_sim.csv.gz" + output: + probs="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/hapProbs.csv.gz", + idDiff="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/idhapProbs.csv.gz", + pshap="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/pshap.csv.gz", + idhapDiff="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/idpshap.csv.gz", + run: + probDiffs(readsf=input.reads, pos_ascf=input.pos_asc, pos_allf=input.pos_all, chrm=wildcards.ch, + fout_prob=output.probs, fout_idDiff=output.idDiff, fout_pshap=output.pshap, + fout_idhap=output.idhapDiff) + + +rule Mfilter_tped: #makes monomorphic filter and READ input file + input: #each chromosome + infiles="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/pshap.csv.gz", + posf="sim_init/run{RID}/chrm{ch}/asc{asc}_ascertained_position_sim.csv.gz" + output: + fil="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/pshap/pos_filtered{Mfil}.gz", + read="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/readfiles_fil{Mfil}/allsamples.tped.gz", + + run: + filterPseudohap(filist=input.infiles, posfile=input.posf, outfilter=output.fil, readfile=output.read, + fil=wildcards.Mfil, fil_version='hap') + + + +rule Mfilter_prob: #makes monomorphic filter and READ input file + input: #each chromosome + infiles="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/hapProbs.csv.gz", + posf="sim_init/run{RID}/chrm{ch}/asc{asc}_ascertained_position_sim.csv.gz" + output: + fil="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/hapProbs/pos_filtered{Mfil}.gz", + run: + filterPseudohap(filist=input.infiles, posfile=input.posf, outfilter=output.fil, readfile='none', + fil=wildcards.Mfil, fil_version='prob') + + +rule READ_input: + input: + tped=expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/coverage{{cov}}/contam{{cnt}}/asc{{asc}}/readfiles_fil{{Mfil}}/allsamples.tped.gz", ch=totalch), + posf=expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/coverage{{cov}}/contam{{cnt}}/asc{{asc}}/pshap/pos_filtered{{Mfil}}.gz", ch=totalch) + output: + read="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}_readfiles_fil{Mfil}/read_allsamples.tped.gz", + tfam="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}_readfiles_fil{Mfil}/read_allsamples.tfam.gz" + resources: + mem_mb=150000 + run: + READInput(filist=input.tped,read=output.read, posfile=input.posf, + tfam=output.tfam, libraries=inds) + + + +rule find_diff: + input: + indf="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/{inpMode}.csv.gz", + infilter="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/{inpMode}/pos_filtered{Mfil}.gz", + posallf="sim_init/run{RID}/chrm{ch}/asc{asc}_ascertained_position_sim.csv.gz" + + output: + diff="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/diffs/inputMode_{inpMode}_fil{Mfil}/alldiffs.csv.gz" + run: + findDiff(indfile=input.indf,infilter=input.infilter, posall=input.posallf, difffile=output.diff,fil=wildcards.Mfil) + + +rule identical_diff: + input: + ind="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/id{inpMode}.csv.gz", + infilter="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/{inpMode}/pos_filtered{Mfil}.gz", + posallf="sim_init/run{RID}/chrm{ch}/asc{asc}_ascertained_position_sim.csv.gz" + + output: + diff="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/diffs/inputMode_id{inpMode}_fil{Mfil}/iddiffs.csv.gz", + + run: + identicalDiff(indfile=input.ind,posall=input.posallf, + infilter=input.infilter, difffile=output.diff,fil=wildcards.Mfil) + + + +rule get_win: + input: + pwfile="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/diffs/inputMode_{inpMode}_fil{Mfil}/alldiffs.csv.gz", + posf="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/{inpMode}/pos_filtered{Mfil}.gz" + output: + df="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/wins/inputMode_{inpMode}_fil{Mfil}/allwind.csv.gz", + tf="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/wins/inputMode_{inpMode}_fil{Mfil}/allwint.csv.gz" + + + run: + getWin(diff=input.pwfile, + dfile=output.df, tfile=output.tf, posf=input.posf, + interval=interval) + + +rule identical_win: + input: + pwfile="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/diffs/inputMode_id{inpMode}_fil{Mfil}/iddiffs.csv.gz", + posf="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/{inpMode}/pos_filtered{Mfil}.gz" + + output: + df="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/wins/inputMode_id{inpMode}_fil{Mfil}/idwind.csv.gz", + tf="post_sim/inbred{inb}/run{RID}/chrm{ch}/coverage{cov}/contam{cnt}/asc{asc}/wins/inputMode_id{inpMode}_fil{Mfil}/idwint.csv.gz", + + run: + getWin(diff=input.pwfile, + dfile=output.df, tfile=output.tf, posf=input.posf, + interval=interval) + + +rule hbd_win: + input: + pwfile="post_sim/inbred{inb}/run{RID}/chrm{ch}/hbd_sim.csv.gz", + posf="sim_init/run{RID}/chrm{ch}/position_sim.csv.gz" + output: + df="post_sim/inbred{inb}/run{RID}/chrm{ch}/hbd_windows/diff_sites.csv.gz", + tf="post_sim/inbred{inb}/run{RID}/chrm{ch}/hbd_windows/total_sites.csv.gz" + + run: + getWin(diff=input.pwfile, + dfile=output.df, tfile=output.tf, posf=input.posf, + interval=interval) + + +rule merge_hbd: + input: + hbdd=expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/hbd_windows/diff_sites.csv.gz", ch=totalch), + hbdt=expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/hbd_windows/total_sites.csv.gz", ch=totalch), + + output: + diffall="post_sim/inbred{inb}/run{RID}/hbd_merged/diff_all.csv.gz", + totalall="post_sim/inbred{inb}/run{RID}/hbd_merged/total_all.csv.gz", + chfile="post_sim/inbred{inb}/run{RID}/hbd_merged/chrm.csv.gz" + + run: + mergeChrm(dinfiles=input.hbdd, + tinfiles=input.hbdt, + dfile=output.diffall, + tfile=output.totalall, + chfile=output.chfile) + + + +rule merge_chrm: + input: + dinfiles= expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/coverage{{cov}}/contam{{cnt}}/asc{{asc}}/wins/inputMode_{{inpMode}}_fil{{Mfil}}/allwind.csv.gz",ch=totalch), + tinfiles= expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/coverage{{cov}}/contam{{cnt}}/asc{{asc}}/wins/inputMode_{{inpMode}}_fil{{Mfil}}/allwint.csv.gz",ch=totalch), + + output: + dfile= "contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/merged_wind.csv.gz", + tfile= "contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/merged_wint.csv.gz", + chfile= "contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/chrm.csv.gz" + + run: + + mergeChrm(dinfiles=input.dinfiles, + tinfiles=input.tinfiles, + dfile=output.dfile, + tfile=output.tfile, + chfile=output.chfile) + + +rule merge_identicalchrm: + input: + dinfiles= expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/coverage{{cov}}/contam{{cnt}}/asc{{asc}}/wins/inputMode_id{{inpMode}}_fil{{Mfil}}/idwind.csv.gz",ch=totalch), + tinfiles= expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/coverage{{cov}}/contam{{cnt}}/asc{{asc}}/wins/inputMode_id{{inpMode}}_fil{{Mfil}}/idwint.csv.gz",ch=totalch) + + output: + dfile= "contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_wind.csv.gz", + tfile= "contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_wint.csv.gz", + chfile= "contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_chrm.csv.gz" + + run: + mergeChrm(dinfiles=input.dinfiles, + tinfiles=input.tinfiles, + dfile=output.dfile, + tfile=output.tfile, + chfile=output.chfile) + +""" +rule contam_file: + output: + indf="contam_est_cont{mcont}_ind{mind}dev.gz", + pairf="contam_est_cont{mcont}_ind{mind}dev_pairwise.gz" + run: + contamFilemiss(contaml=contam_est, outfile_i=output.indf, outfile_p=output.pairf, miss_contam=wildcards.mcont, miss_ind=wildcards.mind) + +""" +rule nh_file: + input: + genf=expand("sim_init/run{{RID}}/chrm{ch}/unrelated_sim.csv.gz", ch=totalch), + all_posf=expand("sim_init/run{{RID}}/chrm{ch}/position_sim.csv.gz", ch=totalch), + fil_posf=expand("post_sim/inbred{{inb}}/run{{RID}}/chrm{ch}/coverage{{cov}}/contam{{cnt}}/asc{{asc}}/{{inpMode}}/pos_filtered{{Mfil}}.gz", ch=totalch), + + output: + nhf="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/NHfile.csv.gz" + run: + nhFile(genf=input.genf,fil_posf=input.fil_posf, all_posf=input.all_posf, nhf=output.nhf,ind1=nea_nh, ind2=human_nh) +""" + +rule contam_all: + input: + dfile="../../../contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/merged_wind.csv.gz", + tfile="../../../contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/merged_wint.csv.gz", + contam_est="contam_est_cont{mcont}_ind{mind}dev_pairwise.gz", + nhf="../../../contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/NHfile.csv.gz" + output: + difffile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/contam_diff.csv.gz", + totalfile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/contam_total.csv.gz", + + run: + contamAll(dfile=input.dfile, tfile=input.tfile, iscnt=wildcards.cnt, cfile=input.contam_est, Dnhfile=input.nhf, difffile=output.difffile, totalfile=output.totalfile) + + + +rule contam_id: + input: + dfile="../../../contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_wind.csv.gz", + tfile="../../../contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_wint.csv.gz", + contam_est="contam_est_cont{mcont}_ind{mind}dev.gz", + nhf="../../../contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/NHfile.csv.gz", + + output: + difffile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_contam_diff.csv.gz", + totalfile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_contam_total.csv.gz" + + run: + contamAll(dfile=input.dfile, tfile=input.tfile, iscnt=wildcards.cnt, cfile=input.contam_est, Dnhfile=input.nhf, difffile=output.difffile, totalfile=output.totalfile) + + +rule get_highDiv: + input: + difffile = "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/contam_diff.csv.gz", + totalfile = "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/contam_total.csv.gz", + + output: + highdiv="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/windows_with_diffProp_higher_than3sd.txt.gz" + run: + getHighDiv(diff=input.difffile, total=input.totalfile, fout=output.highdiv) + + +rule rem_highDiv: + input: + diff="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/contam_diff.csv.gz", + total="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/contam_total.csv.gz", + winfile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/windows_with_diffProp_higher_than3sd.txt.gz" + output: + outd="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/remHighdiv_diff.csv.gz", + outt="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/remHighdiv_total.csv.gz", + + run: + remHighDiv(diffname=input.diff, totalname=input.total, winfile=input.winfile, outd=output.outd, outt=output.outt) + + + +rule rem_highDiv_id: + input: + diff="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_contam_diff.csv.gz", + total="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_contam_total.csv.gz", + winfile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/windows_with_diffProp_higher_than3sd.txt.gz" + output: + outd="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_remHighdiv_diff.csv.gz", + outt="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_remHighdiv_total.csv.gz", + + run: + remHighDiv(diffname=input.diff, totalname=input.total, winfile=input.winfile, outd=output.outd, outt=output.outt) + + +rule get_p1: + input: + difffile= "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/remHighdiv_diff.csv.gz", + totalfile= "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/remHighdiv_total.csv.gz", + + id_difffile= "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_remHighdiv_diff.csv.gz", + id_totalfile= "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_remHighdiv_total.csv.gz", + + output: + pfile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/hmm_parameters/p1_file.gz", + p_all="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/hmm_parameters/p_all.gz", + phalf="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/hmm_parameters/phalf_file.gz", + phalf_all="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/hmm_parameters/phalf_all.gz", + newpairs="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/good_pairs.txt.gz", + newid="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_good_pairs.txt.gz", + over="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/overlap.csv.gz", + id_over="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_overlap.csv.gz", + + run: + getP(dfile=input.difffile, tfile=input.totalfile, targets=listf, + pfile=output.pfile,goodpairs=output.newpairs, + allp=output.p_all, overf=output.over) + + getP(dfile=input.id_difffile, tfile=input.id_totalfile, targets=inds, + pfile=output.phalf,goodpairs=output.newid, + allp=output.phalf_all, overf=output.id_over) + + +rule all: + input: + over=expand("missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/overlap.csv.gz",cnt=cntlist, inb=inblist, RID=runlist, cov=covlist, Mfil=Mfilist,asc=asclist, inpMode=inplist, mind=mind_list, mcont=mcont_list), + pfile=expand("missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/hmm_parameters/p1_file.gz", cnt=cntlist, inb=inblist, RID=runlist, cov=covlist, Mfil=Mfilist,asc=asclist, inpMode=inplist, mind=mind_list, mcont=mcont_list), + id=expand("missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_remHighdiv_diff.csv.gz", cnt=cntlist, inb=inblist, RID=runlist, cov=covlist, Mfil=Mfilist,asc=asclist, inpMode=inplist, mind=mind_list, mcont=mcont_list), + #tped=expand("contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}_readfiles_fil{Mfil}/read_allsamples.tped.gz",cnt=cntlist, inb=inblist, RID=config["runs"], cov=covlist, Mfil=Mfilist,asc=asclist), + #hbd=expand("missC{mcont}_{mind}/post_sim/inbred{inb}/run{RID}/hbd_merged/diff_all.csv.gz", inb=inblist, RID=config["runs"], mind=mind_list, mcont=mcont_list), + + + +###################################################################################sanitycheck plots diff --git a/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/hbd_hmm/Snakefile b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/hbd_hmm/Snakefile new file mode 100644 index 0000000..d37afdc --- /dev/null +++ b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/hbd_hmm/Snakefile @@ -0,0 +1,180 @@ + +import numpy as np +import scipy +import random +import pandas as pd +from scipy.stats import binom +from math import isclose +from scipy.special import gammaln +from scipy import optimize +import math +import matplotlib +matplotlib.use('Agg') +from scipy.special import loggamma +from scipy.special import beta as Betaf +from scipy.optimize import minimize +from numpy.linalg import matrix_power +from scipy.stats import chisquare +from hmm_func_hbd import * + + + +instates=[0,1,2] + + +readrel=['gr','un','id','fir'] + +pclist=['0_8','1_8','1_9','1_10','2_9','2_10','3_11','4_11','10_12','11_12','5_13','12_13'] +grlist=['1_12','2_12','3_12','4_12','10_13','11_13','9_12','8_12'] +idlist=['0_14', '1_15'] +siblist=['9_10'] +firlist=pclist+siblist +thirdlist=['9_13','1_13','2_13','3_13','4_13'] +runlist=list(range(1,61)) +totalch=list(range(1,23)) + + +Mfilist=[0,1] +#expected overlaps: [4,000,000, 160,000-40,000, 26000-6400, 10,000-2500, 3600-900, 1600- 400] +inds=list(range(17)) + +e=0.01 #transition matrix error +stateno=3 + +stateno=3 + +covlist=[4,0.5,0.2,0.1,0.05,0.03] + +statlist=['res','lik'] +#list_models=['allfixed','pfixed','Afixed','nonefixed'] + +listf=[] +for i, l1 in enumerate(inds): + for l2 in inds[(i+1):]: + s = '%s_%s' %(str(l1).strip("['']"), str(l2).strip("['']")) + listf.append(s) + + +folders=["allLikelihoods", "read"] + +datafolder="../" +originalf="../../../../" + +rule hmm: + input: + p1file=datafolder + "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_{inpMode}_fil{Mfil}/hmm_parameters/p1_file.gz", + dfile=datafolder + "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_remHighdiv_diff.csv.gz", + tfile=datafolder + "missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_remHighdiv_total.csv.gz", + chfile=originalf + "contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/asc{asc}/inputMode_id{inpMode}_fil{Mfil}/id_chrm.csv.gz" + + output: + resfile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}_res/gamma_{inpMode}/pw_{listind}.csv.gz", + likfile="missC{mcont}_{mind}/contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}_res/lik_{inpMode}/pw_{listind}.csv.gz", + + run: + l=wildcards.listind + + hmm(dfile=input.dfile, tfile=input.tfile, chrmf=input.chfile, p1file=input.p1file, ind=l, + resfile=output.resfile, likfile=output.likfile, + upA='initial') + +rule all_hbd: + input: expand("missC{mcont}_{mind}/contam1/inbred0/run{RID}/coverage{cov}/filtered0/asc0_res/gamma_hapProbs/pw_{listind}.csv.gz", RID=runlist, cov=covlist, listind=inds, mcont=config["mc"], mind=config["mi"]) +""" +rule accuracy: + input: + resfile="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}_res/gamma_{inpMode}/pw_{listind}.csv.gz", + diffall=datafolder+ "post_sim/inbred{inb}/run{RID}/hbd_merged/diff_all.csv.gz", + totalall=datafolder+ "post_sim/inbred{inb}/run{RID}/hbd_merged/total_all.csv.gz", + output: + outf="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}_res/dev_{inpMode}/pw_{listind}.csv.gz", + outf_noROH="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}_res/dev_noROH_{inpMode}/pw_{listind}.csv.gz", + outf_ROH="contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}_res/dev_ROH_{inpMode}/pw_{listind}.csv.gz" + run: + l=int(wildcards.listind) + reso=pd.read_csv(input.resfile, sep=',', header=0, index_col=0) + hd=np.loadtxt(input.diffall, delimiter=',', dtype='float')[:,l] + ht=np.loadtxt(input.totalall, delimiter=',', dtype='float')[:,l] + + hbd_array=hd/ht + + #hbd_array[hbd_array<=0.5]=0 + #hbd_array[hbd_array>0.5]=1 + + res=np.asarray(reso['g_noin']) + print(res) + xx=np.where(res!=9) + res1=res[xx] + hbd1=hbd_array[xx] + #correct=sum(res1==hbd1)/len(res1) + dev=res1-hbd1 + + + hbd1_noROH=hbd1[hbd1==1] + res1_noROH=res1[hbd1==1] + dev_noROH=res1_noROH - hbd1_noROH + + + hbd1_ROH=hbd1[hbd1<1] + res1_ROH=res1[hbd1<1] + dev_ROH=res1_ROH - hbd1_ROH + + np.savetxt(fname=output.outf, X=dev, delimiter=',') + np.savetxt(fname=output.outf_noROH, X=dev_noROH, delimiter=',') + np.savetxt(fname=output.outf_ROH, X=dev_ROH, delimiter=',') + + +rule test: + input: expand("contam{cnt}/inbred{inb}/run{RID}/coverage{cov}/filtered{Mfil}/asc{asc}_res/dev_{inpMode}/pw_{listind}.csv.gz", cnt=1, inb=0, Mfil=[0],asc=0, folder=["noinbred"], inpMode="hapProbs", listind=inds, RID=runlist, cov=4) + + + +rule model_performance: + input: + inf=expand("contam{{cnt}}/inbred{{inb}}/run{RID}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}_res/dev_{{inpMode}}/pw_{listind}.csv.gz",RID=runlist,listind=inds), + inf_noROH=expand("contam{{cnt}}/inbred{{inb}}/run{RID}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}_res/dev_noROH_{{inpMode}}/pw_{listind}.csv.gz",RID=runlist,listind=inds), + inf_ROH=expand("contam{{cnt}}/inbred{{inb}}/run{RID}/coverage{{cov}}/filtered{{Mfil}}/asc{{asc}}_res/dev_ROH_{{inpMode}}/pw_{listind}.csv.gz",RID=runlist,listind=inds), + output: + outf1="contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/coverage{cov}_fil{Mfil}_inbred_performance.csv.gz", + outf2="contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/coverage{cov}_fil{Mfil}_noinbred_performance.csv.gz", + outf1_noROH="contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/coverage{cov}_fil{Mfil}_inbred_performance_noROH.csv.gz", + outf2_noROH="contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/coverage{cov}_fil{Mfil}_noinbred_performance_noROH.csv.gz", + outf1_ROH="contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/coverage{cov}_fil{Mfil}_inbred_performance_ROH.csv.gz", + outf2_ROH="contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/coverage{cov}_fil{Mfil}_noinbred_performance_ROH.csv.gz", + run: + modelPerformance(files=input.inf, outf1=output.outf1, outf2=output.outf2) + modelPerformance(files=input.inf_noROH, outf1=output.outf1_noROH, outf2=output.outf2_noROH) + modelPerformance(files=input.inf_ROH, outf1=output.outf1_ROH, outf2=output.outf2_ROH) + + + +rule table: + input: + inf=expand("contam{{cnt}}/inbred{{inb}}/inbred_detection/asc{{asc}}/{{inpMode}}/coverage{cov}_fil{{Mfil}}_{{folder}}_performance.csv.gz", cov=covlist), + inf_noROH=expand("contam{{cnt}}/inbred{{inb}}/inbred_detection/asc{{asc}}/{{inpMode}}/coverage{cov}_fil{{Mfil}}_{{folder}}_performance_noROH.csv.gz", cov=covlist), + inf_ROH=expand("contam{{cnt}}/inbred{{inb}}/inbred_detection/asc{{asc}}/{{inpMode}}/coverage{cov}_fil{{Mfil}}_{{folder}}_performance_ROH.csv.gz", cov=covlist), + + output: + outf="contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/performance_table/fil{Mfil}_{folder}.csv.gz" + run: + tablef(inf=input.inf, inf_noROH=input.inf_noROH, inf_ROH=input.inf_ROH, outf=output.outf) + + + +rule alltables: + input: + in1=expand("contam{cnt}/inbred{inb}/inbred_detection/asc{asc}/{inpMode}/performance_table/fil{Mfil}_{folder}.csv.gz", cnt=config["cnt"], inb=config["i"], Mfil=[0],asc=config["a"], folder=["inbred", "noinbred"], inpMode=config["inp"]), + + + + + + + + +rule all: + input: + all=expand("inbred{inb}/model_performance_{folder}/down{down}/filtered{Mfil}.csv", inb=[0,1],folder=["allLikelihoods"], down=config["d"], Mfil=0), + tables=expand("inbred{inb}/run{RID}/downsample{down}/filtered{Mfil}/relatable_{folder}.csv",inb=[0,1], RID=runlist, down=config["d"], Mfil=0, folder=["allLikelihoods"]), + +""" diff --git a/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/hbd_hmm/hmm_func_hbd.py b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/hbd_hmm/hmm_func_hbd.py new file mode 100644 index 0000000..5672e18 --- /dev/null +++ b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/hbd_hmm/hmm_func_hbd.py @@ -0,0 +1,549 @@ + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 6 16:33:15 2021 + +@author: divyaratan_popli +""" + + + +import scipy +import numpy as np +import random +import pandas as pd +from scipy.stats import binom +from math import isclose +from scipy.special import gammaln +from scipy import optimize +import math +import matplotlib.pyplot as plt +from scipy.special import loggamma +from scipy.special import beta as Betaf +from scipy.optimize import minimize_scalar +import pylab +from numpy.linalg import matrix_power + + + + +def Betal(x,y): + return gammaln(x)+gammaln(y) - gammaln(x+y) + +def fact(x): + return(gammaln(x+1)) + + +def scaleB(B): + scale=B.max(axis=0) + Bscaled=B/scale + return Bscaled, np.sum(np.log(scale)) + #return B, np.zeros(np.shape(B)[1]) + +def forward(A,B,pi,data): + N= np.shape(A)[0] + T= np.shape(data)[0] + alpha=np.zeros((T,N)) + alpha_p=np.zeros((T,N)) + scale=np.zeros(T) + + alpha[0,:]= (pi) * B[:,0] + alpha_p[0,:]= alpha[0,:]/sum(alpha[0,:]) + scale[0]=sum(alpha[0,:]) + + for t in range(1,T): + alpha[t,:]= (alpha_p[t-1,:] @ A) * B[:,t] + alpha_p[t,:]= alpha[t,:]/sum(alpha[t,:]) + scale[t]=sum(alpha[t,:]) + + return alpha_p, scale + + + +def backward(A, B, data, scale): + + N= np.shape(A)[0] + T= np.shape(data)[0] + beta=np.zeros((T,N)) + beta_p=np.zeros((T,N)) + + beta[-1,:]=np.ones(N) + beta_p[-1,:]=np.ones(N) + + for t in reversed(range(T-1)): + for n in range(N): + beta[t,n]= sum( A[n,:] * beta_p[t+1,:] * B[:,t+1]) + beta_p[t,n]= beta[t,n]/scale[t+1] + + return beta_p + + + +def objective_old(x,gamma,data): + if len(x)==6: + a0=x[0] + b0=x[1] + a1=x[2] + b1=x[3] + a2=x[4] + b2=x[5] + cost= np.sum((fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+a0, data[:,1]-data[:,0]+b0) - Betal(a0,b0))*gamma[0,:]) + \ + np.sum((fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+a1, data[:,1]-data[:,0]+b1) - Betal(a1,b1))*gamma[1,:]) + \ + np.sum((fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+a2, data[:,1]-data[:,0]+b2) - Betal(a2,b2))*gamma[2,:]) + + elif len(x)==4: + a0=x[0] + b0=x[1] + a1=x[2] + b1=x[3] + + cost= np.sum((fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+a0, data[:,1]-data[:,0]+b0) - Betal(a0,b0))*gamma[0,:]) + \ + np.sum((fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+a1, data[:,1]-data[:,0]+b1) - Betal(a1,b1))*gamma[1,:]) + + + return (-1)*cost + + + + +def objective(a, meanp, k, data): + cost=0 + + b=a * (1-meanp)/meanp + + for w in range(len(data)): + + ll=(fact(data[w,1]) - fact(data[w,0]) - fact(data[w,1]-data[w,0]) + \ + Betal(data[w,0]+a, data[w,1]-data[w,0]+b) - Betal(a,b)) + + + Wll= ll * k[w] + + cost= cost + Wll + + return (-1)*cost + + + + + +def makeB(data,up_a0,up_b0,up_a1,up_b1,M,T,hd): + + + B = np.zeros((M,T)) + B[0,:]= np.exp(fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+up_a0, data[:,1]-data[:,0]+up_b0) - Betal(up_a0,up_b0)) + B[1,:]= np.exp(fact(data[:,1]) - fact(data[:,0]) - fact(data[:,1]-data[:,0]) + Betal(data[:,0]+up_a1, data[:,1]-data[:,0]+up_b1) - Betal(up_a1,up_b1)) + + B[0,hd]=0 + B[1,hd]=1 + return B + + +def expectation(data, A, B, pos, chrm_no, log_bscale, pi): + + M = A.shape[0] + T = len(data) + alpha=np.zeros((T,M)) + beta=np.zeros((T,M)) + scale=np.zeros(T) + + + + + for chrm in range(chrm_no): + data_chr=data[pos[chrm]:pos[chrm+1],:] + + alpha[pos[chrm]:pos[chrm+1],:],scale[pos[chrm]:pos[chrm+1]] = forward(A,B[:,pos[chrm]:pos[chrm+1]],pi,data_chr) + beta[pos[chrm]:pos[chrm+1],:] = backward(A,B[:,pos[chrm]:pos[chrm+1]],data_chr,scale[pos[chrm]:pos[chrm+1]]) + + post= alpha*beta + + + li=np.sum(np.log(scale)) + log_bscale + + + assert isclose(sum(abs(np.sum(post,axis=1) - np.ones(np.shape(B)[1]))), 0, abs_tol=1e-7), "sum of alpha and beta is not 1" + assert isclose(sum(abs(np.sum(A,axis=1) - np.ones(np.shape(A)[0]))), 0, abs_tol=1e-7), "sum of transition matrix columns is not 1" + + #pi_new=np.mean(post[pos[:-1],:],0) + pi_new=matrix_power(A,10000)[0] + return post.T, li, alpha, beta, scale, pi_new + + + + +def transition(alpha1, beta1, n1, gamma1, A, B1, pos1): + break1=0 + pos=pos1[1:-1] + B=np.vsplit(B1.T, pos) + gamma=np.vsplit(gamma1.T, pos) + alpha=np.vsplit(alpha1, pos) + beta=np.vsplit(beta1, pos) + n=np.hsplit(n1, pos) + + + + A_new = np.zeros_like(A) + n_states = A.shape[0] + + for i in range(n_states): + for j in range(n_states): + for a, b, e, n_ in zip(alpha, beta, B, n): + A_new[i, j] += np.sum( + a[:-1, i] * A[i, j] * b[1:, j] * e[1:, j] / n_[1:] + ) + + gamma_sum = np.sum([np.sum(g[:-1], 0) for g in gamma], 0) + A_new /= gamma_sum[:, np.newaxis] + if (np.isnan(A_new)).any(): + break1=1 + else: + assert np.allclose(np.sum(A_new, 1), 1) + + return A_new, break1 + +def emission(gamma, data, p1avg, M, x0, highdiff): + + T = len(data) + + + + # optimize + bd = (0.01,100000) + mean_inb,mean_id=p1avg + + solution_inb = minimize_scalar(objective,x0[0],args=(mean_inb,gamma[0,:],data),method='Bounded',\ + bounds=bd) + solution_id = minimize_scalar(objective,x0[2],args=(mean_id,gamma[1,:],data),method='Bounded',\ + bounds=bd) + + a_inb=solution_inb.x + a_id=solution_id.x + b_inb= a_inb * (1-mean_inb)/mean_inb + b_id= a_id * (1-mean_id)/mean_id + + x=[a_inb,b_inb,a_id,b_id] + + up_p=np.zeros((M)) + + + up_a0=x[0] + up_b0=x[1] + up_a1=x[2] + up_b1=x[3] + + + up_p[0]=up_a0/(up_a0+up_b0) + up_p[1]=up_a1/(up_a1+up_b1) + + + + B=makeB(data=data, up_a0=up_a0, up_b0=up_b0, up_a1=up_a1, up_b1=up_b1, M=M, T=T, hd=highdiff) + [B, log_bscale]=scaleB(B) + + return B, up_p, log_bscale,x + + +def bnd_calc(mean_p, dist, propvar): + + r=(1-mean_p)/mean_p + t=propvar*dist*dist + + low_bnd=-1 * (t + (t*r*r) + r*((2*t) - 1))/(t+ (3*t*r) + (3*r*r*t) + (r*r*r*t)) + return low_bnd + +def baum_welch(data, A, B, pos, p1avg, log_bscale, x0, highdiff, max_iter=1000): + + chrm_no=len(pos)-1 + n_iter=0 + diff=1 + last_lik=0 + + + + #print("initial beta parameters are:",x0) + + [mean_inb,mean_id]=[p1avg[0],p1avg[1]] + + di=mean_id-mean_inb + + + bnd_inb= bnd_calc(mean_p=mean_inb, dist=di, propvar=1) + bnd_id= bnd_calc(mean_p=mean_id, dist=di, propvar=1) + bnds=[bnd_inb,bnd_id] + #I have not used any bounds yet + pi=[0.5,0.5] + + while diff > 0.000001 and n_iter0) + + diff=diff1[gudwin] + total=total1[gudwin] + chrm=chrm1[gudwin] + + win=len(total) + + + with open(p1file,"r") as f: + p_1=float(f.read()) + + p_12=p_1/2 + p_in=p_12/2 + initial_p=np.array([p_in, p_12]) + + highdiff=np.where(diff/total>p_1) + + + data=np.stack([diff,total]).T + + + pos=np.where(chrm[:-1] != chrm[1:])[0]+1 + pos=np.append(pos,np.shape(total)[0]) + pos=np.insert(pos, 0, 0, axis=0) + + + A=np.array([[0.8,0.2],[0.2,0.8]]) + + + + + + Bu = np.zeros((np.shape(A)[0],win)) #Emission probability + [b0, b1]=[1000,1000] + [a0,a1]=[b0*initial_p[0]/(1-initial_p[0]), b1*initial_p[1]/(1-initial_p[1])] + x0 = [a0,b0,a1,b1] + + Bu=makeB(data=data, up_a0=x0[0], up_b0=x0[1], up_a1=x0[2], up_b1=x0[3], M=np.shape(A)[0], T=len(data), hd=highdiff) + + + [B, log_bscale]= scaleB(Bu) + + np.argwhere(np.isnan(B)) + + #print("initial A=",A) + + if win>40: #if number of windows with nonzero count + gamma,A,B,lik,break1= baum_welch(data=data, A=A, B=B, pos=pos, p1avg=initial_p, log_bscale=log_bscale, x0=x0, highdiff=highdiff) + else: + gamma=np.ones([2,win]) * 9 + lik='9' + #res=viterbi(data, A, B, pi) + if break1==1: + gamma=np.ones([2,win]) * 9 + lik='9' + #resall=np.ones(totalwin)*9 + #resall[gudwin]=res + #np.savetxt(fname=resfile, X=resall,delimiter=',') + #observations1['count']=1 + #observations1['dis']=resall + gammall=np.ones((totalwin,2))*9 + gammall[gudwin,:]=gamma.T + + dfg=pd.DataFrame({'chrom':chrm1, + 'count': total1, + 'g_noin': gammall[:,1]}) + + harr=np.asarray(dfg['g_noin']) + goodprop=np.sum((harr>=0.9) & (harr<1.1))/np.sum(harr<=1.1) + if goodprop<0.5: + dfg['g_noin']=9 + + with pd.option_context('display.max_rows', len(dfg.index), 'display.max_columns', len(dfg.columns)): + dfg.to_csv(resfile, sep=',') + + with open(likfile,'w') as f: + f.write(str(lik)) + + + +def modelPerformance(files, outf1, outf2): + ai=[] + ani=[] + for i in range(len(files)): + a=np.loadtxt(files[i],dtype='float', delimiter = ",") + l=files[i].split('pw_')[1].split('.csv')[0] + if l in ['0','1', '2', '3', '4','5','6','7','15','16']: + ai.extend(a) + else: + ani.extend(a) + #print('ai is', ai) + #print('ani is', ani) + avgi=np.mean(ai) + avgni=np.mean(ani) + vari=np.var(ai) + varni=np.var(ani) + + np.savetxt(fname=outf1, X=[avgi,vari],delimiter=',') + np.savetxt(fname=outf2, X=[avgni,varni],delimiter=',') + + +def tablef(inf, inf_noROH, inf_ROH, outf): + + bias_inf=[] + var_inf=[] + + for d in range(len(inf)): + a=np.loadtxt(inf[d],dtype='float', delimiter = ",") + bias_inf.append(a[0]) + var_inf.append(a[1]) + + + bias_infnoROH=[] + var_infnoROH=[] + + for d in range(len(inf)): + a=np.loadtxt(inf_noROH[d],dtype='float', delimiter = ",") + bias_infnoROH.append(a[0]) + var_infnoROH.append(a[1]) + + + bias_infROH=[] + var_infROH=[] + + for d in range(len(inf)): + a=np.loadtxt(inf_ROH[d],dtype='float', delimiter = ",") + bias_infROH.append(a[0]) + var_infROH.append(a[1]) + + + + + df = pd.DataFrame( + {'bias': bias_inf, + 'var': var_inf, + 'bias_noROH': bias_infnoROH, + 'var_noROH': var_infnoROH, + 'bias_ROH': bias_infROH, + 'var_ROH': var_infROH, + }) + + with pd.option_context('display.max_rows', len(df.index), 'display.max_columns', len(df.columns)): + df.to_csv(outf, sep=',') + + +""" + +p1="Chagyrskaya07" + + +datafolder="/mnt/diversity/divyaratan_popli/100arc/inbreeding/laurits_writeup/Ch9_rem/neavar/noCh12_bayesContam/" + +datafile=datafolder+"identicalmergedwin_remHighdiv_fil0/pw_%s.csv" %(p1) +p1file=datafolder+"hmm_parameters_fil0/p1_file" + + +gammall[gammall[:,1]>1,1] = 1 +plt.plot(gammall[:,1]) +#plt.plot(true['dis']/true['count']) +plt.plot(observations1['dis']/observations1['count']) + + + + +from scipy.stats import beta + +ite=str(n_iter) + +jet= plt.get_cmap('jet') +colors = iter(jet(np.linspace(0,1,4))) +labels=['inb','id'] + + +plt.vlines(x=initial_p[0],ymin=0,ymax=100) +plt.vlines(x=initial_p[1],ymin=0,ymax=100) + +for i in range(2): + aa,bb=x0[2*i],x0[(2*i) + 1] + xx = np.linspace(0.0, 0.057, 10000) + plt.plot(xx, beta.pdf(xx, aa, bb),'r-', lw=5, alpha=0.6,color=next(colors),label=labels[i]) + plt.ylim(0,100) + plt.legend() + + +fname="/home/divyaratan_popli/Documents/after_seminar/var_beta_x_%s" %(ite) +plt.savefig(fname,bbox_inches='tight') +plt.close() +""" diff --git a/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/input_preparation_func.py b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/input_preparation_func.py new file mode 100644 index 0000000..b3d8069 --- /dev/null +++ b/Simulations_test/KIN_hmm/contamination_missSpecification/input_files/input_preparation_func.py @@ -0,0 +1,616 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 14 13:22:51 2019 + +@author: divyaratan_popli +""" + + +import os +import msprime as msp +import numpy as np +import random +import pandas as pd +import matplotlib.pyplot as plt +from scipy import stats +from matplotlib import figure +from scipy.stats import bernoulli +import math +#from astropy.stats import jackknife_resampling +#from astropy.stats import jackknife_stats +#from scipy.signal import savgol_filter +from ast import literal_eval +import pysam +import pybedtools +##########################################################start with simulation for relatedness distn + +def pileup2alleles(bam, snp): + """returns the number of ref and alt alleles at each position""" + ref, alt = snp.name, snp.score + ref_cnt, alt_cnt = 0, 0 + for pileupcolumn in bam.pileup(snp.chrom, snp.start, snp.end): + if pileupcolumn.reference_pos == snp.start: #find out the position in pilup that matches position in bed + for base in pileupcolumn.get_query_sequences(): #get all bases present at that position + if base.upper() == ref.upper(): ref_cnt +=1 + if base.upper() == alt.upper(): alt_cnt +=1 + return ref_cnt, alt_cnt + return 0, 0 + + +def makePseudohap(inbam,inbai,inbed,outps): + """takes bam, index, bed files to do random sampling and create pseudohaps""" + big_list=[] + chrm=[] + pos=[] + var=9 + bed = pybedtools.BedTool(inbed) + with pysam.AlignmentFile(inbam) as bam: + for i, snp in enumerate(bed): + + freq = pileup2alleles(bam, snp) + if freq[0]+freq[1]==0: + big_list.append(9) + var=9 + else: + ref_alt=[snp.name, snp.score] + p=freq[1]/(freq[0]+freq[1]) + rand=np.random.binomial(1, p) + randsnp=ref_alt[rand] + big_list.append(randsnp) + + chrm.append(snp.chrom) + pos.append(snp.start) + + df=pd.DataFrame({ + "chrm":chrm, + "pos":pos, + "allele":big_list + }) + + with pd.option_context('display.max_rows', len(df.index), 'display.max_columns', len(df.columns)): + df.to_csv(outps, sep=',') + + +def identicalHaps(inbam,inbai,inbed,outps0,outps1): + """takes bam, index, bed files to do random sampling and create pseudohaps""" + big_list0=[] + chrm0=[] + pos0=[] + big_list1=[] + chrm1=[] + pos1=[] + var=9 + bed = pybedtools.BedTool(inbed) + with pysam.AlignmentFile(inbam) as bam: + for i, snp in enumerate(bed): + + freq = pileup2alleles(bam, snp) + if freq[0]+freq[1]<2: + big_list0.append(9) + big_list1.append(9) + var=9 + else: + ref_alt=[snp.name, snp.score] + a=[snp.name] * freq[0] + [snp.score] * freq[1] + randhaps=random.sample(a,2) + rand0=randhaps[0] + rand1=randhaps[1] + + big_list0.append(rand0) + big_list1.append(rand1) + + chrm0.append(snp.chrom) + pos0.append(snp.start) + + chrm1.append(snp.chrom) + pos1.append(snp.start) + + df0=pd.DataFrame({ + "chrm":chrm0, + "pos":pos0, + "allele":big_list0 + }) + + df1=pd.DataFrame({ + "chrm":chrm1, + "pos":pos1, + "allele":big_list1 + }) + + with pd.option_context('display.max_rows', len(df0.index), 'display.max_columns', len(df0.columns)): + df0.to_csv(outps0, sep=',') + with pd.option_context('display.max_rows', len(df1.index), 'display.max_columns', len(df1.columns)): + df1.to_csv(outps1, sep=',') + + +def filterPseudohap(filist,posfile,outfilter,readfile,fil,fil_version): + """Find polymorphic positions in all bam files listed in filist and store it in outfilter.""" + """Create per chromosome input files for READ in tped format""" + + poly=np.loadtxt(filist,dtype='float', delimiter = ",") + pos=np.loadtxt(posfile,dtype='float', delimiter = ",") + poly[poly==-9]=np.nan + #ascpos=np.nansum(poly-np.vstack(poly[:,0]), axis=1)!=0 + if int(fil)==1: + ascpos=[] + for i in range(len(poly)): + ascpos.append(len(np.unique(poly[i,~np.isnan(poly[i,:])]))>1) + + keep=pos[ascpos] + + + elif int(fil)==0: + keep=pos + + + np.savetxt(fname=outfilter, X=keep, delimiter=",") + + if fil_version=='hap': + read=np.repeat(poly, 2, axis=1) + + if int(fil)==1: + readpos=read[ascpos] + elif int(fil)==0: + readpos=read + + np.savetxt(fname=readfile, X=readpos, delimiter=",") + + +def READInput(filist, posfile, read, tfam, libraries): + """Merge per chromosome files from filterPseudohap into both filtered/non filtered tped files for READ, also output tfam file for READ.""" + + + + fi0=np.loadtxt(filist[0],dtype='float', delimiter=",") + po0=np.loadtxt(posfile[0],dtype='float', delimiter=",") + + chrm=[1]*len(po0) + + for i in range(1,len(filist)): + + fi=np.loadtxt(filist[i],dtype='float', delimiter=",") + fi0=np.concatenate((fi0,fi),axis=0) + + po=np.loadtxt(posfile[i],dtype='float', delimiter=",") + po0=np.concatenate((po0,po),axis=0) + + chrm.extend([i+1]*len(po)) + + df=pd.DataFrame(columns=list(range(np.shape(fi0)[1])), index=list(range(np.shape(fi0)[0])), data=fi0) + df[df==0]='A' + df[df==1]='G' + df=df.replace(np.nan,0) + + + df.insert(loc=0, column='chrm', value=chrm) + df.insert(loc=1, column='pos', value=po0) + df.insert(loc=1, column='gndlist', value=0) + df.insert(loc=1, column='name', value=df['chrm'].astype(str)+'_'+df['pos'].astype(str)) + df['pos']=df['pos'].astype(int) + + + with pd.option_context('display.max_rows', len(df.index), 'display.max_columns', len(df.columns)): + df.to_csv(read, sep='\t', header=None, index=False) + + + #making the tfam files + libs=['_'+str(i)+'_' for i in libraries] + tf=pd.DataFrame() + tf[0]=libs + tf[1]=libs + tf[2]=0 + tf[3]=0 + tf[4]=0 + tf[5]=0 + + with pd.option_context('display.max_rows', len(tf.index), 'display.max_columns', len(tf.columns)): + tf.to_csv(tfam, sep='\t', header=None, index=False) + + + +def findDiff(indfile,infilter,posall,difffile,fil): + """Find pairwise differences along genome for two specimen. In output file 9 is for missing data""" + """Fil=1 is for using only polymorphic sites""" + + inds=np.loadtxt(indfile, dtype='float', delimiter=",") + + if int(fil)==1: + pos_fil=np.loadtxt(infilter, dtype='float', delimiter=",") + pos_all=np.loadtxt(posall, dtype='float', delimiter=",") + + filIdx = np.isin(pos_all,pos_fil) + + inds1=inds[filIdx] + + elif int(fil)==0: + inds1=inds.copy() + + inds1[inds1==-9]=np.nan + + + diff0=(np.vstack(inds1[:,0]) * (1-inds1)) + ((1-np.vstack(inds1[:,0])) * inds1) + diffadd0=np.delete(diff0, np.s_[0:1], axis=1) + for j in range(1,np.shape(inds1)[1]): + + + diffi=(np.vstack(inds1[:,j]) * (1-inds1)) + ((1-np.vstack(inds1[:,j])) * inds1) + diffadd=np.delete(diffi, np.s_[0:j+1], axis=1) + diffadd0=np.concatenate((diffadd0,diffadd),axis=1) + + np.savetxt(fname=difffile, X=diffadd0, delimiter=",") + + + +def identicalDiff(indfile, infilter, posall, difffile, fil): + inds=np.loadtxt(indfile, dtype='float', delimiter=",") + + if int(fil)==1: + pos_fil=np.loadtxt(infilter, dtype='float', delimiter=",") + pos_all=np.loadtxt(posall, dtype='float', delimiter=",") + + filIdx = np.isin(pos_all,pos_fil) + + inds1=inds[filIdx] + elif int(fil)==0: + inds1=inds.copy() + + inds1[inds1==-9]=np.nan + np.savetxt(fname=difffile, X=inds1, delimiter=",") + + + +def getWin(diff, posf, dfile, tfile, interval): + df=np.loadtxt(diff, dtype='float', delimiter=',') + pos=np.loadtxt(posf, dtype='float', delimiter=',') + + length=pos[-1] + + bounds= np.arange(0,length,interval) + snp_bin=np.digitize(pos,bounds,right=True) + [uniq,ind]=np.unique(snp_bin,return_index=True) + ind= np.append(ind,len(pos)) + ind1= ind[1:-1] + + dfsub=np.vsplit(df,ind1) + ds=np.zeros([len(dfsub), np.shape(df)[1]]) + ts=np.zeros([len(dfsub), np.shape(df)[1]]) + for i in range(len(dfsub)): + ds[i]=np.nansum(dfsub[i],0) + ts[i]=np.sum(~np.isnan(dfsub[i]),0) + + winlen= len(ind)-1 + + totalwin=math.ceil(length/interval) + if winlen !=totalwin: + missingwins=np.setdiff1d(list(range(1,totalwin+1)), uniq) + ds=np.insert(ds,missingwins,np.zeros(np.shape(df)[1]),axis=0) + ts=np.insert(ts,missingwins,np.zeros(np.shape(df)[1]),axis=0) + + + np.savetxt(fname=dfile, X=ds, delimiter=",") + np.savetxt(fname=tfile, X=ts, delimiter=",") + + + +def mergeChrm(dinfiles, tinfiles, dfile, tfile, chfile): + """Merge per chromosome window files for a pair of individual""" + #merging the data files + + + df0=np.loadtxt(dinfiles[0], dtype='float', delimiter=',') + tf0=np.loadtxt(tinfiles[0], dtype='float', delimiter=',') + chrm0=np.ones(len(tf0)) + for fi in range(1,len(dinfiles)): + + df=np.loadtxt(dinfiles[fi], dtype='float', delimiter=',') + tf=np.loadtxt(tinfiles[fi], dtype='float', delimiter=',') + chrm=np.ones(len(tf)) * (fi+1) + + df0=np.concatenate((df0,df),axis=0) + tf0=np.concatenate((tf0,tf),axis=0) + chrm0=np.concatenate((chrm0,chrm),axis=0) + + np.savetxt(fname=dfile, X=df0, delimiter=",") + np.savetxt(fname=tfile, X=tf0, delimiter=",") + np.savetxt(fname=chfile, X=chrm0, delimiter=",") + + +def contamFilemiss(contaml, outfile_i, outfile_p, miss_contam, miss_ind): + + cnf=contaml + cnf[int(miss_ind)]=float(miss_contam) + inds=list(range(len(cnf))) + names=[] + contam=[] + for ch1 in range(len(cnf)-1): + for ch2 in range(ch1+1,len(cnf)): + names.append(str(inds[ch1]) + '_' + str(inds[ch2])) + ctotal=cnf[ch1] + cnf[ch2] + contam.append(ctotal) + + df_i= pd.DataFrame( + {'name': inds, + 'contam': 2*np.array(cnf) + }) + + df = pd.DataFrame( + {'name': names, + 'contam': contam + }) + + with pd.option_context('display.max_rows', len(df_i.index), 'display.max_columns', len(df_i.columns)): + df_i.to_csv(outfile_i, sep=',') + + with pd.option_context('display.max_rows', len(df.index), 'display.max_columns', len(df.columns)): + df.to_csv(outfile_p, sep=',') + + +def nhFile(genf,nhf,ind1,ind2,fil_posf,all_posf): + + gen0=np.array(pd.read_csv(genf[0], sep=',',index_col=0,header=0)) + all_pos0=np.loadtxt(all_posf[0], delimiter=',', dtype='float') + + fil_pos0=np.loadtxt(fil_posf[0], delimiter=',', dtype='float') + idx0=np.isin(all_pos0, fil_pos0) + allg=gen0[idx0] + + for i in range(1,len(genf)): + + gen=np.array(pd.read_csv(genf[i], sep=',',index_col=0,header=0)) + all_pos=np.loadtxt(all_posf[i], delimiter=',', dtype='float') + + fil_pos=np.loadtxt(fil_posf[i], delimiter=',', dtype='float') + idx=np.isin(all_pos,fil_pos) + gen1=gen[idx] + allg=np.concatenate((allg,gen1),axis=0) + + j1=allg[:,int(ind1*2)] + j2=allg[:,int(ind1*2 + 1)] + k1=allg[:,int(ind2*2)] + k2=allg[:,int(ind2*2) + 1] + + nD=j1+j2 + nA=2-nD + hD=k1+k2 + hA=2-hD + + + diff=((nA*hD)+(nD*hA)) / ((nA+nD)*(hA+hD)) + + nh_diff=np.mean(diff) + + with open(nhf, 'w') as f: + print(nh_diff,file=f) + + +def adjust_d(D_obs, c, p_c, p_e): + x1 = p_e * (1-c) + x2 = p_c * c + return D_obs * x1 / (x1 + x2) + +def adjust_s(S_obs, c, p_c, p_e): + x1 = (1-p_e) * (1-c) + x2 = (1-p_c) * c + x=S_obs * x1 / (x1 + x2) + x[np.isnan(x)]=0 + return x + +def adjust_sd(D_obs, N_obs, *args, **kwargs): + d_adj = adjust_d(D_obs, *args, **kwargs) + s_adj = adjust_s(N_obs - D_obs, *args, **kwargs) + return d_adj, d_adj+s_adj + + +def contamAll(dfile, tfile, iscnt, cfile, Dnhfile, difffile, totalfile): + + diff=np.loadtxt(dfile, delimiter=',', dtype='float') + total=np.loadtxt(tfile, delimiter=',', dtype='float') + + if str(iscnt)=='1': + with open(Dnhfile,"r") as f: + p_c=float(f.read()) + + + df=pd.read_csv(cfile, sep=",",header=0,index_col=0) + cnt1=np.array(df['contam']) + + c=cnt1/100 + + propmean=np.sum(diff, axis=0)/np.sum(total, axis=0) + + + p_e = (propmean - c * p_c)/ (1-c) + p_e[p_e>1]=1 + + d_cor, n_cor = adjust_sd(D_obs=diff, N_obs=total, p_e=p_e, p_c=p_c, c=c) + print(d_cor[np.isnan(d_cor)]) + + if np.all(np.where(np.isnan(np.sum(d_cor,0)))[0] == np.where(np.isnan(np.sum(n_cor,0)))[0]): + d_cor[np.isnan(d_cor)] = 0 + n_cor[np.isnan(n_cor)] = 0 + + + if(np.sum(np.isnan(d_cor))==0): + if(np.sum(np.isnan(n_cor))==0): + + + np.savetxt(fname=difffile, X=d_cor, delimiter=",") + np.savetxt(fname=totalfile, X=n_cor, delimiter=",") + + elif str(iscnt)=='0': + np.savetxt(fname=difffile, X=diff, delimiter=",") + np.savetxt(fname=totalfile, X=total, delimiter=",") + + +def getHighDiv(diff, total, fout): + + alld= np.loadtxt(diff, delimiter=',', dtype='float') + allt= np.loadtxt(total, delimiter=',', dtype='float') + + + prop=alld/allt + avgprop= np.nansum(prop,axis=1) / np.sum(~np.isnan(prop),1) + + m=np.mean(avgprop[~np.isnan(avgprop)]) + sd=np.sqrt(np.var(avgprop[~np.isnan(avgprop)])) + + #print(m,sd) + avgprop[np.isnan(avgprop)]=-9 + fres=np.where((avgprop>-9) & (avgprop>m+3*sd))[0] + np.savetxt(fout, fres, delimiter=',') + + + +def remHighDiv(diffname, totalname, winfile, outd, outt): + winsHD=np.loadtxt(winfile, ndmin=1).astype(int) + + df= np.loadtxt(diffname, delimiter=',', dtype='float') + dt= np.loadtxt(totalname, delimiter=',', dtype='float') + df[winsHD,:] = 0 + dt[winsHD,:] = 0 + + np.savetxt(outd, df, delimiter=',') + np.savetxt(outt, dt, delimiter=',') + + + + +def getP(dfile, tfile, targets, pfile, goodpairs, allp, overf): + """Calculate the median proportion of differences in all pair of individuals. This will be an input for hmm""" + med=[] + names=[] + + + + obsd= np.loadtxt(dfile, delimiter=',', dtype='float') + obst= np.loadtxt(tfile, delimiter=',', dtype='float') + + prop=np.sum(obsd, axis=0)/np.sum(obst, axis=0) + #chrm= np.loadtxt(chfile, delimiter=',', dtype='float') + #calculating p1 + goodlibs=np.sum(obst>0, axis=0)>10 + + obsd1=obsd[:,goodlibs] + obst1=obst[:,goodlibs] + prop1=prop[goodlibs] + + names=np.array(targets) + + names1=names[goodlibs] + + + med=np.median(prop1) + + + over=np.sum(obst, axis=0) + + with open(pfile, 'w') as f: + print(med,file=f) + #np.savetxt(fname=allp, X=med, delimiter=',') + np.savetxt(fname=goodpairs, fmt='%s', X=names1, delimiter='/n') + + df = pd.DataFrame( + {'pair': names, + 'prop': prop + }) + + with pd.option_context('display.max_rows', len(df.index), 'display.max_columns', len(df.columns)): + df.to_csv(allp, sep=',') + + + df_over=pd.DataFrame( + {'pair': names, + 'overlap': over + }) + + with pd.option_context('display.max_rows', len(df_over.index), 'display.max_columns', len(df_over.columns)): + df_over.to_csv(overf, sep=',') + + + +def inbredwin_transform(inputall,alls,targets,fi,outlist,totalch,interval,fil): + + tb_all=pd.read_csv(inputall,sep=',',header=0,index_col=0) + unique_id=np.unique(tb_all['id']) + l=len(fi) + allids=np.array(list(range(1,l+1))) + noinbred=np.setdiff1d(allids,unique_id,assume_unique=False) + + + totalch=np.array(totalch) + #print(totalch[0]) + for idn in unique_id: + tb=tb_all.loc[tb_all['id']==idn,:] + + + allsites=pd.read_csv(alls,sep='\t',header=None,index_col=False, low_memory=False) + allsites=allsites.loc[allsites[0].isin(totalch.astype(str)),:] + allsites[0]=allsites[0].astype(int) + + ind=allsites[0].diff()[allsites[0].diff() != 0].index.values - 1 + ind=ind[1:] + ind=np.append(ind,len(allsites[0])-1) + + + wint=[] + + for ch in totalch: + + length=allsites.loc[ind[ch - 1],1] + + inbred=tb.loc[tb['chrom']==ch,['start_pos','end_pos']] + for i in range(np.shape(inbred)[0]): + + pos=inbred.iloc[i,:] + + bounds= np.arange(0,length,interval) + snp_bin=np.digitize(pos,bounds,right=True) + #print(snp_bin) + if snp_bin[0]==snp_bin[1]: + bins=snp_bin[0] + else : + bins=list(range(snp_bin[0],snp_bin[1]+1)) + wint.append((ch,bins)) + + + + pair=targets[idn-1] + + + obs=pd.read_csv(fi[idn-1],sep=',',header=0,index_col=0) + data=obs[['dis','count']] + + + inbredwin=[] + + + for i in range(len(wint)): + winlist=[] + + if type(wint[i][1]) is list: + winlist.extend(wint[i][1]) + else: + winlist.append(wint[i][1]) + + in_ch=wint[i][0] + for j in range(len(winlist)): + in_win=winlist[j] + + + zero=obs.loc[obs["chrom"]==in_ch,:].index.values[0]-1 + inbredwin.append(zero+in_win) + + hbdstate=np.ones(len(data)) + hbdstate[inbredwin]=0 + #print(hbdstate) + np.savetxt(fname=outlist[idn-1], X=hbdstate) + + + for noin in noinbred: + pair=targets[noin-1] + + obs=pd.read_csv(fi[noin-1],sep=',',header=0,index_col=0) + data=obs[['dis','count']] + hbdstate=np.ones(len(data)) + np.savetxt(fname=outlist[noin-1], X=hbdstate)