Skip to content

Commit

Permalink
fixing vcf parsing when there are dots in the alternative allele fiel…
Browse files Browse the repository at this point in the history
…d which can encode for fixed locus - keeping the dot as alternative allele and adding additional allele depths of zero to represent them
  • Loading branch information
jeffersonfparil committed Jan 11, 2024
1 parent 3e65793 commit 481dd5a
Show file tree
Hide file tree
Showing 16 changed files with 159 additions and 38,695 deletions.
77 changes: 0 additions & 77 deletions res/LINKIMPUTE_INPUT-maf0.25-missing_rate0.8-55379007.tsv

This file was deleted.

77 changes: 0 additions & 77 deletions res/LINKIMPUTE_INPUT-maf0.25-missing_rate0.9-103760824.tsv

This file was deleted.

5,624 changes: 0 additions & 5,624 deletions res/grape-Coefficient_of_determination.svg

This file was deleted.

5,443 changes: 0 additions & 5,443 deletions res/grape-Concordance.svg

This file was deleted.

5,547 changes: 0 additions & 5,547 deletions res/grape-Mean_absolute_error.svg

This file was deleted.

4,007 changes: 0 additions & 4,007 deletions res/lucerne-Coefficient_of_determination.svg

This file was deleted.

3,861 changes: 0 additions & 3,861 deletions res/lucerne-Concordance.svg

This file was deleted.

3,799 changes: 0 additions & 3,799 deletions res/lucerne-Mean_absolute_error.svg

This file was deleted.

66 changes: 44 additions & 22 deletions res/perf.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@ fn_imputation_accuracy = function(fname_imputed, list_sim_missing, ploidy=4, str
# lukes = FALSE
# mat_genotypes = NULL
# n_threads = 32

### Pick high-depth missing data points to recompute accuracies

### Accuracies per range of allele freqs

### Extract imputed allele frequencies corresponding to the expected allele frequencies
n_missing = length(list_sim_missing$vec_missing_loci)
# vec_imputed = c()
Expand Down Expand Up @@ -267,23 +272,27 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra
fname_out_linkimpute = paste0("LINKIMPUTE_INPUT-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id,"-IMPUTED.csv")
vcf_for_linkimpute = vcfR::read.vcfR(list_sim_missing$fname_vcf)
mat_genotypes_for_linkimpute = t(fn_classify_allele_frequencies(fn_extract_allele_frequencies(vcf_for_linkimpute), ploidy=2)) * 2
mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1
write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
time_ini = Sys.time()
system(paste0("java -jar linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute))
duration_linkimpute = difftime(Sys.time(), time_ini, units="mins")
mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE)
rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute)
colnames(mat_linkimputed) = colnames(mat_genotypes_for_linkimpute)
mat_linkimputed = t(mat_linkimputed / 2)
list_loci_names = strsplit(rownames(mat_linkimputed), "_")
chr = unlist(lapply(list_loci_names, FUN=function(x){(paste(x[1:(length(x)-2)], collapse="_"))}))
pos = unlist(lapply(list_loci_names, FUN=function(x){as.numeric(x[(length(x)-1)])}))
allele = unlist(lapply(list_loci_names, FUN=function(x){x[length(x)]}))
df_linkimputed = data.frame(chr, pos, allele)
df_linkimputed = cbind(df_linkimputed, mat_linkimputed)
colnames(df_linkimputed)[1] = "#chr"
write.table(df_linkimputed, file=fname_out_linkimpute, sep=",", row.names=FALSE, col.names=TRUE, quote=FALSE)
bool_enough_data_to_simulate_10k_missing = prod(dim(mat_genotypes_for_linkimpute)) >= (11000)
if (bool_enough_data_to_simulate_10k_missing == TRUE) {
### LinkImpute stalls if it cannot mask 10,000 data points for optimising l and k, because the number of non-missing data points is not enough to reach the fixed 10,000 random data points.
mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1
write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
time_ini = Sys.time()
system(paste0("java -jar linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute))
duration_linkimpute = difftime(Sys.time(), time_ini, units="mins")
mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE)
rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute)
colnames(mat_linkimputed) = colnames(mat_genotypes_for_linkimpute)
mat_linkimputed = t(mat_linkimputed / 2)
list_loci_names = strsplit(rownames(mat_linkimputed), "_")
chr = unlist(lapply(list_loci_names, FUN=function(x){(paste(x[1:(length(x)-2)], collapse="_"))}))
pos = unlist(lapply(list_loci_names, FUN=function(x){as.numeric(x[(length(x)-1)])}))
allele = unlist(lapply(list_loci_names, FUN=function(x){x[length(x)]}))
df_linkimputed = data.frame(chr, pos, allele)
df_linkimputed = cbind(df_linkimputed, mat_linkimputed)
colnames(df_linkimputed)[1] = "#chr"
write.table(df_linkimputed, file=fname_out_linkimpute, sep=",", row.names=FALSE, col.names=TRUE, quote=FALSE)
}
### Validating imputation
metrics_mvi = fn_imputation_accuracy(fname_imputed=fname_out_mvi,
list_sim_missing=list_sim_missing,
Expand All @@ -305,11 +314,24 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra
ploidy=ploidy,
strict_boundaries=strict_boundaries,
n_threads=n_threads)
metrics_linkimpute = fn_imputation_accuracy(fname_imputed=fname_out_linkimpute,
list_sim_missing=list_sim_missing,
ploidy=2,
strict_boundaries=strict_boundaries,
n_threads=n_threads)
if (bool_enough_data_to_simulate_10k_missing == TRUE) {
metrics_linkimpute = fn_imputation_accuracy(fname_imputed=fname_out_linkimpute,
list_sim_missing=list_sim_missing,
ploidy=2,
strict_boundaries=strict_boundaries,
n_threads=n_threads)
} else {
metrics_linkimpute = list(
frac_imputed = 0.0,
mae_freqs = NA,
rmse_freqs = NA,
r2_freqs = NA,
mae_classes = NA,
rmse_classes = NA,
r2_classes = NA,
concordance_classes = NA
)
}
### Merge imputation accuracy metrics into the output data.frame
# string_metric_lists = c("metrics_mvi", "metrics_aldknni", "metrics_lukes")
string_metric_lists = c("metrics_mvi", "metrics_aldknni", "metrics_aldknni_optim_thresholds", "metrics_aldknni_optim_counts", "metrics_linkimpute")
Expand Down
168 changes: 87 additions & 81 deletions res/perf.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Assessing imputation accuracies

## Variables
## Variables and datasets

- 3 imputation algorithms:
+ **ALDKNNI**: adaptive LD-kNN imputation
Expand Down Expand Up @@ -34,6 +34,7 @@

```shell
#!/bin/bash
conda activate bcftools
DIR=/group/pasture/Jeff/imputef/misc
cd $DIR
### (2) pools of diploid *Glycine max*
Expand All @@ -55,6 +56,75 @@ time Rscript \
mv LinkImpute.data.grape.num.raw.txt.vcf grape.vcf
```

*poolify.R* - assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)

```R
args = commandArgs(trailingOnly=TRUE)
# args = c("/group/pasture/Jeff/imputef/misc/soybean-indi_temp.vcf", "42", "100", "/group/pasture/Jeff/imputef/misc/soybean.vcf")
vcf_fname = args[1]
min_pool_size = as.numeric(args[2])
depth = as.numeric(args[3])
out_fname = args[4]
print("###############################################################################################################")
print("Pooling individual diploid genotypes, i.e. each pool is comprised of {min_pool_size} most closely-related samples using k-means clustering using 1000 evenly indexes loci")
print("Note: assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)")
vcf = vcfR::read.vcfR(vcf_fname)
vec_loci_names = paste(vcfR::getCHROM(vcf), vcfR::getPOS(vcf), vcfR::getREF(vcf), sep="_")
vec_sample_names = colnames(vcf@gt)[-1]
### Extract biallelic diploid allele frequencies
G = matrix(NA, nrow=length(vec_loci_names), ncol=length(vec_sample_names))
GT = vcfR::extract.gt(vcf, element="GT")
G[(GT == "0/0") | (GT == "0|0")] = 1.0
G[(GT == "1/1") | (GT == "1|1")] = 0.0
G[(GT == "0/1") | (GT == "0|1") | (GT == "1|0")] = 0.5
rm(GT)
gc()
rownames(G) = vec_loci_names
colnames(G) = vec_sample_names
### Create n pools of the most related individuals
print("Clustering. This may take a while.")
n = length(vec_sample_names)
for (i in )




p = length(vec_loci_names)
C = t(G[seq(from=1, to=p, length=1000), ])
clustering = kmeans(x=C, centers=200)
vec_clusters = table(clustering$cluster)
vec_clusters = vec_clusters[vec_clusters >= min_pool_size]
n = length(vec_clusters)
GT = matrix("AD", nrow=p, ncol=n+1)
pb = txtProgressBar(min=0, max=n, initial=0, style=3)
for (i in 1:n) {
# i = 1
# ini = ((i-1)*min_pool_size) + 1
# fin = ((i-1)*min_pool_size) + min_pool_size
# q = rowMeans(G[, ini:fin])
idx = which(clustering$cluster == i)
q = rowMeans(G[, idx])
for (j in 1:p) {
# j = 1
ref = rbinom(n=1, size=depth, prob=q[j])
alt = rbinom(n=1, size=depth, prob=(1-q[j]))
GT[j, (i+1)] = paste0(ref, ",", alt) ### skipping the first column containing the FORMAT field "AD"
}
setTxtProgressBar(pb, i)
}
close(pb)
colnames(GT) = c("FORMAT", paste0("Pool-", c(1:n)))
### Create the output vcfR object
vcf_out = vcf
str(vcf_out)
str(vcf_out@gt)
vcf_out@gt = GT
fname_vcf_gz = paste0(out_fname, ".gz")
vcfR::write.vcf(vcf_out, file=fname_vcf_gz)
system(paste0("gunzip -f ", fname_vcf_gz))
print(paste0("Output: ", out_fname))
```

*ssv2vcf.R* - convert space-delimited genotype data from LinkImpute paper

```R
Expand All @@ -65,29 +135,28 @@ fname_geno_txt = args[1]
fname_geno_dummy_vcf = args[2]
dat = read.table(fname_geno_txt, header=TRUE, sep=" ")
vcf = vcfR::read.vcfR(fname_geno_dummy_vcf)
str(dat)
idx_col_start = 7
n = nrow(dat)
p = ncol(dat) - (idx_col_start-1)
vec_loci_names = colnames(dat)[idx_col_start:ncol(dat)]
vec_pool_names = dat$IID
G = dat[, idx_col_start:ncol(dat)]
### Missing data assessment
idx_row_without_missing = apply(G, MARGIN=1, FUN=function(x){sum(is.na(x))==0})
idx_col_without_missing = apply(G, MARGIN=2, FUN=function(x){sum(is.na(x))==0})
sum(idx_row_without_missing)
sum(idx_col_without_missing)
sum(is.na(G)) / prod(dim(G))
### Fill missing with mean (because LinkImpute fails to finish running at MAF=0.25 for reasons I do not know)
ploidy = 2
pb = txtProgressBar(min=0, max=p, style=3)
for (j in 1:p) {
# j = 1
idx_missing = which(is.na(G[, j]))
G[idx_missing, j] = round(mean(G[, j], na.rm=TRUE) * ploidy)
setTxtProgressBar(pb, j)
}
close(pb)
# ### Missing data assessment
# idx_row_without_missing = apply(G, MARGIN=1, FUN=function(x){sum(is.na(x))==0})
# idx_col_without_missing = apply(G, MARGIN=2, FUN=function(x){sum(is.na(x))==0})
# sum(idx_row_without_missing)
# sum(idx_col_without_missing)
# sum(is.na(G)) / prod(dim(G))
# ### Fill missing with mean (because LinkImpute fails to finish running at MAF=0.25 for reasons I do not know)
# ploidy = 2
# pb = txtProgressBar(min=0, max=p, style=3)
# for (j in 1:p) {
# # j = 1
# idx_missing = which(is.na(G[, j]))
# G[idx_missing, j] = round(mean(G[, j], na.rm=TRUE) * ploidy)
# setTxtProgressBar(pb, j)
# }
# close(pb)
### Create meta, fit and gt fields of the vcfR object
mat_loci_ids = matrix(gsub("^X", "chr_", unlist(strsplit(unlist(strsplit(vec_loci_names, "[.]")), "_"))), ncol=3, byrow=TRUE)
### Randomly choose the alternative alleles as the genotype data (*.raw file) do not have that information.
Expand Down Expand Up @@ -138,69 +207,6 @@ vcf_new_loaded = vcfR::read.vcfR(paste0(fname_geno_txt, ".vcf"))
print(vcf_new_loaded)
```

*poolify.R* - assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)

```R
args = commandArgs(trailingOnly=TRUE)
# args = c("soybean-indi_temp.vcf", "42", "100", "soybean.vcf")
vcf_fname = args[1]
min_pool_size = as.numeric(args[2])
depth = as.numeric(args[3])
out_fname = args[4]
print("###############################################################################################################")
print("Pooling individual diploid genotypes, i.e. each pool is comprised of {min_pool_size} most closely-related samples using k-means clustering using 1000 evenly indexes loci")
print("Note: assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)")
vcf = vcfR::read.vcfR(vcf_fname)
vec_loci_names = paste(vcfR::getCHROM(vcf), vcfR::getPOS(vcf), vcfR::getREF(vcf), sep="_")
vec_pool_names = colnames(vcf@gt)[-1]
### Extract biallelic diploid allele frequencies
G = matrix(0.5, nrow=length(vec_loci_names), ncol=length(vec_pool_names))
GT = vcfR::extract.gt(vcf, element="GT")
G[(GT == "0/0") | (GT == "0|0")] = 1.0
G[(GT == "1/1") | (GT == "1|1")] = 0.0
# G[(GT == "0/1") | (GT == "0|1") | (GT == "1|0")] = 0.5
rm(GT)
gc()
rownames(G) = vec_loci_names
colnames(G) = vec_pool_names
### Create n pools of the most related individuals
print("Clustering. This may take a while.")
p = length(vec_loci_names)
C = t(G[seq(from=1, to=p, length=1000), ])
clustering = kmeans(x=C, centers=200)
vec_clusters = table(clustering$cluster)
vec_clusters = vec_clusters[vec_clusters >= min_pool_size]
n = length(vec_clusters)
GT = matrix("AD", nrow=p, ncol=n+1)
pb = txtProgressBar(min=0, max=n, initial=0, style=3)
for (i in 1:n) {
# i = 1
# ini = ((i-1)*min_pool_size) + 1
# fin = ((i-1)*min_pool_size) + min_pool_size
# q = rowMeans(G[, ini:fin])
idx = which(clustering$cluster == i)
q = rowMeans(G[, idx])
for (j in 1:p) {
# j = 1
ref = rbinom(n=1, size=depth, prob=q[j])
alt = rbinom(n=1, size=depth, prob=(1-q[j]))
GT[j, (i+1)] = paste0(ref, ",", alt) ### skipping the first column containing the FORMAT field "AD"
}
setTxtProgressBar(pb, i)
}
close(pb)
colnames(GT) = c("FORMAT", paste0("Pool-", c(1:n)))
### Create the output vcfR object
vcf_out = vcf
str(vcf_out)
str(vcf_out@gt)
vcf_out@gt = GT
fname_vcf_gz = paste0(out_fname, ".gz")
vcfR::write.vcf(vcf_out, file=fname_vcf_gz)
system(paste0("gunzip -f ", fname_vcf_gz))
print(paste0("Output: ", out_fname))
```


## Assess the genetic relationships between samples per dataset

Expand Down
10 changes: 8 additions & 2 deletions res/rustenv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ dependencies:
- _r-mutex=1.0.0=anacondar_1
- alsa-lib=1.2.6.1=h7f98852_0
- bat=0.24.0=he8a937b_0
- bcftools=1.5=3
- binutils_impl_linux-64=2.40=hf600244_0
- blas=1.0=openblas
- bwidget=1.9.11=1
- bzip2=1.0.8=h7b6447c_0
- c-ares=1.19.1=h5eee18b_0
- ca-certificates=2023.11.17=hbcca054_0
- ca-certificates=2023.12.12=h06a4308_0
- cairo=1.18.0=h3faef2a_0
- curl=8.4.0=hdbd6064_1
- expat=2.5.0=h6a678d5_0
Expand Down Expand Up @@ -49,8 +50,10 @@ dependencies:
- libev=4.33=h7f8727e_1
- libevent=2.1.10=h28343ad_4
- libffi=3.4.4=h6a678d5_0
- libgcc=7.2.0=h69d50b8_2
- libgcc-devel_linux-64=13.2.0=ha9c7c90_103
- libgcc-ng=13.2.0=h807b86a_2
- libgfortran=3.0.0=1
- libgfortran-ng=11.2.0=h00389a5_1
- libgfortran5=11.2.0=h1234567_1
- libgit2=1.6.4=h2d74bed_0
Expand All @@ -77,6 +80,7 @@ dependencies:
- make=4.2.1=h1bed415_1
- micro=2.0.8=ha8f183a_1
- ncurses=6.4=h6a678d5_0
- openblas=0.3.4=ha44fe06_0
- openjdk=8.0.382=hd590300_0
- openssl=3.2.0=hd590300_1
- pandoc=2.12=h06a4308_3
Expand Down Expand Up @@ -217,8 +221,10 @@ dependencies:
- readline=8.2=h5eee18b_0
- rust=1.73.0=h70c747d_1
- rust-std-x86_64-unknown-linux-gnu=1.73.0=hc1431ca_1
- samtools=1.6=hc3601fc_10
- sed=4.8=he412f7d_0
- sysroot_linux-64=2.12=he073ed8_16
- tabix=0.2.6=ha92aebf_0
- tk=8.6.13=noxft_h4845f30_101
- tktable=2.10=h14c3975_0
- tmux=3.3=h385fc29_0
Expand All @@ -241,4 +247,4 @@ dependencies:
- xz=5.4.5=h5eee18b_0
- zlib=1.2.13=hd590300_5
- zstd=1.5.5=hc292b87_0
prefix: /home/jp3h/.conda/envs/rustenv

Loading

0 comments on commit 481dd5a

Please sign in to comment.