fixing vcf parsing when there are dots in the alternative allele fiel…

…d which can encode for fixed locus - keeping the dot as alternative allele and adding additional allele depths of zero to represent them
jeffersonfparil · Jan 11, 2024 · 481dd5a · 481dd5a
1 parent 3e65793
commit 481dd5a
Show file tree

Hide file tree

Showing 16 changed files with 159 additions and 38,695 deletions.
diff --git a/res/LINKIMPUTE_INPUT-maf0.25-missing_rate0.8-55379007.tsv b/res/LINKIMPUTE_INPUT-maf0.25-missing_rate0.8-55379007.tsv
diff --git a/res/LINKIMPUTE_INPUT-maf0.25-missing_rate0.9-103760824.tsv b/res/LINKIMPUTE_INPUT-maf0.25-missing_rate0.9-103760824.tsv
diff --git a/res/grape-Coefficient_of_determination.svg b/res/grape-Coefficient_of_determination.svg
diff --git a/res/grape-Concordance.svg b/res/grape-Concordance.svg
diff --git a/res/grape-Mean_absolute_error.svg b/res/grape-Mean_absolute_error.svg
diff --git a/res/lucerne-Coefficient_of_determination.svg b/res/lucerne-Coefficient_of_determination.svg
diff --git a/res/lucerne-Concordance.svg b/res/lucerne-Concordance.svg
diff --git a/res/lucerne-Mean_absolute_error.svg b/res/lucerne-Mean_absolute_error.svg
diff --git a/res/perf.R b/res/perf.R
@@ -145,6 +145,11 @@ fn_imputation_accuracy = function(fname_imputed, list_sim_missing, ploidy=4, str
     # lukes = FALSE
     # mat_genotypes = NULL
     # n_threads = 32
+
+### Pick high-depth missing data points to recompute accuracies
+
+### Accuracies per range of allele freqs
+
     ### Extract imputed allele frequencies corresponding to the expected allele frequencies
     n_missing = length(list_sim_missing$vec_missing_loci)
     # vec_imputed = c()
@@ -267,23 +272,27 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra
     fname_out_linkimpute = paste0("LINKIMPUTE_INPUT-maf", maf, "-missing_rate", missing_rate, "-", rand_number_id,"-IMPUTED.csv")
     vcf_for_linkimpute = vcfR::read.vcfR(list_sim_missing$fname_vcf)
     mat_genotypes_for_linkimpute = t(fn_classify_allele_frequencies(fn_extract_allele_frequencies(vcf_for_linkimpute), ploidy=2)) * 2
-    mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1
-    write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
-    time_ini = Sys.time()
-    system(paste0("java -jar linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute))
-    duration_linkimpute = difftime(Sys.time(), time_ini, units="mins")
-    mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE)
-    rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute)
-    colnames(mat_linkimputed) = colnames(mat_genotypes_for_linkimpute)
-    mat_linkimputed = t(mat_linkimputed / 2)
-    list_loci_names = strsplit(rownames(mat_linkimputed), "_")
-    chr = unlist(lapply(list_loci_names, FUN=function(x){(paste(x[1:(length(x)-2)], collapse="_"))}))
-    pos = unlist(lapply(list_loci_names, FUN=function(x){as.numeric(x[(length(x)-1)])}))
-    allele = unlist(lapply(list_loci_names, FUN=function(x){x[length(x)]}))
-    df_linkimputed = data.frame(chr, pos, allele)
-    df_linkimputed = cbind(df_linkimputed, mat_linkimputed)
-    colnames(df_linkimputed)[1] = "#chr"
-    write.table(df_linkimputed, file=fname_out_linkimpute, sep=",", row.names=FALSE, col.names=TRUE, quote=FALSE)
+    bool_enough_data_to_simulate_10k_missing = prod(dim(mat_genotypes_for_linkimpute)) >= (11000)
+    if (bool_enough_data_to_simulate_10k_missing == TRUE) {
+        ### LinkImpute stalls if it cannot mask 10,000 data points for optimising l and k, because the number of non-missing data points is not enough to reach the fixed 10,000 random data points.
+        mat_genotypes_for_linkimpute[is.na(mat_genotypes_for_linkimpute)] = -1
+        write.table(mat_genotypes_for_linkimpute, file=fname_for_linkimpute, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
+        time_ini = Sys.time()
+        system(paste0("java -jar linkimpute/LinkImpute.jar --verbose -a ", fname_for_linkimpute, " ", output_for_linkimpute))
+        duration_linkimpute = difftime(Sys.time(), time_ini, units="mins")
+        mat_linkimputed = read.delim(output_for_linkimpute, header=FALSE)
+        rownames(mat_linkimputed) = rownames(mat_genotypes_for_linkimpute)
+        colnames(mat_linkimputed) = colnames(mat_genotypes_for_linkimpute)
+        mat_linkimputed = t(mat_linkimputed / 2)
+        list_loci_names = strsplit(rownames(mat_linkimputed), "_")
+        chr = unlist(lapply(list_loci_names, FUN=function(x){(paste(x[1:(length(x)-2)], collapse="_"))}))
+        pos = unlist(lapply(list_loci_names, FUN=function(x){as.numeric(x[(length(x)-1)])}))
+        allele = unlist(lapply(list_loci_names, FUN=function(x){x[length(x)]}))
+        df_linkimputed = data.frame(chr, pos, allele)
+        df_linkimputed = cbind(df_linkimputed, mat_linkimputed)
+        colnames(df_linkimputed)[1] = "#chr"
+        write.table(df_linkimputed, file=fname_out_linkimpute, sep=",", row.names=FALSE, col.names=TRUE, quote=FALSE)
+    }
     ### Validating imputation
     metrics_mvi = fn_imputation_accuracy(fname_imputed=fname_out_mvi,
         list_sim_missing=list_sim_missing,
@@ -305,11 +314,24 @@ fn_test_imputation = function(vcf, mat_genotypes, ploidy=4, maf=0.25, missing_ra
         ploidy=ploidy,
         strict_boundaries=strict_boundaries,
         n_threads=n_threads)
-    metrics_linkimpute = fn_imputation_accuracy(fname_imputed=fname_out_linkimpute,
-        list_sim_missing=list_sim_missing,
-        ploidy=2,
-        strict_boundaries=strict_boundaries,
-        n_threads=n_threads)
+    if (bool_enough_data_to_simulate_10k_missing == TRUE) {
+        metrics_linkimpute = fn_imputation_accuracy(fname_imputed=fname_out_linkimpute,
+            list_sim_missing=list_sim_missing,
+            ploidy=2,
+            strict_boundaries=strict_boundaries,
+            n_threads=n_threads)
+    } else {
+        metrics_linkimpute = list(
+            frac_imputed = 0.0,
+            mae_freqs = NA,
+            rmse_freqs = NA,
+            r2_freqs = NA,
+            mae_classes = NA,
+            rmse_classes = NA,
+            r2_classes = NA,
+            concordance_classes = NA
+        )
+    }
     ### Merge imputation accuracy metrics into the output data.frame
     # string_metric_lists = c("metrics_mvi", "metrics_aldknni", "metrics_lukes")
     string_metric_lists = c("metrics_mvi", "metrics_aldknni", "metrics_aldknni_optim_thresholds", "metrics_aldknni_optim_counts", "metrics_linkimpute")

diff --git a/res/perf.md b/res/perf.md
@@ -1,6 +1,6 @@
 # Assessing imputation accuracies
 
-## Variables
+## Variables and datasets
 
 - 3 imputation algorithms: 
     + **ALDKNNI**: adaptive LD-kNN imputation
@@ -34,6 +34,7 @@
 
 ```shell
 #!/bin/bash
+conda activate bcftools
 DIR=/group/pasture/Jeff/imputef/misc
 cd $DIR
 ### (2) pools of diploid *Glycine max*
@@ -55,6 +56,75 @@ time Rscript \
 mv LinkImpute.data.grape.num.raw.txt.vcf grape.vcf
 ```
 
+*poolify.R* - assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)
+
+```R
+args = commandArgs(trailingOnly=TRUE)
+# args = c("/group/pasture/Jeff/imputef/misc/soybean-indi_temp.vcf", "42", "100", "/group/pasture/Jeff/imputef/misc/soybean.vcf")
+vcf_fname = args[1]
+min_pool_size = as.numeric(args[2])
+depth = as.numeric(args[3])
+out_fname = args[4]
+print("###############################################################################################################")
+print("Pooling individual diploid genotypes, i.e. each pool is comprised of {min_pool_size} most closely-related samples using k-means clustering using 1000 evenly indexes loci")
+print("Note: assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)")
+vcf = vcfR::read.vcfR(vcf_fname)
+vec_loci_names = paste(vcfR::getCHROM(vcf), vcfR::getPOS(vcf), vcfR::getREF(vcf), sep="_")
+vec_sample_names = colnames(vcf@gt)[-1]
+### Extract biallelic diploid allele frequencies
+G = matrix(NA, nrow=length(vec_loci_names), ncol=length(vec_sample_names))
+GT = vcfR::extract.gt(vcf, element="GT")
+G[(GT == "0/0") | (GT == "0|0")] = 1.0
+G[(GT == "1/1") | (GT == "1|1")] = 0.0
+G[(GT == "0/1") | (GT == "0|1") | (GT == "1|0")] = 0.5
+rm(GT)
+gc()
+rownames(G) = vec_loci_names
+colnames(G) = vec_sample_names
+### Create n pools of the most related individuals
+print("Clustering. This may take a while.")
+n = length(vec_sample_names)
+for (i in )
+
+
+
+
+p = length(vec_loci_names)
+C = t(G[seq(from=1, to=p, length=1000), ])
+clustering = kmeans(x=C, centers=200)
+vec_clusters = table(clustering$cluster)
+vec_clusters = vec_clusters[vec_clusters >= min_pool_size]
+n = length(vec_clusters)
+GT = matrix("AD", nrow=p, ncol=n+1)
+pb = txtProgressBar(min=0, max=n, initial=0, style=3)
+for (i in 1:n) {
+    # i = 1
+    # ini = ((i-1)*min_pool_size) + 1
+    # fin = ((i-1)*min_pool_size) + min_pool_size
+    # q = rowMeans(G[, ini:fin])
+    idx = which(clustering$cluster == i)
+    q = rowMeans(G[, idx])
+    for (j in 1:p) {
+        # j = 1
+        ref = rbinom(n=1, size=depth, prob=q[j])
+        alt = rbinom(n=1, size=depth, prob=(1-q[j]))
+        GT[j, (i+1)] = paste0(ref, ",", alt) ### skipping the first column containing the FORMAT field "AD"
+    }
+    setTxtProgressBar(pb, i)
+}
+close(pb)
+colnames(GT) = c("FORMAT", paste0("Pool-", c(1:n)))
+### Create the output vcfR object
+vcf_out = vcf
+str(vcf_out)
+str(vcf_out@gt)
+vcf_out@gt = GT
+fname_vcf_gz = paste0(out_fname, ".gz")
+vcfR::write.vcf(vcf_out, file=fname_vcf_gz)
+system(paste0("gunzip -f ", fname_vcf_gz))
+print(paste0("Output: ", out_fname))
+```
+
 *ssv2vcf.R* - convert space-delimited genotype data from LinkImpute paper
 
 ```R
@@ -65,29 +135,28 @@ fname_geno_txt = args[1]
 fname_geno_dummy_vcf = args[2]
 dat = read.table(fname_geno_txt, header=TRUE, sep=" ")
 vcf = vcfR::read.vcfR(fname_geno_dummy_vcf)
-str(dat)
 idx_col_start = 7
 n = nrow(dat)
 p = ncol(dat) - (idx_col_start-1)
 vec_loci_names = colnames(dat)[idx_col_start:ncol(dat)]
 vec_pool_names = dat$IID
 G = dat[, idx_col_start:ncol(dat)]
-### Missing data assessment
-idx_row_without_missing = apply(G, MARGIN=1, FUN=function(x){sum(is.na(x))==0})
-idx_col_without_missing = apply(G, MARGIN=2, FUN=function(x){sum(is.na(x))==0})
-sum(idx_row_without_missing)
-sum(idx_col_without_missing)
-sum(is.na(G)) / prod(dim(G))
-### Fill missing with mean (because LinkImpute fails to finish running at MAF=0.25 for reasons I do not know)
-ploidy = 2
-pb = txtProgressBar(min=0, max=p, style=3)
-for (j in 1:p) {
-    # j = 1
-    idx_missing = which(is.na(G[, j]))
-    G[idx_missing, j] = round(mean(G[, j], na.rm=TRUE) * ploidy)
-    setTxtProgressBar(pb, j)
-}
-close(pb)
+# ### Missing data assessment
+# idx_row_without_missing = apply(G, MARGIN=1, FUN=function(x){sum(is.na(x))==0})
+# idx_col_without_missing = apply(G, MARGIN=2, FUN=function(x){sum(is.na(x))==0})
+# sum(idx_row_without_missing)
+# sum(idx_col_without_missing)
+# sum(is.na(G)) / prod(dim(G))
+# ### Fill missing with mean (because LinkImpute fails to finish running at MAF=0.25 for reasons I do not know)
+# ploidy = 2
+# pb = txtProgressBar(min=0, max=p, style=3)
+# for (j in 1:p) {
+#     # j = 1
+#     idx_missing = which(is.na(G[, j]))
+#     G[idx_missing, j] = round(mean(G[, j], na.rm=TRUE) * ploidy)
+#     setTxtProgressBar(pb, j)
+# }
+# close(pb)
 ### Create meta, fit and gt fields of the vcfR object
 mat_loci_ids = matrix(gsub("^X", "chr_", unlist(strsplit(unlist(strsplit(vec_loci_names, "[.]")), "_"))), ncol=3, byrow=TRUE)
 ### Randomly choose the alternative alleles as the genotype data (*.raw file) do not have that information.
@@ -138,69 +207,6 @@ vcf_new_loaded = vcfR::read.vcfR(paste0(fname_geno_txt, ".vcf"))
 print(vcf_new_loaded)
 ```
 
-*poolify.R* - assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)
-
-```R
-args = commandArgs(trailingOnly=TRUE)
-# args = c("soybean-indi_temp.vcf", "42", "100", "soybean.vcf")
-vcf_fname = args[1]
-min_pool_size = as.numeric(args[2])
-depth = as.numeric(args[3])
-out_fname = args[4]
-print("###############################################################################################################")
-print("Pooling individual diploid genotypes, i.e. each pool is comprised of {min_pool_size} most closely-related samples using k-means clustering using 1000 evenly indexes loci")
-print("Note: assumes each individual per pool are perfectly equally represented (best-case scenario in the real-world)")
-vcf = vcfR::read.vcfR(vcf_fname)
-vec_loci_names = paste(vcfR::getCHROM(vcf), vcfR::getPOS(vcf), vcfR::getREF(vcf), sep="_")
-vec_pool_names = colnames(vcf@gt)[-1]
-### Extract biallelic diploid allele frequencies
-G = matrix(0.5, nrow=length(vec_loci_names), ncol=length(vec_pool_names))
-GT = vcfR::extract.gt(vcf, element="GT")
-G[(GT == "0/0") | (GT == "0|0")] = 1.0
-G[(GT == "1/1") | (GT == "1|1")] = 0.0
-# G[(GT == "0/1") | (GT == "0|1") | (GT == "1|0")] = 0.5
-rm(GT)
-gc()
-rownames(G) = vec_loci_names
-colnames(G) = vec_pool_names
-### Create n pools of the most related individuals
-print("Clustering. This may take a while.")
-p = length(vec_loci_names)
-C = t(G[seq(from=1, to=p, length=1000), ])
-clustering = kmeans(x=C, centers=200)
-vec_clusters = table(clustering$cluster)
-vec_clusters = vec_clusters[vec_clusters >= min_pool_size]
-n = length(vec_clusters)
-GT = matrix("AD", nrow=p, ncol=n+1)
-pb = txtProgressBar(min=0, max=n, initial=0, style=3)
-for (i in 1:n) {
-    # i = 1
-    # ini = ((i-1)*min_pool_size) + 1
-    # fin = ((i-1)*min_pool_size) + min_pool_size
-    # q = rowMeans(G[, ini:fin])
-    idx = which(clustering$cluster == i)
-    q = rowMeans(G[, idx])
-    for (j in 1:p) {
-        # j = 1
-        ref = rbinom(n=1, size=depth, prob=q[j])
-        alt = rbinom(n=1, size=depth, prob=(1-q[j]))
-        GT[j, (i+1)] = paste0(ref, ",", alt) ### skipping the first column containing the FORMAT field "AD"
-    }
-    setTxtProgressBar(pb, i)
-}
-close(pb)
-colnames(GT) = c("FORMAT", paste0("Pool-", c(1:n)))
-### Create the output vcfR object
-vcf_out = vcf
-str(vcf_out)
-str(vcf_out@gt)
-vcf_out@gt = GT
-fname_vcf_gz = paste0(out_fname, ".gz")
-vcfR::write.vcf(vcf_out, file=fname_vcf_gz)
-system(paste0("gunzip -f ", fname_vcf_gz))
-print(paste0("Output: ", out_fname))
-```
-
 
 ## Assess the genetic relationships between samples per dataset
 

diff --git a/res/rustenv.yml b/res/rustenv.yml
@@ -9,12 +9,13 @@ dependencies:
   - _r-mutex=1.0.0=anacondar_1
   - alsa-lib=1.2.6.1=h7f98852_0
   - bat=0.24.0=he8a937b_0
+  - bcftools=1.5=3
   - binutils_impl_linux-64=2.40=hf600244_0
   - blas=1.0=openblas
   - bwidget=1.9.11=1
   - bzip2=1.0.8=h7b6447c_0
   - c-ares=1.19.1=h5eee18b_0
-  - ca-certificates=2023.11.17=hbcca054_0
+  - ca-certificates=2023.12.12=h06a4308_0
   - cairo=1.18.0=h3faef2a_0
   - curl=8.4.0=hdbd6064_1
   - expat=2.5.0=h6a678d5_0
@@ -49,8 +50,10 @@ dependencies:
   - libev=4.33=h7f8727e_1
   - libevent=2.1.10=h28343ad_4
   - libffi=3.4.4=h6a678d5_0
+  - libgcc=7.2.0=h69d50b8_2
   - libgcc-devel_linux-64=13.2.0=ha9c7c90_103
   - libgcc-ng=13.2.0=h807b86a_2
+  - libgfortran=3.0.0=1
   - libgfortran-ng=11.2.0=h00389a5_1
   - libgfortran5=11.2.0=h1234567_1
   - libgit2=1.6.4=h2d74bed_0
@@ -77,6 +80,7 @@ dependencies:
   - make=4.2.1=h1bed415_1
   - micro=2.0.8=ha8f183a_1
   - ncurses=6.4=h6a678d5_0
+  - openblas=0.3.4=ha44fe06_0
   - openjdk=8.0.382=hd590300_0
   - openssl=3.2.0=hd590300_1
   - pandoc=2.12=h06a4308_3
@@ -217,8 +221,10 @@ dependencies:
   - readline=8.2=h5eee18b_0
   - rust=1.73.0=h70c747d_1
   - rust-std-x86_64-unknown-linux-gnu=1.73.0=hc1431ca_1
+  - samtools=1.6=hc3601fc_10
   - sed=4.8=he412f7d_0
   - sysroot_linux-64=2.12=he073ed8_16
+  - tabix=0.2.6=ha92aebf_0
   - tk=8.6.13=noxft_h4845f30_101
   - tktable=2.10=h14c3975_0
   - tmux=3.3=h385fc29_0
@@ -241,4 +247,4 @@ dependencies:
   - xz=5.4.5=h5eee18b_0
   - zlib=1.2.13=hd590300_5
   - zstd=1.5.5=hc292b87_0
-prefix: /home/jp3h/.conda/envs/rustenv
+