Skip to content

Commit

Permalink
updated soybean pool data with hopefully better clustering + resolved…
Browse files Browse the repository at this point in the history
… issue #4
  • Loading branch information
jeffersonfparil committed Jan 11, 2024
1 parent 67a3c4c commit 211505a
Show file tree
Hide file tree
Showing 3 changed files with 326 additions and 24 deletions.
279 changes: 279 additions & 0 deletions res/bcftools.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
name: bcftools
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- _r-mutex=1.0.1=anacondar_1
- alsa-lib=1.2.8=h166bdaf_0
- arpack=3.7.0=hdefa2d7_2
- bat=0.23.0=h0f5d50b_0
- bcftools=1.16=hfe4b78e_1
- binutils_impl_linux-64=2.39=he00db2b_1
- biopython=1.80=py311hd4cff14_0
- bwa=0.7.17=h7132678_9
- bwidget=1.9.14=ha770c72_1
- bzip2=1.0.8=h7f98852_4
- c-ares=1.18.1=h7f98852_0
- ca-certificates=2023.11.17=hbcca054_0
- cairo=1.16.0=ha61ee94_1014
- coreutils=8.25=1
- curl=7.87.0=h6312ad2_0
- dbus=1.13.6=h5008d03_3
- expat=2.5.0=h27087fc_0
- fastqc-rs=0.3.2=he9f29cb_1
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
- font-ttf-inconsolata=3.000=h77eed37_0
- font-ttf-source-code-pro=2.038=h77eed37_0
- font-ttf-ubuntu=0.83=hab24e00_0
- fontconfig=2.14.1=hc2a2eb6_0
- fonts-conda-ecosystem=1=0
- fonts-conda-forge=1=0
- freetype=2.12.1=hca18f0e_1
- fribidi=1.0.10=h36c2ea0_0
- gatk4=4.3.0.0=py36hdfd78af_0
- gcc_impl_linux-64=12.2.0=hcc96c02_19
- gettext=0.21.1=h27087fc_0
- gfortran_impl_linux-64=12.2.0=h55be85b_19
- giflib=5.2.1=h36c2ea0_2
- git=2.39.1=pl5321ha3eba64_0
- glib=2.74.1=h6239696_1
- glib-tools=2.74.1=h6239696_1
- gmp=6.2.1=h58526e2_0
- graphite2=1.3.13=h58526e2_1001
- gsl=2.7=he838d99_0
- gxx_impl_linux-64=12.2.0=hcc96c02_19
- harfbuzz=6.0.0=h8e241bc_0
- htop=3.2.2=h8228510_0
- htslib=1.16=h6bc39ce_0
- icu=70.1=h27087fc_0
- java-jdk=7.0.91=1
- jpeg=9e=h166bdaf_2
- julia=1.8.5=h783901f_0
- kernel-headers_linux-64=2.6.32=he073ed8_15
- keyutils=1.6.1=h166bdaf_0
- krb5=1.20.1=hf9c8cef_0
- lcms2=2.14=h6ed2654_0
- ld_impl_linux-64=2.39=hcc3a1bd_1
- lerc=4.0.0=h27087fc_0
- libblas=3.9.0=16_linux64_openblas
- libcblas=3.9.0=16_linux64_openblas
- libcups=2.3.3=h36d4200_3
- libcurl=7.87.0=h6312ad2_0
- libdeflate=1.13=h166bdaf_0
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libffi=3.4.2=h7f98852_5
- libgcc=7.2.0=h69d50b8_2
- libgcc-devel_linux-64=12.2.0=h3b97bd3_19
- libgcc-ng=12.2.0=h65d4601_19
- libgfortran-ng=12.2.0=h69a702a_19
- libgfortran5=12.2.0=h337968e_19
- libgit2=1.5.1=ha98c156_0
- libglib=2.74.1=h606061b_1
- libgomp=12.2.0=h65d4601_19
- libhwloc=2.8.0=h32351e8_1
- libiconv=1.17=h166bdaf_0
- liblapack=3.9.0=16_linux64_openblas
- libnghttp2=1.51.0=hdcd2b5c_0
- libnl=3.8.0=hd590300_0
- libnsl=2.0.0=h7f98852_0
- libopenblas=0.3.21=pthreads_h78a6416_3
- libopenblas-ilp64=0.3.21=pthreads_h44c56fb_3
- libpng=1.6.39=h753d276_0
- libsanitizer=12.2.0=h46fd767_19
- libsqlite=3.40.0=h753d276_0
- libssh2=1.10.0=haa6b8db_3
- libstdcxx-devel_linux-64=12.2.0=h3b97bd3_19
- libstdcxx-ng=12.2.0=h46fd767_19
- libtiff=4.4.0=h0e0dad5_3
- libunwind=1.6.2=h9c3ff4c_0
- libutf8proc=2.8.0=h166bdaf_0
- libuuid=2.32.1=h7f98852_1000
- libwebp-base=1.2.4=h166bdaf_0
- libxcb=1.13=h7f98852_1004
- libxml2=2.10.3=h7463322_0
- libzlib=1.2.13=h166bdaf_4
- make=4.3=hd18ef5c_1
- mbedtls=3.3.0=hcb278e6_0
- metis=5.1.0=h58526e2_1006
- mpfr=4.1.0=h9202a9a_1
- ncurses=6.3=h27087fc_1
- nextflow=22.10.4=h4a94de4_0
- numpy=1.24.1=py311hbde0eaa_0
- openblas-ilp64=0.3.21=pthreads_h3d04fff_3
- openjdk=11.0.13=h87a67e3_0
- openlibm=0.8.1=h7f98852_0
- openssl=1.1.1w=hd590300_0
- p7zip=16.02=h9c3ff4c_1001
- pandoc=3.1.3=h32600fe_0
- pango=1.50.12=hd33c08f_1
- parallel=20221122=ha770c72_0
- pcre2=10.40=hc3806b6_0
- perl=5.32.1=2_h7f98852_perl5
- picard=2.27.5=hdfd78af_0
- pip=22.3.1=pyhd8ed1ab_0
- pixman=0.40.0=h36c2ea0_0
- pthread-stubs=0.4=h36c2ea0_1001
- python=3.11.0=h10a6764_1_cpython
- python_abi=3.11=3_cp311
- r-ape=5.7_1=r42h08d816e_1
- r-argparse=2.2.2=r42hc72bb7e_1
- r-askpass=1.2.0=r42h57805ef_0
- r-assertthat=0.2.1=r42hc72bb7e_4
- r-backports=1.4.1=r42h57805ef_2
- r-base=4.2.2=h6b4767f_2
- r-base64enc=0.1_3=r42h57805ef_1006
- r-brew=1.0_8=r42hc72bb7e_2
- r-brio=1.1.3=r42h57805ef_2
- r-bslib=0.5.1=r42hc72bb7e_0
- r-cachem=1.0.8=r42h57805ef_1
- r-callr=3.7.3=r42hc72bb7e_1
- r-cli=3.6.1=r42ha503ecb_1
- r-clipr=0.8.0=r42hc72bb7e_2
- r-cluster=2.1.4=r42h8da6f51_0
- r-codetools=0.2_19=r42hc72bb7e_1
- r-commonmark=1.9.0=r42h57805ef_1
- r-cpp11=0.4.6=r42hc72bb7e_0
- r-crayon=1.5.2=r42hc72bb7e_2
- r-credentials=2.0.1=r42hc72bb7e_0
- r-curl=4.3.3=r42h06615bd_1
- r-desc=1.4.2=r42hc72bb7e_2
- r-devtools=2.4.5=r42hc72bb7e_2
- r-diffobj=0.3.5=r42h57805ef_2
- r-digest=0.6.33=r42ha503ecb_0
- r-doparallel=1.0.17=r42hc72bb7e_2
- r-downlit=0.4.3=r42hc72bb7e_0
- r-dplyr=1.1.3=r42ha503ecb_0
- r-ellipsis=0.3.2=r42h57805ef_2
- r-evaluate=0.22=r42hc72bb7e_0
- r-fansi=1.0.5=r42h57805ef_0
- r-fastmap=1.1.1=r42ha503ecb_1
- r-findpython=1.0.8=r42hc72bb7e_1
- r-fontawesome=0.5.2=r42hc72bb7e_0
- r-foreach=1.5.2=r42hc72bb7e_2
- r-fs=1.6.3=r42ha503ecb_0
- r-generics=0.1.3=r42hc72bb7e_2
- r-gert=1.9.2=r42hf3f2ec2_0
- r-gh=1.4.0=r42hc72bb7e_1
- r-gitcreds=0.1.2=r42hc72bb7e_2
- r-glue=1.6.2=r42h57805ef_2
- r-highr=0.10=r42hc72bb7e_1
- r-htmltools=0.5.7=r42ha503ecb_0
- r-htmlwidgets=1.6.2=r42hc72bb7e_1
- r-httpuv=1.6.12=r42ha503ecb_0
- r-httr=1.4.7=r42hc72bb7e_0
- r-httr2=0.2.3=r42hc72bb7e_1
- r-ini=0.3.1=r42hc72bb7e_1005
- r-iterators=1.0.14=r42hc72bb7e_2
- r-jquerylib=0.1.4=r42hc72bb7e_2
- r-jsonlite=1.8.7=r42h57805ef_0
- r-knitr=1.45=r42hc72bb7e_0
- r-later=1.3.1=r42ha503ecb_1
- r-lattice=0.21_9=r42h57805ef_0
- r-lifecycle=1.0.3=r42hc72bb7e_2
- r-magrittr=2.0.3=r42h57805ef_2
- r-mass=7.3_60=r42h57805ef_1
- r-matrix=1.6_1.1=r42h316c678_0
- r-memoise=2.0.1=r42hc72bb7e_2
- r-memuse=4.2_3=r42h57805ef_1
- r-mgcv=1.9_0=r42h316c678_0
- r-mime=0.12=r42h57805ef_2
- r-miniui=0.1.1.1=r42hc72bb7e_1004
- r-nlme=3.1_162=r42hac0b197_0
- r-openssl=2.0.5=r42hb1dc35e_0
- r-permute=0.9_7=r42hc72bb7e_2
- r-pillar=1.9.0=r42hc72bb7e_1
- r-pinfsc50=1.2.0=r42ha770c72_2
- r-pkgbuild=1.4.2=r42hc72bb7e_0
- r-pkgconfig=2.0.3=r42hc72bb7e_3
- r-pkgdown=2.0.7=r42hc72bb7e_1
- r-pkgload=1.3.3=r42hc72bb7e_0
- r-praise=1.0.0=r42hc72bb7e_1007
- r-prettyunits=1.2.0=r42hc72bb7e_0
- r-processx=3.8.2=r42h57805ef_0
- r-profvis=0.3.8=r42h57805ef_3
- r-promises=1.2.1=r42ha503ecb_0
- r-ps=1.7.5=r42h57805ef_1
- r-purrr=1.0.2=r42h57805ef_0
- r-r6=2.5.1=r42hc72bb7e_2
- r-ragg=1.2.4=r42hc1f6985_0
- r-rappdirs=0.3.3=r42h57805ef_2
- r-rcmdcheck=1.4.0=r42h785f33e_2
- r-rcpp=1.0.11=r42h7df8631_0
- r-rematch2=2.1.2=r42hc72bb7e_3
- r-remotes=2.4.2.1=r42hc72bb7e_0
- r-rlang=1.1.1=r42ha503ecb_1
- r-rmarkdown=2.25=r42hc72bb7e_0
- r-roxygen2=7.2.3=r42ha503ecb_1
- r-rprojroot=2.0.3=r42hc72bb7e_1
- r-rstudioapi=0.15.0=r42hc72bb7e_0
- r-rversions=2.1.2=r42hc72bb7e_2
- r-sass=0.4.7=r42ha503ecb_0
- r-sessioninfo=1.2.2=r42hc72bb7e_2
- r-shiny=1.7.5.1=r42h785f33e_0
- r-sourcetools=0.1.7_1=r42ha503ecb_1
- r-stringi=1.7.12=r42h1ae9187_0
- r-stringr=1.5.0=r42h785f33e_1
- r-sys=3.4.2=r42h57805ef_1
- r-systemfonts=1.0.5=r42haf97adc_0
- r-testthat=3.2.0=r42ha503ecb_0
- r-textshaping=0.3.6=r42hbb20487_4
- r-tibble=3.2.1=r42h57805ef_2
- r-tidyselect=1.2.0=r42hc72bb7e_1
- r-tinytex=0.48=r42hc72bb7e_1
- r-txtplot=1.0_4=r42h142f84f_0
- r-urlchecker=1.0.1=r42hc72bb7e_2
- r-usethis=2.2.2=r42hc72bb7e_0
- r-utf8=1.2.3=r42h57805ef_1
- r-vcfr=1.14.0=r42h33b523d_1
- r-vctrs=0.6.3=r42ha503ecb_0
- r-vegan=2.6_4=r42hb20cf53_0
- r-viridislite=0.4.2=r42hc72bb7e_1
- r-waldo=0.5.1=r42hc72bb7e_1
- r-whisker=0.4.1=r42hc72bb7e_1
- r-withr=2.5.1=r42hc72bb7e_0
- r-xfun=0.41=r42ha503ecb_0
- r-xml2=1.3.3=r42h044e5c7_2
- r-xopen=1.0.0=r42hc72bb7e_1005
- r-xtable=1.8_4=r42hc72bb7e_5
- r-yaml=2.3.7=r42h57805ef_1
- r-zip=2.3.0=r42h57805ef_1
- readline=8.1.2=h0f457ee_0
- rust=1.74.0=h70c747d_0
- rust-std-x86_64-unknown-linux-gnu=1.74.0=h2c6d0dc_0
- samtools=1.16.1=h6899075_1
- sed=4.8=he412f7d_0
- setuptools=66.1.0=pyhd8ed1ab_0
- suitesparse=5.10.1=h9e50725_1
- sysroot_linux-64=2.12=he073ed8_15
- tbb=2021.7.0=h924138e_1
- tk=8.6.12=h27826a3_0
- tktable=2.10=hb7b940f_3
- trimmomatic=0.39=hdfd78af_2
- tzdata=2022g=h191b570_0
- wheel=0.38.4=pyhd8ed1ab_0
- xorg-fixesproto=5.0=h7f98852_1002
- xorg-inputproto=2.3.2=h7f98852_1002
- xorg-kbproto=1.0.7=h7f98852_1002
- xorg-libice=1.0.10=h7f98852_0
- xorg-libsm=1.2.3=hd9c2040_1000
- xorg-libx11=1.7.2=h7f98852_0
- xorg-libxau=1.0.9=h7f98852_0
- xorg-libxdmcp=1.1.3=h7f98852_0
- xorg-libxext=1.3.4=h7f98852_1
- xorg-libxfixes=5.0.3=h7f98852_1004
- xorg-libxi=1.7.10=h7f98852_0
- xorg-libxrender=0.9.10=h7f98852_1003
- xorg-libxt=1.2.1=h7f98852_2
- xorg-libxtst=1.2.3=h7f98852_1002
- xorg-recordproto=1.14.2=h7f98852_1002
- xorg-renderproto=0.11.1=h7f98852_1002
- xorg-xextproto=7.3.0=h7f98852_1002
- xorg-xproto=7.0.31=h7f98852_1007
- xz=5.2.6=h166bdaf_0
- zlib=1.2.13=h166bdaf_4
- zstd=1.5.2=h3eb15da_6

44 changes: 37 additions & 7 deletions res/perf.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,19 +122,49 @@ fn_simulate_missing_data = function(vcf, mat_genotypes, maf=0.25, missing_rate=0
))
}
### Imputation accuracy metrics
fn_metrics = function(y_predicted, y_expected) {
deviation = y_predicted - y_expected
fn_metrics = function(q_predicted, q_expected) {
## Overall metrics
deviation = q_predicted - q_expected
mae = mean(abs(deviation), na.rm=TRUE)
mse = mean(deviation^2, na.rm=TRUE)
rmse = sqrt(mse)
r2 = 1.00 - (mean(mse, na.rm=TRUE) / mean((y_expected-mean(y_expected))^2, na.rm=TRUE))
concordance = mean(y_predicted == y_expected, na.rm=TRUE)
r2 = 1.00 - (mean(mse, na.rm=TRUE) / mean((q_expected-mean(q_expected))^2, na.rm=TRUE))
concordance = mean(q_predicted == q_expected, na.rm=TRUE)
### Metrics across the range of expected allele frequencies
vec_q_max = c(0.01, 0.05, c(1:9)/10, 0.95, 0.99, 1.00)
vec_n = rep(0, each=length(vec_q_max))
vec_mae = rep(NA, each=length(vec_q_max))
vec_mse = rep(NA, each=length(vec_q_max))
vec_rmse = rep(NA, each=length(vec_q_max))
vec_r2 = rep(NA, each=length(vec_q_max))
vec_concordance = rep(NA, each=length(vec_q_max))
for (i in 1:length(vec_q_max)) {
# i = 1
if (i==1) {
q_min = 0.0
} else {
q_min = vec_q_max[i-1]
}
q_max = vec_q_max[i]
idx = which((q_expected > q_min) & (q_expected <= q_max))
if (length(idx) > 0) {
vec_n[i] = length(idx)
vec_deviations = q_predicted[idx] - q_expected[idx]
vec_mae[i] = mean(abs(vec_deviations), na.rm=TRUE)
vec_mse[i] = mean(vec_deviations^2, na.rm=TRUE)
vec_rmse[i] = sqrt(vec_mse[i])
vec_r2[i] = 1.00 - (mean(vec_mse[i], na.rm=TRUE) / mean((q_expected[idx]-mean(q_expected[idx]))^2, na.rm=TRUE))
vec_concordance[i] = mean(q_predicted[idx] == q_expected[idx], na.rm=TRUE)
}
}
df_metrics_across_allele_freqs = data.frame(q=vec_q_max, n=vec_n, mae=vec_mae, mse=vec_mse, rmse=vec_rmse, r2=vec_r2, concordance=vec_concordance)
return(list(
mae=mae,
mse=mse,
rmse=rmse,
r2=r2,
concordance=concordance
concordance=concordance,
df_metrics_across_allele_freqs=df_metrics_across_allele_freqs
))
}
### Assess imputation accuracies
Expand Down Expand Up @@ -196,11 +226,11 @@ fn_imputation_accuracy = function(fname_imputed, list_sim_missing, ploidy=4, str
# print(paste0("Total number of simulated missing data = ", n_missing))
# print(paste0("Total number of imputed missing data = ", n_imputed))
### Metrics using allele frequencies
metrics_allele_frequencies = fn_metrics(y_predicted=vec_imputed, y_expected=list_sim_missing$expected_allele_frequencies)
metrics_allele_frequencies = fn_metrics(q_predicted=vec_imputed, q_expected=list_sim_missing$expected_allele_frequencies)
### Metrics using genotype classes
vec_expected_classes = fn_classify_allele_frequencies(mat_genotypes=list_sim_missing$expected_allele_frequencies, ploidy=ploidy, strict_boundaries=strict_boundaries)
vec_imputed_classes = fn_classify_allele_frequencies(mat_genotypes=vec_imputed, ploidy=ploidy, strict_boundaries=strict_boundaries)
metrics_genotype_classes = fn_metrics(y_predicted=vec_imputed_classes, y_expected=vec_expected_classes)
metrics_genotype_classes = fn_metrics(q_predicted=vec_imputed_classes, q_expected=vec_expected_classes)
return(list(
frac_imputed = n_imputed / n_missing,
mae_freqs = metrics_allele_frequencies$mae,
Expand Down
Loading

0 comments on commit 211505a

Please sign in to comment.