From 34638202b7db8328c6cd021bbb430768f58f089d Mon Sep 17 00:00:00 2001 From: jeffersonfparil Date: Fri, 16 Feb 2024 17:02:43 +1100 Subject: [PATCH 01/12] trying to make a binary package but not performing as well as before --- .github/workflows/r.yml | 22 - DESCRIPTION | 12 - NAMESPACE | 5 - R/extendr-wrappers.R | 17 - R/imputef.R | 224 - README.md | 62 +- man/aldknni.Rd | 97 - man/mvi.Rd | 75 - res/perf.R | 6 +- res/perf_functions.R | 35 +- src/.gitignore | 5 - src/Makevars | 30 - src/Makevars.ucrt | 5 - src/Makevars.win | 40 - src/entrypoint.c | 8 - src/rust/Cargo.toml | 23 - src/rust/src/aldknni.rs | 977 - src/rust/src/filter_missing.rs | 364 - src/rust/src/geno.rs | 387 - src/rust/src/helpers.rs | 257 - src/rust/src/lib.rs | 360 - src/rust/src/mvi.rs | 407 - src/rust/src/phen.rs | 256 - src/rust/src/structs_and_traits.rs | 222 - src/rust/src/sync.rs | 1580 - src/rust/src/vcf.rs | 696 - src/rust/tests/test.csv | 12669 ------ src/rust/tests/test.pileup | 64370 --------------------------- src/rust/tests/test.sync | 6675 --- src/rust/tests/test.txt | 6918 --- src/rust/tests/test.vcf | 406 - src/rust/tests/test_pheno.csv | 6 - tests/test.csv | 13424 +++++- tests/test.sync | 7052 ++- tests/test.vcf | 4 +- tests/tests.R | 48 - 36 files changed, 19363 insertions(+), 98381 deletions(-) delete mode 100644 .github/workflows/r.yml delete mode 100644 DESCRIPTION delete mode 100644 NAMESPACE delete mode 100644 R/extendr-wrappers.R delete mode 100644 R/imputef.R delete mode 100644 man/aldknni.Rd delete mode 100644 man/mvi.Rd delete mode 100644 src/.gitignore delete mode 100644 src/Makevars delete mode 100644 src/Makevars.ucrt delete mode 100644 src/Makevars.win delete mode 100644 src/entrypoint.c delete mode 100644 src/rust/Cargo.toml delete mode 100644 src/rust/src/aldknni.rs delete mode 100644 src/rust/src/filter_missing.rs delete mode 100644 src/rust/src/geno.rs delete mode 100644 src/rust/src/helpers.rs delete mode 100644 src/rust/src/lib.rs delete mode 100644 src/rust/src/mvi.rs delete mode 100644 src/rust/src/phen.rs delete mode 100644 src/rust/src/structs_and_traits.rs delete mode 100644 src/rust/src/sync.rs delete mode 100644 src/rust/src/vcf.rs delete mode 100644 src/rust/tests/test.csv delete mode 100644 src/rust/tests/test.pileup delete mode 100644 src/rust/tests/test.sync delete mode 100644 src/rust/tests/test.txt delete mode 100644 src/rust/tests/test.vcf delete mode 100644 src/rust/tests/test_pheno.csv delete mode 100644 tests/tests.R diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml deleted file mode 100644 index 190de9e..0000000 --- a/.github/workflows/r.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: 🚀 -on: - push: - branches: [ "main", "dev" ] - pull_request: - branches: [ "main" ] -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: r-lib/actions/setup-r@v2 - - name: Install dependencies on Ubuntu - run: | - sudo apt install -y libcurl4-openssl-dev libharfbuzz-dev libfribidi-dev - - name: Install dependencies - run: | - install.packages(c("devtools", "rextendr", "testthat")) - shell: Rscript {0} - - name: Tests - run: | - Rscript tests/tests.R diff --git a/DESCRIPTION b/DESCRIPTION deleted file mode 100644 index b4b1b59..0000000 --- a/DESCRIPTION +++ /dev/null @@ -1,12 +0,0 @@ -Package: imputef -Title: Imputing allele frequencies for individual polyploid genotype data and pools of individuals or population genotype data -Version: 0.0.1.0 -Authors@R: - person("Jeff", "Paril", email="jeffersonparil@gmail.com", role=c("aut", "cre"), - comment = c(ORCID = "0000-0002-5693-4123")) -Description: Imputation of genotype data from sequencing of more than 2 sets of genomes, i.e. polyploid individuals, population samples, or pools of individuals. This library can also perform simple genotype data filtering prior to imputation. Two imputation methods are available: (1) mean value imputation which uses the arithmentic mean of the locus across non-missing pools (`?imputef::mvi`); (2) adaptive linkage-informed k-nearest neighbour imputation (`?imputef::aldknni`). This is an attempt to extend the [LD-kNNi method of Money et al, 2015, i.e. LinkImpute](https://doi.org/10.1534/g3.115.021667), which was an extension of the [kNN imputation of Troyanskaya et al, 2001](https://doi.org/10.1093/bioinformatics/17.6.520). Similar to LD-kNNi, LD is estimated using Pearson's product moment correlation across loci per pair of samples. Mean absolute difference in allele frequencies is used to define genetic distance between samples, instead of taxicab or Manhattan distance in LD-kNNi. Four parameters can be set by the user, (1) minimum loci correlation threshold: dictates the minimum LD between the locus requiring imputation and other loci which will be used to estimate genetic distance between samples; (2) maximum genetic distance threshold: sets the maximum genetic distance between the sample requiring imputation and the samples (i.e. nearest neighbours) to be used in weighted mean imputation of missing allele frequencies; (3) minimum number of loci linked to the locus requiring imputation: overrides minimum loci correlation threshold if this minimum is not met; and (4) minimum k-nearest neighbours: overrides maximum genetic distance threshold if this minimum is not met. The first two parameters (minimum loci correlation and maximum genetic distance thresholds) can be optimised per locus requiring imputation using non-missing samples as replicates simulating missing data to minimum the mean absolute error in imputation. -License: `use_gpl3_license()` -Encoding: UTF-8 -Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 -Config/rextendr/version: 0.3.1 diff --git a/NAMESPACE b/NAMESPACE deleted file mode 100644 index c0be8fa..0000000 --- a/NAMESPACE +++ /dev/null @@ -1,5 +0,0 @@ -# Generated by roxygen2: do not edit by hand - -export(aldknni) -export(mvi) -useDynLib(imputef, .registration = TRUE) diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R deleted file mode 100644 index a358a12..0000000 --- a/R/extendr-wrappers.R +++ /dev/null @@ -1,17 +0,0 @@ -# Generated by extendr: Do not edit by hand - -# nolint start - -# -# This file was created with the following call: -# .Call("wrap__make_imputef_wrappers", use_symbols = TRUE, package_name = "imputef") - -#' @docType package -#' @usage NULL -#' @useDynLib imputef, .registration = TRUE -NULL - -impute <- function(fname, imputation_method, min_coverage, min_allele_frequency, max_missingness_rate_per_locus, pool_sizes, min_depth_below_which_are_missing, max_depth_above_which_are_missing, frac_top_missing_pools, frac_top_missing_loci, n_reps, min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, restrict_linked_loci_per_chromosome, n_threads, fname_out_prefix) .Call(wrap__impute, fname, imputation_method, min_coverage, min_allele_frequency, max_missingness_rate_per_locus, pool_sizes, min_depth_below_which_are_missing, max_depth_above_which_are_missing, frac_top_missing_pools, frac_top_missing_loci, n_reps, min_loci_corr, max_pool_dist, min_l_loci, min_k_neighbours, restrict_linked_loci_per_chromosome, n_threads, fname_out_prefix) - - -# nolint end diff --git a/R/imputef.R b/R/imputef.R deleted file mode 100644 index b35a777..0000000 --- a/R/imputef.R +++ /dev/null @@ -1,224 +0,0 @@ -#' @title -#' mvi -#' @description -#' Mean value imputation of allele frequencies -#' @usage -#' mvi(fname, -#' min_coverage=0, -#' min_allele_frequency=0.0001, -#' max_missingness_rate_per_locus=1.00, -#' pool_sizes=c(100), -#' min_depth_below_which_are_missing=1, -#' max_depth_above_which_are_missing=1000000, -#' frac_top_missing_pools=0.0, -#' frac_top_missing_loci=0.0, -#' n_threads=2, -#' fname_out_prefix="") -#' @param fname -#' name of the genotype file to be imputed in uncompressed vcf, sync or allele frequency table format. See genotype format details below. -#' @param min_coverage -#' minimum coverage per locus, i.e. if at a locus, a pool falls below this value (does not skip missing data, i.e. missing locus has a depth of zero), then the whole locus is omitted. Set this to zero if the vcf has been filtered and contains missing values, i.e. `./.` or `.|.`. [Default=0] -#' @param min_allele_frequency -#' minimum allele frequency per locus, i.e. if at a locus, a pool has all its alleles below this value and/or above the additive complement of this value (skipping missing data), then the entire locus is omitted. [Default=0.0001] -#' @param max_missingness_rate_per_locus -#' maximum fraction of pools missing per locus, i.e. if at a locus, there were more pools missing than the coverage dictated by this threshold, then the locus is omitted. [Default=1.00] -#' @param pool_sizes -#' vector of pool sizes, i.e. the number of individuals included in each pool, or can be set to an arbitrarily large value like 100 for individual polyploids or if allele frequency estimates are expected to be accurate. [Default=100] -#' @param min_depth_below_which_are_missing -#' minimum depth at which loci with depth below this threshold are set to missing. Set to one if the input vcf has already been filtered and the loci beyond the depth thresholds have been set to missing, otherwise set to an integer above zero. [Default=1] -#' @param max_depth_above_which_are_missing -#' maximum depth at which loci with depth above this threshold are set to missing. Set to some large arbitrarily large value (e.g. 1000000) if the input vcf has already been filtered and the loci beyond the depth thresholds have been set to missing, otherwise set to an integer above zero. [Default=1000000] -#' @param frac_top_missing_pools -#' fraction of pools with the highest number of missing loci to be omitted. Set to zero if the input vcf has already been filtered and the loci beyond the depth thresholds have been set to missing, otherwise set to a decimal number between zero and one. [Default=0.0] -#' @param frac_top_missing_loci -#' fraction of loci with the highest number of pools with missing data to be omitted. Set to zero if the input vcf has already been filtered and the loci beyond the depth thresholds have been set to missing, otherwise set to an decimal number between zero and one. [Default=0.0] -#' @param n_threads -#' number of computing threads or processor cores to use in the computations. [Default=2] -#' @param fname_out_prefix -#' prefix of the output files including the [imputed allele frequency table](#allele-frequency-table-csv) (`-