preparing first stable release

jeffersonfparil · Feb 19, 2024 · b479dcd · b479dcd
1 parent 232d760
commit b479dcd
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 64,421 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,9 @@ edition = '2021'
 [dependencies]
 clap = { version="3.2.23", features=["derive"]}
 ndarray = { version = "0.15.6", features = ["rayon"]}
-ndarray-linalg = { version = "0.16.0", features = ["intel-mkl-static"] } # ndarray-linalg = { version = "0.16.0", features = ["openblas"] }
+ndarray-linalg = "0.16"
+# ndarray-linalg = { version = "0.16.0", features = ["intel-mkl-static"] } 
+# ndarray-linalg = { version = "0.16.0", features = ["openblas"] }
 statrs = "0.16.0"
 rand = "0.8.5"
 rayon = "1.7.0"

diff --git a/README.md b/README.md
@@ -10,35 +10,100 @@ Impute allele frequencies to reduce sparsity of genotype data from polyploids, p
 
 ### Compile from source
 
-1. From this repo
+1. Clone the repository
 
+```shell
+git clone https://jeffersonfparil:<API_KEY>@github.com/jeffersonfparil/imputef.git main
+```
 
-2. Download the repository and load the development environment
+2. Load the Rust development environment via Conda (please see [Conda installation instructions](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) if you do not have Conda pre-installed)
 
 ```shell
-git clone https://jeffersonfparil:<API_KEY>@github.com/jeffersonfparil/imputef.git main
 cd imputef/
 conda env create --file res/rustenv.yml
 conda activate rustenv
 ```
 
+3. Compile and optionally create an alias or a symbolic link or add to $PATH
+
 ```shell
 cargo build --release
 target/release/imputef -h
+# Option 1: alias
+echo alias imputef="$(pwd)/target/release/imputef" >> ~/.bashrc
+source ~/.bashrc
+# Option 2: symlink
+sudo ln -s $(pwd)/target/release/imputef /usr/bin/imputef
+# Option 3: add to $PATH (Note that you need to place this in your ~/.bashrc or the appropriate shell initialisation file)
+export PATH=${PATH}:$(pwd)/target/release
+### Check
+type -a imputef
+cd ~; imputef -h; cd -
 ```
 
 ### Pre-compiled binaries
 
+1. Download the appropriate executable binary compatible with your system
 
+- [GNU/Linux (x86 64-bit)](some/link)
+- [macOS Catalina (x86 64-bit)](some/link)
+- [Windows 10 (x86 64-bit)](some/link)
 
-## Usage
+2. Configure and execute
+
+- In GNU/Linux and macOS:
 
+```shell
+unzip imputef.zip
+chmod +x imputef
+./imputef
+```
 
-### Functions
+- In Windows, open command prompt via: *Win + R*, type "cmd" and press enter. Navigate to your download folder and execute, e.g. `imputef-x86_64-windows.exe -h`.
 
-- `mvi`: mean value imputation of allele frequencies
 
-- `aldknni`: adaptive linkage-informed k-nearest neighbour imputation of allele frequencies
+## Usage
+
+### Quickstart
+
+```shell
+imputef -h
+imputef -f tests/test.csv   # allele frequency table as input
+imputef -f tests/test.sync  # synchronised pileup file as input
+imputef -f tests/test.vcf   # variant call format as input without missing data
+imputef -f tests/test_2.vcf   # variant call format as input
+imputef -f tests/test_2.vcf --method mean # use mean value imputation
+imputef -f tests/test_2.vcf --min-loci-corr=0.75 --max-pool-dist=0.25 # use set minimum loci correlation and maximum genetic distance thresholds
+imputef -f tests/test_2.vcf --min-loci-corr=-1 --max-pool-dist=-1 # optimise for minimum loci correlation and maximum genetic distance thresholds per locus
+### Gird search optimisation assuming a common set of optimal parameters across all loci
+### (different from the built-in optimisation which optimises the minimum loci correlation and maximum genetic distance per locus)
+echo 'l,k,corr,dist,mae' > grid_search.csv
+for l in 5 10 15
+do
+    for k in 1 2 3 4 5
+    do
+        for corr in 0.75 0.95 1.00
+        do
+            for dist in 0.0 0.10 0.25
+            do
+                echo "@@@@@@@@@@@@@@@@@@@@@@@@"
+                echo ${l},${k},${corr},${dist}
+                imputef -f tests/test_2.vcf \
+                    --min-l-loci=${l} \
+                    --min-k-neighbours=${k} \
+                    --min-loci-corr=${corr} \
+                    --max-pool-dist=${dist} \
+                    --n-reps=3 > log.tmp
+                fname_out=$(tail -n1 log.tmp | cut -d':' -f2 | tr -d ' ')
+                mae=$(grep "Expected imputation accuracy in terms of mean absolute error:" log.tmp | cut -d':' -f2 | tr -d ' ')
+                echo ${l},${k},${corr},${dist},${mae} >> grid_search.csv
+                rm $fname_out log.tmp
+            done
+        done
+    done
+done
+awk -F',' 'NR == 1 || $5 < min {min = $5; min_line = $0} END {print min_line}' grid_search.csv
+```
 
 ### Input variables
 

diff --git a/res/doc_header_for_maths.html b/res/doc_header_for_maths.html
@@ -0,0 +1,15 @@
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-9eLZqc9ds8eNjO3TmqPeYcDj8n+Qfa4nuSiGYa6DjLNcv9BtN69ZIulL9+8CqC9Y" crossorigin="anonymous">
+<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js"                  integrity="sha384-K3vbOmF2BtaVai+Qk37uypf7VrgBubhQreNQe9aGsz9lB63dIFiQVlJbr92dw2Lx" crossorigin="anonymous"></script>
+<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js"    integrity="sha384-kmZOZB5ObwgQnS/DuDg6TScgOiWWBiVt0plIRkZCmE6rDZGrEOQeHM5PcHi+nyqe" crossorigin="anonymous"></script>
+<script>
+    document.addEventListener("DOMContentLoaded", function() {
+        renderMathInElement(document.body, {
+            delimiters: [
+                {left: "$$", right: "$$", display: true},
+                {left: "\\(", right: "\\)", display: false},
+                {left: "$", right: "$", display: false},
+                {left: "\\[", right: "\\]", display: true}
+            ]
+        });
+    });
+</script>
diff --git a/src/aldknni.rs b/src/aldknni.rs
@@ -605,6 +605,12 @@ impl GenotypesAndPhenotypes {
                 let p = allele_frequencies.ncols();
                 assert_eq!(chromosome.len(), p, "Error, the number of chromosome names and the total number of loci are not equal.");
                 // Instantiate output file
+                println!(
+                    "--> {}: Writing out intermediate file with expected MAE of {}: {}",
+                    i,
+                    sensible_round(sum_mae / n_missing, 4),
+                    &fname_intermediate_file
+                );
                 let error_writing_file =
                     "Unable to create file: ".to_owned() + &fname_intermediate_file;
                 let mut file_out = OpenOptions::new()
@@ -675,6 +681,40 @@ impl GenotypesAndPhenotypes {
     }
 }
 
+/// # `aldknni`: adaptive linkage disequilibrium (LD)-based k-nearest neighbour imputation of genotype data
+///
+/// This is an attempt to extend the [LD-kNNi method of Money et al, 2015, i.e. LinkImpute](https://doi.org/10.1534/g3.115.021667), which was an extension of the [kNN imputation of Troyanskaya et al, 2001](https://doi.org/10.1093/bioinformatics/17.6.520). Similar to LD-kNNi, linkage disequilibrium (LD) is estimated using Pearson's product moment correlation per pair of loci, which is computed per chromosome by default, but can be computed across the entire genome. We use the mean absolute difference/error (MAE) between allele frequencies among linked loci as an estimate of genetic distance between samples. Fixed values for the minimum correlation to identify loci used in distance estimation, and maximum genetic distance to select the k-nearest neighbours can be defined. Additionally, minimum number of loci to include in distance estimation, and minimum number of nearest neighbours can be set. Moreover, all four parameters can be optimised, i.e. the minimum correlation and/or maximum distance and/or minimum number of loci and/or minimum number of nearest neighbours which minimises the MAE between predicted and expected allele frequencies after simulating 10% missing data are identified.
+///
+/// The allele depth information (`AD`), i.e. the unfiltered allele depth which includes the reads which did not pass the variant caller filters are used to calculate allele frequencies. If the `GT` field is present but the `AD` field is absent, then each sample is assumed to be an individual diploid, i.e., neither a polyploid nor a pool. Optional filtering steps based on minimum depth, minimum allele frequency, and maximum sparsity are available. Genotype data are not imported into R, LD estimation and imputation per se are multi-threaded, and imputation output is written into disk as an [allele frequency table](#allele-frequency-table-csv). The structs, traits, methods, and functions defined in this library are subsets of [poolgen](https://github.com/jeffersonfparil/poolgen), and will eventually be merged.
+///
+/// The imputed allele frequency is computed as:
+///
+/// $$
+/// \hat q_{r,j} = { \sum_{i \ne r}^{k} q_{i,j} (1 - \delta_{i,r}) }
+/// $$
+///
+/// with:
+///
+/// $$
+/// \delta_{i,r} = { {1 \over \sum d_{i,r}} d_{i,r} }
+/// $$
+///
+/// and
+///
+/// $$
+/// d_{i,r} = { {1 \over c} { \sum_{j=1}^{c} |q_{i,j} - q_{r,j}| } }
+/// $$
+///
+/// where:
+///
+/// - $\hat q_{r,j}$ is the imputed allele frequency of sample $r$ at the $j^{\text {th}}$ locus,
+/// - $n$ is the total number of samples,
+/// - $m$ is the number of samples which are missing data at the $j^{\text {th}}$ locus,
+/// - $q_{i,j}$ is the known allele frequency of the $i^{\text {th}}$ sample at the $j^{\text {th}}$ locus,
+/// - $k$ is the number of nearest neighbours or the samples most closely related to the sample requiring imputation, i.e. sample $r$ at locus $j$, and
+/// - $\delta_{i,r}$ is scaled $d_{i,r}$ which is the genetic distance between the $i^{\text {th}}$ sample and sample $r$. This distance is the mean absolute difference in allele frequencies between the two samples across $c$ linked loci.
+///
+/// The variables $k$ and $c$ are proportional to the user inputs `max_pool_dist` (default=0.1) and `min_loci_corr` (default=0.9), respectively. The former defines the maximum distance of samples to be considered as one of the k-nearest neighbours, while the latter refers to the minimum correlation with the locus requiring imputation to be included in the estimation of the genetic distance.
 pub fn impute_aldknni(
     genotypes_and_phenotypes: GenotypesAndPhenotypes,
     filter_stats: &FilterStats,

diff --git a/src/filter_missing.rs b/src/filter_missing.rs
@@ -43,9 +43,13 @@ impl GenotypesAndPhenotypes {
 
     pub fn missing_rate(&mut self) -> io::Result<f64> {
         let (n, l) = self.coverages.dim();
-        let sum = self
-            .coverages
-            .fold(0, |sum, &x| if x.is_nan() { sum + 1 } else { sum });
+        let sum = self.coverages.fold(0, |sum, &x| {
+            if (x.is_nan()) || (x == 0.0) {
+                sum + 1
+            } else {
+                sum
+            }
+        });
         Ok(sensible_round(sum as f64 * 100.0 / ((n * l) as f64), 5))
     }
 

diff --git a/src/geno.rs b/src/geno.rs
@@ -259,13 +259,16 @@ impl LoadAll for FileGeno {
         loci_chr.push(chromosome.last().expect("Error push chromosome within the convert_into_genotypes_and_phenotypes() method for FileGeno struct.").to_owned());
         loci_pos.push(position.last().expect("Error push position within the convert_into_genotypes_and_phenotypes() method for FileGeno struct.").to_owned());
         // Add alternative alleles if the allele frequencies per locus do not add up to 1.00 (~or if only one allele per locus is present~)
-        // Count how many allele we have to add
+        // Create coverages matrix setting missing loci to f64::NAN, while counting how many allele we have to add
+        let mut coverages: Array2<f64> = Array2::from_elem((n, l), 1_000.0);
         for j in 0..l {
             let idx_ini = loci_idx[j];
             let idx_fin = loci_idx[j + 1];
-            let _n_alleles = idx_fin - idx_ini;
             let mut freq_sum_less_than_one = false;
             for i in 0..n {
+                if mat[(i, idx_ini)].is_nan() {
+                    coverages[(i, j)] = f64::NAN;
+                }
                 if mat.slice(s![i, idx_ini..idx_fin]).sum() < 1.0 {
                     freq_sum_less_than_one = true;
                     break;
@@ -323,7 +326,7 @@ impl LoadAll for FileGeno {
             intercept_and_allele_frequencies: mat_new,
             phenotypes: Array2::from_shape_vec((n, 1), vec![f64::NAN; n]).expect("Error generating dummy phenotype data within the convert_into_genotypes_and_phenotypes() method for FileGeno struct."),
             pool_names,
-            coverages: Array2::from_elem((n, l), 1_000.0),
+            coverages,
         })
     }
 }

diff --git a/src/helpers.rs b/src/helpers.rs
@@ -186,17 +186,17 @@ mod tests {
     fn test_helpers() {
         if cfg!(windows) {
             assert_eq!(
-                find_start_of_next_line(&"./tests/test.pileup".to_owned(), 56),
-                58
-            ); // line 1 has a total of 57 characters in windows and only 56 in unix, so the next line should start at the 58th character position
+                find_start_of_next_line(&"./tests/test.sync".to_owned(), 77),
+                79
+            ); // line 1 has a total of 78 characters in windows and only 77 in unix, so the next line should start at the 79th character position
         } else {
             assert_eq!(
-                find_start_of_next_line(&"./tests/test.pileup".to_owned(), 56),
-                57
-            ); // 56th character is the newline character as line 1 has a total of 56 characters, so the next line should start at the 57th character position
+                find_start_of_next_line(&"./tests/test.sync".to_owned(), 77),
+                78
+            ); // 77th character is the newline character as line 1 has a total of 77 characters, so the next line should start at the 78th character position
         }
         assert_eq!(
-            find_file_splits(&"./tests/test.pileup".to_owned(), &2)
+            find_file_splits(&"./tests/test.sync".to_owned(), &2)
                 .unwrap()
                 .len(),
             3