Skip to content

Commit

Permalink
preparing first stable release
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffersonfparil committed Feb 19, 2024
1 parent 232d760 commit b479dcd
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 64,421 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ edition = '2021'
[dependencies]
clap = { version="3.2.23", features=["derive"]}
ndarray = { version = "0.15.6", features = ["rayon"]}
ndarray-linalg = { version = "0.16.0", features = ["intel-mkl-static"] } # ndarray-linalg = { version = "0.16.0", features = ["openblas"] }
ndarray-linalg = "0.16"
# ndarray-linalg = { version = "0.16.0", features = ["intel-mkl-static"] }
# ndarray-linalg = { version = "0.16.0", features = ["openblas"] }
statrs = "0.16.0"
rand = "0.8.5"
rayon = "1.7.0"
Expand Down
79 changes: 72 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,100 @@ Impute allele frequencies to reduce sparsity of genotype data from polyploids, p

### Compile from source

1. From this repo
1. Clone the repository

```shell
git clone https://jeffersonfparil:<API_KEY>@github.com/jeffersonfparil/imputef.git main
```

2. Download the repository and load the development environment
2. Load the Rust development environment via Conda (please see [Conda installation instructions](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) if you do not have Conda pre-installed)

```shell
git clone https://jeffersonfparil:<API_KEY>@github.com/jeffersonfparil/imputef.git main
cd imputef/
conda env create --file res/rustenv.yml
conda activate rustenv
```

3. Compile and optionally create an alias or a symbolic link or add to $PATH

```shell
cargo build --release
target/release/imputef -h
# Option 1: alias
echo alias imputef="$(pwd)/target/release/imputef" >> ~/.bashrc
source ~/.bashrc
# Option 2: symlink
sudo ln -s $(pwd)/target/release/imputef /usr/bin/imputef
# Option 3: add to $PATH (Note that you need to place this in your ~/.bashrc or the appropriate shell initialisation file)
export PATH=${PATH}:$(pwd)/target/release
### Check
type -a imputef
cd ~; imputef -h; cd -
```

### Pre-compiled binaries

1. Download the appropriate executable binary compatible with your system

- [GNU/Linux (x86 64-bit)](some/link)
- [macOS Catalina (x86 64-bit)](some/link)
- [Windows 10 (x86 64-bit)](some/link)

## Usage
2. Configure and execute

- In GNU/Linux and macOS:

```shell
unzip imputef.zip
chmod +x imputef
./imputef
```

### Functions
- In Windows, open command prompt via: *Win + R*, type "cmd" and press enter. Navigate to your download folder and execute, e.g. `imputef-x86_64-windows.exe -h`.

- `mvi`: mean value imputation of allele frequencies

- `aldknni`: adaptive linkage-informed k-nearest neighbour imputation of allele frequencies
## Usage

### Quickstart

```shell
imputef -h
imputef -f tests/test.csv # allele frequency table as input
imputef -f tests/test.sync # synchronised pileup file as input
imputef -f tests/test.vcf # variant call format as input without missing data
imputef -f tests/test_2.vcf # variant call format as input
imputef -f tests/test_2.vcf --method mean # use mean value imputation
imputef -f tests/test_2.vcf --min-loci-corr=0.75 --max-pool-dist=0.25 # use set minimum loci correlation and maximum genetic distance thresholds
imputef -f tests/test_2.vcf --min-loci-corr=-1 --max-pool-dist=-1 # optimise for minimum loci correlation and maximum genetic distance thresholds per locus
### Gird search optimisation assuming a common set of optimal parameters across all loci
### (different from the built-in optimisation which optimises the minimum loci correlation and maximum genetic distance per locus)
echo 'l,k,corr,dist,mae' > grid_search.csv
for l in 5 10 15
do
for k in 1 2 3 4 5
do
for corr in 0.75 0.95 1.00
do
for dist in 0.0 0.10 0.25
do
echo "@@@@@@@@@@@@@@@@@@@@@@@@"
echo ${l},${k},${corr},${dist}
imputef -f tests/test_2.vcf \
--min-l-loci=${l} \
--min-k-neighbours=${k} \
--min-loci-corr=${corr} \
--max-pool-dist=${dist} \
--n-reps=3 > log.tmp
fname_out=$(tail -n1 log.tmp | cut -d':' -f2 | tr -d ' ')
mae=$(grep "Expected imputation accuracy in terms of mean absolute error:" log.tmp | cut -d':' -f2 | tr -d ' ')
echo ${l},${k},${corr},${dist},${mae} >> grid_search.csv
rm $fname_out log.tmp
done
done
done
done
awk -F',' 'NR == 1 || $5 < min {min = $5; min_line = $0} END {print min_line}' grid_search.csv
```

### Input variables

Expand Down
15 changes: 15 additions & 0 deletions res/doc_header_for_maths.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-9eLZqc9ds8eNjO3TmqPeYcDj8n+Qfa4nuSiGYa6DjLNcv9BtN69ZIulL9+8CqC9Y" crossorigin="anonymous">
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-K3vbOmF2BtaVai+Qk37uypf7VrgBubhQreNQe9aGsz9lB63dIFiQVlJbr92dw2Lx" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-kmZOZB5ObwgQnS/DuDg6TScgOiWWBiVt0plIRkZCmE6rDZGrEOQeHM5PcHi+nyqe" crossorigin="anonymous"></script>
<script>
document.addEventListener("DOMContentLoaded", function() {
renderMathInElement(document.body, {
delimiters: [
{left: "$$", right: "$$", display: true},
{left: "\\(", right: "\\)", display: false},
{left: "$", right: "$", display: false},
{left: "\\[", right: "\\]", display: true}
]
});
});
</script>
40 changes: 40 additions & 0 deletions src/aldknni.rs
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,12 @@ impl GenotypesAndPhenotypes {
let p = allele_frequencies.ncols();
assert_eq!(chromosome.len(), p, "Error, the number of chromosome names and the total number of loci are not equal.");
// Instantiate output file
println!(
"--> {}: Writing out intermediate file with expected MAE of {}: {}",
i,
sensible_round(sum_mae / n_missing, 4),
&fname_intermediate_file
);
let error_writing_file =
"Unable to create file: ".to_owned() + &fname_intermediate_file;
let mut file_out = OpenOptions::new()
Expand Down Expand Up @@ -675,6 +681,40 @@ impl GenotypesAndPhenotypes {
}
}

/// # `aldknni`: adaptive linkage disequilibrium (LD)-based k-nearest neighbour imputation of genotype data
///
/// This is an attempt to extend the [LD-kNNi method of Money et al, 2015, i.e. LinkImpute](https://doi.org/10.1534/g3.115.021667), which was an extension of the [kNN imputation of Troyanskaya et al, 2001](https://doi.org/10.1093/bioinformatics/17.6.520). Similar to LD-kNNi, linkage disequilibrium (LD) is estimated using Pearson's product moment correlation per pair of loci, which is computed per chromosome by default, but can be computed across the entire genome. We use the mean absolute difference/error (MAE) between allele frequencies among linked loci as an estimate of genetic distance between samples. Fixed values for the minimum correlation to identify loci used in distance estimation, and maximum genetic distance to select the k-nearest neighbours can be defined. Additionally, minimum number of loci to include in distance estimation, and minimum number of nearest neighbours can be set. Moreover, all four parameters can be optimised, i.e. the minimum correlation and/or maximum distance and/or minimum number of loci and/or minimum number of nearest neighbours which minimises the MAE between predicted and expected allele frequencies after simulating 10% missing data are identified.
///
/// The allele depth information (`AD`), i.e. the unfiltered allele depth which includes the reads which did not pass the variant caller filters are used to calculate allele frequencies. If the `GT` field is present but the `AD` field is absent, then each sample is assumed to be an individual diploid, i.e., neither a polyploid nor a pool. Optional filtering steps based on minimum depth, minimum allele frequency, and maximum sparsity are available. Genotype data are not imported into R, LD estimation and imputation per se are multi-threaded, and imputation output is written into disk as an [allele frequency table](#allele-frequency-table-csv). The structs, traits, methods, and functions defined in this library are subsets of [poolgen](https://github.com/jeffersonfparil/poolgen), and will eventually be merged.
///
/// The imputed allele frequency is computed as:
///
/// $$
/// \hat q_{r,j} = { \sum_{i \ne r}^{k} q_{i,j} (1 - \delta_{i,r}) }
/// $$
///
/// with:
///
/// $$
/// \delta_{i,r} = { {1 \over \sum d_{i,r}} d_{i,r} }
/// $$
///
/// and
///
/// $$
/// d_{i,r} = { {1 \over c} { \sum_{j=1}^{c} |q_{i,j} - q_{r,j}| } }
/// $$
///
/// where:
///
/// - $\hat q_{r,j}$ is the imputed allele frequency of sample $r$ at the $j^{\text {th}}$ locus,
/// - $n$ is the total number of samples,
/// - $m$ is the number of samples which are missing data at the $j^{\text {th}}$ locus,
/// - $q_{i,j}$ is the known allele frequency of the $i^{\text {th}}$ sample at the $j^{\text {th}}$ locus,
/// - $k$ is the number of nearest neighbours or the samples most closely related to the sample requiring imputation, i.e. sample $r$ at locus $j$, and
/// - $\delta_{i,r}$ is scaled $d_{i,r}$ which is the genetic distance between the $i^{\text {th}}$ sample and sample $r$. This distance is the mean absolute difference in allele frequencies between the two samples across $c$ linked loci.
///
/// The variables $k$ and $c$ are proportional to the user inputs `max_pool_dist` (default=0.1) and `min_loci_corr` (default=0.9), respectively. The former defines the maximum distance of samples to be considered as one of the k-nearest neighbours, while the latter refers to the minimum correlation with the locus requiring imputation to be included in the estimation of the genetic distance.
pub fn impute_aldknni(
genotypes_and_phenotypes: GenotypesAndPhenotypes,
filter_stats: &FilterStats,
Expand Down
10 changes: 7 additions & 3 deletions src/filter_missing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,13 @@ impl GenotypesAndPhenotypes {

pub fn missing_rate(&mut self) -> io::Result<f64> {
let (n, l) = self.coverages.dim();
let sum = self
.coverages
.fold(0, |sum, &x| if x.is_nan() { sum + 1 } else { sum });
let sum = self.coverages.fold(0, |sum, &x| {
if (x.is_nan()) || (x == 0.0) {
sum + 1
} else {
sum
}
});
Ok(sensible_round(sum as f64 * 100.0 / ((n * l) as f64), 5))
}

Expand Down
9 changes: 6 additions & 3 deletions src/geno.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,13 +259,16 @@ impl LoadAll for FileGeno {
loci_chr.push(chromosome.last().expect("Error push chromosome within the convert_into_genotypes_and_phenotypes() method for FileGeno struct.").to_owned());
loci_pos.push(position.last().expect("Error push position within the convert_into_genotypes_and_phenotypes() method for FileGeno struct.").to_owned());
// Add alternative alleles if the allele frequencies per locus do not add up to 1.00 (~or if only one allele per locus is present~)
// Count how many allele we have to add
// Create coverages matrix setting missing loci to f64::NAN, while counting how many allele we have to add
let mut coverages: Array2<f64> = Array2::from_elem((n, l), 1_000.0);
for j in 0..l {
let idx_ini = loci_idx[j];
let idx_fin = loci_idx[j + 1];
let _n_alleles = idx_fin - idx_ini;
let mut freq_sum_less_than_one = false;
for i in 0..n {
if mat[(i, idx_ini)].is_nan() {
coverages[(i, j)] = f64::NAN;
}
if mat.slice(s![i, idx_ini..idx_fin]).sum() < 1.0 {
freq_sum_less_than_one = true;
break;
Expand Down Expand Up @@ -323,7 +326,7 @@ impl LoadAll for FileGeno {
intercept_and_allele_frequencies: mat_new,
phenotypes: Array2::from_shape_vec((n, 1), vec![f64::NAN; n]).expect("Error generating dummy phenotype data within the convert_into_genotypes_and_phenotypes() method for FileGeno struct."),
pool_names,
coverages: Array2::from_elem((n, l), 1_000.0),
coverages,
})
}
}
Expand Down
14 changes: 7 additions & 7 deletions src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,17 +186,17 @@ mod tests {
fn test_helpers() {
if cfg!(windows) {
assert_eq!(
find_start_of_next_line(&"./tests/test.pileup".to_owned(), 56),
58
); // line 1 has a total of 57 characters in windows and only 56 in unix, so the next line should start at the 58th character position
find_start_of_next_line(&"./tests/test.sync".to_owned(), 77),
79
); // line 1 has a total of 78 characters in windows and only 77 in unix, so the next line should start at the 79th character position
} else {
assert_eq!(
find_start_of_next_line(&"./tests/test.pileup".to_owned(), 56),
57
); // 56th character is the newline character as line 1 has a total of 56 characters, so the next line should start at the 57th character position
find_start_of_next_line(&"./tests/test.sync".to_owned(), 77),
78
); // 77th character is the newline character as line 1 has a total of 77 characters, so the next line should start at the 78th character position
}
assert_eq!(
find_file_splits(&"./tests/test.pileup".to_owned(), &2)
find_file_splits(&"./tests/test.sync".to_owned(), &2)
.unwrap()
.len(),
3
Expand Down
Loading

0 comments on commit b479dcd

Please sign in to comment.