diff --git a/Cargo.toml b/Cargo.toml
index cf375d0..b2f4494 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"
 
 [lib]
 name = "rustfrc"
-crate-type = ["cdylib"]
+crate-type = ["cdylib", "lib"]
 
 [dependencies]
 numpy = "~0.21"
@@ -16,7 +16,14 @@ num-traits = "~0.2.18"
 
 [dependencies.pyo3]
 version = "~0.21.1"
-features = ["extension-module", "gil-refs"]
+features = ["extension-module"]
+
+[dev-dependencies]
+criterion = "0.3"
+
+[[bench]]
+name = "rustfrc_bench"
+harness = false
 
 [package.metadata.maturin]
 python-source = "python"
diff --git a/benches/rustfrc_bench.rs b/benches/rustfrc_bench.rs
new file mode 100644
index 0000000..5d1809a
--- /dev/null
+++ b/benches/rustfrc_bench.rs
@@ -0,0 +1,31 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion, BatchSize};
+use rustfrc::{binom_split, pois_gen, sqr_abs, to_i32};
+use num_complex::Complex;
+use ndarray_rand::rand::prelude::{Distribution, thread_rng};
+use ndarray_rand::rand_distr::Poisson;
+
+pub fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sample-10");
+    // Configure Criterion.rs to detect smaller differences and increase sample size to improve
+    // precision and counteract the resulting noise.
+    group.sample_size(10);
+    let size = [4000, 4000];
+    let arr = pois_gen(&size, 200f64).unwrap();
+
+    let arr_complex = arr.map(|e| {
+        let mut rng = thread_rng();
+        let a = Poisson::new(*e).unwrap().sample(&mut rng);
+        Complex::new(*e, a.to_owned())
+    });
+
+    let arr_i32 = to_i32(arr.view());
+    let pois_shape = [4000, 4000];
+
+    group.bench_function("binom", |b| b.iter_batched(|| arr_i32.to_owned(), |a| binom_split(black_box(a)), BatchSize::SmallInput));
+    group.bench_function("pois", |b| b.iter(|| pois_gen(black_box(&pois_shape), black_box(100f64))));
+    group.bench_function("sqr_abs", |b| b.iter_batched(|| arr_complex.to_owned(), |a| sqr_abs(black_box(a)), BatchSize::SmallInput));
+    group.finish()
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 0804331..125ff0f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,11 +4,14 @@ use std::fmt::{Display, Formatter};
 use std::marker::{Send, Sync};
 use std::sync::atomic::{AtomicBool, Ordering};
 
-use ndarray::{Array, ArrayD, Dimension};
+use ndarray::parallel::prelude::*;
+use ndarray::parallel::prelude::IntoParallelIterator;
+use ndarray::{Array, ArrayD, ArrayView, Dimension, Zip};
 use ndarray_rand::rand::prelude::{Distribution, thread_rng};
 use ndarray_rand::rand_distr::{Binomial, Poisson, PoissonError};
 use num_complex::{Complex, Complex32, Complex64};
 use num_traits::{Float, Zero};
+use num_traits::real::Real;
 use numpy::{IntoPyArray, PyArrayDyn, PyReadonlyArrayDyn, PyArrayMethods};
 use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
@@ -16,14 +19,14 @@ use pyo3::prelude::*;
 use pyo3::PyObject;
 
 #[pymodule]
-fn rustfrc(py: Python<'_>, m: &PyModule) -> PyResult<()> {
-    let internal = PyModule::new(py, "_internal")?;
-    internal.add_function(wrap_pyfunction!(binom_split_py, internal)?)?;
-    internal.add_function(wrap_pyfunction!(sqr_abs32_py, internal)?)?;
-    internal.add_function(wrap_pyfunction!(sqr_abs64_py, internal)?)?;
-    internal.add_function(wrap_pyfunction!(pois_gen_py, internal)?)?;
+fn rustfrc(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    let internal = PyModule::new_bound(m.py(), "_internal")?;
+    internal.add_function(wrap_pyfunction!(binom_split_py, &internal)?)?;
+    internal.add_function(wrap_pyfunction!(sqr_abs32_py, &internal)?)?;
+    internal.add_function(wrap_pyfunction!(sqr_abs64_py, &internal)?)?;
+    internal.add_function(wrap_pyfunction!(pois_gen_py, &internal)?)?;
 
-    m.add_submodule(internal)?;
+    m.add_submodule(&internal)?;
 
     Ok(())
 }
@@ -32,32 +35,32 @@ fn rustfrc(py: Python<'_>, m: &PyModule) -> PyResult<()> {
 /// binomial distribution (n, p) with n = pixel value and p = 0.5. Returns a single array.
 #[pyfunction]
 #[pyo3(text_signature = "a, /")]
-fn binom_split_py<'py>(py: Python<'py>, a: PyReadonlyArrayDyn<'py, i32>) -> PyResult<&'py PyArrayDyn<i32>> {
+fn binom_split_py<'py>(py: Python<'py>, a: PyReadonlyArrayDyn<'py, i32>) -> PyResult<Bound<'py, PyArrayDyn<i32>>> {
     let a = a.to_owned_array();
 
     binom_split(a)
              .map_err(|e| PyValueError::new_err(format!("{}", e)))
-             .map(|a| a.into_pyarray(py))
+             .map(|a| a.into_pyarray_bound(py))
 }
 
 /// Takes an array (np.ndarray with dtype complex64) and takes the absolute value and then squares
 /// it, element-wise.
 #[pyfunction]
 #[pyo3(text_signature = "a, /")]
-fn sqr_abs32_py<'py>(py: Python<'py>, a: PyReadonlyArrayDyn<'py, Complex32>) -> &'py PyArrayDyn<f32> {
+fn sqr_abs32_py<'py>(py: Python<'py>, a: PyReadonlyArrayDyn<'py, Complex32>) -> Bound<'py, PyArrayDyn<f32>> {
     let a = a.to_owned_array();
 
-    sqr_abs(a).into_pyarray(py)
+    sqr_abs(a).into_pyarray_bound(py)
 }
 
 /// Takes an array (np.ndarray with dtype complex128) and takes the absolute value and then squares
 /// it, element-wise.
 #[pyfunction]
 #[pyo3(text_signature = "a, /")]
-fn sqr_abs64_py<'py>(py: Python<'py>, a: PyReadonlyArrayDyn<'py, Complex64>) -> &'py PyArrayDyn<f64> {
+fn sqr_abs64_py<'py>(py: Python<'py>, a: PyReadonlyArrayDyn<'py, Complex64>) -> Bound<'py, PyArrayDyn<f64>> {
     let a = a.to_owned_array();
 
-    sqr_abs(a).into_pyarray(py)
+    sqr_abs(a).into_pyarray_bound(py)
 }
 
 /// Generates an array (np.ndarray with dtype float64) by sampling a Poisson distribution with
@@ -65,17 +68,17 @@ fn sqr_abs64_py<'py>(py: Python<'py>, a: PyReadonlyArrayDyn<'py, Complex64>) ->
 /// non-negative ints.
 #[pyfunction]
 #[pyo3(text_signature = "a, /")]
-fn pois_gen_py(py: Python, shape: PyObject, lambda: f64 ) -> PyResult<&PyArrayDyn<f64>> {
+fn pois_gen_py<'py>(py: Python<'py>, shape: PyObject, lambda: f64 ) -> PyResult<Bound<'py, PyArrayDyn<f64>>> {
     let shape_vec: Vec<usize> = shape.extract(py)?;
     let shape = shape_vec.as_slice();
 
     pois_gen(shape, lambda)
         .map_err(|e| PyValueError::new_err(format!("{}", e)))
-        .map(|a| a.into_pyarray(py))
+        .map(|a| a.into_pyarray_bound(py))
 }
 
 #[derive(Debug)]
-struct ToUsizeError {}
+pub struct ToUsizeError {}
 
 impl Display for ToUsizeError {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
@@ -88,12 +91,33 @@ impl Error for ToUsizeError {}
 
 /// Takes an ndarray (i32, dimension D) and splits every pixel value according to the
 /// binomial distribution (n, p) with n = element value and p = 0.5. Returns a single array.
-fn binom_split<D: Dimension>(mut a: Array<i32, D>) -> Result<Array<i32, D>, ToUsizeError> {
+pub fn binom_split<D: Dimension>(mut a: Array<i32, D>) -> Result<Array<i32, D>, ToUsizeError> {
     // AtomicBool is thread-safe, and allows for communicating an error state occurred across threads
     // We initialize it with the value false, since no error occurred
     let to_unsized_failed = AtomicBool::new(false);
     // We map all values in parallel, since they do not depend on each other
 
+    // let a = a.mapv(|i| {
+    //     if !to_unsized_failed.load(Ordering::Relaxed) {
+    //         // We use thread rng, which is fast
+    //         let mut rng = thread_rng();
+    //         // We try to convert i32 to u64 (which is required for Binomial)
+    //         // If it fails, we indicate a failure has occurred
+    //         // Unfortunately it is not possible to escape from the loop immediately
+    //         let n = u64::try_from(i).unwrap_or_else(|_| {
+    //             to_unsized_failed.store(true, Ordering::Relaxed);
+    //             0
+    //         });
+    //         // Since the input fit into an i32 and the binomial output is always less, we know the output
+    //         // will also fit into an i32
+    //         Binomial::new(n, 0.5).unwrap().sample(&mut rng) as i32
+    //     }
+    //     // We just keep the rest unchanged if a failure occurred
+    //     else {
+    //         i
+    //     }
+    // });
+
     a.par_mapv_inplace(|i| {
         // If no failure has occurred, we continue
         // We use Relaxed Ordering because the order in which stores and loads occur does not matter
@@ -108,6 +132,8 @@ fn binom_split<D: Dimension>(mut a: Array<i32, D>) -> Result<Array<i32, D>, ToUs
                 to_unsized_failed.store(true, Ordering::Relaxed);
                 0
             });
+            // Since the input fit into an i32 and the binomial output is always less, we know the output
+            // will also fit into an i32
             Binomial::new(n, 0.5).unwrap().sample(&mut rng) as i32
         }
         // We just keep the rest unchanged if a failure occurred
@@ -125,25 +151,35 @@ fn binom_split<D: Dimension>(mut a: Array<i32, D>) -> Result<Array<i32, D>, ToUs
 
 /// Takes an ndarray (dimension D and complex number of generic float F) and computes the absolute
 /// value and then square for each element.
-fn sqr_abs<D: Dimension, F: Float + Send + Sync>(mut a: Array<Complex<F>, D>) -> Array<F, D> {
-    // We map all values in parallel, since they do not depend on each other
-    // As there is no standard parallel map, we first map in place
-    a.par_mapv_inplace(|i| { let c: Complex<F> = Complex::from({
-        i.norm_sqr()
-    }); c });
-    // Then we get the real part
-    a.mapv(|i| {
-        i.re
-    })
+pub fn sqr_abs<D: Dimension, F: Float + Send + Sync>(a: Array<Complex<F>, D>) -> Array<F, D> {
+    let v: Vec<F> = a.par_iter().map(|i| i.norm_sqr()).collect();
+    let v_arr = Array::from_shape_vec(a.raw_dim(), v).unwrap();
+
+    v_arr
+
+    // a.mapv(|i| {
+    //     i.norm_sqr()
+    // })
 }
 
 /// Generates an ndarray (dynamic dimension) by sampling a Poisson distribution with parameter
 /// lambda for each element. Takes a lambda parameter (positive f64) and a shape slice.
-fn pois_gen(shape: &[usize], lambda: f64) -> Result<ArrayD<f64>, PoissonError> {
+pub fn pois_gen(shape: &[usize], lambda: f64) -> Result<ArrayD<f64>, PoissonError> {
     if lambda.is_sign_negative() || lambda.is_infinite() || lambda.is_nan() || lambda.is_zero() {
         return Err(PoissonError::ShapeTooSmall);
     }
 
+    // let size = shape.iter().product();
+    // let mut v = Vec::with_capacity(size);
+
+    // (0..size).into_par_iter().map(|_i| {
+    //     let mut rng = thread_rng();
+
+    //     Poisson::new(lambda).unwrap().sample(&mut rng)
+    // }).collect_into_vec(&mut v);
+
+    // let a = Array::from_shape_vec(shape, v).unwrap();
+
     let mut a = ArrayD::<f64>::from_elem(shape, lambda);
     a.par_mapv_inplace(|l| {
         let mut rng = thread_rng();
@@ -153,6 +189,13 @@ fn pois_gen(shape: &[usize], lambda: f64) -> Result<ArrayD<f64>, PoissonError> {
     Ok(a)
 }
 
+/// Panics if the input numbers cannot be cast to i32, use with caution!
+pub fn to_i32<D: Dimension, R: Real + Send + Sync>(a: ArrayView<R, D>) -> Array<i32, D> {
+    a.map(|l| {
+        l.to_i32().unwrap()
+    })
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;