Data pre-processing steps

mrc-ide · Oct 31, 2024 · 2c6a47e · 2c6a47e
1 parent 613fd94
commit 2c6a47e
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 5 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,3 +10,6 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.2
+Suggests: 
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,124 @@
+data_complete <- function(data, ...){
+  site_names <- rlang::enquos(...)
+
+  # Complete all site x time combinations
+  data <- data |>
+    tidyr::complete(!!!site_names, t, fill = list(n = NA)) |>
+    dplyr::group_by(!!!site_names)|>
+    dplyr::mutate(
+      lat = dplyr::first(lat, na_rm = TRUE),
+      lon = dplyr::first(lon, na_rm = TRUE)
+    ) |>
+    dplyr::ungroup()
+
+  return(data)
+}
+
+data_missing <- function(data, ...){
+  site_names <- rlang::enquos(...)
+
+  sites_to_drop_n <- data |>
+    dplyr::group_by(!!!site_names) |>
+    dplyr::filter(
+      all(is.na(n))
+    ) |>
+    dplyr::distinct(!!!site_names)
+
+  sites_to_drop_lat_lon <- data |>
+    dplyr::group_by(!!!site_names) |>
+    dplyr::filter(
+      all(is.na(lat)) | all(is.na(lon))
+    ) |>
+    dplyr::distinct(!!!site_names)
+
+  sites_to_drop_no_cases <- data |>
+    dplyr::group_by(!!!site_names) |>
+    dplyr::filter(
+      sum(n, na.rm = TRUE) == 0
+    ) |>
+    dplyr::distinct(!!!site_names)
+
+  sites_to_drop <-
+    sites_to_drop_n |>
+    dplyr::bind_rows(sites_to_drop_lat_lon) |>
+    dplyr::bind_rows(sites_to_drop_no_cases) |>
+    dplyr::distinct()
+
+
+  cat("Sites dropped as all data missing, or all counts = 0: ")
+  knitr::kable(sites_to_drop, format = "pipe", align = "c") |>
+    print()
+
+  data <- data |>
+    dplyr::anti_join(
+      sites_to_drop,
+      by = dplyr::join_by(...)
+    )
+
+  return(data)
+}
+
+data_observed_summmary <- function(data, ...){
+  site_names <- rlang::enquos(...)
+
+  data <- data |>
+    dplyr::group_by(!!!site_names) |>
+    dplyr::mutate(
+      observed_mu = log(mean(n, na.rm = T)),
+      observed_sigmasq = log((sd(n, na.rm = T) / mean(n, na.rm = T))^2 + 1)
+    ) |>
+    dplyr::ungroup() |>
+    dplyr::mutate(
+      observed_mu = ifelse(is.na(observed_mu), mean(observed_mu, na.rm = T), observed_mu),
+      observed_sigmasq = ifelse(is.na(observed_sigmasq), mean(observed_sigmasq, na.rm = T), observed_sigmasq)
+    )
+
+  return(data)
+}
+
+data_initial_par <- function(data){
+  data <- data |>
+    dplyr::mutate(
+      start_par = log(n + 1),
+      start_par = ifelse(is.na(n), observed_mu, start_par)
+    )
+
+  return(data)
+}
+
+data_order_index <- function(data, ...){
+  site_names <- rlang::enquos(...)
+
+  data <- data |>
+    dplyr::arrange(
+      !!!site_names,
+      t
+    ) |>
+    dplyr::group_by(!!!site_names)|>
+    dplyr::mutate(id = dplyr::cur_group_id()) |>
+    dplyr::ungroup()
+
+  return(data)
+}
+
+
+data_process <- function(data, ...){
+  if(!all(c("t", "n", "lat", "lon") %in% colnames(data))){
+    stop("Input data must include the following columns: t, n, lat and lon"
+    )
+  }
+
+  if("id" %in% colnames(data)){
+    stop("The column name 'id' is protected and cannot be used in the input data")
+  }
+
+  data <- data |>
+    data_complete(...) |>
+    data_missing(...) |>
+    data_observed_summmary(...) |>
+    data_initial_par() |>
+    data_order_index() |>
+    data_order_index(...)
+
+  return(data)
+}
diff --git a/R/matrix.R b/R/matrix.R
@@ -1,4 +1,5 @@
-create_time_matrix <- function(times, periodic_scale, long_term_scale, period, epsilon = 0.01){
+create_time_matrix <- function(data, periodic_scale, long_term_scale, period, epsilon = 0.01){
+  times <- unique(data$t)
   time_distance <- outer(times,  times, "-")
   time <- time_distance |>
     periodic_kernel(periodic_scale, long_term_scale, period)
@@ -19,20 +20,25 @@ create_time_matrix <- function(times, periodic_scale, long_term_scale, period, e
   )
 }
 
-create_spatial_matrix <- function(coords, sigma_sq, space_sigma, epsilon = 0.01){
-  spatial_distance <- coords |>
+create_spatial_matrix <- function(data, space_sigma, epsilon = 0.01){
+  coordinates <- data |>
+    dplyr::select(lat, lon) |>
+    dplyr::distinct()
+
+  spatial_distance <- coordinates |>
     dist() |>
     as.matrix()
+
   space <- spatial_distance |>
     rbf_kernel(space_sigma)
 
   # Create D_space
-  d_space <- diag(sqrt(sigma_sq))
+  d_space <- diag(sqrt(unique(data$observed_sigmasq)))
   # Compute Sigma_space
   space <- d_space %*% space %*% d_space
   space <- Matrix::nearPD(space)
   space <- as.matrix(space$mat)
-  space_reg <- space + epsilon * diag(nrow(coords))
+  space_reg <- space + epsilon * diag(nrow(coordinates))
   space_inv_reg <- MASS::ginv(space_reg)
   space_chol <- chol(space)
 

diff --git a/tests/testthat.R b/tests/testthat.R
@@ -0,0 +1,12 @@
+# This file is part of the standard setup for testthat.
+# It is recommended that you do not modify it.
+#
+# Where should you do additional test configuration?
+# Learn more about the roles of various files in:
+# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
+# * https://testthat.r-lib.org/articles/special-files.html
+
+library(testthat)
+library(weave)
+
+test_check("weave")
diff --git a/tests/testthat/test-data-processing.R b/tests/testthat/test-data-processing.R
@@ -0,0 +1,51 @@
+test_that("data completed correctly", {
+  input_data <-
+    data.frame(
+      admin1 = rep("A", 6),
+      admin2 = c(rep("A", 4), rep("B", 2)),
+      t = c(1:4, 1:2),
+      n = c(1, 2, 3, NA, 4, NA),
+      lat = c(1, NA, NA, NA, 2, NA),
+      lon = c(1, NA, NA, NA, 2, NA)
+    )
+
+  output_data <- data_complete(input_data, admin1, admin2)
+
+  expect_identical(
+    as.data.frame(output_data),
+    data.frame(
+      admin1 = rep("A", 8),
+      admin2 = c(rep("A", 4), rep("B", 4)),
+      t = rep(1:4, 2),
+      n = c(1, 2, 3, NA, 4, NA, NA, NA),
+      lat = c(rep(1, 4), rep(2, 4)),
+      lon = c(rep(1, 4), rep(2, 4))
+    )
+  )
+})
+
+test_that("missing data dropped correctly", {
+  input_data <-
+    data.frame(
+      admin1 = rep("A", 6),
+      admin2 = c(rep("A", 4), rep("B", 2)),
+      t = c(1:4, 1:2),
+      n = c(1, 2, 3, NA, NA, NA),
+      lat = c(1, NA, NA, NA, 2, NA),
+      lon = c(1, NA, NA, NA, 2, NA)
+    )
+
+  output_data <- data_missing(input_data, admin1, admin2)
+
+  expect_identical(
+    as.data.frame(output_data),
+    data.frame(
+      admin1 = rep("A", 4),
+      admin2 = rep("A", 4),
+      t = 1:4,
+      n = c(1, 2, 3, NA),
+      lat = c(1, NA, NA, NA),
+      lon = c(1, NA, NA, NA)
+    )
+  )
+})