Skip to content

Commit

Permalink
Data pre-processing steps
Browse files Browse the repository at this point in the history
  • Loading branch information
pwinskill committed Oct 31, 2024
1 parent 613fd94 commit 2c6a47e
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 5 deletions.
3 changes: 3 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.2
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
124 changes: 124 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
data_complete <- function(data, ...){
site_names <- rlang::enquos(...)

# Complete all site x time combinations
data <- data |>
tidyr::complete(!!!site_names, t, fill = list(n = NA)) |>
dplyr::group_by(!!!site_names)|>
dplyr::mutate(
lat = dplyr::first(lat, na_rm = TRUE),
lon = dplyr::first(lon, na_rm = TRUE)
) |>
dplyr::ungroup()

return(data)
}

data_missing <- function(data, ...){
site_names <- rlang::enquos(...)

sites_to_drop_n <- data |>
dplyr::group_by(!!!site_names) |>
dplyr::filter(
all(is.na(n))
) |>
dplyr::distinct(!!!site_names)

sites_to_drop_lat_lon <- data |>
dplyr::group_by(!!!site_names) |>
dplyr::filter(
all(is.na(lat)) | all(is.na(lon))
) |>
dplyr::distinct(!!!site_names)

sites_to_drop_no_cases <- data |>
dplyr::group_by(!!!site_names) |>
dplyr::filter(
sum(n, na.rm = TRUE) == 0
) |>
dplyr::distinct(!!!site_names)

sites_to_drop <-
sites_to_drop_n |>
dplyr::bind_rows(sites_to_drop_lat_lon) |>
dplyr::bind_rows(sites_to_drop_no_cases) |>
dplyr::distinct()


cat("Sites dropped as all data missing, or all counts = 0: ")
knitr::kable(sites_to_drop, format = "pipe", align = "c") |>
print()

data <- data |>
dplyr::anti_join(
sites_to_drop,
by = dplyr::join_by(...)
)

return(data)
}

data_observed_summmary <- function(data, ...){
site_names <- rlang::enquos(...)

data <- data |>
dplyr::group_by(!!!site_names) |>
dplyr::mutate(
observed_mu = log(mean(n, na.rm = T)),
observed_sigmasq = log((sd(n, na.rm = T) / mean(n, na.rm = T))^2 + 1)
) |>
dplyr::ungroup() |>
dplyr::mutate(
observed_mu = ifelse(is.na(observed_mu), mean(observed_mu, na.rm = T), observed_mu),
observed_sigmasq = ifelse(is.na(observed_sigmasq), mean(observed_sigmasq, na.rm = T), observed_sigmasq)
)

return(data)
}

data_initial_par <- function(data){
data <- data |>
dplyr::mutate(
start_par = log(n + 1),
start_par = ifelse(is.na(n), observed_mu, start_par)
)

return(data)
}

data_order_index <- function(data, ...){
site_names <- rlang::enquos(...)

data <- data |>
dplyr::arrange(
!!!site_names,
t
) |>
dplyr::group_by(!!!site_names)|>
dplyr::mutate(id = dplyr::cur_group_id()) |>
dplyr::ungroup()

return(data)
}


data_process <- function(data, ...){
if(!all(c("t", "n", "lat", "lon") %in% colnames(data))){
stop("Input data must include the following columns: t, n, lat and lon"
)
}

if("id" %in% colnames(data)){
stop("The column name 'id' is protected and cannot be used in the input data")
}

data <- data |>
data_complete(...) |>
data_missing(...) |>
data_observed_summmary(...) |>
data_initial_par() |>
data_order_index() |>
data_order_index(...)

return(data)
}
16 changes: 11 additions & 5 deletions R/matrix.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
create_time_matrix <- function(times, periodic_scale, long_term_scale, period, epsilon = 0.01){
create_time_matrix <- function(data, periodic_scale, long_term_scale, period, epsilon = 0.01){
times <- unique(data$t)
time_distance <- outer(times, times, "-")
time <- time_distance |>
periodic_kernel(periodic_scale, long_term_scale, period)
Expand All @@ -19,20 +20,25 @@ create_time_matrix <- function(times, periodic_scale, long_term_scale, period, e
)
}

create_spatial_matrix <- function(coords, sigma_sq, space_sigma, epsilon = 0.01){
spatial_distance <- coords |>
create_spatial_matrix <- function(data, space_sigma, epsilon = 0.01){
coordinates <- data |>
dplyr::select(lat, lon) |>
dplyr::distinct()

spatial_distance <- coordinates |>
dist() |>
as.matrix()

space <- spatial_distance |>
rbf_kernel(space_sigma)

# Create D_space
d_space <- diag(sqrt(sigma_sq))
d_space <- diag(sqrt(unique(data$observed_sigmasq)))
# Compute Sigma_space
space <- d_space %*% space %*% d_space
space <- Matrix::nearPD(space)
space <- as.matrix(space$mat)
space_reg <- space + epsilon * diag(nrow(coords))
space_reg <- space + epsilon * diag(nrow(coordinates))
space_inv_reg <- MASS::ginv(space_reg)
space_chol <- chol(space)

Expand Down
12 changes: 12 additions & 0 deletions tests/testthat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This file is part of the standard setup for testthat.
# It is recommended that you do not modify it.
#
# Where should you do additional test configuration?
# Learn more about the roles of various files in:
# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
# * https://testthat.r-lib.org/articles/special-files.html

library(testthat)
library(weave)

test_check("weave")
51 changes: 51 additions & 0 deletions tests/testthat/test-data-processing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
test_that("data completed correctly", {
input_data <-
data.frame(
admin1 = rep("A", 6),
admin2 = c(rep("A", 4), rep("B", 2)),
t = c(1:4, 1:2),
n = c(1, 2, 3, NA, 4, NA),
lat = c(1, NA, NA, NA, 2, NA),
lon = c(1, NA, NA, NA, 2, NA)
)

output_data <- data_complete(input_data, admin1, admin2)

expect_identical(
as.data.frame(output_data),
data.frame(
admin1 = rep("A", 8),
admin2 = c(rep("A", 4), rep("B", 4)),
t = rep(1:4, 2),
n = c(1, 2, 3, NA, 4, NA, NA, NA),
lat = c(rep(1, 4), rep(2, 4)),
lon = c(rep(1, 4), rep(2, 4))
)
)
})

test_that("missing data dropped correctly", {
input_data <-
data.frame(
admin1 = rep("A", 6),
admin2 = c(rep("A", 4), rep("B", 2)),
t = c(1:4, 1:2),
n = c(1, 2, 3, NA, NA, NA),
lat = c(1, NA, NA, NA, 2, NA),
lon = c(1, NA, NA, NA, 2, NA)
)

output_data <- data_missing(input_data, admin1, admin2)

expect_identical(
as.data.frame(output_data),
data.frame(
admin1 = rep("A", 4),
admin2 = rep("A", 4),
t = 1:4,
n = c(1, 2, 3, NA),
lat = c(1, NA, NA, NA),
lon = c(1, NA, NA, NA)
)
)
})

0 comments on commit 2c6a47e

Please sign in to comment.