Merge pull request #11 from robboyd/master

Pull in Rob B's updates to reporting rate to allow coarser time period
AugustT · Oct 11, 2022 · 02eed2a · 02eed2a
2 parents 84594ee + 8898a78
commit 02eed2a
Show file tree

Hide file tree

Showing 10 changed files with 166 additions and 54 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: sparta
 Type: Package
 Title: Trend Analysis for Unstructured Data
-Version: 0.2.14
-Date: 2020-02-15
+Version: 0.2.18
+Date: 2020-03-27
 Authors@R: c(person("Tom", "August", role = c("aut", "cre"), email = "[email protected]"),
              person("Gary", "Powney", role = c("aut")),
              person("Charlie", "Outhwaite", role = c("aut")),

diff --git a/R/dataGaps.r b/R/dataGaps.r
@@ -4,6 +4,10 @@ dataGaps<-function(years, minmod, maxmod, mindat, maxdat){
      actualyears <- (years-1) + minmod
      middleyears <- subset(actualyears,actualyears %in% seq(mindat,maxdat,1))
      middleyears <- sort(unique(middleyears))
-     gap_middle <- max(diff(middleyears)-1)
+     if(length(middleyears)>1){
+        gap_middle <- max(diff(middleyears)-1)
+     } else {
+        gap_middle <- NA
+     }
      return(list(gap_start=gap_start,gap_end=gap_end,gap_middle=gap_middle))
-   }
+   }
diff --git a/R/occDetFunc.r b/R/occDetFunc.r
@@ -50,6 +50,8 @@
 #' Other options are `EqualWt` or `HighSpec`, which define the application of "rules of thumb" defined in Pocock et al 2019. 
 #' Defaults to 1, in which case the model is applied for so long there is a single record of the focal species.
 #' @param provenance An optional text string allowing the user to identify the dataset.
+#' @param rem_aggs_with_missing_regions An option which if TRUE will remove all aggregates which contain at least one region with no data.
+#' If FALSE, only aggregates where ALL regions in that aggregate contain no data, are dropped. Defaults to TRUE
 #' 
 #' @details \code{modeltype} is used to choose the model as well as the associated initial values,
 #' and parameters to monitor. Elements to choose from can be separated into the following components:
@@ -179,7 +181,8 @@ occDetFunc <- function (taxa_name, occDetdata, spp_vis, n_iterations = 5000, nyr
                         seed = NULL, model.function = NULL, regional_codes = NULL,
                         region_aggs = NULL, additional.parameters = NULL,
                         additional.BUGS.elements = NULL, additional.init.values = NULL,
-                        return_data = FALSE, criterion = 1, provenance = NULL, saveMatrix = FALSE){
+                        return_data = FALSE, criterion = 1, provenance = NULL, saveMatrix = FALSE,
+                        rem_aggs_with_missing_regions = TRUE){
 
   ################## BASIC CHECKS
   # first run the error checks
@@ -192,8 +195,8 @@ occDetFunc <- function (taxa_name, occDetdata, spp_vis, n_iterations = 5000, nyr
   # Check the taxa_name is one of my species
   if(!taxa_name %in% colnames(spp_vis)) stop('taxa_name is not the name of a taxa in spp_vis')
   ################## 
-  min_year <- min(occDetdata$TP)
-
+  min_year_original <- min_year <- min(occDetdata$TP)
+  
   # only include sites which have more than nyr of records
   yps <- rowSums(acast(occDetdata, site ~ TP, length, value.var = 'L') > 0)
   sites_to_include <- names(yps[yps >= nyr])
@@ -357,6 +360,13 @@ occDetFunc <- function (taxa_name, occDetdata, spp_vis, n_iterations = 5000, nyr
     # need to get a measure of whether the species was on that site in that year, unequivocally, in zst
     zst <- acast(occDetdata, id ~ factor(TP), value.var = 'focal', max, fill = 0) # initial values for the latent state = observed state
 
+    # Calculate min year. We're doing it now as it's fine if we've dropped the first year(s) of data and nothing in the middle
+    min_year <- min(occDetdata$TP)
+    if(min_year != min_year_original){
+      warning(paste0('\nThe first year of the data has changed, as the first years of data were dropped.\n',
+                    'The original first year was ',min_year_original,'. It is now ',min_year,'\n'))
+    }
+
     # if the max_year is not null, edit the zst table to add the additional years required
     if(!is.null(max_year)){
 
@@ -493,16 +503,47 @@ occDetFunc <- function (taxa_name, occDetdata, spp_vis, n_iterations = 5000, nyr
         region_names <- setdiff(region_names, zero_sites)
 
         # remove region aggregates
-        rem_aggs <- unlist(lapply(region_aggs, FUN = function(x) any(zero_sites %in% x)))
-        rem_aggs_names <- names(region_aggs)[rem_aggs]
-
-        # remove aggs if you need to
-        if(length(rem_aggs_names) > 0){
-          warning(paste('The following region aggregates have to be removed as they contain a region with no data:',
-                        paste(rem_aggs_names, collapse = ', '),
-                        '- These region aggregates will not be included in the model'))
-          region_aggs <- region_aggs[!names(region_aggs) %in% rem_aggs_names]
-          parameters <- parameters[!parameters %in% paste0('psi.fs.r_', rem_aggs_names)]
+        if(rem_aggs_with_missing_regions){
+          rem_aggs <- unlist(lapply(region_aggs, FUN = function(x) any(zero_sites %in% x)))
+          rem_aggs_names <- names(region_aggs)[rem_aggs]
+
+          # remove aggs if you need to
+          if(length(rem_aggs_names) > 0){
+            warning(paste('The following region aggregates have to be removed as they contain a region with no data:',
+                          paste(rem_aggs_names, collapse = ', '),
+                          '- These region aggregates will not be included in the model\n',
+                          'If you want to keep aggregates with one or more missing regions,',
+                          'set rem_aggs_with_missing_regions=FALSE'))
+            region_aggs <- region_aggs[!names(region_aggs) %in% rem_aggs_names]
+            parameters <- parameters[!parameters %in% paste0('psi.fs.r_', rem_aggs_names)]
+          }
+        } else {
+          rem_aggs <- unlist(lapply(region_aggs, FUN = function(x) all(x %in% zero_sites)))
+          rem_aggs_names <- names(region_aggs)[rem_aggs]
+          edit_aggs <- unlist(lapply(region_aggs, FUN = function(x) any(zero_sites %in% x)))
+          edit_aggs_names <- names(region_aggs)[edit_aggs & !(rem_aggs)]
+
+          # edit aggs if you need to
+          if(length(edit_aggs_names) > 0){
+            warning(paste('The following region aggregates have to be edited as they contain regions with no data:',
+                          paste(edit_aggs_names, collapse = ', '),
+                          '\n- These region aggregates will still be included in the model\n'))
+            # Recreate aggs, removing regions without data
+            region_aggs_new <- lapply(region_aggs, FUN = function(agg){
+              agg[!(agg %in% zero_sites)]
+            })
+            names(region_aggs_new) <- names(region_aggs)
+            region_aggs <- region_aggs_new
+          }
+
+          # remove aggs completely if you need to
+          if(length(rem_aggs_names) > 0){
+            warning(paste('The following region aggregates have to be removed as all regions within them have no data:',
+                          paste(rem_aggs_names, collapse = ', '),
+                          '- These region aggregates will not be included in the model'))
+            region_aggs <- region_aggs[!names(region_aggs) %in% rem_aggs_names]
+            parameters <- parameters[!parameters %in% paste0('psi.fs.r_', rem_aggs_names)]
+          }
         }
       } 
 

diff --git a/R/occDetModel.r b/R/occDetModel.r
@@ -10,6 +10,13 @@
 #' @param site A character vector of site names, as long as the number of observations.
 #' @param survey A  vector as long as the number of observations. 
 #'        This must be a Date if includeJDay = \code{TRUE}
+#' @param replicate An optional vector to identify replicate samples (visits) per survey. Need not be globally unique (e.g can be 1, 2, .. n within surveys) 
+#' @param closure_period An optional vector of integers specifying the closure period. 
+#'        If \code{FALSE} then closure_period will be extracted as the year from the survey.
+#' @param criterion Determines whether the model should be run. If an integer then this defines the threshold number of records (50 in Outhwaite et al 2019).
+#' Other options are `EqualWt` or `HighSpec`, which define the application of "rules of thumb" defined in Pocock et al 2019. 
+#' Defaults to 1, in which case the model is applied for so long there is a single record of the focal species.
+#' @param provenance An optional text string allowing the user to identify the dataset.
 #' @param species_list A character vector of taxa names for which models should be run. This is
 #'        optional and by default models will be run for all taxa
 #' @param write_results logical, should results be saved to \code{output_dir}. This is
@@ -168,11 +175,15 @@
 #'                        region_aggs = list(agg1 = c('region1', 'region2')))       
 #' }
 #' @export
-#' @references Roy, H.E., Adriaens, T., Isaac, N.J.B. et al. (2012) Invasive alien predator
-#'             causes rapid declines of native European ladybirds. Diversity & Distributions,
-#'             18, 717-725.
+#' @references Isaac, N.J.B., van Strien, A.J., August, T.A., de Zeeuw, M.P. and Roy, D.B. (2014).
+#'             Statistics for citizen science: extracting signals of change from noisy ecological data.
+#'             \emph{Methods in Ecology and Evolution}, 5: 1052-1060.
+#' @references Outhwaite, C.L., Chandler, R.E., Powney, G.D., Collen, B., Gregory, R.D. & Isaac, N.J.B. (2018).
+#'             Prior specification in Bayesian occupancy modelling improves analysis of species occurrence data. 
+#'             \emph{Ecological Indicators}, 93: 333-343.
 
-occDetModel <- function(taxa, site, survey,
+occDetModel <- function(taxa, site, survey, 
+                        replicate = NULL, closure_period = NULL, criterion = 1, provenance = NULL,
                         species_list = unique(taxa), write_results = TRUE,
                         output_dir = getwd(), nyr = 2, n_iterations = 5000,
                         burnin = 1500, thinning = 3, n_chains = 3, 
@@ -202,6 +213,8 @@ occDetModel <- function(taxa, site, survey,
   visitData <- formatOccData(taxa = taxa,
                              site = site,
                              survey = survey,
+                             replicate = replicate, 
+                             closure_period = replicate,
                              includeJDay = includeJDay)
 
   ### loop through the species list running the Bayesian occupancy model function ###
@@ -228,6 +241,8 @@ occDetModel <- function(taxa, site, survey,
                                       additional.parameters = additional.parameters,
                                       additional.BUGS.elements = additional.BUGS.elements,
                                       additional.init.values = additional.init.values,
+                                      provenance = provenance,
+                                      criterion = criterion,
                                       return_data = return_data)
   }
 

diff --git a/R/plot_DetectionOverTime.R b/R/plot_DetectionOverTime.R
@@ -21,8 +21,18 @@
 #' @export
 
 
-plot_DetectionOverTime <- function(model, spname = NULL, min.yr = NULL){
+plot_DetectionOverTime <- function(model, spname = NULL, min.yr = NULL, CI=95){
 
+
+  # convert the CI into quantiles
+  # first check that CI is sensible
+  if((CI > 100) | (CI <= 0)) stop("Credible intervals must be between 0 and 100")
+  CI2q <- function(CI) {
+    q <- (1 - CI/100)/2
+    return(c(q, 1-q))
+  }
+  quant <- CI2q(CI)
+
   sims_list <- model$BUGSoutput$sims.list
 
   # the base: alpha.p is common to all models: 
@@ -47,11 +57,14 @@ plot_DetectionOverTime <- function(model, spname = NULL, min.yr = NULL){
     # the model was fitted with categorical list length
     pDet2 <- pDet1 + sims_list$dtype2.p[,1]
     pDet4 <- pDet1 + sims_list$dtype3.p[,1]
-  } 
-  # there is also an option to ignore list length, 
-  # in which case the probability of detection is assumed to be constant across surveys
-  # i.e. if the survey was systematic
-
+  } else {
+    # there is also an option to ignore list length, 
+    # in which case the probability of detection is assumed to be constant across surveys
+    # i.e. if the survey was systematic
+    # in this case the there are no estimates for the other pDet values.
+    pDet2 <- pDet4 <- NA
+  }
+
   pDet <- melt(list(pDet1, pDet2, pDet4))
   names(pDet) <- c("it", "year", "lgt_pDet", "ListLength")
   pDet$ListLength[pDet$ListLength==3] <- 4 # the "third" category is for a list of length 4
@@ -61,9 +74,9 @@ plot_DetectionOverTime <- function(model, spname = NULL, min.yr = NULL){
   # now summarize these posterior distributions
   pDet_summary <-ddply(
         pDet, .(year, ListLength), summarise, 
-        mean_pDet = mean(pDet),
-        lower95CI = quantile(pDet, 0.025),
-        upper95CI = quantile(pDet, 0.975))
+        mean_pDet = mean(pDet, na.rm=T),
+        lower95CI = quantile(pDet, quant[1], na.rm=T),
+        upper95CI = quantile(pDet, quant[2], na.rm=T))
 
   # if the user has supplied a year then switch the x axis to start at that minimum
   if(!is.null(min.yr)) pDet_summary$year <- pDet_summary$year + min.yr - 1
@@ -75,7 +88,7 @@ plot_DetectionOverTime <- function(model, spname = NULL, min.yr = NULL){
     ylab("Detection probability") +
     ggtitle(spname) +
     theme_bw()
-  gp 
+  gp
 
 }
 

diff --git a/R/reportingRateModel.r b/R/reportingRateModel.r
@@ -5,9 +5,12 @@
 #' @param taxa A character vector of taxon names, as long as the number of observations.
 #' @param site A character vector of site names, as long as the number of observations.
 #' @param time_period A numeric vector of user defined time periods, or a date vector,
-#'        as long as the number of observations.
+#'        as long as the number of observations. Dates should be used where one wants to fit models at the "visit"
+#'        (typically one day) level. To fit models at coarser temporal resolutions then \code{time_period} 
+#'        should be numeric. Note that duplicae site x time_period x species records are dropped, so
+#'        your data will be substantially degraded if broad time periods are used. 
 #' @param list_length Logical, if \code{TRUE} then list length is added to the models as a fixed
-#'        effect. Note that since list_length is a property of each visit the model will run as 
+#'        effect. Note that since list_length is a property of each visit (or coarser period) the model will run as 
 #'        a binomial model rather that as a bernoulli model.
 #' @param site_effect Logical, if \code{TRUE} then site is added to the models as a random
 #'        effect.
@@ -91,7 +94,7 @@
 
 reportingRateModel <- function(taxa, site, time_period, list_length = FALSE, site_effect = FALSE,
                                species_to_include = unique(taxa), overdispersion = FALSE,
-                               verbose = FALSE, family = 'Binomial', print_progress = FALSE) {
+                               verbose = FALSE, family = 'Binomial', print_progress = TRUE) {
 
   # Do error checks
   errorChecks(taxa = taxa, site = site, time_period = time_period, list_length = list_length,
@@ -100,12 +103,12 @@ reportingRateModel <- function(taxa, site, time_period, list_length = FALSE, sit
 
   # Create dataframe from vectors
   taxa_data <- distinct(data.frame(taxa, site, time_period))
-  
+
   # Reshape the data so that it is suitable for model
   space_time <- dcast(taxa_data, time_period + site ~ ., value.var='taxa',
                       fun.aggregate = function(x) length(unique(x)))
   names(space_time)[ncol(space_time)] <- 'listLength'
-  
+
   # time_period could be a numeric or a date. If it is a date extract the year
   if('POSIXct' %in% class(time_period) | 'Date' %in% class(time_period)){
 
@@ -114,20 +117,22 @@ reportingRateModel <- function(taxa, site, time_period, list_length = FALSE, sit
     intercept_year <- median(unique(as.numeric(space_time$year)))
     space_time$year <- space_time$year - intercept_year    
 
-  } else{
-
-    stop('non-dates not yet supported for time_period')
-
+  } else {
+    # note that the term "year" is used loosely; may be a time period of any length
+    space_time$year <- space_time$time_period
+    intercept_year <- median(unique(space_time$year))
+    space_time$year <- space_time$year - intercept_year
+
   }
-  
+
   model_formula <- formulaBuilder(family,
                                   list_length,
                                   site_effect,
                                   overdispersion)
 
   counter <- data.frame(species = species_to_include,
                         count = 1:length(species_to_include))
-    
+
   # Run an apply across all species which undertakes the modelling
   all_coefs <- lapply(species_to_include, function(species_name){ # the sort ensures species are done in order
 
@@ -156,10 +161,10 @@ reportingRateModel <- function(taxa, site, time_period, list_length = FALSE, sit
     }
 
   })
-      
+
   # Bind the results of the apply into a data.frame
   coef_df <- do.call(rbind.fill, all_coefs)
-   
+
   # Get some attributes of the dataset
   attributes(coef_df) <- c(attributes(coef_df), list(intercept_year = intercept_year,
                                                      min_year =  min(unique(as.numeric(space_time$year))),

diff --git a/man/occDetFunc.Rd b/man/occDetFunc.Rd