diff --git a/.gitignore b/.gitignore index 44c4ccb..39d43e9 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,7 @@ # etc frescalo.exe sparta.Rproj - misc/ /doc/ /Meta/ +create_unicorn.r \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 3e9a36b..1ce2e5b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -52,6 +52,6 @@ VignetteBuilder: knitr Encoding: UTF-8 LazyData: TRUE -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 SystemRequirements: JAGS (https://sourceforge.net/projects/mcmc-jags/files/JAGS/) diff --git a/NAMESPACE b/NAMESPACE index ce0b547..82a296a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,3 +52,4 @@ importFrom(plyr,rbind.fill) importFrom(reshape2,acast) importFrom(reshape2,dcast) importFrom(reshape2,melt) +importFrom(runjags,testjags) diff --git a/R/detect_jags.r b/R/detect_jags.r new file mode 100644 index 0000000..e6ae445 --- /dev/null +++ b/R/detect_jags.r @@ -0,0 +1,5 @@ +# Internal function to detect JAGS installation. +#' @importFrom runjags testjags +detect_jags <- function(){ + return(suppressWarnings(runjags::testjags(silent = TRUE)$JAGS.found)) +} \ No newline at end of file diff --git a/R/formatOccData.r b/R/formatOccData.r index 6704264..7649118 100644 --- a/R/formatOccData.r +++ b/R/formatOccData.r @@ -87,13 +87,11 @@ #' survey = survey, #' closure_period = closure_period) #' -#' # format the unicorns data +#' # OR format the unicorns data +#' formatted_data <- formatOccData(taxa = unicorns$species, +#' survey = unicorns$start_date, +#' site = unicorns$site) #' -#' unicorns <- unicorns[complete.cases(unicorns$kmsq), ] -#' -#' formatted_data <- formatOccData(taxa = unicorns$CONCEPT, -#' survey = unicorns$Date, -#' site = unicorns$kmsq) #'} #' #' @export diff --git a/R/frescalo.R b/R/frescalo.R index a6c95d4..7014e8f 100644 --- a/R/frescalo.R +++ b/R/frescalo.R @@ -151,17 +151,17 @@ #' # Load data #' data(unicorns) #' -#' # Run frescalo (data is save to the working directory as sinkdir is not given) #' fres_out <- frescalo(Data = unicorns, +#' frespath = file.path(getwd(), "Frescalo.exe"), #' time_periods = data.frame(start=c(1980,1990),end=c(1989,1999)), -#' site_col = 'hectad', -#' sp_col = 'CONCEPT', -#' start_col = 'TO_STARTDATE', -#' end_col = 'Date') +#' site_col = 'site', +#' sp_col = 'species', +#' start_col = 'start_date', +#' end_col = 'end_date') #'} frescalo <- - function(Data,#your Data (.rdata files) as a file path (or list of file paths) + function(Data, #your Data as a dataframe object frespath, #path to the exe time_periods, #a list of vector pairs used in frescalo (ie 'c((1990,1995),(1996,2000))') site_col, # name of site column diff --git a/R/frescalo_checks.r b/R/frescalo_checks.r index 816a640..79d567a 100644 --- a/R/frescalo_checks.r +++ b/R/frescalo_checks.r @@ -8,7 +8,7 @@ frescalo_checks <- function(site_col, sp_col, year_col, start_col, end_col, # Check column names are in the data new.colnames <- c(site_col,sp_col,year_col,start_col,end_col) missingColNames <- new.colnames[!new.colnames %in% names(Data)] - if(length(missingColNames) > 0) stop(paste(unlist(missingColNames),'is not the name of a column in data')) + if(length(missingColNames) > 0) stop(paste(unlist(missingColNames),'is not the name of a column in data\n\n')) # Remove columns that are not needed Data <- Data[,names(Data) %in% new.colnames] diff --git a/R/unicorns-data.r b/R/unicorns-data.r index 85ae5f0..8d67a7c 100644 --- a/R/unicorns-data.r +++ b/R/unicorns-data.r @@ -1,6 +1,6 @@ #' @name unicorns #' @title A fictional dataset of unicorn sightings -#' @description This is a fictional occurrence dataset of 70 species of unicorn in the UK. +#' @description This is a fictional occurrence dataset of 20 species of unicorn in the UK. The dataset has column names, start_date, end_date, site and species respectively, corresponding to the start datetime of a occurence, end datetime of the occurence, site ID, and species ID. #' @docType data #' @usage unicorns #' @author Tom August, 2015-07-01 diff --git a/data/unicorns.rda b/data/unicorns.rda index 9ba0589..5c65d70 100644 Binary files a/data/unicorns.rda and b/data/unicorns.rda differ diff --git a/man/formatOccData.Rd b/man/formatOccData.Rd index bc1972c..bf07025 100644 --- a/man/formatOccData.Rd +++ b/man/formatOccData.Rd @@ -100,13 +100,11 @@ formatted_data <- formatOccData(taxa = taxa, survey = survey, closure_period = closure_period) -# format the unicorns data +# OR format the unicorns data +formatted_data <- formatOccData(taxa = unicorns$species, + survey = unicorns$start_date, + site = unicorns$site) -unicorns <- unicorns[complete.cases(unicorns$kmsq), ] - -formatted_data <- formatOccData(taxa = unicorns$CONCEPT, - survey = unicorns$Date, - site = unicorns$kmsq) } } diff --git a/man/frescalo.Rd b/man/frescalo.Rd index 027c112..a9bea8e 100644 --- a/man/frescalo.Rd +++ b/man/frescalo.Rd @@ -193,13 +193,13 @@ returned object gives a useful summary. # Load data data(unicorns) -# Run frescalo (data is save to the working directory as sinkdir is not given) fres_out <- frescalo(Data = unicorns, + frespath = file.path(getwd(), "Frescalo.exe"), time_periods = data.frame(start=c(1980,1990),end=c(1989,1999)), - site_col = 'hectad', - sp_col = 'CONCEPT', - start_col = 'TO_STARTDATE', - end_col = 'Date') + site_col = 'site', + sp_col = 'species', + start_col = 'start_date', + end_col = 'end_date') } } \references{ diff --git a/man/sparta.Rd b/man/sparta.Rd index 7ec9583..5a1b20b 100644 --- a/man/sparta.Rd +++ b/man/sparta.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/sparta.r \docType{package} \name{sparta} +\alias{sparta-package} \alias{sparta} \title{\pkg{sparta} Trend Analysis for Unstructured Data} \description{ @@ -11,3 +12,27 @@ include Frescalo, Telfer's change index, Reporting rate models and Bayesian Occupancy models. These methods are reviewed in Issac et al (2014), available at \url{http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12254/abstract} } +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/BiologicalRecordsCentre/sparta} + \item Report bugs at \url{https://github.com/BiologicalRecordsCentre/sparta/issues} +} + +} +\author{ +\strong{Maintainer}: Dylan Carbone \email{dylcar@ceh.ac.uk} (\href{https://orcid.org/0009-0003-5290-786X}{ORCID}) + +Authors: +\itemize{ + \item Tom August \email{tomaug@ceh.ac.uk} (\href{https://orcid.org/0000-0003-1116-3385}{ORCID}) + \item Gary Powney (\href{https://orcid.org/0000-0003-3313-7786}{ORCID}) + \item Charlie Outhwaite (\href{https://orcid.org/0000-0001-9997-6780}{ORCID}) + \item Jack Hatfield (\href{https://orcid.org/0000-0002-6361-0629}{ORCID}) + \item Nick Isaac (\href{https://orcid.org/0000-0002-4869-8052}{ORCID}) + \item Francesca Mancini (\href{https://orcid.org/0000-0003-4085-4978}{ORCID}) + \item Mark Hill (\href{https://orcid.org/0000-0002-3323-4458}{ORCID}) + \item Colin Harrower (\href{https://orcid.org/0000-0001-5070-5293}{ORCID}) +} + +} diff --git a/man/unicorns.Rd b/man/unicorns.Rd index cea4da9..57a08c4 100644 --- a/man/unicorns.Rd +++ b/man/unicorns.Rd @@ -8,7 +8,7 @@ unicorns } \description{ -This is a fictional occurrence dataset of 70 species of unicorn in the UK. +This is a fictional occurrence dataset of 20 species of unicorn in the UK. The dataset has column names, start_date, end_date, site and species respectively, corresponding to the start datetime of a occurence, end datetime of the occurence, site ID, and species ID. } \author{ Tom August, 2015-07-01 diff --git a/pre_vignette/sparta_vignette.Rmd b/pre_vignette/sparta_vignette.Rmd index 2761dca..e84f51f 100644 --- a/pre_vignette/sparta_vignette.Rmd +++ b/pre_vignette/sparta_vignette.Rmd @@ -189,7 +189,7 @@ head(telfer_results) The reporting rates models in sparta are all either GLMs or GLMMs with year as a continuous covariate but are flexible, giving the user a number of options for their analyses. These options include the addition of covariates to account for biases in the data including a random site effect and fixed effect of list length. -In [Isaac et al (2014)](http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12254/abstract) it was shown that reporting rate models can be susceptible to type 1 errors under certain scenarios and that with site and list length covariates the models performed better when the data were bias. These methods were found to out perform simple methods like Telfer. +In [Isaac et al (2014)](http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12254/abstract) it was shown that reporting rate models can be susceptible to type 1 errors under certain scenarios and that with site and list length covariates the models performed better when the data were biased. These methods were found to out perform simple methods like Telfer. The common feature among these models is that the quantity under consideration is the 'probability of being recorded'. When binomial models are used (as is the default), it's the 'probability for an average visit' for the Bernoulli version it is the probability of being recorded per time period. @@ -281,7 +281,7 @@ nrow(myDataSubset) ### Running Reporting Rate Models -Once you have subset your data using the above functions (or perhaps not at all) the reporting rate models can be applied using the function `reportingRateModel`. This function offers flexibility in the model you wish to fit, allowing the user to specify whether list length and site should be used as covariates, whether over-dispersion should be used, and whether the family should be binomial or Bernoulli. A number of these variants are presented in [Isaac et al (2014)](http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12254/abstract). While multi-species data is required it is not nessecary to model all species. In fact you can save a significant amount of time by only modelling hte species you are interested in. +Once you have subset your data using the above functions (or perhaps not at all) the reporting rate models can be applied using the function `reportingRateModel`. This function offers flexibility in the model you wish to fit, allowing the user to specify whether list length and site should be used as covariates, whether over-dispersion should be used, and whether the family should be binomial or Bernoulli. A number of these variants are presented in [Isaac et al (2014)](http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12254/abstract). While multi-species data is required it is not nessecary to model all species. In fact you can save a significant amount of time by only modelling the species you are interested in. ```{r,cache = TRUE} # Run the reporting rate model using list length as a fixed effect and @@ -321,7 +321,7 @@ with(RR_out, The returned object is a data frame with one row per species. Each column gives information on an element of the model output including covariate estimates, standard errors and p-values. This object also has some attributes giving the year that was chosen as the intercept, the number of visits in the dataset and the model formula used. -These models can take a long time to run when your data set is larg or you have a large number of species to model. To make this faster it is possible to parallelise this process across species which can significantly improve your run times. Here is an example of how we would parallelise the above example using hte R package snowfall. +These models can take a long time to run when your data set is large or you have a large number of species to model. To make this faster it is possible to parallelise this process across species which can significantly improve your run times. Here is an example of how we would parallelise the above example using hte R package snowfall. ```{r,cache = TRUE} # Load in snowfall @@ -411,13 +411,31 @@ This function works in a very similar fashion to that of the previous functions # Here is our data str(myData) +################################################ +# Create new entries for the year 1986 +new_dates <- seq(as.Date("1986-01-01"), as.Date("1986-12-31"), by = "1 month") + +# Create a dataframe with similar values for the new dates +new_data <- data.frame(taxa = rep("x", length(new_dates)), + site = rep("A99", length(new_dates)), + time_period = new_dates, + tp = NA) + +# Append the new data to the existing dataframe +myData <- rbind(myData, new_data) + +# View the updated dataframe +head(myData) + +################################################ + # Run an occupancy model for three species # Here we use very small number of iterations # to avoid a long run time system.time({ occ_out <- occDetModel(taxa = myData$taxa, site = myData$site, - time_period = myData$time_period, + survey = myData$time_period, species_list = c('a','b','c','d'), write_results = FALSE, n_iterations = 200, @@ -447,7 +465,7 @@ head(occ_out$a$BUGSoutput$summary) plot(occ_out$a) ``` -He we have run a small example but in reality these models are usually run for many thousands of iterations, making the analysis of more than a handful of species impractical. For those with access to the necessary facilities it is possible to parallelise across species. To do this we use a pair of functions that are used internally by `occDetModel`. These are `formatOccData` which is used to format our occurrence data into the format needed by JAGS, and `occDetFunc`, the function which undertakes the modelling. +Here we have run a small example but in reality these models are usually run for many thousands of iterations, making the analysis of more than a handful of species impractical. For those with access to the necessary facilities it is possible to parallelise across species. To do this we use a pair of functions that are used internally by `occDetModel`. These are `formatOccData` which is used to format our occurrence data into the format needed by JAGS, and `occDetFunc`, the function which undertakes the modelling. ```{r,cache = TRUE} # First format our data diff --git a/tests/testthat/testerrorChecks.R b/tests/testthat/testerrorChecks.R new file mode 100644 index 0000000..4b80a67 --- /dev/null +++ b/tests/testthat/testerrorChecks.R @@ -0,0 +1,171 @@ +context("Test errorChecks") + +set.seed(seed = 128) + +# Create data +n <- 3000 #size of dataset +nyr <- 10 # number of years in data +nSamples <- 30 # set number of dates +nSites <- 15 # set number of sites + +# Create somes dates +first <- as.POSIXct(strptime("2010/01/01", "%Y/%m/%d")) +last <- as.POSIXct(strptime(paste(2010+(nyr-1),"/12/31", sep=''), "%Y/%m/%d")) +dt <- last-first +rDates <- first + (runif(nSamples)*dt) + +# taxa are set as random letters +taxa <- sample(letters, size = n, TRUE) + +# three sites are visited randomly +site <- sample(paste('A', 1:nSites, sep=''), size = n, TRUE) + +# the date of visit is selected at random from those created earlier +time_period <- sample(rDates, size = n, TRUE) +time_period_missing <- sample(c(rDates, NA), size = n, TRUE) + +dist_sub <- rnorm(n, 10, 1) +sim_sub <- rnorm(n, 10, 1) + +dist_sub_fac <- as.factor(rnorm(n, 10, 1)) +sim_sub_fac <- as.factor(rnorm(n, 10, 1)) + + +dist_sub_chr <- as.character(rnorm(n, 10, 1)) +sim_sub_chr <- as.character(rnorm(n, 10, 1)) + +dist_sub_fac <- as.factor(rnorm(n, 10, 1)) +sim_sub_fac <- as.factor(rnorm(n, 10, 1)) + + +# combine this to a dataframe +df <- data.frame(taxa = taxa, + site = site, + time_period = as.character(time_period), + time_period_missing = time_period_missing, + dist_sub = dist_sub, + sim_sub = sim_sub, + dist_sub_chr = dist_sub_chr, + sim_sub_chr = sim_sub_chr, + dist_sub_fac = dist_sub_fac, + sim_sub_fac = sim_sub_fac) + + +useIterations_num <- 1 +useIterations_chr <- "TRUE" + + +temp <- tempfile() + +dir.create(temp) + +test_that("Test errors", { + expect_error(errorChecks(startDate = df$time_period), + 'startDate is not in a date format. This should be of class "Date" or "POSIXct"') + + expect_error(errorChecks(startDate = df$time_period_missing), + 'startDate must not contain NAs') + + expect_error(errorChecks(Date = df$time_period), + 'Date must be a data.frame or date vector') + + expect_error(errorChecks(Date = df$time_period_missing), + 'Date must not contain NAs') + + expect_error(errorChecks(endDate = df$time_period), + 'endDate is not in a date format. This should be of class "Date" or "POSIXct"') + + expect_error(errorChecks(endDate = df$time_period_missing), + 'endDate must not contain NAs') + + expect_error(errorChecks(dist_sub = df$dist_sub_chr, sim_sub = df$sim_sub), + 'dist_sub must be integer or numeric') + + expect_error(errorChecks(dist_sub = df$dist_sub, sim_sub = df$sim_sub_chr), + 'sim_sub must be integer or numeric') + + expect_error(errorChecks(dist_sub = df$dist_sub_fac, sim_sub = df$sim_sub), + 'dist_sub must be integer or numeric') + + expect_error(errorChecks(dist_sub = df$dist_sub, sim_sub = df$sim_sub_fac), + 'sim_sub must be integer or numeric') + + expect_error(errorChecks(useIterations = useIterations_chr), + 'useIterations must be logical') + + expect_error(errorChecks(useIterations = useIterations_num), + 'useIterations must be logical') + + expect_error(errorChecks(iterations = "1000"), + 'iterations must be numeric or integer') + + expect_error(errorChecks(family = "Poisson"), + 'family must be either Binomial or Bernoulli') + + expect_error(errorChecks(n_iterations = 1000, burnin = 500, thinning = 1100, n_chains = 3), + 'thinning must not be larger that the number of iteration (n_iterations)', + fixed = TRUE) + + expect_error(errorChecks(n_iterations = "1000", burnin = 500, thinning = 5, n_chains = 3), + 'n_iterations should be numeric') + + expect_error(errorChecks(n_iterations = 1000, burnin = "500", thinning = 5, n_chains = 3), + 'burnin should be numeric') + + expect_error(errorChecks(n_iterations = 1000, burnin = 500, thinning = "5", n_chains = 3), + 'thinning should be numeric') + + expect_error(errorChecks(n_iterations = 1000, burnin = 500, thinning = 5, n_chains = "3"), + 'n_chains should be numeric') + + expect_error(errorChecks(seed = "1"), + 'seed muct be numeric') + + expect_error(errorChecks(year_col = NA, start_col = NA, end_col = time_period[1]), + 'year_col or start_col and end_col must be given') + + expect_error(errorChecks(year_col = NA, start_col = df$time_period[1], end_col = NA), + 'year_col or start_col and end_col must be given') + + expect_error(errorChecks(year_col = NA, start_col = df$time_period[1], end_col = df$time_period[1]), + 'year_col cannot be used at the same time as start_col and end_col') + + expect_error(errorChecks(phi = 0.1), + "phi is outside permitted range of 0.50 to 0.95") + + expect_error(errorChecks(phi = 0.99), + "phi is outside permitted range of 0.50 to 0.95") + + expect_error(errorChecks(alpha = 0.05), + "alpha is outside permitted range of 0.08 to 0.50") + + expect_error(errorChecks(alpha = 0.99), + "alpha is outside permitted range of 0.08 to 0.50") + + expect_error(errorChecks(non_benchmark_sp = c(1,5,10,12)), + 'non_benchmark_sp must be a character vector') + + expect_error(errorChecks(non_benchmark_sp = 12), + 'non_benchmark_sp must be a character vector') + + expect_error(errorChecks(fres_site_filter = c(1,5,10,12)), + 'fres_site_filter must be a character vector') + + expect_error(errorChecks(fres_site_filter = 12), + 'fres_site_filter must be a character vector') + + expect_error(errorChecks(time_periods = as.matrix(df$time_period)), + 'time_periods should be a data.frame. e.g. "data.frame(start=c(1980,1990),end=c(1989,1999))"', + fixed = TRUE) + + expect_error(errorChecks(frespath = temp), + "filepath is not the path to a '.exe' file") + + expect_error(errorChecks(frespath = "file.exe"), + 'file.exe does not exist') + + expect_error(errorChecks(site = c('a', 'b', '')), + "site must not contain empty values (i.e. '')", + fixed = TRUE) + +})