diff --git a/.Rbuildignore b/.Rbuildignore index 606eeffb0..946bc32ad 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,4 +11,7 @@ ^doc$ ^Meta$ ^_pkgdown\.yml$ -^\.devcontainer$ \ No newline at end of file +^inst/manuscript/manuscript_cache$ +^\.lintr$ +^docs$ +^\.devcontainer$ diff --git a/.Rinstignore b/.Rinstignore new file mode 100644 index 000000000..2033ccc0c --- /dev/null +++ b/.Rinstignore @@ -0,0 +1,5 @@ +inst/manuscript/manuscript_cache/ +inst/manuscript/manuscript.log +inst/manuscript/manuscript.pdf +inst/manuscript/manuscript.tex +inst/manuscript/manuscript_files/ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 000000000..2d19fc766 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 000000000..051331820 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,45 @@ +# Contributing to scoringutils + +This outlines how to propose a change to scoringutils. + +## Fixing typos + +You can fix typos, spelling mistakes, or grammatical errors in the documentation directly using the GitHub web interface, as long as the changes are made in the _source_ file. +This generally means you'll need to edit [roxygen2 comments](https://roxygen2.r-lib.org/articles/roxygen2.html) in an `.R`, not a `.Rd` file. +You can find the `.R` file that generates the `.Rd` by reading the comment in the first line. + +## Bigger changes + +If you want to make a bigger change, it's a good idea to first file an issue and make sure someone from the team agrees that it’s needed. +If you’ve found a bug, please file an issue that illustrates the bug with a minimal +[reprex](https://www.tidyverse.org/help/#reprex) (this will also help you write a unit test, if needed). + +### Pull request process + +* Fork the package and clone onto your computer. If you haven't done this before, we recommend using `usethis::create_from_github("epiforecasts/scoringutils", fork = TRUE)`. + +* Install all development dependences with `devtools::install_dev_deps()`, and then make sure the package passes R CMD check by running `devtools::check()`. + If R CMD check doesn't pass cleanly, it's a good idea to ask for help before continuing. +* Create a Git branch for your pull request (PR). We recommend using `usethis::pr_init("brief-description-of-change")`. + +* Make your changes, commit to git, and then create a PR by running `usethis::pr_push()`, and following the prompts in your browser. + The title of your PR should briefly describe the change. + The body of your PR should contain `Fixes #issue-number`. + +* For user-facing changes, add a bullet to the top of `NEWS.md` (i.e. just below the first header). Follow the style described in . + +### Code style + +* New code should follow the tidyverse [style guide](https://style.tidyverse.org). + You can use the [styler](https://CRAN.R-project.org/package=styler) package to apply these styles, but please don't restyle code that has nothing to do with your PR. + +* We use [roxygen2](https://cran.r-project.org/package=roxygen2), with [Markdown syntax](https://cran.r-project.org/web/packages/roxygen2/vignettes/rd-formatting.html), for documentation. + +* We use [testthat](https://cran.r-project.org/package=testthat) for unit tests. + Contributions with test cases included are easier to accept. + +## Code of Conduct + +Please note that the scoringutils project is released with a +[Contributor Code of Conduct](CODE_OF_CONDUCT.md). By contributing to this +project you agree to abide by its terms. diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 000000000..041ee3c88 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,49 @@ +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +name: lint + +jobs: + lint: + runs-on: macOS-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v1 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install dependencies + run: | + install.packages(c("remotes")) + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("lintr") + shell: Rscript {0} + + - name: Install package + run: R CMD INSTALL . + + - name: Lint + run: lintr::lint_package() + shell: Rscript {0} \ No newline at end of file diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 7dc9ad409..fa69d08c8 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -11,6 +11,8 @@ name: test-coverage jobs: test-coverage: runs-on: macOS-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v2 diff --git a/.gitignore b/.gitignore index 250f09584..a515f56da 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,9 @@ inst/doc R/sandbox.R doc Meta +inst/manuscript/manuscript_cache/ +inst/manuscript/manuscript.log +inst/manuscript/manuscript.pdf +inst/manuscript/manuscript.tex +inst/manuscript/manuscript_files/ +docs \ No newline at end of file diff --git a/.lintr b/.lintr new file mode 100644 index 000000000..c9061db51 --- /dev/null +++ b/.lintr @@ -0,0 +1,6 @@ +linters: with_defaults( + line_length_linter = line_length_linter(120), + cyclocomp_linter = cyclocomp_linter(complexity_limit = 20L), + object_usage_linter = NULL) +exclusions: c(list.files(path = "tests/", recursive = T, full.names = T)) +exclude: "# nolint" diff --git a/DESCRIPTION b/DESCRIPTION index 1ac107c00..73dc84c0d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: scoringutils Title: Utilities for Scoring and Assessing Predictions -Version: 0.1.8 +Version: 1.0.0 Language: en-GB Authors@R: c( person(given = "Nikos", @@ -21,22 +21,9 @@ Authors@R: c( role = c("ctb"), email = "johannes.bracher@kit.edu", comment = c(ORCID = "0000-0002-3777-1410")), - person("Joel", "Hellewell", - email = "joel.hellewell@lshtm.ac.uk", - role = c("ctb"), - comment = c(ORCID = "0000-0003-2683-0849")), - person(given = "Sophie Meakins", - role = c("ctb"), - email = "sophie.meakins@lshtm.ac.uk"), - person("James", "Munday", - email = "james.munday@lshtm.ac.uk", - role = c("ctb")), - person("Katharine", "Sherratt", - email = "katharine.sherratt@lshtm.ac.uk", - role = c("ctb")), person("Sebastian", "Funk", email = "sebastian.funk@lshtm.ac.uk", - role = c("aut"))) + role = c("ctb"))) Description: Combines a collection of metrics and proper scoring rules (Tilmann Gneiting & Adrian E Raftery (2007) @@ -61,21 +48,26 @@ Encoding: UTF-8 LazyData: true Imports: data.table, - forcats, + ggdist (>= 3.1.0), ggplot2, - goftest, scoringRules, stats, methods Suggests: testthat, + kableExtra, + magrittr, knitr, rmarkdown, vdiffr -RoxygenNote: 7.1.1 +Config/Needs/website: + r-lib/pkgdown, + amirmasoudabdol/preferably +Config/testthat/edition: 3 +RoxygenNote: 7.1.2 URL: https://github.com/epiforecasts/scoringutils, https://epiforecasts.io/scoringutils/ BugReports: https://github.com/epiforecasts/scoringutils/issues VignetteBuilder: knitr Depends: - R (>= 3.1) + R (>= 3.5) Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 7c37d46dc..9cab7f73e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,51 +2,50 @@ S3method(print,scoringutils_check) export(abs_error) +export(add_coverage) export(ae_median_quantile) export(ae_median_sample) +export(avail_forecasts) export(available_metrics) -export(bias) +export(bias_quantile) +export(bias_range) +export(bias_sample) export(brier_score) export(check_forecasts) -export(correlation_plot) -export(crps) -export(dss) -export(eval_forecasts) -export(interval_coverage) +export(correlation) +export(crps_sample) +export(dss_sample) +export(find_duplicates) export(interval_score) -export(logs) +export(logs_binary) +export(logs_sample) +export(mad_sample) export(merge_pred_and_obs) -export(mse) export(pairwise_comparison) export(pit) -export(pit_df) -export(pit_df_fast) +export(pit_sample) +export(plot_avail_forecasts) +export(plot_correlation) +export(plot_heatmap) +export(plot_interval_coverage) export(plot_pairwise_comparison) +export(plot_pit) export(plot_predictions) -export(quantile_bias) -export(quantile_coverage) -export(quantile_to_long) -export(quantile_to_range) -export(quantile_to_range_long) -export(quantile_to_wide) -export(range_long_to_quantile) -export(range_long_to_wide) -export(range_plot) -export(range_to_quantile) -export(range_wide_to_long) +export(plot_quantile_coverage) +export(plot_ranges) +export(plot_score_table) +export(plot_wis) +export(quantile_score) export(sample_to_quantile) -export(sample_to_range) -export(sample_to_range_long) -export(score_heatmap) -export(score_table) -export(sharpness) -export(show_avail_forecasts) -export(wis_components) +export(score) +export(se_mean_sample) +export(squared_error) +export(summarise_scores) +export(theme_scoringutils) importFrom(data.table,"%like%") importFrom(data.table,':=') importFrom(data.table,.I) importFrom(data.table,.N) -importFrom(data.table,`%like%`) importFrom(data.table,`:=`) importFrom(data.table,as.data.table) importFrom(data.table,copy) @@ -57,39 +56,48 @@ importFrom(data.table,melt) importFrom(data.table,rbindlist) importFrom(data.table,setDT) importFrom(data.table,setnames) -importFrom(forcats,fct_relevel) -importFrom(forcats,fct_rev) +importFrom(ggdist,geom_lineribbon) importFrom(ggplot2,aes) importFrom(ggplot2,aes_string) importFrom(ggplot2,coord_cartesian) +importFrom(ggplot2,coord_flip) importFrom(ggplot2,element_blank) +importFrom(ggplot2,element_line) importFrom(ggplot2,element_text) importFrom(ggplot2,expand_limits) importFrom(ggplot2,facet_grid) importFrom(ggplot2,facet_wrap) +importFrom(ggplot2,geom_col) importFrom(ggplot2,geom_histogram) importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_linerange) importFrom(ggplot2,geom_point) +importFrom(ggplot2,geom_polygon) importFrom(ggplot2,geom_text) importFrom(ggplot2,geom_tile) importFrom(ggplot2,ggplot) +importFrom(ggplot2,ggtitle) +importFrom(ggplot2,guide_legend) +importFrom(ggplot2,guides) importFrom(ggplot2,labs) importFrom(ggplot2,scale_color_continuous) importFrom(ggplot2,scale_colour_manual) +importFrom(ggplot2,scale_fill_gradient) importFrom(ggplot2,scale_fill_gradient2) importFrom(ggplot2,scale_fill_manual) +importFrom(ggplot2,scale_y_continuous) importFrom(ggplot2,stat) importFrom(ggplot2,theme) importFrom(ggplot2,theme_light) +importFrom(ggplot2,theme_minimal) importFrom(ggplot2,unit) importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) -importFrom(goftest,ad.test) importFrom(methods,hasArg) importFrom(scoringRules,crps_sample) importFrom(scoringRules,dss_sample) importFrom(scoringRules,logs_sample) +importFrom(stats,as.formula) importFrom(stats,cor) importFrom(stats,mad) importFrom(stats,median) diff --git a/NEWS.md b/NEWS.md index e3d090c50..b7216c8d6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,49 @@ +# scoringutils 1.0.0 + +Major update to the package and most package functions with lots of breaking changes. + +## Feature updates +- new and updated Readme and vignette +- the proposed scoring workflow was reworked. Functions were changed so they +can easily be piped and have simplified arguments and outputs. + +### new functions and function changes +- the function `eval_forecasts()` was replaced by a function [score()] with a +much reduced set of function arguments. +- Functionality to summarise scores and to add relative skill scores was moved +to a function [summarise_scores()] +- new function [check_forecasts()] to analyse input data before scoring +- new function [correlation()] to compute correlations between different metrics +- new function [add_coverage()] to add coverage for specific central prediction +intervals +- new function [avail_forecasts()] allows to visualise the number of available +forecasts +- new function [find_duplicates()] to find duplicate forecasts which cause an +error +- all plotting functions were renamed to begin with `plot_`. Arguments were +simplified +- the function [pit()] now works based on data.frames. The old `pit` function +was renamed to [pit_sample()]. PIT p-values were removed entirely. +- the function [plot_pit()] now works directly with input as produced by [pit()] +- many data-handling functions were removed and input types for [score()] were +restricted to sample-based, quantile-based or binary forecasts. +- the function [brier_score()] now returns all brier scores, rather than taking +the mean before returning an output. +- `crps`, `dss` and `logs` were renamed to [crps_sample()], [dss_sample()], and +[logs_sample()] + +### package data updated +- package data is now based on forecasts submitted to the European Forecast Hub +(https://covid19forecasthub.eu/). +- all example data files were renamed to begin with `example_` +- a new data set, `summary_metrics` was included that contains a summary of the +metrics implemented in `scoringutils` + +## Other breaking changes +- The 'sharpness' component of the weighted interval score was renamed to +dispersion. This was done to make it more clear what the component represents +and to maintain consistency with what is used in other places. + # scoringutils 0.1.8 ## Feature updates @@ -24,6 +70,8 @@ between models on the output of `eval_forecasts()` - The WIS definition change introduced in version 0.1.5 was partly corrected such that the difference in weighting is only introduced when summarising over scores from different interval ranges +- "sharpness" was renamed to 'mad' in the output of [score()] for sample-based +forecasts. # scoringutils 0.1. @@ -52,12 +100,12 @@ with the argument `count_median_twice = FALSE`. ## Feature updates - we added basic plotting functionality to visualise scores. You can now -easily obtain diagnostic plots based on scores as produced by `eval_forecasts`. +easily obtain diagnostic plots based on scores as produced by `score`. - `correlation_plot` shows correlation between metrics -- `range_plot` shows contribution of different prediction intervals to some +- `plot_ranges` shows contribution of different prediction intervals to some chosen metric -- `score_heatmap` visualises scores as heatmap -- `score_table` shows a coloured summary table of scores +- `plot_heatmap` visualises scores as heatmap +- `plot_score_table` shows a coloured summary table of scores ## package updates - renamed "calibration" to "coverage" @@ -69,39 +117,39 @@ chosen metric # scoringutils 0.1.3 ## (Potentially) Breaking changes -- the by argument in `eval_forecasts` now has a slightly changed meaning. It +- the by argument in `score` now has a slightly changed meaning. It now denotes the lowest possible grouping unit, i.e. the unit of one observation and needs to be specified explicitly. The default is now `NULL`. The reason for this change is that most metrics need scoring on the observation level and this the most consistent implementation of this principle. The pit function receives its grouping now from `summarise_by`. In a similar spirit, `summarise_by` has to -be specificed explicitly and e.g. doesn't assume anymore that you want 'range' +be specified explicitly and e.g. doesn't assume anymore that you want 'range' to be included. - for the interval score, `weigh = TRUE` is now the default option. - (potentially planned) rename true_values to true_value and predictions to prediction. ## Feature updates -- updated quantile evaluation metrics in `eval_forecasts`. Bias as well as +- updated quantile evaluation metrics in `score`. Bias as well as calibration now take all quantiles into account - Included option to summarise scores according to a `summarise_by` argument in -`eval_forecasts` The summary can return the mean, the standard deviation as well +`score` The summary can return the mean, the standard deviation as well as an arbitrary set of quantiles. -- `eval_forecasts` can now return pit histograms. +- `score` can now return pit histograms. - switched to ggplot2 for plotting # scoringutils 0.1.2 ## (Potentially) Breaking changes -- all scores in eval_forecasts were consistently renamed to lower case. +- all scores in score were consistently renamed to lower case. Interval_score is now interval_score, CRPS is now crps etc. ## Feature updates - included support for grouping scores according to a vector of column names -in `eval_forecasts` +in `score` - included support for passing down arguments to lower-level functions in -`eval_forecasts` +`score` - included support for three new metrics to score quantiles with -`eval_forecasts`: bias, sharpness and calibration +`score`: bias, sharpness and calibration ## Package updates - example data now has a horizon column to illustrate the use of grouping @@ -111,9 +159,9 @@ in `eval_forecasts` ## Feature updates - included support for a long as well as wide input formats for -quantile forecasts that are scored with `eval_forecasts` +quantile forecasts that are scored with `score` ## Package updates -- updated documentation for the `eval_forecasts` +- updated documentation for the `score` - added badges to the Readme diff --git a/R/absolute_error.R b/R/absolute_error.R deleted file mode 100644 index 1025c1601..000000000 --- a/R/absolute_error.R +++ /dev/null @@ -1,105 +0,0 @@ - - -#' @title Absolute Error of the Median (Sample-based Version) -#' -#' @description -#' Absolute error of the median calculated as -#' -#' \deqn{ -#' abs(true_value - median_prediction) -#' } -#' -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the -#' number of Monte Carlo samples. Alternatively, predictions can just be a vector -#' of size n -#' @return vector with the scoring values -#' @importFrom stats median -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' ae_median_sample(true_values, predicted_values) -#' @export - - -ae_median_sample <- function(true_values, predictions) { - - median_predictions <- apply(as.matrix(predictions), - MARGIN = 1, # rowwise - FUN = median) - - ae_median <- abs(true_values - median_predictions) - - return(ae_median) -} - - - - - -#' @title Absolute Error of the Median (Quantile-based Version) -#' -#' @description -#' Absolute error of the median calculated as -#' -#' \deqn{ -#' abs(true_value - median_prediction) -#' } -#' -#' @param true_values A vector with the true observed values of size n -#' @param predictions numeric vector with predictions, corresponding to the -#' quantiles in a second vector, `quantiles`. -#' @param quantiles numeric vector that denotes the quantile for the values -#' in `predictions`. Only those predictions where `quantiles == 0.5` will -#' be kept. If `quantiles` is `NULL`, then all `predictions` and -#' `true_values` will be used (this is then the same as [abs_error()]) -#' @param verbose logical, return a warning is something unexpected happens -#' @return vector with the scoring values -#' @importFrom stats median -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' ae_median_quantile(true_values, predicted_values, quantiles = 0.5) -#' @export -ae_median_quantile <- function(true_values, predictions, quantiles = NULL, - verbose = TRUE) { - if (!is.null(quantiles)) { - if (!any(quantiles == 0.5) && !any(is.na(quantiles))) { - return(NA_real_) - if (verbose) { - warning("in order to compute the absolute error of the median, `0.5` must be among the quantiles given. Maybe you want to use `abs_error()`?") - } - } - true_values <- true_values[quantiles == 0.5] - predictions <- predictions[quantiles == 0.5] - } - abs_error_median <- abs(true_values - predictions) - return(abs_error_median) -} - - - -#' @title Absolute Error -#' -#' @description -#' Caclulate absolute error as -#' -#' \deqn{ -#' abs(true_value - prediction) -#' } -#' -#' @param true_values A vector with the true observed values of size n -#' @param predictions numeric vector with predictions, corresponding to the -#' quantiles in a second vector, `quantiles`. -#' @return vector with the absolute error -#' @examples -#' true_values <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' abs_error(true_values, predicted_values) -#' @export - - -abs_error <- function(true_values, predictions) { - return(abs(true_values - predictions)) -} diff --git a/R/avail_forecasts.R b/R/avail_forecasts.R new file mode 100644 index 000000000..629136f1e --- /dev/null +++ b/R/avail_forecasts.R @@ -0,0 +1,61 @@ +#' @title Display Number of Forecasts Available +#' +#' @description +#' +#' Given a data set with forecasts, count the number of available forecasts +#' for arbitrary grouping (e.g. the number of forecasts per model, or the +#' number of forecasts per model and location). +#' This is useful to determine whether there are any missing forecasts. +#' +#' @param data data.frame with predictions in the same format required for +#' [score()]. +#' @param by character vector or `NULL` (the default) that denotes the +#' categories over which the number of forecasts should be counted. +#' By default (`by = NULL`) this will be the unit of a single forecast (i.e. +#' all available columns (apart from a few "protected" columns such as +#' 'prediction' and 'true value') plus "quantile" or "sample" where present). +#' @param collapse character vector (default is `c("quantile", "sample"`) with +#' names of categories for which the number of rows should be collapsed to one +#' when counting. For example, a single forecast is usually represented by a +#' set of several quantiles or samples and collapsing these to one makes sure +#' that a single forecast only gets counted once. +#' @return A data.table with columns as specified in `by` and an additional +#' column with the number of forecasts. +#' @importFrom data.table .I .N +#' @export +#' @keywords check-forecasts +#' @examples +#' avail_forecasts(example_quantile, +#' collapse = c("quantile"), +#' by = c("model", "target_type") +#' ) +avail_forecasts <- function(data, + by = NULL, + collapse = c("quantile", "sample")) { + + check_data <- check_forecasts(data) + + + data <- check_data$cleaned_data + forecast_unit <- check_data$forecast_unit + + if (is.null(by)) { + by <- forecast_unit + } + + # collapse several rows to 1, e.g. treat a set of 10 quantiles as one, + # because they all belong to one single forecast that should be counted once + collapse_by <- setdiff( + c(forecast_unit, "quantile", "sample"), + collapse + ) + # filter out "quantile" or "sample" if present in collapse_by, but not data + collapse_by <- intersect(collapse_by, names(data)) + + data <- data[data[, .I[1], by = collapse_by]$V1] + + # count number of rows = number of forecasts + out <- data[, .(`Number forecasts` = .N), by = by] + + return(out[]) +} diff --git a/R/bias.R b/R/bias.R index e3237746f..04d165e7d 100644 --- a/R/bias.R +++ b/R/bias.R @@ -28,26 +28,21 @@ #' In both cases, Bias can assume values between #' -1 and 1 and is 0 ideally. #' -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the -#' number of Monte Carlo samples #' @return vector of length n with the biases of the predictive samples with #' respect to the true values. +#' @inheritParams ae_median_sample #' @author Nikos Bosse \email{nikosbosse@@gmail.com} #' @examples #' #' ## integer valued forecasts #' true_values <- rpois(30, lambda = 1:30) #' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' bias(true_values, predictions) +#' bias_sample(true_values, predictions) #' #' ## continuous forecasts #' true_values <- rnorm(30, mean = 1:30) #' predictions <- replicate(200, rnorm(30, mean = 1:30)) -#' bias(true_values, predictions) -#' -#' +#' bias_sample(true_values, predictions) #' @export #' @references #' The integer valued Bias function is discussed in @@ -57,61 +52,31 @@ #' real-time epidemic forecasts: A case study of Ebola in the Western Area #' region of Sierra Leone, 2014-15. PLOS Computational Biology 15(2): e1006785. #' +#' @keywords metric +bias_sample <- function(true_values, predictions) { -bias <- function(true_values, predictions) { - - # ============== Error handling ============== - - if (missing(true_values) | missing(predictions)) { - stop("true_values or predictions argument missing") - } - - n <- length(true_values) - - if (is.data.frame(predictions)) { - predictions <- as.matrix(predictions) - } - if (!is.matrix(predictions)) { - msg <- sprintf("'predictions' should be a matrix. Instead `%s` was found", - class(predictions[1])) - stop(msg) - } - if (nrow(predictions) != n) { - - msg <- sprintf("Mismatch: The true values provided have length `%s`, but 'predictions' has `%s` rows.", - n, nrow(predictions)) - stop(msg) - } - - # ============================================ - - ## check whether continuous or integer - if (!isTRUE(all.equal(as.vector(predictions), as.integer(predictions)))) { - continuous_predictions <- TRUE - } else { - continuous_predictions <- FALSE - } - - n_pred <- ncol(predictions) + # check inputs + check_true_values(true_values) + check_predictions(predictions, true_values, class = "matrix") + prediction_type <- get_prediction_type(predictions) # empirical cdf - P_x <- rowSums(predictions <= true_values) / n_pred + n_pred <- ncol(predictions) + p_x <- rowSums(predictions <= true_values) / n_pred - if (continuous_predictions) { - res <- 1 - 2 * P_x + if (prediction_type == "continuous") { + res <- 1 - 2 * p_x return(res) } else { # for integer case also calculate empirical cdf for (y-1) - P_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred + p_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred - res <- 1 - (P_x + P_xm1) + res <- 1 - (p_x + p_xm1) return(res) } } - - #' @title Determines Bias of Quantile Forecasts #' #' @description @@ -123,15 +88,23 @@ bias <- function(true_values, predictions) { #' For quantile forecasts, bias is measured as #' #' \deqn{ -#' B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) 1( x_t \leq q_{t, 0.5}) \\ -#' + (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) 1( x_t \geq q_{t, 0.5}),} +#' B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) +#' \mathbf{1}( x_t \leq q_{t, 0.5}) \\ +#' + (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) +#' \mathbf{1}( x_t \geq q_{t, 0.5}), +#' }{ +#' B_t = (1 - 2 * max(i | q_{t,i} in Q_t and q_{t,i} <= x_t\)) +#' 1( x_t <= q_{t, 0.5}) + (1 - 2 * min(i | q_{t,i} in Q_t and q_{t,i} >= x_t)) +#' 1( x_t >= q_{t, 0.5}), +#' } #' #' where \eqn{Q_t} is the set of quantiles that form the predictive #' distribution at time \eqn{t}. They represent our -#' belief about what the true value $x_t$ will be. For consistency, we define +#' belief about what the true value \eqn{x_t} will be. For consistency, we +#' define #' \eqn{Q_t} such that it always includes the element -#' \eqn{q_{t, 0} = - \infty$ and $q_{t,1} = \infty}. -#' \eqn{1()} is the indicator function that is \eqn{1} if the +#' \eqn{q_{t, 0} = - \infty} and \eqn{q_{t,1} = \infty}. +#' \eqn{\mathbf{1}()}{1()} is the indicator function that is \eqn{1} if the #' condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is #' defined as the maximum percentile rank for which the corresponding quantile #' is still below the true value, if the true value is smaller than the @@ -159,41 +132,46 @@ bias <- function(true_values, predictions) { #' @author Nikos Bosse \email{nikosbosse@@gmail.com} #' @examples #' -#' lower <- c(6341.000, 6329.500, 6087.014, 5703.500, -#' 5451.000, 5340.500, 4821.996, 4709.000, -#' 4341.500, 4006.250, 1127.000, 705.500) +#' lower <- c( +#' 6341.000, 6329.500, 6087.014, 5703.500, +#' 5451.000, 5340.500, 4821.996, 4709.000, +#' 4341.500, 4006.250, 1127.000, 705.500 +#' ) #' -#' upper <- c(6341.000, 6352.500, 6594.986, 6978.500, -#' 7231.000, 7341.500, 7860.004, 7973.000, -#' 8340.500, 8675.750, 11555.000, 11976.500) +#' upper <- c( +#' 6341.000, 6352.500, 6594.986, 6978.500, +#' 7231.000, 7341.500, 7860.004, 7973.000, +#' 8340.500, 8675.750, 11555.000, 11976.500 +#' ) #' #' range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) #' #' true_value <- 8062 #' -#' quantile_bias(lower = lower, upper = upper, -#' range = range, true_value = true_value) -#' +#' bias_range( +#' lower = lower, upper = upper, +#' range = range, true_value = true_value +#' ) #' @export -#' - -quantile_bias <- function(range, lower, upper, - true_value) { +#' @keywords metric +bias_range <- function(range, lower, upper, + true_value) { lower_predictions <- lower upper_predictions <- upper - if(any(is.na(upper)) | any(is.na(lower))) { - + if (any(is.na(upper)) | any(is.na(lower))) { range <- range[!is.na(upper) & !is.na(lower)] lower_predictions <- lower[!is.na(lower) & !is.na(upper)] upper_predictions <- upper[!is.na(lower) & !is.na(upper)] # deal with the point forecast case where inputs may be NA - if (length(range) == 0 | length(lower_predictions) == 0 | length(upper_predictions) == 0) { + if (length(range) == 0 | + length(lower_predictions) == 0 | + length(upper_predictions) == 0 + ) { return(NA_real_) } - } # convert range to quantiles @@ -226,10 +204,114 @@ quantile_bias <- function(range, lower, upper, lower <- max(lower_quantiles[lower_predictions <= true_value]) bias <- 1 - 2 * lower return(bias) - } else if (any(upper_predictions <= true_value)){ + } else if (any(upper_predictions <= true_value)) { upper <- min(upper_quantiles[upper_predictions >= true_value]) bias <- 1 - 2 * upper return(bias) } } +#' @title Determines Bias of Quantile Forecasts +#' +#' @description +#' Determines bias from quantile forecasts. For an increasing number of +#' quantiles this measure converges against the sample based bias version +#' for integer and continuous forecasts. +#' +#' @details +#' For quantile forecasts, bias is measured as +#' +#' \deqn{ +#' B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) +#' 1( x_t \leq q_{t, 0.5}) \\ +#' + (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) +#' 1( x_t \geq q_{t, 0.5}),} +#' +#' where \eqn{Q_t} is the set of quantiles that form the predictive +#' distribution at time \eqn{t}. They represent our +#' belief about what the true value $x_t$ will be. For consistency, we define +#' \eqn{Q_t} such that it always includes the element +#' \eqn{q_{t, 0} = - \infty$ and $q_{t,1} = \infty}. +#' \eqn{1()} is the indicator function that is \eqn{1} if the +#' condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is +#' defined as the maximum percentile rank for which the corresponding quantile +#' is still below the true value, if the true value is smaller than the +#' median of the predictive distribution. If the true value is above the +#' median of the predictive distribution, then $B_t$ is the minimum percentile +#' rank for which the corresponding quantile is still larger than the true +#' value. If the true value is exactly the median, both terms cancel out and +#' \eqn{B_t} is zero. For a large enough number of quantiles, the +#' percentile rank will equal the proportion of predictive samples below the +#' observed true value, and this metric coincides with the one for +#' continuous forecasts. +#' +#' Bias can assume values between +#' -1 and 1 and is 0 ideally. +#' @param predictions vector of length corresponding to the number of quantiles +#' that holds predictions +#' @param quantiles vector of corresponding size with the quantiles for which +#' predictions were made +#' @inheritParams bias_range +#' @return scalar with the quantile bias for a single quantile prediction +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @examples +#' +#' predictions <- c( +#' 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, +#' 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, +#' 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, +#' 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 +#' ) +#' +#' quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) +#' +#' true_value <- 8062 +#' +#' bias_quantile(predictions, quantiles, true_value = true_value) +#' @export +#' @keywords metric + +bias_quantile <- function(predictions, quantiles, true_value) { + # check that predictions and quantiles have the same length + if (!length(predictions) == length(quantiles)) { + stop("predictions and quantiles must have the same length") + } + + if (any(is.na(predictions))) { + quantiles <- quantiles[!is.na(predictions)] + predictions <- predictions[!is.na(predictions)] + } + # if there is no input, return NA + if (length(quantiles) == 0 | length(predictions) == 0) { + return(NA_real_) + } + + if (0.5 %in% quantiles) { + median_prediction <- predictions[quantiles == 0.5] + } else { + # if median is not available, compute as mean of two innermost quantiles + median_prediction <- + 0.5 * predictions[quantiles == max(quantiles[quantiles < 0.5])] + + 0.5 * predictions[quantiles == min(quantiles[quantiles > 0.5])] + } + + if (true_value == median_prediction) { + bias <- 0 + return(bias) + } else if (true_value < median_prediction) { + if (true_value < min(predictions)) { + bias <- 1 + } else { + q <- max(quantiles[predictions <= true_value]) + bias <- 1 - 2 * q + } + } else if (true_value > median_prediction) { + if (true_value > max(predictions)) { + bias <- -1 + } else { + q <- min(quantiles[predictions >= true_value]) + bias <- 1 - 2 * q + } + } + return(bias) +} diff --git a/R/brier_score.R b/R/brier_score.R index f4adb8017..46736910e 100644 --- a/R/brier_score.R +++ b/R/brier_score.R @@ -12,51 +12,31 @@ #' probabilistic prediction and the true outcome. #' #' \deqn{ -#' Brier_Score = \frac{1}{N} \sum_{t = 1}^{n} (prediction_t - outcome_t)^2 +#' \text{Brier_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\text{prediction_t} - +#' \text{outcome_t})^2 +#' }{ +#' Brier_Score = 1/N \sum_{t = 1}^{n} (prediction_t - outcome_t)² #' } #' -#' @param true_values A vector with the true observed values of size n +#' @param true_values A vector with the true observed values of size n with +#' all values equal to either 0 or 1 #' @param predictions A vector with a predicted probability #' that true_value = 1. #' @return A numeric value with the Brier Score, i.e. the mean squared #' error of the given probability forecasts -#' @importFrom methods hasArg #' @export #' #' @examples -#' true_values <- sample(c(0,1), size = 30, replace = TRUE) +#' true_values <- sample(c(0, 1), size = 30, replace = TRUE) #' predictions <- runif(n = 30, min = 0, max = 1) #' #' brier_score(true_values, predictions) -#' - -brier_score <- function (true_values, predictions) { - - # ============== Error handling ============== - - if (!all(c(methods::hasArg("true_values"), methods::hasArg("predictions")))) { - stop("true_values or predictions argument missing") - } - - if (!all(true_values %in% c(0,1))) { - stop("elements of true_values should be either zero or one") - } +#' @keywords metric - n <- length(true_values) +brier_score <- function(true_values, predictions) { + check_true_values(true_values, type = "binary") + check_predictions(predictions, true_values, type = "binary") - if (length(predictions) != n) { - msg <- sprintf("Mismatch: 'true_values' has length `%s`, but 'predictions' has length `%s`.", - n, length(predictions)) - stop(msg) - } - - if (max(predictions) > 1 | min(predictions) < 0) { - stop("elements of 'predictions' should be probabilites between zero and one") - } - # ============================================ - - brierscore <- (sum((true_values - predictions)^2) ) / n + brierscore <- (true_values - predictions)^2 return(brierscore) } - - diff --git a/R/check_forecasts.R b/R/check_forecasts.R index 2890c2ed6..327b66e83 100644 --- a/R/check_forecasts.R +++ b/R/check_forecasts.R @@ -1,24 +1,24 @@ #' @title Check forecasts #' #' @description Function to check the input data before running -#' [eval_forecasts()]. +#' [score()]. #' #' The data should come in one of three different formats: -#' - A format for binary predictions (see [binary_example_data]) +#' - A format for binary predictions (see [example_binary]) #' - A sample-based format for discrete or continuous predictions -#' (see [continuous_example_data] and [integer_example_data]) -#' - A quantile-based format (see [quantile_example_data]) -#' -#' @seealso Functions to move between different formats: -#' [range_long_to_quantile()], [range_wide_to_long()] -#' @param data A data.frame or similar as would be used for [eval_forecasts()] +#' (see [example_continuous] and [example_integer]) +#' - A quantile-based format (see [example_quantile]) #' +#' @seealso Function to move from sample-based to quantile format: +#' [sample_to_quantile()] +#' @inheritParams avail_forecasts #' @return A list with elements that give information about what `scoringutils` #' thinks you are trying to do and potential issues. #' #' - `target_type` the type of the prediction target as inferred from the -#' input: 'binary', if all values in `true_value` are either 0 or 1 and values in -#' `prediction` are between 0 and 1, 'discrete' if all true values are integers +#' input: 'binary', if all values in `true_value` are either 0 or 1 and values +#' in `prediction` are between 0 and 1, 'discrete' if all true values are +#' integers. #' and 'continuous' if not. #' - `prediction_type` inferred type of the prediction. 'quantile', if there is #' a column called 'quantile', else 'discrete' if all values in `prediction` @@ -38,151 +38,180 @@ #' - `warnings` A vector with warnings. These can be ignored if you know what #' you are doing. #' - `errors` A vector with issues that will cause an error when running -#' [eval_forecasts()]. +#' [score()]. #' - `messages` A verbal explanation of the information provided above. #' #' @importFrom data.table ':=' is.data.table -#' -#' @examples -#' library(scoringutils) -#' check <- check_forecasts(quantile_example_data) -#' print(check) -#' check_forecasts(binary_example_data) #' @author Nikos Bosse \email{nikosbosse@@gmail.com} #' @export - +#' @keywords check-forecasts +#' @examples +#' check <- check_forecasts(example_quantile) +#' print(check) +#' check_forecasts(example_binary) check_forecasts <- function(data) { - check <- list() - msg <- list() + + # create lists to store results ---------------------------------------------- + out <- list() warnings <- list() errors <- list() + messages <- list() - # check data looks ok and remove columns with no prediction or no true value - - data <- withCallingHandlers( - tryCatch( - check_clean_data(data), - error = function(e) { - errors <<- c(errors, e$message) - } - ), - warning = function(w) { - warnings <<- c(warnings, w$message) - tryInvokeRestart("muffleWarning") - } - ) - if (length(errors) > 0 | !is.data.table(data)) { - stop( - "Can't check input. The following error has been produced:\n", - paste(errors, collapse = "\n") + # check data columns --------------------------------------------------------- + if (!is.data.frame(data)) { + stop("Input should be a data.frame or similar") + } + data <- data.table::as.data.table(data) + + # make sure true_value and prediction are present + if (!all(c("true_value", "prediction") %in% colnames(data))) { + stop("Data needs to have columns called `true_value` and `prediction`") + } + + # check whether any column name is a scoringutils metric + if (any(colnames(data) %in% available_metrics())) { + warnings <- c( + warnings, + "At least one column in the data corresponds to the name of a metric that will be computed by scoringutils. Please check `available_metrics()`" # nolint ) } - # obtain truth type + # check whether there is a model column present + if (!("model" %in% colnames(data))) { + messages <- c( + messages, + paste( + "There is no column called `model` in the data.", + "scoringutils therefore thinks that all forecasts come from the same model" # nolint + ) + ) + data[, model := "Unspecified model"] + } - check[["target_type"]] <- get_target_type(data) - check[["prediction_type"]] <- get_prediction_type(data) - msg <- c( - msg, - paste0( - "Forecasts are for a `", check[["target_type"]], "` target ", - "using a `", check[["prediction_type"]], "` prediction format." + # remove rows where prediction or true value are NA -------------------------- + if (anyNA(data$true_value)) { + messages <- c( + messages, + "Some values for `true_value` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected." # nolint ) - ) - - # obtain unit of a single forecast - protected_columns <- c( - "prediction", "true_value", "sample", "quantile", - "range", "boundary" - ) - obs_unit <- setdiff(colnames(data), protected_columns) - check[["forecast_unit"]] <- obs_unit - msg <- c( - msg, - paste0( - "The unit of a single forecast is defined by `", - paste(check[["forecast_unit"]], collapse = "`, `"), "`. ", - "If this is not as intended, please DELETE UNNECESSARY columns or add new ones." + } + if (anyNA(data$prediction)) { + messages <- c( + messages, + "Some values for `prediction` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected." # nolint ) - ) + } + data <- data[!is.na(true_value) & !is.na(prediction)] + + if (nrow(data) == 0) { + stop("After removing all NA true values and predictions, there were no observations left") + } + + + # get information about the forecasts ---------------------------------------- + forecast_unit <- get_forecast_unit(data) + target_type <- get_target_type(data) + prediction_type <- get_prediction_type(data) - # check what format is has right now and tell user to convert it. + + # check whether a column called 'quantile' or 'sample' is present ------------ if (!any(c("quantile", "sample") %in% colnames(data))) { - if ("range" %in% colnames(data) | any(grepl("lower_", colnames(data)))) { - errors <- c( - errors, - "It seems like you have a format based on forecast intervals (see `example_data_long`, `example_data_semi_wide`, `example_data_wide`). You need to convert this to a quantile-based format first using `range_wide_to_long()` and `range_long_to_quantile()`" - ) - } else if (!check[["target_type"]] == "binary") { + if (!target_type == "binary") { errors <- c( errors, - "This forecast does not seem to be for a binary prediction target, so we need a column called quantile or sample" + "This forecast does not seem to be for a binary prediction target, so we need a column called quantile or sample" # nolint ) } } + + # check duplicate forecasts -------------------------------------------------- # check whether there is more than one prediction for the same target, i.e. # the length of prediction is greater 1 for a sample / quantile for # a single forecast - type <- c("sample", "quantile")[c("sample", "quantile") %in% colnames(data)] - data[, InternalDuplicateCheck := .N, by = c(obs_unit, type)] - if (any(data$InternalDuplicateCheck > 1)) { + check_duplicates <- find_duplicates(data) + + if (nrow(check_duplicates) > 0) { errors <- c( errors, paste( - "There are instances with more than one forecast for the same target.", - "This can't be right and needs to be resolved. Maybe you need to check", - "the unit of a single forecast and add missing columns?" + "There are instances with more than one forecast for the same target. This can't be right and needs to be resolved. Maybe you need to check the unit of a single forecast and add missing columns? Use the function find_duplicates() to identify duplicate rows." ) ) - check[["duplicate_forecasts"]] <- data[InternalDuplicateCheck > 1] } - data[, InternalDuplicateCheck := NULL] - # check whether there is a model column present. And if not, state what that means - if (!("model" %in% colnames(data))) { - msg <- c( - msg, - paste( - "There is no column called `model` in the data.", - "scoringutils therefore thinks that all forecasts come from the same model" - ) - ) - data[, model := "Unspecified model"] - } - - # some checks whether there are the same number of quantiles, samples - data[, InternalNumCheck := length(prediction), by = obs_unit] + # check whether there are the same number of quantiles, samples -------------- + data[, InternalNumCheck := length(prediction), by = forecast_unit] n <- unique(data$InternalNumCheck) if (length(n) > 1) { warnings <- c( warnings, paste0( - "Some forecasts have different numbers of rows (e.g. quantiles or samples). ", + "Some forecasts have different numbers of rows (e.g. quantiles or samples). ", # nolint "scoringutils found: ", paste(n, collapse = ", "), ". This is not necessarily a problem, but make sure this is intended." ) ) } - check[["rows_per_forecast"]] <- - data[, .(rows_per_forecast = unique(InternalNumCheck)), by = model] data[, InternalNumCheck := NULL] - # get available unique values per model for the different columns - cols <- obs_unit[obs_unit != "model"] - check[["unique_values"]] <- - data[, vapply(.SD, FUN = function(x) length(unique(x)), integer(1)), by = "model"] - check[["messages"]] <- unlist(msg) - check[["warnings"]] <- unlist(warnings) - check[["errors"]] <- unlist(errors) + # store info so it can be accessed by the user ------------------------------- + out[["cleaned_data"]] <- data + + # available unique values per model for the different columns + cols <- forecast_unit[forecast_unit != "model"] + out[["unique_values"]] <- + data[, lapply(.SD, FUN = function(x) length(unique(x))), by = "model"] + + # forecast infos + out[["forecast_unit"]] <- forecast_unit + out[["target_type"]] <- target_type + out[["prediction_type"]] <- prediction_type + + out[["messages"]] <- unlist(messages) + out[["warnings"]] <- unlist(warnings) + out[["errors"]] <- unlist(errors) - class(check) <- c("scoringutils_check", "list") - return(check) + # generate messages, warnings, errors ---------------------------------------- + if (length(messages) > 0) { + msg <- collapse_messages(type = "messages", messages) + message(msg) + } + if (length(warnings) > 0) { + msg <- collapse_messages(type = "warnings", warnings) + warning(msg) + } + if (length(errors) > 0) { + msg <- collapse_messages(type = "errors", errors) + stop(msg) + } + + # return check results + class(out) <- c("scoringutils_check", "list") + return(out) +} + + +#' @title Collapse several messages to one +#' +#' @description Internal helper function to facilitate generating messages +#' and warnings in [check_forecasts()] +#' +#' @param type character, should be either "messages", "warnings" or "errors" +#' @param messages the messages or warnings to collapse +#' +#' @return string with the message or warning +#' @keywords internal +collapse_messages <- function(type = "messages", messages) { + paste0( + "The following ", type, " were produced when checking inputs:\n", + paste(paste0(seq_along(messages), ". "), + messages, collapse = "\n")) } @@ -197,128 +226,57 @@ check_forecasts <- function(data) { #' #' @return NULL #' @export - +#' @keywords check-forecasts +#' @examples +#' check <- check_forecasts(example_quantile) +#' print(check) print.scoringutils_check <- function(x, ...) { - print_elements <- names(x)[!(names(x) %in% c("messages"))] - print.default(x[print_elements]) - - cat(paste0( - "\nBased on your input, scoringutils thinks:\n", - paste(x$messages, collapse = "\n") - )) - cat("\n$rows_per_forecast shows how many rows (usually quantiles or samples are available per forecast.") - cat( - "\n$unique_values shows how many unique values there are per column per model", - "(across the entire data)." - ) - - if (length(x$warnings) > 0) { - cat(paste0( - "\n\n", - "You should be aware of the following warnings:\n", - paste(x$warnings, collapse = "\n") - )) - } - - if (length(x$errors) > 0) { - cat(paste0( - "\n\n", - "The following things will likely result in an error:", - paste(x$errors, collapse = "\n") - )) - } - return(invisible(x)) -} + cat("Your forecasts seem to be for a target of the following type:\n") + print(x["target_type"]) + cat("and in the following format:\n") + print(x["prediction_type"]) + cat("The unit of a single forecast is defined by:\n") + print(x["forecast_unit"]) + cat("Cleaned data, rows with NA values in prediction or true_value removed:\n") + print.default(x["cleaned_data"]) -#' @title Get prediction type of a forecast -#' -#' @description Internal helper function to get the prediction type of a -#' forecast. That is inferred based on the properties of the values in the -#' `prediction` column. -#' -#' @inheritParams check_forecasts -#' -#' @return Character vector of length one with either "quantile", "integer", or -#' "continuous". -#' -#' @keywords internal + cat("Number of unique values per column per model:\n") + print.default(x["unique_values"]) -get_prediction_type <- function(data) { - if ("quantile" %in% names(data)) { - return("quantile") - } else if (all.equal(data$prediction, as.integer(data$prediction)) == TRUE) { - return("integer") - } else { - return("continuous") + colnames <- names(x)[names(x) %in% c("messages", "warnings", "errors")] + if (length(colnames) > 0) { + print.default(x[colnames]) } -} - - -#' @title Get type of the target true values of a forecast -#' -#' @description Internal helper function to get the type of the target -#' true values of a forecast. That is inferred based on the which columns -#' are present in the data. -#' -#' @inheritParams check_forecasts -#' -#' @return Character vector of length one with either "binary", "integer", or -#' "continous" -#' -#' @keywords internal -get_target_type <- function(data) { - if (isTRUE(all.equal(data$true_value, as.integer(data$true_value)))) { - if (all(data$true_value %in% c(0, 1)) && - all(data$prediction >= 0) && all(data$prediction <= 1)) { - return("binary") - } else { - return("integer") - } - } else { - return("continuous") - } + return(invisible(x)) } -#' @title Clean forecast data +#' @title Find duplicate forecasts #' -#' @description Helper function to check that the input is in fact a data.frame -#' or similar and remove rows with no value for `prediction` or `true_value` +#' @description Helper function to identify duplicate forecasts, i.e. +#' instances where there is more than one forecast for the same prediction +#' target. #' -#' @param data A data.frame or similar as it gets passed to [eval_forecasts()]. +#' @param data A data.frame as used for [score()] #' -#' @return A data.table with NA values in `true_value` or `prediction` removed. -#' -#' @importFrom data.table as.data.table -#' -#' @keywords internal - -check_clean_data <- function(data) { - if (!is.data.frame(data)) { - stop("Input should be a data.frame or similar") - } - data <- as.data.table(data) - - # make sure necessary columns are present - if (!all(c("true_value", "prediction") %in% colnames(data))) { - stop("Data needs to have columns called `true_value` and `prediction`") - } +#' @return A data.frame with all rows for which a duplicate forecast was found +#' @export +#' @keywords check-forecasts +#' @examples +#' example <- rbind(example_quantile, example_quantile[1000:1010]) +#' find_duplicates(example) - # remove rows where prediction or true value are NA - if (anyNA(data$true_value)) { - warning("Some values for `true_value` are NA in the data provided") - } - data <- data[!is.na(true_value)] +find_duplicates <- function(data) { + type <- c("sample", "quantile")[c("sample", "quantile") %in% colnames(data)] + forecast_unit <- get_forecast_unit(data) - if (anyNA(data$prediction)) { - warning("Some values for `prediction` are NA in the data provided") - } - data <- data[!is.na(prediction)] - if (nrow(data) == 0) { - stop("After removing all NA true values and predictions, there were no observations left") - } - return(data) + data <- as.data.table(data) + data[, InternalDuplicateCheck := .N, by = c(forecast_unit, type)] + out <- data[InternalDuplicateCheck > 1] + out[, InternalDuplicateCheck := NULL] + return(out[]) } + diff --git a/R/correlations.R b/R/correlations.R new file mode 100644 index 000000000..e08417ead --- /dev/null +++ b/R/correlations.R @@ -0,0 +1,118 @@ +#' @title Correlation Between Metrics +#' +#' @description +#' Calculate the correlation between different metrics for a data.frame of +#' scores as produced by [score()]. +#' +#' @param metrics A character vector with the metrics to show. If set to +#' `NULL` (default), all metrics present in `scores` will +#' be shown +#' @inheritParams avail_forecasts +#' @inheritParams pairwise_comparison +#' @return A data.table with correlations for the different metrics +#' @importFrom data.table setDT +#' @importFrom stats cor na.omit +#' @export +#' @keywords scoring +#' @examples +#' scores <- score(example_quantile) +#' correlation(scores) +correlation <- function(scores, + metrics = NULL) { + metrics <- check_metrics(metrics) + + # check metrics are present + metrics <- names(scores)[names(scores) %in% metrics] + + # if quantile column is present, throw a warning + if ("quantile" %in% names(scores)) { + warning("There is a column called 'quantile' in the scores. Usually, you should call 'summarise_scores()' to summarise over quantiles and obtain one score per forecast before calculating correlations. You can ignore this warning if you know what you're doing.") + } + + # remove all non metrics and non-numeric columns + df <- scores[, .SD, .SDcols = sapply( + scores, + function(x) { + (all(is.numeric(x))) && all(is.finite(x)) + } + )] + df <- df[, .SD, .SDcols = names(df) %in% metrics] + + # define correlation matrix + cor_mat <- round(cor(as.matrix(df)), 2) + + correlations <- setDT(as.data.frame((cor_mat)), + keep.rownames = TRUE + )[, metric := rn][, rn := NULL] + + return(correlations[]) +} + +# define function to obtain upper triangle of matrix +get_lower_tri <- function(cormat) { + cormat[lower.tri(cormat)] <- NA + return(cormat) +} + +#' @title Plot Correlation Between Metrics +#' +#' @description +#' Plots a heatmap of correlations between different metrics +#' +#' @param correlations A data.table of correlations between scores as produced +#' by [correlation()]. +#' @return A ggplot2 object showing a coloured matrix of correlations +#' between metrics +#' @importFrom ggplot2 ggplot geom_tile geom_text aes scale_fill_gradient2 +#' element_text labs coord_cartesian theme element_blank +#' @importFrom data.table setDT melt +#' @export +#' @examples +#' scores <- score(example_quantile) +#' correlations <- correlation(scores) +#' plot_correlation(correlations) +plot_correlation <- function(correlations) { + + metrics <- names(correlations)[names(correlations) %in% available_metrics()] + + lower_triangle <- get_lower_tri(correlations[, .SD, .SDcols = metrics]) + rownames(lower_triangle) <- colnames(lower_triangle) + + # get plot data.frame + plot_df <- data.table::as.data.table(lower_triangle)[, metric := metrics] + plot_df <- na.omit(data.table::melt(plot_df, id.vars = "metric")) + + # refactor levels according to the metrics + plot_df[, metric := factor(metric, levels = metrics)] + plot_df[, variable := factor(variable, rev(metrics))] + + plot <- ggplot(plot_df, aes( + x = variable, y = metric, + fill = value + )) + + geom_tile( + color = "white", + width = 0.97, height = 0.97 + ) + + geom_text(aes(y = metric, label = value)) + + scale_fill_gradient2( + low = "steelblue", mid = "white", + high = "salmon", + name = "Correlation", + breaks = c(-1, -0.5, 0, 0.5, 1) + ) + + theme_scoringutils() + + theme( + axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1 + ), + panel.grid.major.y = element_blank(), + panel.grid.minor.y = element_blank(), + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank() + ) + + labs(x = "", y = "") + + coord_cartesian(expand = FALSE) + return(plot) +} diff --git a/R/data.R b/R/data.R index 1d11cfef1..aed5b6c27 100644 --- a/R/data.R +++ b/R/data.R @@ -1,155 +1,72 @@ #' Quantile Example Data #' -#' A data set with predictions for different quantities relevant in the -#' 2020 UK Covid-19 epidemic. +#' A data set with predictions for COVID-19 cases and deaths submitted to the +#' European Forecast Hub. #' #' @format A data frame with #' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} +#' \item{location}{the country for which a prediction was made} +#' \item{target_end_date}{the date for which a prediction was made} +#' \item{target_type}{the target to be predicted (cases or deaths)} #' \item{true_value}{true observed values} -#' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} +#' \item{location_name}{name of the country for which a prediction was made} +#' \item{forecast_date}{the date on which a prediction was made} #' \item{quantile}{quantile of the corresponding prediction} -#' \item{prediction}{quantile predictions} -#' \item{horizon}{forecast horizon in days} -#' -#' } -"quantile_example_data" - - -#' Range Forecast Example Data (Long Format) -#' -#' A data set with predictions with different interval ranges relevant in the -#' 2020 UK Covid-19 epidemic. -#' -#' @format A data frame with: -#' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} -#' \item{true_value}{true observed values} -#' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} -#' \item{prediction}{value for the lower or upper bound of the given prediction interval} -#' \item{horizon}{forecast horizon in days} -#' \item{boundary}{indicate lower or upper bound of prediction interval} -#' \item{range}{range of the corresponding prediction interval} -#' } -"range_example_data_long" - - - -#' Range Forecast Example Data (Wide Format) -#' -#' A data set with predictions with different interval ranges relevant in the -#' 2020 UK Covid-19 epidemic. -#' -#' @format A data frame with: -#' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} -#' \item{true_value}{true observed values} -#' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} -#' \item{horizon}{forecast horizon in days} -#' \item{lower_0}{prediction for the lower bound of the 0% interval range (median)} -#' \item{lower_10}{prediction for the lower bound of the 10% interval range} -#' \item{lower_20}{prediction for the lower bound of the 20% interval range} -#' \item{lower_30}{prediction for the lower bound of the 30% interval range} -#' \item{lower_40}{prediction for the lower bound of the 40% interval range} -#' \item{lower_50}{prediction for the lower bound of the 50% interval range} -#' \item{lower_60}{prediction for the lower bound of the 60% interval range} -#' \item{lower_70}{prediction for the lower bound of the 70% interval range} -#' \item{lower_80}{prediction for the lower bound of the 80% interval range} -#' \item{lower_90}{prediction for the lower bound of the 90% interval range} -#' \item{upper_0}{prediction for the upper bound of the 0% interval range} -#' \item{upper_10}{prediction for the upper bound of the 1% interval range} -#' \item{upper_20}{prediction for the upper bound of the 20% interval range} -#' \item{upper_30}{prediction for the upper bound of the 30% interval range} -#' \item{upper_40}{prediction for the upper bound of the 40% interval range} -#' \item{upper_50}{prediction for the upper bound of the 50% interval range} -#' \item{upper_60}{prediction for the upper bound of the 60% interval range} -#' \item{upper_70}{prediction for the upper bound of the 70% interval range} -#' \item{upper_80}{prediction for the upper bound of the 80% interval range} -#' \item{upper_90}{prediction for the upper bound of the 90% interval range} -#' } -"range_example_data_wide" - - -#' Range Forecast Example Data (Semi-Wide Format) -#' -#' A data set with predictions with different interval ranges relevant in the -#' 2020 UK Covid-19 epidemic. -#' -#' @format A data frame with 5,419 rows and 12 columns: -#' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} -#' \item{true_value}{true observed values} +#' \item{prediction}{predicted value} #' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} -#' \item{horizon}{forecast horizon in days} -#' \item{range}{range of the corresponding prediction interval} -#' \item{lower}{prediction for the lower bound of the corresponding interval} -#' \item{upper}{prediction for the upper bound of the corresponding interval} +#' \item{horizon}{forecast horizon in weeks} #' } -"range_example_data_semi_wide" +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +"example_quantile" #' Continuous Forecast Example Data #' -#' A data set with continuous predictions in a sample-based format relevant in the -#' 2020 UK Covid-19 epidemic. +#' A data set with continuous predictions for COVID-19 cases and deaths +#' constructed from data submitted to the European Forecast Hub. #' #' @format A data frame with 13,429 rows and 10 columns: #' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} +#' \item{location}{the country for which a prediction was made} +#' \item{target_end_date}{the date for which a prediction was made} +#' \item{target_type}{the target to be predicted (cases or deaths)} +#' \item{true_value}{true observed values} +#' \item{location_name}{name of the country for which a prediction was made} +#' \item{forecast_date}{the date on which a prediction was made} #' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} -#' \item{horizon}{forecast horizon in days} -#' \item{prediction}{prediction value for the corresponding sample} +#' \item{horizon}{forecast horizon in weeks} +#' \item{prediction}{predicted value} #' \item{sample}{id for the corresponding sample} -#' \item{true_value}{true observed values} #' } -"continuous_example_data" +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +"example_continuous" #' Integer Forecast Example Data #' -#' A data set with integer predictions in a sample-based format relevant in the -#' 2020 UK Covid-19 epidemic. +#' A data set with integer predictions for COVID-19 cases and deaths +#' constructed from data submitted to the European Forecast Hub. #' #' @format A data frame with 13,429 rows and 10 columns: #' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} +#' \item{location}{the country for which a prediction was made} +#' \item{target_end_date}{the date for which a prediction was made} +#' \item{target_type}{the target to be predicted (cases or deaths)} +#' \item{true_value}{true observed values} +#' \item{location_name}{name of the country for which a prediction was made} +#' \item{forecast_date}{the date on which a prediction was made} #' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} -#' \item{horizon}{forecast horizon in days} -#' \item{prediction}{prediction value for the corresponding sample} +#' \item{horizon}{forecast horizon in weeks} +#' \item{prediction}{predicted value} #' \item{sample}{id for the corresponding sample} -#' \item{true_value}{true observed values} #' } -"integer_example_data" +"example_integer" #' Binary Forecast Example Data #' -#' A data set with (constructed) binary predictions relevant in the -#' 2020 UK Covid-19 epidemic. +#' A data set with binary predictions for COVID-19 cases and deaths constructed +#' from data submitted to the European Forecast Hub. #' #' Predictions in the data set were constructed based on the continuous example #' data by looking at the number of samples below the mean prediction. @@ -160,56 +77,59 @@ #' #' @format A data frame with 346 rows and 10 columns: #' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} -#' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} -#' \item{horizon}{forecast horizon in days} -#' \item{prediction}{probability prediction that true value would be 1} +#' \item{location}{the country for which a prediction was made} +#' \item{location_name}{name of the country for which a prediction was made} +#' \item{target_end_date}{the date for which a prediction was made} +#' \item{target_type}{the target to be predicted (cases or deaths)} #' \item{true_value}{true observed values} +#' \item{forecast_date}{the date on which a prediction was made} +#' \item{model}{name of the model that generated the forecasts} +#' \item{horizon}{forecast horizon in weeks} +#' \item{prediction}{predicted value} #' } -"binary_example_data" +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +"example_binary" #' Quantile Example Data - Forecasts only #' -#' A data set with predictions for different quantities relevant in the -#' 2020 UK Covid-19 epidemic, but no true_values +#' A data set with quantile predictions for COVID-19 cases and deaths +#' submitted to the European Forecast Hub. #' #' @format A data frame with 7,581 rows and 9 columns: #' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{model}{name of the model that generated the forecasts} -#' \item{creation_date}{date on which the forecast was made} +#' \item{location}{the country for which a prediction was made} +#' \item{target_end_date}{the date for which a prediction was made} +#' \item{target_type}{the target to be predicted (cases or deaths)} +#' \item{forecast_date}{the date on which a prediction was made} #' \item{quantile}{quantile of the corresponding prediction} -#' \item{prediction}{quantile predictions} -#' \item{value_desc}{long form description of the prediction target} -#' \item{horizon}{forecast horizon in days} -#' +#' \item{prediction}{predicted value} +#' \item{model}{name of the model that generated the forecasts} +#' \item{horizon}{forecast horizon in weeks} #' } +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_quantile_forecasts_only" #' Truth data only #' -#' A data set with truth data for different quantities relevant in the -#' 2020 UK Covid-19 epidemic, but no predictions +#' A data set with truth values for COVID-19 cases and deaths +#' submitted to the European Forecast Hub. #' #' @format A data frame with 140 rows and 5 columns: #' \describe{ -#' \item{value_date}{the date for which a prediction was made} -#' \item{value_type}{the target to be predicted (short form)} -#' \item{geography}{the region for which a prediction was made} -#' \item{value_desc}{long form description of the prediction target} +#' \item{location}{the country for which a prediction was made} +#' \item{target_end_date}{the date for which a prediction was made} +#' \item{target_type}{the target to be predicted (cases or deaths)} #' \item{true_value}{true observed values} -#' +#' \item{location_name}{name of the country for which a prediction was made} #' } -"example_truth_data_only" - - - +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +"example_truth_only" +#' Summary information for selected metrics +#' +#' A data set with summary information on selected metrics implemented in +#' \pkg{scoringutils} +#' @keywords info +"metrics_summary" diff --git a/R/eval_forecasts.R b/R/eval_forecasts.R deleted file mode 100644 index 0390573c3..000000000 --- a/R/eval_forecasts.R +++ /dev/null @@ -1,404 +0,0 @@ -#' @title Evaluate forecasts -#' -#' @description The function `eval_forecasts` is an easy to use wrapper -#' of the lower level functions in the \pkg{scoringutils} package. -#' It can be used to score probabilistic or quantile forecasts of -#' continuous, integer-valued or binary variables. -#' -#' @details the following metrics are used where appropriate: -#' \itemize{ -#' \item {Interval Score} for quantile forecasts. Smaller is better. See -#' [interval_score()] for more information. By default, the -#' weighted interval score is used. -#' \item {Brier Score} for a probability forecast of a binary outcome. -#' Smaller is better. See [brier_score()] for more information. -#' \item {aem} Absolute error of the median prediction -#' \item {Bias} 0 is good, 1 and -1 are bad. -#' See [bias()] for more information. -#' \item {Sharpness} Smaller is better. See [sharpness()] for more -#' information. -#' \item {Calibration} represented through the p-value of the -#' Anderson-Darling test for the uniformity of the Probability Integral -#' Transformation (PIT). For integer valued forecasts, this p-value also -#' has a standard deviation. Larger is better. -#' See [pit()] for more information. -#' \item {DSS} Dawid-Sebastiani-Score. Smaller is better. -#' See [dss()] for more information. -#' \item {CRPS} Continuous Ranked Probability Score. Smaller is better. -#' See [crps()] for more information. -#' \item {Log Score} Smaller is better. Only for continuous forecasts. -#' See [logs()] for more information. -#' } -#' -#' @param data A data.frame or data.table with the predictions and observations. -#' Note: it is easiest to have a look at the example files provided in the -#' package and in the examples below. -#' The following columns need to be present: -#' \itemize{ -#' \item `true_value` - the true observed values -#' \item `prediction` - predictions or predictive samples for one -#' true value. (You only don't need to provide a prediction column if -#' you want to score quantile forecasts in a wide range format.)} -#' For integer and continuous forecasts a `sample` column is needed: -#' \itemize{ -#' \item `sample` - an index to identify the predictive samples in the -#' prediction column generated by one model for one true value. Only -#' necessary for continuous and integer forecasts, not for -#' binary predictions.} -#' For quantile forecasts the data can be provided in variety of formats. You -#' can either use a range-based format or a quantile-based format. (You can -#' convert between formats using [quantile_to_range_long()], -#' [range_long_to_quantile()], -#' [sample_to_range_long()], -#' [sample_to_quantile()]) -#' For a quantile-format forecast you should provide: -#' - `prediction`: prediction to the corresponding quantile -#' - `quantile`: quantile to which the prediction corresponds -#' For a range format (long) forecast you need -#' - `prediction`: the quantile forecasts -#' - `boundary`: values should be either "lower" or "upper", depending -#' on whether the prediction is for the lower or upper bound of a given range -#' - `range` the range for which a forecast was made. For a 50% interval -#' the range should be 50. The forecast for the 25% quantile should have -#' the value in the `prediction` column, the value of `range` -#' should be 50 and the value of `boundary` should be "lower". -#' If you want to score the median (i.e. `range = 0`), you still -#' need to include a lower and an upper estimate, so the median has to -#' appear twice. -#' Alternatively you can also provide the format in a wide range format. -#' This format needs: -#' - pairs of columns called something like 'upper_90' and 'lower_90', -#' or 'upper_50' and 'lower_50', where the number denotes the interval range. -#' For the median, you need to provide columns called 'upper_0' and 'lower_0' -#' @param by character vector of columns to group scoring by. This should be the -#' lowest level of grouping possible, i.e. the unit of the individual -#' observation. This is important as many functions work on individual -#' observations. If you want a different level of aggregation, you should use -#' `summarise_by` to aggregate the individual scores. -#' Also not that the pit will be computed using `summarise_by` -#' instead of `by` -#' @param summarise_by character vector of columns to group the summary by. By -#' default, this is equal to `by` and no summary takes place. -#' But sometimes you may want to to summarise -#' over categories different from the scoring. -#' `summarise_by` is also the grouping level used to compute -#' (and possibly plot) the probability integral transform(pit). -#' @param metrics the metrics you want to have in the output. If `NULL` (the -#' default), all available metrics will be computed. For a list of available -#' metrics see [available_metrics()] -#' @param quantiles numeric vector of quantiles to be returned when summarising. -#' Instead of just returning a mean, quantiles will be returned for the -#' groups specified through `summarise_by`. By default, no quantiles are -#' returned. -#' @param sd if `TRUE` (the default is `FALSE`) the standard deviation of all -#' metrics will be returned when summarising. -#' @param pit_plots if `TRUE` (not the default), pit plots will be returned. For -#' details see [pit()]. -#' @param interval_score_arguments list with arguments for the calculation of -#' the interval score. These arguments get passed down to -#' `interval_score`, except for the argument `count_median_twice` that -#' controls how the interval scores for different intervals are summed up. This -#' should be a logical (default is `FALSE`) that indicates whether or not -#' to count the median twice when summarising. This would conceptually treat the -#' median as a 0% prediction interval, where the median is the lower as well as -#' the upper bound. The alternative is to treat the median as a single quantile -#' forecast instead of an interval. The interval score would then -#' be better understood as an average of quantile scores.) -#' @param summarised Summarise arguments (i.e. take the mean per group -#' specified in group_by. Default is `TRUE.` -#' @param verbose print out additional helpful messages (default is `TRUE`) -#' @param forecasts data.frame with forecasts, that should follow the same -#' general guidelines as the `data` input. Argument can be used to supply -#' forecasts and truth data independently. Default is `NULL`. -#' @param truth_data data.frame with a column called `true_value` to be merged -#' with `forecasts` -#' @param merge_by character vector with column names that `forecasts` and -#' `truth_data` should be merged on. Default is `NULL` and merge will be -#' attempted automatically. -#' @param compute_relative_skill logical, whether or not to compute relative -#' performance between models. If `TRUE` (default is `FALSE`), then a column called -#' 'model' must be present in the input data. For more information on -#' the computation of relative skill, see [pairwise_comparison()]. -#' Relative skill will be calculated for the aggregation level specified in -#' `summarise_by`. -#' @param rel_skill_metric character string with the name of the metric for which -#' a relative skill shall be computed. If equal to 'auto' (the default), then -#' one of interval score, crps or brier score will be used where appropriate -#' @param baseline character string with the name of a model. If a baseline is -#' given, then a scaled relative skill with respect to the baseline will be -#' returned. By default (`NULL`), relative skill will not be scaled with -#' respect to a baseline model. -#' -#' @return A data.table with appropriate scores. For binary predictions, -#' the Brier Score will be returned, for quantile predictions the interval -#' score, as well as adapted metrics for calibration, sharpness and bias. -#' For integer forecasts, Sharpness, Bias, DSS, CRPS, LogS, and -#' pit_p_val (as an indicator of calibration) are returned. For integer -#' forecasts, pit_sd is returned (to account for the randomised PIT), -#' but no Log Score is returned (the internal estimation relies on a -#' kernel density estimate which is difficult for integer-valued forecasts). -#' If `summarise_by` is specified differently from `by`, -#' the average score per summary unit is returned. -#' If specified, quantiles and standard deviation of scores can also be returned -#' when summarising. -#' -#' @importFrom data.table ':=' as.data.table -#' @importFrom methods hasArg -#' -#' @examples -#' ## Probability Forecast for Binary Target -#' binary_example <- data.table::setDT(scoringutils::binary_example_data) -#' eval <- scoringutils::eval_forecasts(binary_example, -#' summarise_by = c("model"), -#' quantiles = c(0.5), sd = TRUE, -#' verbose = FALSE) -#' -#' ## Quantile Forecasts -#' # wide format example (this examples shows usage of both wide formats) -#' range_example_wide <- data.table::setDT(scoringutils::range_example_data_wide) -#' range_example <- scoringutils::range_wide_to_long(range_example_wide) -#' wide2 <- data.table::setDT(scoringutils::range_example_data_semi_wide) -#' range_example <- scoringutils::range_wide_to_long(wide2) -#' example <- scoringutils::range_long_to_quantile(range_example) -#' eval <- scoringutils::eval_forecasts(example, -#' summarise_by = "model", -#' quantiles = c(0.05, 0.95), -#' sd = TRUE) -#' eval <- scoringutils::eval_forecasts(example) -#' -#' -#' ## Integer Forecasts -#' integer_example <- data.table::setDT(scoringutils::integer_example_data) -#' eval <- scoringutils::eval_forecasts(integer_example, -#' summarise_by = c("model"), -#' quantiles = c(0.1, 0.9), -#' sd = TRUE, -#' pit_plots = TRUE) -#' eval <- scoringutils::eval_forecasts(integer_example) -#' -#' ## Continuous Forecasts -#' continuous_example <- data.table::setDT(scoringutils::continuous_example_data) -#' eval <- scoringutils::eval_forecasts(continuous_example) -#' eval <- scoringutils::eval_forecasts(continuous_example, -#' quantiles = c(0.5, 0.9), -#' sd = TRUE, -#' summarise_by = c("model")) -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @references Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ -#' (2019) Assessing the performance of real-time epidemic forecasts: A -#' case study of Ebola in the Western Area region of Sierra Leone, 2014-15. -#' PLoS Comput Biol 15(2): e1006785. -#' @export - -eval_forecasts <- function(data = NULL, - by = NULL, - summarise_by = by, - metrics = NULL, - quantiles = c(), - sd = FALSE, - interval_score_arguments = list(weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE), - pit_plots = FALSE, - summarised = TRUE, - verbose = TRUE, - forecasts = NULL, - truth_data = NULL, - merge_by = NULL, - compute_relative_skill = FALSE, - rel_skill_metric = "auto", - baseline = NULL) { - - - # preparations --------------------------------------------------------------- - # check data argument is provided - if (is.null(data) && (is.null(truth_data) | is.null(forecasts))) { - stop("need arguments 'data' in function 'eval_forecasts()', or alternatively 'forecasts' and 'truth_data'") - } - if (is.null(data)) { - data <- merge_pred_and_obs(forecasts, truth_data, by = merge_by) - if (nrow(data) == 0) { - if (verbose) { - warning("After attempting to merge, only an empty data.table was left") - } - return(data) - } - } - - # do a copy to avoid that the input may be altered in any way. - data <- data.table::as.data.table(data) - - # error handling for relative skill computation - # should probably wrap this in a function warn_if_verbose(warning, verbose) - if (compute_relative_skill) { - if (!("model" %in% colnames(data))) { - warning("to compute relative skills, there must column present called 'model'. Relative skill will not be computed") - compute_relative_skill <- FALSE - } - models <- unique(data$model) - if (length(models) < 2 + (!is.null(baseline))) { - if (verbose) { - warning("you need more than one model non-baseline model to make model comparisons. Relative skill will not be computed") - } - compute_relative_skill <- FALSE - } - if (!is.null(baseline) && !(baseline %in% models)) { - if (verbose){ - warning("The baseline you provided for the relative skill is not one of the models in the data. Relative skill will not be computed") - } - compute_relative_skill <- FALSE - } - if (rel_skill_metric != "auto" && !(rel_skill_metric %in% available_metrics())) { - warning("argument 'rel_skill_metric' must either be 'auto' or one of the metrics that can be computed. Relative skill will not be computed") - compute_relative_skill <- FALSE - } - } - - # check that everything is unique - unique_data <- unique(data) - if (nrow(unique_data) != nrow(data)) { - data <- unique_data - if(verbose) { - warning("There are duplicate rows in data. These were removed") - } - } - - # check and remove any rows where the true value is missing - if (any(is.na(data$true_value))) { - if(verbose) { - warning("There are NA values in the true values provided. These will be removed") - } - } - data <- data[!is.na(true_value)] - - # obtain a value for by if nothing was provided by the user - if (is.null(by)) { - protected_columns <- c("prediction", "true_value", "sample", "quantile", - "range", "boundary") - by <- setdiff(colnames(data), protected_columns) - - if (is.null(summarise_by)) { - summarise_by <- by - } - } - - # check that the arguments in by and summarise_by are actually present - if (!all(c(by, summarise_by) %in% c(colnames(data), "range", "quantile"))) { - not_present <- setdiff(unique(c(by, summarise_by)), - c(colnames(data), "range", "quantile")) - msg <- paste0("The following items in `by` or `summarise_by` are not", - "valid column names of the data: '", - paste(not_present, collapse = ", "), - "'. Check and run `eval_forecasts()` again") - stop(msg) - } - - # check metrics to be computed - available_metrics <- available_metrics() - if (is.null(metrics)) { - metrics <- available_metrics - } else { - if (!all(metrics %in% available_metrics)) { - if (verbose) { - msg <- paste("The following metrics are not currently implemented and", - "will not be computed:", - paste(setdiff(metrics, available_metrics), collapse = ", ")) - } - warning(msg) - } - } - - - # check prediction and target type ------------------------------------------- - if (any(grepl("lower", names(data))) | "boundary" %in% names(data) | - "quantile" %in% names(data) | "range" %in% names(data)) { - prediction_type <- "quantile" - } else if (isTRUE(all.equal(data$prediction, as.integer(data$prediction)))) { - prediction_type <- "integer" - } else { - prediction_type <- "continuous" - } - - if (isTRUE(all.equal(data$true_value, as.integer(data$true_value)))) { - if (all(data$true_value %in% c(0,1)) && all(data$prediction >= 0) && all(data$prediction <= 1)) { - target_type = "binary" - } else { - target_type = "integer" - } - } else { - target_type = "continuous" - } - - # remove any rows where the prediction is missing ---------------------------- - data <- data[!is.na(prediction)] - if (nrow(data) == 0) { - if (verbose) { - message("After removing all NA true values and predictions, there were no observations left") - } - return(data) - } - - - # Score binary predictions --------------------------------------------------- - if (target_type == "binary") { - res <- eval_forecasts_binary(data = data, - by = by, - summarise_by = summarise_by, - metrics = metrics, - quantiles = quantiles, - sd = sd, - summarised = summarised, - verbose = verbose) - return(res) - } - - # Score quantile predictions ------------------------------------------------- - if (prediction_type == "quantile") { - res <- eval_forecasts_quantile(data = data, - by = by, - summarise_by = summarise_by, - metrics = metrics, - quantiles = quantiles, - sd = sd, - pit_plots = pit_plots, - interval_score_arguments = interval_score_arguments, - summarised = summarised, - verbose = verbose, - compute_relative_skill = compute_relative_skill, - rel_skill_metric = rel_skill_metric, - baseline = baseline) - return(res) - } - - - # Score integer or continuous predictions ------------------------------------ - if (prediction_type %in% c("integer", "continuous")) { - - # compute scores ----------------------------------------------------------- - res <- eval_forecasts_sample(data = data, - by = by, - summarise_by = summarise_by, - metrics = metrics, - prediction_type = prediction_type, - quantiles = quantiles, - sd = sd, - pit_plots = pit_plots, - summarised = summarised, - verbose = verbose) - return(res) - } -} - - - - - - - - - - - - - diff --git a/R/eval_forecasts_binary.R b/R/eval_forecasts_binary.R deleted file mode 100644 index 28584906d..000000000 --- a/R/eval_forecasts_binary.R +++ /dev/null @@ -1,52 +0,0 @@ -#' @title Evaluate forecasts in a Binary Format -#' -#' @inheritParams eval_forecasts -#' @return A data.table with appropriate scores. For more information see -#' [eval_forecasts()] -#' -#' @importFrom data.table ':=' -#' -#' @examples -#' # Probability Forecast for Binary Target -#' binary_example <- data.table::setDT(scoringutils::binary_example_data) -#' eval <- scoringutils::eval_forecasts(data = binary_example, -#' summarise_by = c("model"), -#' quantiles = c(0.5), sd = TRUE, -#' verbose = FALSE) -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} - -eval_forecasts_binary <- function(data, - by, - summarise_by, - metrics, - quantiles, - sd, - summarised, - verbose){ - - res <- data[, "brier_score" := scoringutils::brier_score(true_value, prediction), - by = by] - - if (summarised) { - # add quantiles - if (!is.null(quantiles)) { - res <- add_quantiles(res, "brier_score", quantiles, by = summarise_by) - } - - # add standard deviation - if (sd) { - res <- add_sd(res, "brier_score", by = c(summarise_by)) - } - - # summarise by taking the mean over all relevant columns - res <- res[, lapply(.SD, mean, na.rm = TRUE), - .SDcols = colnames(res) %like% "brier", - by = summarise_by] - - } - - return(res[]) -} - - diff --git a/R/eval_forecasts_continuous_integer.R b/R/eval_forecasts_continuous_integer.R deleted file mode 100644 index 5ba76b5f4..000000000 --- a/R/eval_forecasts_continuous_integer.R +++ /dev/null @@ -1,170 +0,0 @@ -#' @title Evaluate forecasts in a Sample-Based Format (Integer or Continuous) -#' -#' -#' @inheritParams eval_forecasts -#' @param prediction_type character, should be either "continuous" or "integer" -#' -#' @return A data.table with appropriate scores. For more information see -#' [eval_forecasts()] -#' -#' @importFrom data.table ':=' as.data.table rbindlist %like% -#' -#' -#' @examples -#' -#' ## Integer Forecasts -#' integer_example <- data.table::setDT(scoringutils::integer_example_data) -#' eval <- scoringutils::eval_forecasts(integer_example, -#' summarise_by = c("model"), -#' quantiles = c(0.1, 0.9), -#' sd = TRUE, -#' pit_plots = TRUE) -#' eval <- scoringutils::eval_forecasts(integer_example) -#' -#' ## Continuous Forecasts -#' continuous_example <- data.table::setDT(scoringutils::continuous_example_data) -#' eval <- scoringutils::eval_forecasts(continuous_example)#' -#' -#' eval <- scoringutils::eval_forecasts(continuous_example, -#' quantiles = c(0.5, 0.9), -#' sd = TRUE, -#' summarise_by = c("model")) -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @inherit eval_forecasts references - - -eval_forecasts_sample <- function(data, - by, - summarise_by, - metrics, - prediction_type, - quantiles, - sd, - pit_plots, - summarised, - verbose) { - - if (missing(prediction_type)) { - if (isTRUE(all.equal(data$prediction, as.integer(data$prediction)))) { - prediction_type <- "integer" - } else { - prediction_type <- "continuous" - } - } - - # calculate scores ----------------------------------------------------------- - # sharpness - if ("sharpness" %in% metrics) { - data[, sharpness := scoringutils::sharpness(t(prediction)), by = c(by)] - } - # bias - if ("bias" %in% metrics) { - data[, bias := scoringutils::bias(unique(true_value), - t(prediction)), by = c(by)] - } - # DSS - if ("dss" %in% metrics) { - data[, dss := scoringutils::dss(unique(true_value), - t(prediction)), by = c(by)] - } - # CRPS - if ("crps" %in% metrics) { - data[, crps := scoringutils::crps(unique(true_value), - t(prediction)), by = c(by)] - } - # Log Score - if ("log_score" %in% metrics) { - # only compute if prediction type is continuous - if (prediction_type == "continuous") { - data[, log_score := scoringutils::logs(unique(true_value), - t(prediction)), by = c(by)] - } - } - # coverage - if ("coverage" %in% metrics) { - } - - - # Compute PIT if specified --------------------------------------------------- - if (any(grepl("pit", metrics)) || pit_plots) { - - # check if by == summarise_by - in that case no pit values can be computed - if (identical(by, summarise_by)) { - data[, c("pit_p_val", "pit_sd") := NA] - if (verbose) { - message("In order to compute PIT values, 'summarise_by' must be different from 'by'") - } - } - - # if they are not identical, pit p-values can be computed - if (!identical(by, summarise_by)) { - # if plots are not desired, a quick way to do computation can be chosen - if (!pit_plots) { - data <- pit_df_fast(data, by = summarise_by) - } else { - # split data into chunks as determined by summarise_by, since we need to - # get one PIT per element of summarise_by - split_data <- split(data, by = summarise_by) - - # calculate pit for every element of the split data.frame - pits <- lapply(split_data, - FUN = pit_df, plot = pit_plots) - - # extract data frames with added p-values. Bind data together again - data_with_pit_values <- extract_from_list(pits, "data") - data <- data.table::rbindlist(data_with_pit_values) - - if (pit_plots) { - # extract pit histograms if plots are desired - pit_histograms <- extract_from_list(pits, "hist_PIT") - - # add another histogram for the entire data set - pit_histograms[["overall_pit"]] <- scoringutils::pit_df(data)$hist_PIT - } - - } - } - - # remove sd if not asked for - if (!sd) { - data[, "pit_sd" := NULL] - } - } - - res <- data.table::copy(data) - - # make scores unique to avoid redundancy. - res <- res[, lapply(.SD, unique), - .SDcols = colnames(res) %like% "pit_|bias|sharpness|dss|crps|log_score|pit", - by = c(by)] - - # summarise output if desired ------------------------------------------------ - if (summarised) { - # add quantiles - if (!is.null(quantiles)) { - quantile_vars <- c("crps", "dss", "log_score", "pit_p_val", "bias", "sharpness") - res <- add_quantiles(res, quantile_vars, quantiles, by = c(summarise_by)) - } - - if (sd) { - # add standard deviations - sd_vars <- c("crps", "dss", "log_score", "bias", "sharpness") - res <- add_sd(res, sd_vars, by = c(summarise_by)) - } - - # take mean - res <- res[, lapply(.SD, mean, na.rm = TRUE), - .SDcols = colnames(res) %like% "pit_|bias|sharpness|dss|crps|log_score", - by = summarise_by] - } - - - # if pit_plots is TRUE, add the plots as an output --------------------------- - if (pit_plots) { - res <- list(scores = res, - pit_plots = pit_histograms) - } - - return(res[]) -} diff --git a/R/eval_forecasts_helper.R b/R/eval_forecasts_helper.R deleted file mode 100644 index e359ad89e..000000000 --- a/R/eval_forecasts_helper.R +++ /dev/null @@ -1,45 +0,0 @@ - -#' @title Add Quantiles to Predictions When Summarising -#' -#' @description -#' Helper function used within eval_forecasts -#' @param dt the data.table operated on -#' @param varnames names of the variables for which to calculate quantiles -#' @param quantiles the desired quantiles -#' @param by grouping variable in [eval_forecasts()] -#' -#' @return `data.table` with quantiles added -#' -#' @keywords internal -add_quantiles <- function(dt, varnames, quantiles, by) { - # make sure that the desired varnames are actually present - varnames <- intersect(varnames, colnames(dt)) - for (varname in varnames) { - dt[, paste0(varname, "_", quantiles) := as.list(quantile(get(varname), - probs = quantiles, - na.rm = TRUE)), - by = c(by)] - } - return(dt[]) -} - - -#' @title Add Standard Deviation to Predictions When Summarising -#' -#' @description -#' Helper function used within eval_forecasts -#' @param dt the data.table operated on -#' @param varnames names of the variables for which to calculate the sd -#' @param by grouping variable in [eval_forecasts()] -#' @importFrom data.table `%like%` -#' @return `data.table` with sd added -#' -#' @keywords internal -add_sd <- function(dt, varnames, by) { - # make sure that the desired varnames are actually present - varnames <- intersect(varnames, colnames(dt)) - for (varname in varnames) { - dt[, paste0(varname, "_sd") := sd(get(varname), na.rm = TRUE), by = by] - } - return(dt[]) -} diff --git a/R/eval_forecasts_quantile.R b/R/eval_forecasts_quantile.R deleted file mode 100644 index 5e7d25410..000000000 --- a/R/eval_forecasts_quantile.R +++ /dev/null @@ -1,189 +0,0 @@ -eval_forecasts_quantile <- function(data, - by, - summarise_by, - metrics, - quantiles, - sd, - pit_plots, - interval_score_arguments, - summarised, - verbose, - compute_relative_skill, - rel_skill_metric, - baseline) { - - # make sure to have both quantile as well as range format -------------------- - range_data <- scoringutils::quantile_to_range_long(data, - keep_quantile_col = FALSE) - # adds the range column to the quantile data set - quantile_data <- scoringutils::range_long_to_quantile(range_data, - keep_range_col = TRUE) - - - # to deal with point forecasts in a quantile format. This in effect adds - # a third column next to lower and upper after pivoting - range_data[is.na(range), boundary := "point"] - - range_data <- data.table::dcast(range_data, ... ~ boundary, - value.var = "prediction") - - # if we only score point forecasts, it may be true that there are no columns - # upper and lower in the data.frame. If so, these need to be added - if (!all(c("upper", "lower") %in% colnames(range_data))) { - range_data[, c("upper", "lower") := NA] - } - - # update interval_score arguments based on what was provided by user - interval_score_arguments <- update_list(defaults = list(weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE), - optional = interval_score_arguments) - - # store separately, as this doesn't get passed down to interval_score() - count_median_twice <- interval_score_arguments$count_median_twice - interval_score_arguments$count_median_twice <- NULL - - # set up results data.table that will then be modified throughout ------------ - res <- data.table::copy(range_data) - - # calculate scores on range format ------------------------------------------- - if ("interval_score" %in% metrics) { - # compute separate results if desired - if (interval_score_arguments$separate_results) { - res <- res[, c("interval_score", - "sharpness", - "underprediction", - "overprediction") := do.call(scoringutils::interval_score, - c(list(true_value, - lower, - upper, - range), - interval_score_arguments))] - } else { - res <- res[, c("interval_score") := do.call(scoringutils::interval_score, - c(list(true_value, - lower, - upper, - range), - interval_score_arguments))] - } - # res[, .(unlist(interval_score)), by = setdiff(colnames(res), "interval_score")] - } - - # compute coverage for every single observation - if ("coverage" %in% metrics) { - res[, coverage := ifelse(true_value <= upper & true_value >= lower, 1, 0)] - res[, coverage_deviation := coverage - range/100] - } - - # compute bias - if ("bias" %in% metrics) { - res[, bias := quantile_bias(range = range, lower = lower, upper = upper, - true_value = unique(true_value)), - by = by] - } - - # score absolute error for point forecasts - # these are marked by an NA in range, and a numeric value for point - if (any(c("ae_point", "aem") %in% metrics)) { - if ("point" %in% colnames(res)) { - res[is.na(range) & is.numeric(point), - ae_point := abs_error(predictions = point, true_value)] - } - } - - - # calculate scores on quantile format ---------------------------------------- - # compute absolute error of the median - if ("aem" %in% metrics) { - quantile_data[, aem := ae_median_quantile(true_value, - prediction, - quantile, - verbose = verbose), - by = by] - } - - # compute quantile coverage based on quantile version - if ("quantile_coverage" %in% metrics) { - quantile_data[, quantile_coverage := (true_value <= prediction)] - } - - # merge metrics computed on quantile data (i.e. aem, quantile_coverage) back - # into metrics computed on range data. One important side effect of this is - # that it controls whether we count the median twice for the interval score - # (row is then duplicated) or only once. However, merge only needs to happen - # if we computed either the interval score or the aem or quantile coverage - if (any(c("aem", "interval_score", "quantile_coverage") %in% metrics)) { - # delete unnecessary columns before merging back - keep_cols <- unique(c(by, "quantile", "aem", "quantile_coverage", - "boundary", "range")) - delete_cols <- names(quantile_data)[!(names(quantile_data) %in% keep_cols)] - quantile_data[, eval(delete_cols) := NULL] - - # duplicate median column before merging if median is too be counted twice - # if this is false, then the res will have one entry for every quantile, - # which translates to two rows for every interval, but only one for the median - if (count_median_twice) { - median <- quantile_data[quantile == 0.5, ][, boundary := "upper"] - quantile_data <- data.table::rbindlist(list(quantile_data, median)) - } - - # merge back with other metrics - merge_cols <- setdiff(keep_cols, c("aem", "quantile_coverage", "quantile", - "boundary")) - # specify all.x = TRUE as the point forecasts got deleted when - # going from range to quantile above - res <- merge(res, quantile_data, by = merge_cols, all.x = TRUE) - } - - - if (compute_relative_skill) { - - relative_res <- add_rel_skill_to_eval_forecasts(unsummarised_scores = res, - rel_skill_metric = rel_skill_metric, - baseline = baseline, - by = by, - summarise_by = summarise_by, - verbose = verbose) - res <- merge(res, relative_res, by = by) - } - - # summarise scores if desired ------------------------------------------------ - if (summarised) { - # add quantiles for the scores - if (!is.null(quantiles)) { - res <- add_quantiles(res, - c("interval_score", "coverage", - "overprediction", "underprediction", - "coverage_deviation", "bias", "sharpness", "aem", - "ae_point"), - quantiles, - by = c(summarise_by)) - } - # add standard deviation - if (sd) { - res <- add_sd(res, - varnames = c("interval_score", "bias", "coverage", - "overprediction", "underprediction", - "coverage_deviation", "sharpness", "aem", - "ae_point"), - by = c(summarise_by)) - } - - # summarise by taking the mean and omitting unnecessary columns - res <- res[, lapply(.SD, mean, na.rm = TRUE), - by = c(summarise_by), - .SDcols = colnames(res) %like% - "coverage|bias|sharpness|coverage_deviation|interval_score|overprediction|underprediction|aem|ae_point|relative_skill|scaled_rel_skill"] - } - - # if neither quantile nor range are in summarise_by, remove coverage and quantile_coverage - if (!("range" %in% summarise_by) & ("coverage" %in% colnames(res))) { - res[, c("coverage") := NULL] - } - if (!("quantile" %in% summarise_by) & "quantile_coverage" %in% names(res)) { - res[, c("quantile_coverage") := NULL] - } - - return(res[]) -} diff --git a/R/input-check-helpers.R b/R/input-check-helpers.R new file mode 100644 index 000000000..6a9fc1c8f --- /dev/null +++ b/R/input-check-helpers.R @@ -0,0 +1,207 @@ +#' @title Check Prediction Input For Lower-level Scoring Functions +#' +#' @description +#' Helper function to check inputs for lower-level score functions. +#' @param predictions an object with predictions. Depending on whether +#' `class = vector` or `class = "matrix"` this can be either a vector of length +#' n (corresponding to the length of the true_values) or a nxN matrix of +#' predictive samples, n (number of rows) being the number of data points and +#' N (number of columns) the number of Monte Carlo samples +#' @param type character, one of "continuous" (default), "integer" or "binary" that +#' defines the type of the forecast +#' @param class character, either "vector" (default) or "matrix" that determines the +#' class the input has to correspond to +#' @inheritParams ae_median_sample +#' @return NULL +#' @keywords internal + +check_predictions <- function(predictions, + true_values = NULL, + type = c("continuous", "integer", "binary"), + class = c("vector", "matrix")) { + type <- match.arg(type) + class <- match.arg(class) + + if (missing(predictions)) { + stop("argument 'predictions' missing") + } + + if (class == "vector") { + if (!is.vector(predictions)) { + msg <- sprintf( + "'predictions' should be a vector. Instead `%s` was found", + class(predictions)[1] + ) + stop(msg) + } + if (!is.null(true_values) && length(predictions) != length(true_values)) { + msg <- sprintf( + "Mismatch: 'true_values' has length `%s`, but 'predictions' has length `%s`.", # nolint + length(true_values), length(predictions) + ) + stop(msg) + } + } + + if (class == "matrix") { + if (!is.matrix(predictions)) { + msg <- sprintf( + "'predictions' should be a matrix. Instead `%s` was found", + class(predictions[1]) + ) + stop(msg) + } + if (!is.null(true_values) && nrow(predictions) != length(true_values)) { + msg <- sprintf( + "Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", + length(true_values), nrow(predictions) + ) + stop(msg) + } + } + + if (type == "integer") { + if (isFALSE(all.equal(predictions, as.integer(predictions)))) { + warning("Prediction type should be 'integer', but some of the predictions are not integers") + } + } + + if (type == "binary") { + if (isFALSE(all(predictions >= 0) & all(predictions <= 1))) { + stop("For a binary forecast, all predictions should be probabilities between 0 or 1.") + } + } + + return(NULL) +} + + +#' @title Check Observed Value Input For Lower-level Scoring Functions +#' +#' @description +#' Helper function to check inputs for lower-level score functions. +#' @inheritParams check_predictions +#' @return NULL +#' @keywords internal + +check_true_values <- function(true_values, + type = c("continuous", "integer", "binary")) { + type <- match.arg(type) + if (missing(true_values)) { + stop("true_values argument is missing") + } + + if (type == "integer") { + if (isFALSE(all.equal(true_values, as.integer(true_values)))) { + stop("Some of the true_values are not integers") + } + } + + if (type == "binary") { + if (isFALSE(all(true_values %in% c(0, 1)))) { + stop("For a binary forecast, all true_values should be either 0 or 1.") + } + } +} + + +#' @title Check Variable is not NULL +#' +#' @description +#' Check whether a certain variable is not `NULL` and return the name of that +#' variable and the function call where the variable is missing. This function +#' is a helper function that should only be called within other functions +#' @param ... The variables to check +#' @return The function returns `NULL`, but throws an error if the variable is +#' missing. +#' +#' @keywords internal +check_not_null <- function(...) { + vars <- list(...) + varnames <- names(vars) + + for (i in seq_along(vars)) { + varname <- varnames[i] + if (is.null(vars[[i]])) { + calling_function <- deparse1(sys.calls()[[sys.nframe() - 1]]) + stop(paste0( + "variable '", varname, + "' is `NULL` in the following function call: '", + calling_function, "'" + )) + } + } + return(invisible(NULL)) +} + + +#' @title Check Length +#' +#' @description +#' Check whether variables all have the same length +#' @param ... The variables to check +#' @param one_allowed logical, allow arguments of length one that can be +#' recycled +#' +#' @return The function returns `NULL`, but throws an error if variable lengths +#' differ +#' +#' @keywords internal +check_equal_length <- function(..., + one_allowed = TRUE) { + vars <- list(...) + lengths <- sapply(vars, + FUN = function(x) { + length(x) + } + ) + + lengths <- unique(lengths) + + if (one_allowed) { + lengths <- lengths[lengths != 1] + } + + if (length(unique(lengths)) != 1) { + calling_function <- deparse1(sys.calls()[[sys.nframe() - 1]]) + stop(paste0( + "Arguments passed to the following function call: '", + calling_function, + "' should have the same length (or length one). Arguments have the following lengths: ", + paste0(lengths, collapse = ", ") + )) + } + return(invisible(NULL)) +} + + + +#' @title Check whether the desired metrics are available in scoringutils +#' +#' @description Helper function to check whether desired metrics are +#' available. If the input is `NULL`, all metrics will be returned. +#' +#' @param metrics character vector with desired metrics +#' +#' @return A character vector with metrics that can be used for downstream +#' computation +#' +#' @keywords internal + +check_metrics <- function(metrics) { + # use all available metrics if none are given + if (is.null(metrics)) { + metrics <- available_metrics() + } + + # check desired metrics are actually available in scoringutils + available_metrics <- available_metrics() + if (!all(metrics %in% available_metrics)) { + msg <- paste( + "The following metrics are not available:", + paste(setdiff(metrics, available_metrics), collapse = ", ") + ) + warning(msg) + } + return(metrics) +} diff --git a/R/interval_score.R b/R/interval_score.R index 077e3aae4..fee45772d 100644 --- a/R/interval_score.R +++ b/R/interval_score.R @@ -7,12 +7,21 @@ #' The score is computed as #' #' \deqn{ +#' \text{score} = (\text{upper} - \text{lower}) + \frac{2}{\alpha}(\text{lower} +#' - \text{true_value}) * +#' \mathbf{1}(\text{true_value} < \text{lower}) + +#' \frac{2}{\alpha}(\text{true_value} - \text{upper}) * +#' \mathbf{1}(\text{true_value} > \text{upper}) +#' }{ #' score = (upper - lower) + 2/alpha * (lower - true_value) * #' 1(true_values < lower) + 2/alpha * (true_value - upper) * #' 1(true_value > upper) #' } -#' where $1()$ is the indicator function and alpha is the decimal value that +#' where \eqn{\mathbf{1}()}{1()} is the indicator function and #' indicates how much is outside the prediction interval. +#' \eqn{\alpha}{alpha} is the decimal value that indicates how much is outside +#' the prediction interval. +#' #' To improve usability, the user is asked to provide an interval range in #' percentage terms, i.e. interval_range = 90 (percent) for a 90 percent #' prediction interval. Correspondingly, the user would have to provide the @@ -20,56 +29,63 @@ #' No specific distribution is assumed, #' but the range has to be symmetric (i.e you can't use the 0.1 quantile #' as the lower bound and the 0.7 quantile as the upper). +#' Non-symmetric quantiles can be scored using the function [quantile_score()]. #' -#' The interval score is a proper scoring rule that scores a quantile forecast -#' -#' @param true_values A vector with the true observed values of size n -#' @param lower vector of size n with the lower quantile of the given range -#' @param upper vector of size n with the upper quantile of the given range +#' @param lower vector of size n with the prediction for the lower quantile +#' of the given range +#' @param upper vector of size n with the prediction for the upper quantile +#' of the given range #' @param interval_range the range of the prediction intervals. i.e. if you're #' forecasting the 0.05 and 0.95 quantile, the interval_range would be 90. #' Can be either a single number or a vector of size n, if the range changes #' for different forecasts to be scored. This corresponds to (100-alpha)/100 #' in Gneiting and Raftery (2007). Internally, the range will be transformed #' to alpha. -#' @param weigh if TRUE, weigh the score by alpha / 4, so it can be averaged -#' into an interval score that, in the limit, corresponds to CRPS. Default: -#' `FALSE`. +#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged +#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the +#' decimal value that represents how much is outside a central prediction +#' interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) +#' Default: `TRUE`. #' @param separate_results if `TRUE` (default is `FALSE`), then the separate -#' parts of the interval score (sharpness, penalties for over- and +#' parts of the interval score (dispersion penalty, penalties for over- and #' under-prediction get returned as separate elements of a list). If you want a #' `data.frame` instead, simply call [as.data.frame()] on the output. #' @return vector with the scoring values, or a list with separate entries if #' `separate_results` is `TRUE`. +#' @inheritParams ae_median_sample #' @examples #' true_values <- rnorm(30, mean = 1:30) -#' interval_range = rep(90, 30) -#' alpha = (100 - interval_range) / 100 -#' lower = qnorm(alpha/2, rnorm(30, mean = 1:30)) -#' upper = qnorm((1- alpha/2), rnorm(30, mean = 1:30)) +#' interval_range <- rep(90, 30) +#' alpha <- (100 - interval_range) / 100 +#' lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) +#' upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) #' -#' interval_score(true_values = true_values, -#' lower = lower, -#' upper = upper, -#' interval_range = interval_range) +#' interval_score( +#' true_values = true_values, +#' lower = lower, +#' upper = upper, +#' interval_range = interval_range +#' ) #' #' # example with missing values and separate results -#' interval_score(true_values = c(true_values, NA), -#' lower = c(lower, NA), -#' upper = c(NA, upper), -#' separate_results = TRUE, -#' interval_range = 90) +#' interval_score( +#' true_values = c(true_values, NA), +#' lower = c(lower, NA), +#' upper = c(NA, upper), +#' separate_results = TRUE, +#' interval_range = 90 +#' ) #' @export +#' @keywords metric #' @references Strictly Proper Scoring Rules, Prediction,and Estimation, #' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American #' Statistical Association, Volume 102, 2007 - Issue 477 #' #' Evaluating epidemic forecasts in an interval format, #' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, -#' +#' # nolint #' - interval_score <- function(true_values, lower, upper, @@ -78,39 +94,118 @@ interval_score <- function(true_values, separate_results = FALSE) { # error handling - not sure how I can make this better - present <- c(methods::hasArg("true_values"), methods::hasArg("lower"), - methods::hasArg("upper"), methods::hasArg("interval_range")) + present <- c( + methods::hasArg("true_values"), methods::hasArg("lower"), + methods::hasArg("upper"), methods::hasArg("interval_range") + ) if (!all(present)) { - stop("need all arguments 'true_values', 'lower', 'upper' and 'interval_range' in function 'interval_score()'") + stop( + "need all arguments 'true_values', 'lower', 'upper' and 'interval_range' in function 'interval_score()'" # nolint + ) } - check_not_null(true_values = true_values, lower = lower, upper = upper, - interval_range = interval_range) + check_not_null( + true_values = true_values, lower = lower, upper = upper, + interval_range = interval_range + ) check_equal_length(true_values, lower, interval_range, upper) # calculate alpha from the interval range alpha <- (100 - interval_range) / 100 # calculate three components of WIS - sharpness <- (upper - lower) - overprediction <- 2/alpha * (lower - true_values) * (true_values < lower) - underprediction <- 2/alpha * (true_values - upper) * (true_values > upper) + dispersion <- (upper - lower) + overprediction <- 2 / alpha * (lower - true_values) * (true_values < lower) + underprediction <- 2 / alpha * (true_values - upper) * (true_values > upper) if (weigh) { - sharpness <- sharpness * alpha / 2 + dispersion <- dispersion * alpha / 2 underprediction <- underprediction * alpha / 2 overprediction <- overprediction * alpha / 2 } - score <- sharpness + underprediction + overprediction + score <- dispersion + underprediction + overprediction if (separate_results) { - return(list(interval_score = score, - sharpness = sharpness, - underprediction = underprediction, - overprediction = overprediction)) + return(list( + interval_score = score, + dispersion = dispersion, + underprediction = underprediction, + overprediction = overprediction + )) } else { return(score) } } +#' @title Quantile Score +#' +#' @description +#' Proper Scoring Rule to score quantile predictions. Smaller values are better. +#' The quantile score is +#' closely related to the Interval score (see [interval_score()]) and is +#' the quantile equivalent that works with single quantiles instead of +#' central prediction intervals. +#' +#' @param quantiles vector of size n with the quantile values of the +#' corresponding predictions. +#' @param weigh if TRUE, weigh the score by alpha / 2, so it can be averaged +#' into an interval score that, in the limit, corresponds to CRPS. Alpha is the +#' value that corresponds to the (alpha/2) or (1 - alpha/2) quantiles provided +#' and will be computed from the quantile. Alpha is the decimal value that +#' represents how much is outside a central prediction interval (E.g. for a +#' 90 percent central prediction interval, alpha is 0.1). Default: `TRUE`. +#' @return vector with the scoring values +#' @inheritParams interval_score +#' @inheritParams ae_median_sample +#' @examples +#' true_values <- rnorm(10, mean = 1:10) +#' alpha <- 0.5 +#' +#' lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) +#' upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) +#' +#' qs_lower <- quantile_score(true_values, +#' predictions = lower, +#' quantiles = alpha / 2 +#' ) +#' qs_upper <- quantile_score(true_values, +#' predictions = upper, +#' quantiles = 1 - alpha / 2 +#' ) +#' interval_score <- (qs_lower + qs_upper) / 2 +#' @export +#' @keywords metric +#' @references Strictly Proper Scoring Rules, Prediction,and Estimation, +#' Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American +#' Statistical Association, Volume 102, 2007 - Issue 477 +#' +#' Evaluating epidemic forecasts in an interval format, +#' Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, +#' +# + +quantile_score <- function(true_values, + predictions, + quantiles, + weigh = TRUE) { + + # get central prediction interval which corresponds to given quantiles + central_interval <- abs(0.5 - quantiles) * 2 + alpha <- 1 - central_interval + # compute score - this is the version explained in the SI of Bracher et. al. + error <- abs(predictions - true_values) + score <- 2 * ifelse( + true_values <= predictions, 1 - quantiles, quantiles + ) * error + + # adapt score such that mean of unweighted quantile scores corresponds to + # unweighted interval score of the corresponding prediction interval + score <- 2 * score / alpha + + if (weigh) { + score <- score * alpha / 2 + } + + return(score) +} diff --git a/R/log_score.R b/R/log_score.R new file mode 100644 index 000000000..517623fb3 --- /dev/null +++ b/R/log_score.R @@ -0,0 +1,33 @@ +#' Log Score for Binary outcomes +#' +#' @description +#' Computes the Log Score for probabilistic forecasts of binary outcomes. +#' +#' @details +#' The Log Score is a proper score rule suited to assessing the accuracy of +#' probabilistic binary predictions. The outcomes can be either 0 or 1, +#' the predictions must be a probability that the true outcome will be 1. +#' +#' The Log Score is then computed as the negative logarithm of the probability +#' assigned to the true outcome. Reporting the negative logarithm means that +#' smaller values are better. +#' +#' @inheritParams brier_score +#' @return A numeric value with the Log Score, i.e. the mean squared +#' error of the given probability forecasts +#' @importFrom methods hasArg +#' @export +#' @keywords metric +#' +#' @examples +#' true_values <- sample(c(0, 1), size = 30, replace = TRUE) +#' predictions <- runif(n = 30, min = 0, max = 1) + +#' logs_binary(true_values, predictions) +logs_binary <- function(true_values, predictions) { + check_true_values(true_values, type = "binary") + check_predictions(predictions, true_values, type = "binary") + + logs <- -log(ifelse(true_values == 1, predictions, 1 - predictions)) + return(logs) +} diff --git a/R/metrics_point_forecasts.R b/R/metrics_point_forecasts.R index dd9e8561d..fe8e3cc23 100644 --- a/R/metrics_point_forecasts.R +++ b/R/metrics_point_forecasts.R @@ -1,26 +1,167 @@ -#' @title Mean Squared Error +#' @title Absolute Error of the Median (Sample-based Version) #' #' @description -#' Mean Squared Error MSE of point forecasts. -#' Calculated as +#' Absolute error of the median calculated as #' #' \deqn{ -#' mean((true_values - predicted_values)^2) +#' \text{abs}(\text{true_value} - \text{median_prediction}) +#' }{ +#' abs(true_value - median_prediction) #' } #' #' @param true_values A vector with the true observed values of size n -#' @param predictions A vector with predicted values of size n +#' @param predictions nxN matrix of predictive samples, n (number of rows) being +#' the number of data points and N (number of columns) the number of Monte +#' Carlo samples. Alternatively, predictions can just be a vector of size n. #' @return vector with the scoring values +#' @seealso [ae_median_quantile()], [abs_error()] +#' @importFrom stats median #' @examples #' true_values <- rnorm(30, mean = 1:30) #' predicted_values <- rnorm(30, mean = 1:30) -#' mse(true_values, predicted_values) +#' ae_median_sample(true_values, predicted_values) #' @export +#' @keywords metric + +ae_median_sample <- function(true_values, predictions) { + median_predictions <- apply(as.matrix(predictions), + MARGIN = 1, # rowwise + FUN = median + ) + + ae_median <- abs(true_values - median_predictions) -mse <- function(true_values, predictions) { - mse <- mean((true_values - predictions)^2) - return(mse) + return(ae_median) } +#' @title Squared Error of the Mean (Sample-based Version) +#' +#' @description +#' Squared error of the mean calculated as +#' +#' \deqn{ +#' \text{mean}(\text{true_value} - \text{prediction})^2 +#' }{ +#' mean(true_value - mean_prediction)^2 +#' } +#' +#' @param true_values A vector with the true observed values of size n +#' @param predictions nxN matrix of predictive samples, n (number of rows) being +#' the number of data points and N (number of columns) the number of Monte +#' Carlo samples. Alternatively, predictions can just be a vector of size n. +#' @return vector with the scoring values +#' @seealso [squared_error()] +#' @examples +#' true_values <- rnorm(30, mean = 1:30) +#' predicted_values <- rnorm(30, mean = 1:30) +#' se_mean_sample(true_values, predicted_values) +#' @export +#' @keywords metric +se_mean_sample <- function(true_values, predictions) { + mean_predictions <- rowMeans(as.matrix(predictions)) + se_mean <- (true_values - mean_predictions)^2 + + return(se_mean) +} + + +#' @title Absolute Error of the Median (Quantile-based Version) +#' +#' @description +#' Absolute error of the median calculated as +#' +#' \deqn{ +#' \text{abs}(\text{true_value} - \text{prediction}) +#' }{ +#' abs(true_value - median_prediction) +#' } +#' +#' The function was created for internal use within [score()], but can also +#' used as a standalone function. +#' +#' @param predictions numeric vector with predictions, corresponding to the +#' quantiles in a second vector, `quantiles`. +#' @param quantiles numeric vector that denotes the quantile for the values +#' in `predictions`. Only those predictions where `quantiles == 0.5` will +#' be kept. If `quantiles` is `NULL`, then all `predictions` and +#' `true_values` will be used (this is then the same as [abs_error()]) +#' @return vector with the scoring values +#' @seealso [ae_median_sample()], [abs_error()] +#' @importFrom stats median +#' @inheritParams ae_median_sample +#' @examples +#' true_values <- rnorm(30, mean = 1:30) +#' predicted_values <- rnorm(30, mean = 1:30) +#' ae_median_quantile(true_values, predicted_values, quantiles = 0.5) +#' @export +#' @keywords metric + +ae_median_quantile <- function(true_values, predictions, quantiles = NULL) { + if (!is.null(quantiles)) { + if (!any(quantiles == 0.5) && !any(is.na(quantiles))) { + return(NA_real_) + warning( + "in order to compute the absolute error of the median, `0.5` must be among the quantiles given. Maybe you want to use `abs_error()`?" # nolint + ) + } + true_values <- true_values[quantiles == 0.5] + predictions <- predictions[quantiles == 0.5] + } + abs_error_median <- abs(true_values - predictions) + return(abs_error_median) +} + + + + +#' @title Absolute Error +#' +#' @description +#' Calculate absolute error as +#' +#' \deqn{ +#' \text{abs}(\text{true_value} - \text{median_prediction}) +#' }{ +#' abs(true_value - prediction) +#' } +#' +#' @return vector with the absolute error +#' @inheritParams ae_median_quantile +#' @seealso [ae_median_sample()], [ae_median_quantile()] +#' @examples +#' true_values <- rnorm(30, mean = 1:30) +#' predicted_values <- rnorm(30, mean = 1:30) +#' abs_error(true_values, predicted_values) +#' @export +#' @keywords metric + +abs_error <- function(true_values, predictions) { + return(abs(true_values - predictions)) +} + + +#' @title Squared Error +#' +#' @description +#' Squared Error SE calculated as +#' +#' \deqn{ +#' (true_values - predicted_values)^2 +#' } +#' +#' @param predictions A vector with predicted values of size n +#' @return vector with the scoring values +#' @inheritParams ae_median_sample +#' @export +#' @keywords metric +#' @examples +#' true_values <- rnorm(30, mean = 1:30) +#' predicted_values <- rnorm(30, mean = 1:30) +#' squared_error(true_values, predicted_values) + +squared_error <- function(true_values, predictions) { + se <- (true_values - predictions)^2 + return(se) +} diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R index 7d4d12382..aa5d46a04 100644 --- a/R/pairwise-comparisons.R +++ b/R/pairwise-comparisons.R @@ -9,173 +9,113 @@ #' `permutationTest` from the `surveillance` package by Michael Höhle, #' Andrea Riebler and Michaela Paul. #' -#' @param scores A data.frame of unsummarised scores as produced by -#' [eval_forecasts()] -#' @param metric A character vector of length one with the metric to do -#' the comparison on. -#' @param by character vector of columns to group scoring by. This should be the -#' lowest level of grouping possible, i.e. the unit of the individual -#' observation. This is important as many functions work on individual -#' observations. If you want a different level of aggregation, you should use -#' `summarise_by` to aggregate the individual scores. -#' Also not that the pit will be computed using `summarise_by` instead of `by` -#' @param summarise_by character vector of columns to group the summary by. By -#' default, this is equal to `by` and no summary takes place. But sometimes you -#' may want to to summarise over categories different from the scoring. -#' `summarise_by` is also the grouping level used to compute (and possibly plot) -#' the probability integral transform(pit). -#' @param test_options list with options to pass down to [compare_two_models()]. -#' To change only one of the default options, just pass a list as input with -#' the name of the argument you want to change. All elements not included in the -#' list will be set to the default (so passing an empty list would result in the -#' default options). +#' @param scores A data.table of scores as produced by [score()]. +#' @param metric A character vector of length one with the metric to do the +#' comparison on. The default is "auto", meaning that either "interval_score", +#' "crps", or "brier_score" will be selected where available. +#' See [available_metrics()] for available metrics. +#' @param by character vector with names of columns present in the input +#' data.frame. `by` determines how pairwise comparisons will be computed. +#' You will get a relative skill score for every grouping level determined in +#' `by`. If, for example, `by = c("model", "location")`. Then you will get a +#' separate relative skill score for every model in every location. Internally, +#' the data.frame will be split according `by` (but removing "model" before +#' splitting) and the pairwise comparisons will be computed separately for the +#' split data.frames. #' @param baseline character vector of length one that denotes the baseline #' model against which to compare other models. +#' @param ... additional arguments, such as test options that can get passed +#' down to lower level functions. The following options are available: +#' `one_sided` (Boolean, default is `FALSE`, whether two conduct a one-sided +#' instead of a two-sided test), `test_type` (character, either "non_parametric" +#' or "permutation" determining which kind of test shall be conducted to +#' determine p-values. Default is "non-parametric), `n_permutations` (number of +#' permutations for a permutation test. Default is 999). See +#' [compare_two_models()] for more information. #' @return A ggplot2 object with a coloured table of summarised scores #' @importFrom data.table as.data.table data.table setnames copy #' @importFrom stats sd rbinom wilcox.test p.adjust #' @importFrom utils combn #' @export -#' @author Johannes Bracher, https://jbracher.github.io/ -#' @author Nikos Bosse +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @author Johannes Bracher, \email{johannes.bracher@@kit.edu} +#' @keywords scoring #' @examples -#' df <- data.frame(model = rep(c("model1", "model2", "model3"), each = 10), -#' date = as.Date("2020-01-01") + rep(1:5, each = 2), -#' location = c(1, 2), -#' interval_score = (abs(rnorm(30))), -#' aem = (abs(rnorm(30)))) +#' df <- data.frame( +#' model = rep(c("model1", "model2", "model3"), each = 10), +#' date = as.Date("2020-01-01") + rep(1:5, each = 2), +#' location = c(1, 2), +#' interval_score = (abs(rnorm(30))), +#' ae_median = (abs(rnorm(30))) +#' ) #' -#' res <- scoringutils::pairwise_comparison(df, -#' baseline = "model1") -#' scoringutils::plot_pairwise_comparison(res) +#' res <- pairwise_comparison(df, +#' baseline = "model1" +#' ) +#' plot_pairwise_comparison(res) #' -#' eval <- scoringutils::eval_forecasts(scoringutils::quantile_example_data) -#' pairwise <- pairwise_comparison(eval, summarise_by = c("model")) -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @author Johannes Bracher, \email{johannes.bracher@@kit.edu} +#' eval <- score(example_quantile) +#' pairwise_comparison(eval, by = c("model")) pairwise_comparison <- function(scores, - metric = "interval_score", # maybe the default can happen automatically, - test_options = list(oneSided = FALSE, - test_type = c("non_parametric", "permuation"), - n_permutations = 999), + by = c("model"), + metric = "auto", baseline = NULL, - by = NULL, - summarise_by = c("model")) { - + ...) { scores <- data.table::as.data.table(scores) - # update test options - test_options <- update_list(defaults = list(oneSided = FALSE, - test_type = c("non_parametric", "permuation"), - n_permutations = 999), - optional = test_options) - - # identify unit of single observation if it is not given. - # usually, by = NULL should be fine and only needs to be specified if there - # are additional columns that are not metrics and not related to the unit of observation - if (is.null(by)) { - all_metrics <- available_metrics() - by <- setdiff(names(scores), c(all_metrics, "model")) + # determine metric automatically + if (metric == "auto") { + metric <- infer_rel_skill_metric(scores) } - split_by <- setdiff(summarise_by, "model") - - split_scores <- split(scores, by = split_by) - - - results <- lapply(split_scores, - FUN = function(scores) { - out <- pairwise_comparison_one_group(scores = scores, - metric = metric, - test_options = test_options, - baseline = baseline, - by = by, - summarise_by = summarise_by) - }) - - out <- data.table::rbindlist(results) -} - + # check that all values of the chosen metric are positive + if (any(sign(scores[[metric]]) < 0)) { + if (any(sign(scores) > 0)) { + msg <- paste("To compute pairwise comparisons, all values of", metric, + "must have the same sign.") + stop(msg) + } + } -#' @title Add relative skill to eval_forecasts() -#' -#' @description -#' -#' This function will only be called within [eval_forecasts()] and serves to -#' make pairwise comparisons from within that function. It uses the -#' `summarise_by` argument as well as the data from [eval_forecasts()]. -#' Essentially, it wraps [pairwise_comparison()] and deals with the specifics -#' necessary to work with [eval_forecasts()]. -#' @inheritParams eval_forecasts -#' @param unsummarised_scores unsummarised scores to be passed from -#' [eval_forecasts()] -#' -#' @keywords internal + # identify unit of single observation. + forecast_unit <- get_forecast_unit(scores) -add_rel_skill_to_eval_forecasts <- function(unsummarised_scores, - rel_skill_metric, - baseline, - by, - summarise_by, - verbose) { - - # infer the correct relative skill if only "auto" is given - if (rel_skill_metric == "auto") { - if ("interval_score" %in% colnames(unsummarised_scores)) { - rel_skill_metric <- "interval_score" - } else if ("crps" %in% colnames(unsummarised_scores)) { - rel_skill_metric <- "crps" - } else if ("brier_score" %in% colnames(unsummarised_scores)) { - rel_skill_metric <- "brier_score" - } else { - stop("automatically assign a metric to add relative skill failed. Please provide a metric.") - } + # if by is equal to forecast_unit, then pairwise comparisons don't make sense + if (identical(sort(by), sort(forecast_unit))) { + by <- "model" + message("relative skill can only be computed if `by` is different from the unit of a single forecast. `by` was set to 'model'") } - # summarise scores over all quantiles, ranges or samples in order to not - # include them in the calculation of relative scores - scores <- unsummarised_scores[, lapply(.SD, mean, na.rm = TRUE), - by = c(by), - .SDcols = colnames(unsummarised_scores) %in% c(rel_skill_metric)] + # summarise scores over everything (e.g. quantiles, ranges or samples) in + # order to not to include those in the calculation of relative scores. Also + # gets rid of all unnecessary columns and keep only metric and forecast unit + scores <- scores[, lapply(.SD, mean, na.rm = TRUE), + by = forecast_unit, + .SDcols = metric + ] - # remove range and quantile from summarise_by if they are present - summarise_by <- setdiff(summarise_by, c("range", "quantile", "sample")) + # split data set into groups determined by 'by' + split_by <- setdiff(by, "model") + split_scores <- split(scores, by = split_by) - # if summarise_by is equal to by, then pairwise comparisons don't make sense - if (identical(sort(summarise_by), sort(by))) { - summarise_by <- "model" - if (verbose) { - message("relative skill can only be computed if `summarise_by` is different from `by`. `summarise_by` was set to 'model'") + results <- lapply(split_scores, + FUN = function(scores) { + out <- pairwise_comparison_one_group( + scores = scores, + metric = metric, + baseline = baseline, + by = by, + ... + ) } - } + ) - # do pairwise comparison - pairwise <- pairwise_comparison(scores = scores, - metric = rel_skill_metric, - baseline = baseline, - by = by, - summarise_by = summarise_by) - - # delete unnecessary columns from the output - cols_to_delete <- setdiff(colnames(pairwise), - unique(c(summarise_by, "model", "relative_skill", "scaled_rel_skill"))) - if (length(cols_to_delete > 1)) { - pairwise[, eval(cols_to_delete) := NULL] - } - pairwise <- unique(pairwise) - out <- merge(scores, pairwise, all.x = TRUE, - by = unique(c("model", summarise_by))) - - # also delete skill metric from output - out[, eval(rel_skill_metric) := NULL] + out <- data.table::rbindlist(results) return(out[]) } - - - #' @title Do Pairwise Comparison for one Set of Forecasts #' #' @description @@ -189,16 +129,13 @@ add_rel_skill_to_eval_forecasts <- function(unsummarised_scores, #' actually do the comparison between two models over a subset of common #' forecasts it calls [compare_two_models()]. #' @inheritParams pairwise_comparison +#' @keywords internal pairwise_comparison_one_group <- function(scores, metric, - test_options, baseline, by, - summarise_by) { - - - + ...) { if (!("model" %in% names(scores))) { stop("pairwise compairons require a column called 'model'") } @@ -211,31 +148,24 @@ pairwise_comparison_one_group <- function(scores, return(NULL) } - # the overlap is obtained by merging the available data for one model with - # the avaialble data from the other model. - # for efficiency when merging, remove everything that is not in c(by, var) - cols_to_remove <- setdiff(names(scores), c(by, "model", metric)) - if (length(cols_to_remove > 0)) { - scores[, eval(cols_to_remove) := NULL] - scores <- unique(scores) - } - # create a data.frame with results # we only need to do the calculation once, because for the ratio that - # should just be the inverse and for the permuation the result should + # should just be the inverse and for the permutation the result should # be the same # set up initial data.frame with all possible pairwise comparisons combinations <- data.table::as.data.table(t(combn(models, m = 2))) colnames(combinations) <- c("model", "compare_against") - combinations[, c("ratio", "pval") := compare_two_models(scores = scores, - name_model1 = model, - name_model2 = compare_against, - metric = metric, - test_options = test_options, - by = by), - by = seq_len(NROW(combinations))] + combinations[, c("ratio", "pval") := compare_two_models( + scores = scores, + name_model1 = model, + name_model2 = compare_against, + metric = metric, + ... + ), + by = seq_len(NROW(combinations)) + ] combinations <- combinations[order(ratio)] combinations[, adj_pval := p.adjust(pval)] @@ -243,26 +173,33 @@ pairwise_comparison_one_group <- function(scores, # mirror computations combinations_mirrored <- data.table::copy(combinations) data.table::setnames(combinations_mirrored, - old = c("model", "compare_against"), - new = c("compare_against", "model")) + old = c("model", "compare_against"), + new = c("compare_against", "model") + ) combinations_mirrored[, ratio := 1 / ratio] # add a one for those that are the same - combinations_equal <- data.table::data.table(model = models, - compare_against = models, - ratio = 1, - pval = 1, - adj_pval = 1) - - result <- data.table::rbindlist(list(combinations, - combinations_mirrored, - combinations_equal), - use.names = TRUE) + combinations_equal <- data.table::data.table( + model = models, + compare_against = models, + ratio = 1, + pval = 1, + adj_pval = 1 + ) + + result <- data.table::rbindlist(list( + combinations, + combinations_mirrored, + combinations_equal + ), + use.names = TRUE + ) # make result character instead of factor - result[, `:=`("model" = as.character(model), - "compare_against" = as.character(compare_against))] - + result[, `:=`( + "model" = as.character(model), + "compare_against" = as.character(compare_against) + )] # calculate relative skill as geometric mean # small theta is again better. If a baseline is given, exclude it @@ -271,9 +208,12 @@ pairwise_comparison_one_group <- function(scores, if (!is.null(baseline)) { result_without_baseline <- data.table::copy(result) # filter out all ratios where compare_against is the baseline - result_without_baseline <- result_without_baseline[compare_against != baseline, ] - result_without_baseline[, `:=` (theta = geom_mean_helper(ratio)), - by = "model"] + result_without_baseline <- result_without_baseline[ + compare_against != baseline + ] + result_without_baseline[, `:=`(theta = geom_mean_helper(ratio)), + by = "model" + ] # merge back to retain the ratios even for comparisons with the baseline result <- merge(result, result_without_baseline, all.x = TRUE) # avoid mixture of NA and NaN which can cause problems downstream @@ -281,18 +221,21 @@ pairwise_comparison_one_group <- function(scores, # remove NAs form merge in the thetas result[, theta := unique(na.omit(theta)), by = "model"] } else { - result[, `:=` (theta = geom_mean_helper(ratio), - rel_to_baseline = NA_real_), - by = "model"] + result[, `:=`( + theta = geom_mean_helper(ratio), + rel_to_baseline = NA_real_ + ), + by = "model" + ] } - if(!is.null(baseline)) { + if (!is.null(baseline)) { baseline_theta <- unique(result[model == baseline, ]$theta) result[, rel_to_baseline := theta / baseline_theta] } - # remove all the rows that are not present in summarise_by before merging - cols_to_keep <- unique(c(summarise_by, "model")) + # remove all the rows that are not present in by before merging + cols_to_keep <- unique(c(by, "model")) cols_to_remove <- colnames(scores)[!(colnames(scores) %in% cols_to_keep)] scores[, eval(cols_to_remove) := NULL] scores <- unique(scores) @@ -300,18 +243,14 @@ pairwise_comparison_one_group <- function(scores, out <- merge(scores, result, by = "model", all = TRUE) # rename ratio to mean_scores_ratio - data.table::setnames(out, old = c("ratio", "theta", "rel_to_baseline"), - new = c("mean_scores_ratio", "relative_skill", "scaled_rel_skill")) + data.table::setnames(out, + old = c("ratio", "theta", "rel_to_baseline"), + new = c("mean_scores_ratio", "relative_skill", "scaled_rel_skill") + ) return(out[]) } - - - - - - #' @title Compare Two Models Based on Subset of Common Forecasts #' #' @description @@ -328,31 +267,42 @@ pairwise_comparison_one_group <- function(scores, #' @inheritParams pairwise_comparison #' @param name_model1 character, name of the first model #' @param name_model2 character, name of the model to compare against +#' @param one_sided Boolean, default is `FALSE`, whether two conduct a one-sided +#' instead of a two-sided test to determine significance in a pairwise +#' comparison. +#' @param test_type character, either "non_parametric" (the default) or +#' "permutation". This determines which kind of test shall be conducted to +#' determine p-values. +#' @param n_permutations numeric, the number of permutations for a +#' permutation test. Default is 999. #' @author Johannes Bracher, \email{johannes.bracher@@kit.edu} #' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @keywords internal compare_two_models <- function(scores, name_model1, name_model2, metric, - test_options, - by) { - + one_sided = FALSE, + test_type = c("non_parametric", "permutation"), + n_permutations = 999) { scores <- data.table::as.data.table(scores) + forecast_unit <- get_forecast_unit(scores) + if (!("model" %in% names(scores))) { stop("pairwise comparisons require a column called 'model'") } # select only columns in c(by, var) - a <- scores[model == name_model1, ] - b <- scores[model == name_model2, ] + a <- scores[model == name_model1] + b <- scores[model == name_model2] # remove "model" from 'by' before merging - by <- setdiff(by, "model") + merge_by <- setdiff(forecast_unit, "model") - overlap <- merge(a, b, by = by, allow.cartesian = TRUE) -unique(overlap) + overlap <- merge(a, b, by = merge_by, allow.cartesian = TRUE) + unique(overlap) if (nrow(overlap) == 0) { return(list(ratio = NA_real_, pval = NA_real_)) @@ -370,290 +320,47 @@ unique(overlap) # test whether the ratio is significantly different from one # equivalently, one can test whether the difference between the two values # is significantly different from zero. - if (test_options$test_type[1] == "permutation") { + test_type <- match.arg(test_type) + if (test_type == "permutation") { # adapted from the surveillance package pval <- permutation_test(values_x, values_y, - nPermutation = test_options$n_permutations, - oneSided = test_options$oneSided, - comparison_mode = "difference") + n_permutation = n_permutations, + one_sided = one_sided, + comparison_mode = "difference" + ) } else { # this probably needs some more thought # alternative: do a paired t-test on ranks? pval <- wilcox.test(values_x, values_y, paired = TRUE)$p.value } - return(list(mean_scores_ratio = ratio, - pval = pval)) + return(list( + mean_scores_ratio = ratio, + pval = pval + )) } - - - - - - - - - - - - - - - - - - -#' @title Plot Heatmap of Pairwise Comparisons +#' @title Infer metric for pairwise comparisons #' #' @description -#' Creates a heatmap of the ratios or pvalues from a pairwise comparison -#' between models -#' -#' @param comparison_result A data.frame as produced by -#' [pairwise_comparison()] -#' @param type character vector of length one that is either "mean_scores_ratio" or "pval". -#' This denotes whether to visualise the ratio or the p-value of the -#' pairwise comparison. Default is "mean_scores_ratio" -#' @param smaller_is_good logical (default is `TRUE`) that indicates whether -#' smaller or larger values are to be interpreted as 'good' (as you could just -#' invert the mean scores ratio) -#' @param facet_formula facetting formula passed down to ggplot. Default is -#' `NULL` -#' @param scales scales argument that gets passed down to ggplot. Only necessary -#' if you make use of facetting. Default is "free_y" -#' @param facet_wrap_or_grid Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param ncol Number of columns for facet wrap. Only relevant if -#' `facet_formula` is given and `facet_wrap_or_grid == "facet_wrap"` -#' @importFrom ggplot2 ggplot aes geom_tile geom_text labs coord_cartesian -#' scale_fill_gradient2 theme_light element_text -#' @importFrom data.table as.data.table setnames rbindlist -#' @importFrom stats reorder -#' @importFrom ggplot2 labs coord_cartesian facet_wrap facet_grid theme -#' element_text element_blank -#' @export -#' -#' @examples -#' df <- data.frame(model = rep(c("model1", "model2", "model3"), each = 10), -#' id = rep(1:10), -#' interval_score = abs(rnorm(30, mean = rep(c(1, 1.3, 2), each = 10))), -#' aem = (abs(rnorm(30)))) -#' -#' data <- scoringutils::quantile_example_data -#' scores <- scoringutils::eval_forecasts(data) -#' pairwise <- pairwise_comparison(scores, -#' summarise_by = "value_desc") -#' scoringutils::plot_pairwise_comparison(pairwise, -#' facet_formula = ~ value_desc, -#' scales = "fixed") - - -plot_pairwise_comparison <- function(comparison_result, - type = c("mean_scores_ratio", "pval", "together"), - smaller_is_good = TRUE, - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap") { - - comparison_result <- data.table::as.data.table(comparison_result) - - comparison_result[, model := reorder(model, -relative_skill)] - levels <- levels(comparison_result$model) - - - get_fill_scale <- function(values, breaks, plot_scales) { - values[is.na(values)] <- 1 # this would be either ratio = 1 or pval = 1 - scale <- cut(values, breaks = breaks, - include.lowest = TRUE, - right = FALSE, - labels = plot_scales) - # scale[is.na(scale)] <- 0 - return(as.numeric(as.character(scale))) - } - - if (type[1] == "together") { - # obtain only the upper triangle of the comparison - # that is used for showing ratios - # need to change the order if larger is good - if (smaller_is_good) { - unique_comb <- as.data.frame(t(combn(rev(levels), 2))) - } else { - unique_comb <- as.data.frame(t(combn((levels), 2))) - } - - colnames(unique_comb) <- c("model", "compare_against") - upper_triangle <- merge(comparison_result, unique_comb) - - # change levels for plotting order - upper_triangle[, `:=` (model = factor(model, levels), - compare_against = factor(compare_against, levels))] - - # reverse y and x if larger is better - if (!smaller_is_good) { - data.table::setnames(upper_triangle, - c("model", "compare_against"), - c("compare_against", "model")) - } - - # modify upper triangle ------------------------------------------------------ - # add columns where a model is compared with itself. make adj_pval NA - # to plot it as grey later on - equal <- data.table::data.table(model = levels, - compare_against = levels, - mean_scores_ratio = 1, - pval = NA, - adj_pval = NA) - upper_triangle_complete <- data.table::rbindlist(list(upper_triangle, - equal), fill = TRUE) - - # define interest variable - upper_triangle_complete[, var_of_interest := round(mean_scores_ratio, 2)] - - # implemnt breaks for colour heatmap - breaks <- c(0, 0.1, 0.5, 0.75, 1, 1.33, 2, 10, Inf) - plot_scales <- c(-1, -0.5, -0.25, 0, 0, 0.25, 0.5, 1) - if (!smaller_is_good) { - plot_scales <- rev(plot_scales) - } - upper_triangle_complete[, fill_col := get_fill_scale(var_of_interest, - breaks, plot_scales)] - - # create mean_scores_ratios in plot - plot <- ggplot2::ggplot(upper_triangle_complete, - ggplot2::aes(x = compare_against, - y = model, - fill = fill_col)) + - ggplot2::geom_tile(width = 0.98, height = 0.98) + - ggplot2::geom_text(ggplot2::aes(label = var_of_interest), - na.rm = TRUE) + - ggplot2::scale_fill_gradient2(low = "skyblue", mid = "grey95", - high = "brown1", - na.value = "lightgrey", - midpoint = 0, - limits = c(-1,1), - name = NULL) + - ggplot2::theme_light() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1, color = "brown4"), - axis.text.y = ggplot2::element_text(color = "steelblue4"), - panel.grid.major = ggplot2::element_blank(), - panel.grid.minor = ggplot2::element_blank(), - # panel.background = ggplot2::element_rect(fill = "grey90"), - # axis.line.y = ggplot2::element_line(color = "steelblue4", size = 4), - # axis.line.x = ggplot2::element_line(color = "brown3", size = 4), - legend.position = "none") + - ggplot2::labs(x = "", y = "", - title = "Pairwise comparisons - mean_scores_ratio (upper) and pval (lower)") + - ggplot2::coord_cartesian(expand = FALSE) - - # add pvalues to plot -------------------------------------------------------- - # obtain lower triangle for the pvalues - lower_triangle <- data.table::copy(upper_triangle) - data.table::setnames(lower_triangle, - c("model", "compare_against"), - c("compare_against", "model")) - - lower_triangle[, var_of_interest := round(adj_pval, 3)] - # implemnt breaks for colour heatmap - breaks <- c(0, 0.01, 0.05, 0.1, 1) - plot_scales <- c(0.8, 0.5, 0.1, 0.000001) - lower_triangle[, fill_col := get_fill_scale(var_of_interest, - breaks, plot_scales)] - - fill_rule <- ifelse(lower_triangle$fill_col == 0.000001, "grey95", "palegreen3") - lower_triangle[, var_of_interest := as.character(var_of_interest)] - lower_triangle[, var_of_interest := ifelse(var_of_interest == "0", - "< 0.001", var_of_interest)] - - plot <- plot + - ggplot2::geom_tile(data = lower_triangle, - ggplot2::aes(alpha = fill_col), - fill = fill_rule, - color = "white", - width = 0.97, height = 0.97) + - ggplot2::geom_text(data = lower_triangle, - ggplot2::aes(label = var_of_interest), - na.rm = TRUE) - - } else if (type[1] == "mean_scores_ratio") { - comparison_result[, var_of_interest := round(mean_scores_ratio, 2)] - - # implemnt breaks for colour heatmap - breaks <- c(0, 0.1, 0.5, 0.75, 1, 1.33, 2, 10, Inf) - plot_scales <- c(-1, -0.5, -0.25, 0, 0, 0.25, 0.5, 1) - comparison_result[, fill_col := get_fill_scale(var_of_interest, - breaks, plot_scales)] - - high_col = "brown1" +#' Helper function to infer the metric for which pairwise comparisons shall +#' be made. The function simply checks the names of the available columns and +#' chooses the most widely used metric. +#' @inheritParams pairwise_comparison +#' @keywords internal +infer_rel_skill_metric <- function(scores) { + if ("interval_score" %in% colnames(scores)) { + rel_skill_metric <- "interval_score" + } else if ("crps" %in% colnames(scores)) { + rel_skill_metric <- "crps" + } else if ("brier_score" %in% colnames(scores)) { + rel_skill_metric <- "brier_score" } else { - comparison_result[, var_of_interest := round(pval, 3)] - # implemnt breaks for colour heatmap - breaks <- c(0, 0.01, 0.05, 0.1, 1) - plot_scales <- c(1, 0.5, 0.1, 0) - comparison_result[, fill_col := get_fill_scale(var_of_interest, - breaks, plot_scales)] - - high_col = "palegreen3" - comparison_result[, var_of_interest := as.character(var_of_interest)] - comparison_result[, var_of_interest := ifelse(var_of_interest == "0", - "< 0.001", var_of_interest)] - } - - plot <- ggplot2::ggplot(comparison_result, - ggplot2::aes(y = reorder(model, 1 / mean_scores_ratio, FUN = geom_mean_helper), - x = reorder(compare_against, mean_scores_ratio, FUN = geom_mean_helper), - fill = fill_col)) + - ggplot2::geom_tile(color = "white", - width = 0.97, height = 0.97) + - ggplot2::geom_text(ggplot2::aes(label = var_of_interest), - na.rm = TRUE) + - ggplot2::scale_fill_gradient2(low = "skyblue", mid = "grey95", - high = high_col, - na.value = "lightgrey", - midpoint = 0, - limits = c(-1,1), - name = NULL) + - ggplot2::theme_light() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1), - legend.position = "none") + - ggplot2::labs(x = "", y = "", - title = "Pairwise comparisons - p-value whether mean scores ratio equal to 1") + - ggplot2::coord_cartesian(expand = FALSE) - - if (type[1] == "mean_scores_ratio") { - plot <- plot + - ggplot2::theme(panel.grid.major = ggplot2::element_blank(), - panel.grid.minor = ggplot2::element_blank(), - axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1, color = "brown4"), - axis.text.y = ggplot2::element_text(color = "steelblue4")) + - ggplot2::ggtitle("Pairwise comparisons - ratio of mean scores (for overlapping forecast sets)") + stop( + "automatically assigning a metric to compute relative skills on failed. ", + "Please provide a metric." + ) } - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - plot <- plot + - ggplot2::facet_wrap(facet_formula, ncol = ncol, - scales = scales) - } else { - plot <- plot + - ggplot2::facet_grid(facet_formula, scales = scales) - } - } - - return(plot) + return(rel_skill_metric) } - - - - - - - - - diff --git a/R/pit.R b/R/pit.R index 8b8b7dd70..32988c84a 100644 --- a/R/pit.R +++ b/R/pit.R @@ -1,4 +1,4 @@ -#' @title Probability Integral Transformation +#' @title Probability Integral Transformation (sample-based version) #' #' @description Uses a Probability Integral Transformation (PIT) (or a #' randomised PIT for integer forecasts) to @@ -19,12 +19,12 @@ #' u_t = F_t (x_t) #' } #' -#' where \eqn{x_t} is the observed data point at time \eqn{t in t_1, …, t_n}, -#' n being the number of forecasts, and $F_t$ is the (continuous) predictive -#' cumulative probability distribution at time t. If the true probability -#' distribution of outcomes at time t is \eqn{G_t} then the forecasts eqn{F_t} are -#' said to be ideal if eqn{F_t = G_t} at all times t. In that case, the -#' probabilities ut are distributed uniformly. +#' where \eqn{x_t} is the observed data point at time \eqn{t in t_1, …, t_n}{t +#' \text{ in } t_1, …, t_n}, n being the number of forecasts, and \eqn{F_t} is +#' the (continuous) predictive cumulative probability distribution at time t. If +#' the true probability distribution of outcomes at time t is \eqn{G_t} then the +#' forecasts \eqn{F_t} are said to be ideal if \eqn{F_t = G_t} at all times t. +#' In that case, the probabilities \eqn{u_t} are distributed uniformly. #' #' In the case of discrete outcomes such as incidence counts, #' the PIT is no longer uniform even when forecasts are ideal. @@ -35,7 +35,7 @@ #' #' where \eqn{k_t} is the observed count, \eqn{P_t(x)} is the predictive #' cumulative probability of observing incidence k at time t, -#' eqn{P_t (-1) = 0} by definition and v is standard uniform and independent +#' \eqn{P_t (-1) = 0} by definition and v is standard uniform and independent #' of k. If \eqn{P_t} is the true cumulative #' probability distribution, then \eqn{u_t} is standard uniform. #' @@ -52,86 +52,54 @@ #' In this context it should be noted, though, that uniformity of the #' PIT is a necessary but not sufficient condition of calibration. #' -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the -#' number of Monte Carlo samples -#' @param plot logical. If `TRUE`, a histogram of the PIT values will be -#' returned as well -#' @param num_bins the number of bins in the PIT histogram (if `plot = TRUE`) -#' If not given, the square root of n will be used -#' @param n_replicates the number of tests to perform, -#' each time re-randomising the PIT -#' @param full_output return all individual p_values and computed u_t values -#' for the randomised PIT. Usually not needed. -#' @param verbose if `TRUE` (default is `FALSE`) more error messages are printed. -#' Usually, this should not be needed, but may help with debugging. -#' @return a list with the following components: -#' \itemize{ -#' \item `p_value`: p-value of the Anderson-Darling test on the -#' PIT values. In case of integer forecasts, this will be the mean p_value -#' from the `n_replicates` replicates -#' \item `sd`: standard deviation of the p_value returned. In case of -#' continuous forecasts, this will be NA as there is only one p_value returned. -#' \item `hist_PIT` a plot object with the PIT histogram. Only returned -#' if `plot = TRUE`. Call `plot(PIT(...)$hist_PIT)` to display the histogram. -#' \item `p_values`: all p_values generated from the Anderson-Darling tests -#' on the (randomised) PIT. Only returned if `full_output = TRUE` -#' \item `u`: the u_t values internally computed. Only returned if -#' `full_output = TRUE` -#' } -#' @importFrom goftest ad.test -#' @importFrom stats runif sd +#' @param n_replicates the number of draws for the randomised PIT for +#' integer predictions. +#' @inheritParams ae_median_sample +#' @return A vector with PIT-values. For continuous forecasts, the vector will +#' correspond to the length of `true_values`. For integer forecasts, a +#' randomised PIT will be returned of length +#' `length(true_values) * n_replicates` +#' @seealso [pit()] +#' @importFrom stats runif #' @examples -#' #' ## continuous predictions #' true_values <- rnorm(30, mean = 1:30) #' predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -#' pit(true_values, predictions) +#' pit <- pit_sample(true_values, predictions) +#' plot_pit(pit) #' #' ## integer predictions #' true_values <- rpois(100, lambda = 1:100) #' predictions <- replicate(5000, rpois(n = 100, lambda = 1:100)) -#' pit(true_values, predictions, n_replicates = 5) -#' +#' pit <- pit_sample(true_values, predictions, n_replicates = 50) +#' plot_pit(pit) #' @export #' @references #' Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, #' Rosalind M. Eggo, W. John Edmunds (2019) Assessing the performance of #' real-time epidemic forecasts: A case study of Ebola in the Western Area #' region of Sierra Leone, 2014-15, +#' @keywords metric - -pit <- function(true_values, - predictions, - plot = TRUE, - full_output = FALSE, - n_replicates = 50, - num_bins = NULL, - verbose = FALSE) { - - +pit_sample <- function(true_values, + predictions, + n_replicates = 100) { # error handling-------------------------------------------------------------- # check al arguments are provided - if (!all(c(methods::hasArg("true_values"), methods::hasArg("predictions")))) { - stop("`true_values` or `predictions` missing in function 'pit()'") + # this could be integrated into check_not_null + if (missing("true_values") | missing("predictions")) { + stop("`true_values` or `predictions` missing in function 'pit_sample()'") } check_not_null(true_values = true_values, predictions = predictions) # check if there is more than one observation n <- length(true_values) if (n == 1) { - if (verbose) { - message("you need more than one observation to assess uniformity of the PIT") - } - out <- list(p_value = NA, - sd = NA) - if (full_output) { - out <- list(p_values = NA, - calibration = NA, - u = NA) - } + message( + "you need more than one observation to assess uniformity of the PIT" + ) + return(NA) } # check and handle format of predictions @@ -139,18 +107,20 @@ pit <- function(true_values, predictions <- as.matrix(predictions) } if (!is.matrix(predictions)) { - msg <- sprintf("'predictions' should be a matrix. Instead `%s` was found", - class(predictions[1])) + msg <- sprintf( + "'predictions' should be a matrix. Instead `%s` was found", + class(predictions)[1] + ) stop(msg) } if (nrow(predictions) != n) { - - msg <- sprintf("Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", - n, nrow(predictions)) + msg <- sprintf( + "Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", + n, nrow(predictions) + ) stop(msg) } - # check data type ------------------------------------------------------------ # check whether continuous or integer if (!isTRUE(all.equal(as.vector(predictions), as.integer(predictions)))) { @@ -159,67 +129,25 @@ pit <- function(true_values, continuous_predictions <- FALSE } - - # calculate PIT -------------------------------------------------------------- + # calculate PIT-values ------------------------------------------------------- n_pred <- ncol(predictions) # calculate emipirical cumulative distribution function as # Portion of (y_true <= y_predicted) - P_x <- rowSums(predictions <= true_values) / n_pred + p_x <- rowSums(predictions <= true_values) / n_pred # calculate PIT for continuous predictions case if (continuous_predictions) { - p_value <- goftest::ad.test(P_x)$p.value - out <- list(p_value = p_value, - sd = NA) - if (plot) { - hist_PIT <- hist_PIT(P_x, num_bins = num_bins, caption = p_value) - out$hist_PIT = hist_PIT - } - if(full_output) { - out$u <- P_x - out$p_values <- p_value - } - } - - # calculate PIT for integer predictions case - if (!continuous_predictions) { - # empirical cdf for (y-1) for integer-valued predictions - P_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred - # do n_replicates times for randomised PIT - u <- replicate(n_replicates, P_xm1 + stats::runif(n) * (P_x - P_xm1)) - # apply Anderson Darling test on u values - p_values <- apply( - u, - MARGIN = 2, - FUN = function (x) { - goftest::ad.test(x)$p.value - } + pit_values <- p_x + } else { + p_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred + pit_values <- as.vector( + replicate(n_replicates, p_xm1 + runif(1) * (p_x - p_xm1)) ) - out <- list(p_value = mean(p_values), - sd = stats::sd(p_values)) - # add additional output if desired - if (full_output) { - out$u <- u - out$p_values <- p_values - } - # make plot if desired - if (plot) { - hist_PIT <- hist_PIT(rowMeans(u), num_bins = num_bins, - caption = mean(p_values)) - out$hist_PIT = hist_PIT - } } - - return(out) + return(pit_values) } - - - - - - #' @title Probability Integral Transformation (data.frame Format) #' #' @description Wrapper around `pit()` for use in data.frames @@ -228,227 +156,72 @@ pit <- function(true_values, #' see [pit()] #' #' @param data a data.frame with the following columns: `true_value`, -#' `prediction`, `sample` -#' @inheritParams pit -#' @return a list with the following components: -#' \itemize{ -#' \item `data`: the input data.frame (not including rows where prediction is `NA`), -#' with added columns `pit_p_val` and `pit_sd` -#' \item `hist_PIT` a plot object with the PIT histogram. Only returned -#' if `plot = TRUE`. Call -#' `plot(PIT(...)$hist_PIT)` to display the histogram. -#' \item `p_values`: all p_values generated from the Anderson-Darling tests on the -#' (randomised) PIT. Only returned if `full_output = TRUE` -#' \item `u`: the u_t values internally computed. Only returned if -#' `full_output = TRUE` -#' } -#' @importFrom goftest ad.test -#' @importFrom stats runif sd +#' `prediction`, `sample`. +#' @param by Character vector with the columns according to which the +#' PIT values shall be grouped. If you e.g. have the columns 'model' and +#' 'location' in the data and want to have a PIT histogram for +#' every model and location, specify `by = c("model", "location")`. +#' @inheritParams pit_sample +#' @return a data.table with PIT values according to the grouping specified in +#' `by` #' @examples -#' example <- scoringutils::continuous_example_data -#' result <- pit_df(example, full_output = TRUE) +#' result <- pit(example_continuous, by = "model") +#' plot_pit(result) #' +#' # example with quantile data +#' result <- pit(example_quantile, by = "model") +#' plot_pit(result) #' @export #' @references #' Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, #' Rosalind M. Eggo, W. John Edmunds (2019) Assessing the performance of #' real-time epidemic forecasts: A case study of Ebola in the Western Area #' region of Sierra Leone, 2014-15, +#' @keywords scoring -pit_df <- function(data, - plot = TRUE, - full_output = FALSE, - n_replicates = 100, - num_bins = NULL, - verbose = FALSE) { - - data <- data.table::as.data.table(data) - - # filter out instances where prediction is NA - data <- data[!is.na(prediction)] - - # reformat data.table to wide format for PIT - data_wide <- data.table::dcast(data, ... ~ paste("sampl_", sample, sep = ""), - value.var = "prediction") - - samples <- as.matrix(data_wide[, grepl("sampl_", colnames(data_wide)), - with = FALSE]) - # extract true values - true_values <- data_wide$true_value - - pit_arguments = list(true_values = true_values, - predictions = samples, - plot = plot, - full_output = full_output, - n_replicates = n_replicates, - num_bins = num_bins, - verbose = verbose) - - # call pit with samples and true values - res <- do.call(pit, pit_arguments) - - # add results back to the data.frame - data[, `:=` (pit_p_val = res$p_value, - pit_sd = res$sd)] - - out <- list(data = data, - hist_PIT = res$hist_PIT) - - if (full_output) { - out$p_values <- res$p_values - out$u <- res$u - } - - return(out) -} - +pit <- function(data, + by, + n_replicates = 100) { + check_data <- check_forecasts(data) + data <- check_data$cleaned_data + prediction_type <- check_data$prediction_type + # if prediction type is quantile, simply extract coverage values from + # score and returned a list with named vectors + if (prediction_type == "quantile") { + coverage <- + score(data, metrics = "quantile_coverage") -#' @title Probability Integral Transformation (data.frame Format, fast version) -#' -#' @description Wrapper around `pit()` for fast use in data.frames. This version -#' of the pit does not do allow any plotting, but can iterate over categories -#' in a data.frame as specified in the `by` argument. -#' -#' @details -#' see [pit()] -#' -#' @param data a data.frame with the following columns: `true_value`, -#' `prediction`, `sample` -#' @inheritParams pit -#' @param by character vector with categories to iterate over -#' @return the input data.frame (not including rows where prediction is `NA`), -#' with added columns `pit_p_val` and `pit_sd` -#' @importFrom goftest ad.test -#' @importFrom stats runif sd -#' @examples -#' example <- scoringutils::continuous_example_data -#' result <- pit_df(example, full_output = TRUE) -#' -#' @export -#' @references -#' Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, -#' Rosalind M. Eggo, W. John Edmunds (2019) Assessing the performance of -#' real-time epidemic forecasts: A case study of Ebola in the Western Area -#' region of Sierra Leone, 2014-15, - -pit_df_fast <- function(data, - n_replicates = 100, - by = by) { - - data <- data.table::as.data.table(data) - - # filter out instances where prediction is NA - data <- data[!is.na(prediction)] - - # define arguments for call to PIT function - pit_arguments = list(plot = FALSE, - full_output = FALSE, - n_replicates = n_replicates, - num_bins = 1, - verbose = FALSE) - - # reformat data.table to wide format for PIT - data_wide <- data.table::dcast(data, ... ~ paste("sampl_", sample, sep = ""), - value.var = "prediction") - - # calculate PIT values - data_wide[, c("pit_p_val", "pit_sd") := do.call(pit, c(list(true_value, - as.matrix(.SD)), - pit_arguments)), - .SDcols = names(data_wide)[grepl("sampl_", names(data_wide))], by = by] - - # melt data back - sample_names <- names(data_wide)[grepl("sampl_", names(data_wide))] - data <- data.table::melt(data_wide, - measure.vars = sample_names, - variable.name = "sample", - value.name = "prediction") - - - return(data) -} - - - - - - - - - -#' @title PIT Histogram -#' -#' @description -#' Make a simple histogram of the probability integral transformed values to -#' visually check whether a uniform distribution seems likely. -#' -#' @param PIT_samples A vector with the PIT values of size n -#' @param num_bins the number of bins in the PIT histogram. -#' @param caption provide a caption that gets passed to the plot -#' If not given, the square root of n will be used -#' @return vector with the scoring values -#' @importFrom ggplot2 ggplot aes xlab ylab geom_histogram stat - - -hist_PIT <- function(PIT_samples, - num_bins = NULL, - caption = NULL) { - - if (is.null(num_bins)) { - n <- length(PIT_samples) - num_bins = round(sqrt(n)) - } - - hist_PIT <- ggplot2::ggplot(data = data.frame(x = PIT_samples), - ggplot2::aes(x = x)) + - ggplot2::geom_histogram(ggplot2::aes(y = stat(count) / sum(count)), - breaks = seq(0, 1, length.out = num_bins + 1), - colour = "grey") + - ggplot2::xlab("PIT") + - ggplot2::ylab("Frequency") + - ggplot2::labs(caption = paste0("p-value of Andersen-Darling test for uniformity: ", - round(caption, 3))) - - return(hist_PIT) -} - - - -#' @title PIT Histogram Quantile -#' -#' @description -#' Make a simple histogram of the probability integral transformed values to -#' visually check whether a uniform distribution seems likely. -#' -#' @param PIT_samples A vector with the PIT values of size n -#' @param num_bins the number of bins in the PIT histogram. -#' @param caption provide a caption that gets passed to the plot -#' If not given, the square root of n will be used -#' @return vector with the scoring values -#' @importFrom ggplot2 ggplot aes xlab ylab geom_histogram stat - + coverage <- summarise_scores(coverage, + by = unique(c(by, "quantile")) + ) -hist_PIT_quantile <- function(PIT_samples, - num_bins = NULL, - caption = NULL) { + coverage <- coverage[order(quantile), + .( + quantile = c(quantile, 1), + pit_value = diff(c(0, quantile_coverage, 1)) + ), + by = c(get_forecast_unit(coverage)) + ] - if (is.null(num_bins)) { - n <- length(PIT_samples) - num_bins = round(sqrt(n)) + return(coverage[]) } - hist_PIT <- ggplot2::ggplot(data = data.frame(x = PIT_samples), - ggplot2::aes(x = x)) + - ggplot2::geom_histogram(ggplot2::aes(y = stat(count) / sum(count)), - breaks = seq(0, 1, length.out = num_bins + 1), - colour = "grey") + - ggplot2::xlab("PIT") + - ggplot2::ylab("Frequency") + - ggplot2::labs() - - return(hist_PIT) + # if prediction type is not quantile, calculate PIT values based on samples + data_wide <- data.table::dcast(data, + ... ~ paste("InternalSampl_", sample, sep = ""), + value.var = "prediction" + ) + + pit <- data_wide[, .("pit_value" = pit_sample( + true_values = true_value, + predictions = as.matrix(.SD) + )), + by = by, + .SDcols = grepl("InternalSampl_", names(data_wide)) + ] + + return(pit[]) } - diff --git a/R/plot.R b/R/plot.R index a840c0a73..07c6dc604 100644 --- a/R/plot.R +++ b/R/plot.R @@ -2,79 +2,56 @@ #' #' @description #' Plots a coloured table of summarised scores obtained using -#' [eval_forecasts()] +#' [score()]. #' -#' @param summarised_scores A data.frame of summarised scores as produced by -#' [eval_forecasts()] -#' @param y the variable to be shown on the y-axis. If `NULL` (default), -#' all columns that are not scoring metrics will be used. Alternatively, -#' you can specify a vector with column names, e.g. +#' @param y the variable to be shown on the y-axis. Instead of a single character string, +#' you can also specify a vector with column names, e.g. #' `y = c("model", "location")`. These column names will be concatenated -#' to create a unique row identifier (e.g. "model1_location1") -#' @param select_metrics A character vector with the metrics to show. If set to -#' `NULL` (default), all metrics present in `summarised_scores` will -#' be shown -#' @param facet_formula formula for facetting in ggplot. If this is `NULL` -#' (the default), no facetting will take place -#' @param facet_wrap_or_grid Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param ncol Number of columns for facet wrap. Only relevant if -#' `facet_formula` is given and `facet_wrap_or_grid == "facet_wrap"` +#' to create a unique row identifier (e.g. "model1_location1"). +#' @param by A character vector that determines how the colour shading for the +#' plot gets computed. By default (`NULL`), shading will be determined per +#' metric, but you can provide additional column names (see examples). +#' @param metrics A character vector with the metrics to show. If set to +#' `NULL` (default), all metrics present in `scores` will be shown. +#' #' @return A ggplot2 object with a coloured table of summarised scores -#' @importFrom ggplot2 ggplot aes element_blank element_text labs coord_cartesian +#' @inheritParams pairwise_comparison +#' @importFrom ggplot2 ggplot aes element_blank element_text labs coord_cartesian coord_flip #' @importFrom data.table setDT melt #' @importFrom stats sd #' @export #' #' @examples -#' scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, -#' summarise_by = c("model", "value_desc")) -#' scoringutils::score_table(scores, y = "model", facet_formula = ~ value_desc, -#' ncol = 1) -#' -#' # can also put target description on the y-axis -#' scoringutils::score_table(scores, y = c("model", "value_desc")) -#' -#' # yields the same result in this case -#' scoringutils::score_table(scores) +#' library(ggplot2) +#' library(magrittr) # pipe operator #' +#' scores <- score(example_quantile) %>% +#' summarise_scores(by = c("model", "target_type")) %>% +#' summarise_scores(fun = signif, digits = 2) #' -#' scores <- scoringutils::eval_forecasts(scoringutils::integer_example_data, -#' summarise_by = c("model", "value_desc")) -#' scoringutils::score_table(scores, y = "model", facet_formula = ~ value_desc, -#' ncol = 1) +#' plot_score_table(scores, y = "model", by = "target_type") + +#' facet_wrap(~target_type, ncol = 1) #' -#' # only show selected metrics -#' scoringutils::score_table(scores, y = "model", facet_formula = ~ value_desc, -#' ncol = 1, select_metrics = c("crps", "bias")) - -score_table <- function(summarised_scores, - y = NULL, - select_metrics = NULL, - facet_formula = NULL, - ncol = NULL, - facet_wrap_or_grid = "facet_wrap") { - +#' # can also put target description on the y-axis +#' plot_score_table(scores, +#' y = c("model", "target_type"), +#' by = "target_type") +plot_score_table <- function(scores, + y = "model", + by = NULL, + metrics = NULL) { # identify metrics ----------------------------------------------------------- - # identify metrics by looking at which of the available column names - # are metrics. All other variables are treated as identifier variables - all_metrics <- available_metrics() - - metrics <- names(summarised_scores)[names(summarised_scores) %in% all_metrics] - id_vars <- names(summarised_scores)[!(names(summarised_scores) %in% all_metrics)] - - - # metrics to delete - summarised_scores <- data.table::as.data.table(summarised_scores) - - if (!is.null(select_metrics)) { - to_delete <- setdiff(metrics, select_metrics) - summarised_scores[, (to_delete) := NULL] + id_vars <- get_forecast_unit(scores) + if (is.null(metrics)) { + metrics <- names(scores)[names(scores) %in% available_metrics()] } + scores <- delete_columns( + scores, + names(scores)[!(names(scores) %in% c(metrics, id_vars))] + ) + # compute scaled values ------------------------------------------------------ # scaling is done in order to colour the different scores # for most metrics larger is worse, but others like bias are better if they @@ -84,9 +61,10 @@ score_table <- function(summarised_scores, # which not (metrics like bias where deviations in both directions are bad) metrics_zero_good <- c("bias", "coverage_deviation") metrics_no_color <- c("coverage") - metrics_p_val <- c("pit_p_val") - metrics_min_good <- setdiff(metrics, c(metrics_zero_good, metrics_p_val, - metrics_no_color)) + + metrics_min_good <- setdiff(metrics, c( + metrics_zero_good, metrics_no_color + )) # write scale functions that can be used in data.table scale <- function(x) { @@ -97,264 +75,144 @@ score_table <- function(summarised_scores, scaled <- (x - min(x)) / sd(x, na.rm = TRUE) return(scaled) } - scale_p_val <- function(x) { - out <- rep(0, length(x)) - out[x < 0.1] <- 0.2 - out[x < 0.05] <- 0.5 - out[x < 0.01] <- 1 - return(out) - } # pivot longer and add scaled values - df <- data.table::melt(summarised_scores, value.vars = metrics, - id.vars = id_vars, - variable.name = "metric") + df <- data.table::melt(scores, + value.vars = metrics, + id.vars = id_vars, + variable.name = "metric" + ) df[metric %in% metrics_min_good, value_scaled := scale_min_good(value), - by = metric] + by = c("metric", by) + ] df[metric %in% metrics_zero_good, value_scaled := scale(value), - by = metric] + by = c("metric", by) + ] df[metric %in% metrics_no_color, value_scaled := 0, - by = metric] - df[metric %in% metrics_p_val, value_scaled := scale_p_val(value), - by = metric] + by = c("metric", by) + ] - # create identifier column for plot if not given ----------------------------- - if (is.null(y)) { - # create an identifier column by concatinating all columns that - # are not a metric - identifier_columns <- names(df)[!names(df) %in% - c("metric", "value", "value_scaled")] + # create identifier column for plot ------------------------------------------ + # if there is only one column, leave column as is. Reason to do that is that + # users can then pass in a factor and keep the ordering of that column intact + if (length(y) > 1) { + df[, identifCol := do.call(paste, c(.SD, sep = "_")), + .SDcols = y[y %in% names(df)] + ] } else { - identifier_columns <- y + setnames(df, old = eval(y), new = "identifCol") } - df[, identif := do.call(paste, c(.SD, sep = "_")), - .SDcols = identifier_columns] - - # plot ----------------------------------------------------------------------- # make plot with all metrics that are not NA - plot <- ggplot2::ggplot(df[!is.na(value), ], - ggplot2::aes(y = identif, x = metric)) + - #ggplot2::geom_tile(fill = "blue") + - ggplot2::geom_tile(ggplot2::aes(fill = value_scaled), colour = "white") + - ggplot2::geom_text(ggplot2::aes(y = identif, label = round(value, 2))) + - ggplot2::scale_fill_gradient2(low = "steelblue", high = "salmon") + - ggplot2::theme_light() + - ggplot2::theme(legend.title = ggplot2::element_blank(), - legend.position = "none", - axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1)) + - ggplot2::labs(x = "", y = "") + - ggplot2::coord_cartesian(expand=FALSE) - - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - plot <- plot + - ggplot2::facet_wrap(facet_formula, ncol = ncol) - } else { - plot <- plot + - ggplot2::facet_grid(facet_formula) - } - } + plot <- ggplot( + df[!is.na(value), ], + aes(y = identifCol, x = metric) + ) + + # geom_tile(fill = "blue") + + geom_tile(aes(fill = value_scaled), colour = "white", show.legend = FALSE) + + geom_text(aes(y = identifCol, label = value)) + + scale_fill_gradient2(low = "steelblue", high = "salmon") + + theme_scoringutils() + + theme( + legend.title = element_blank(), + legend.position = "none", + axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1 + ) + ) + + labs(x = "", y = "") + + coord_cartesian(expand = FALSE) return(plot) - } - -#' @title Plot Correlation Between Metrics -#' -#' @description -#' Plots a coloured table of scores obtained using -#' [eval_forecasts()] -#' -#' @param scores A data.frame of scores as produced by -#' [eval_forecasts()] -#' @param select_metrics A character vector with the metrics to show. If set to -#' `NULL` (default), all metrics present in `summarised_scores` will -#' be shown -#' @return A ggplot2 object showing a coloured matrix of correlations -#' between metrics -#' @importFrom ggplot2 ggplot geom_tile geom_text aes scale_fill_gradient2 -#' element_text labs coord_cartesian theme theme_light -#' @importFrom stats cor na.omit -#' @importFrom data.table setDT melt -#' @importFrom forcats fct_relevel fct_rev -#' @export -#' -#' @examples -#' scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data) -#' scoringutils::correlation_plot(scores) - - -correlation_plot <- function(scores, - select_metrics = NULL) { - - # define possible metrics - all_metrics <- available_metrics() - - # find metrics present - metrics <- names(scores)[names(scores) %in% all_metrics] - - # restrict to selected metrics - if (!is.null(select_metrics)) { - metrics <- metrics[metrics %in% metrics_select] - } - - # remove all non metrics and non-numeric columns - df <- scores[, .SD, .SDcols = sapply(scores, - function(x) { - (all(is.numeric(x))) && all(is.finite(x)) - })] - df <- df[, .SD, .SDcols = names(df) %in% metrics] - - # define correlation matrix - cor_mat <- round(cor(as.matrix(df)), 2) - - # define function to obtain upper triangle of matrix - get_lower_tri <- function(cormat){ - cormat[lower.tri(cormat)] <- NA - return(cormat) - } - - # get plot data.frame - plot_df <- data.table::setDT(as.data.frame(get_lower_tri(cor_mat)), - keep.rownames = TRUE)[, metric := rn][, rn := NULL] - plot_df <- na.omit(data.table::melt(plot_df, id.vars = "metric")) - - # refactor levels according to the metrics - metrics <- unique(plot_df$metric) - plot_df[, metric := forcats::fct_relevel(metric, metrics)] - plot_df[, variable := forcats::fct_relevel(variable, metrics)] - plot_df[, variable := forcats::fct_rev(variable)] - - plot <- ggplot2::ggplot(plot_df, ggplot2::aes(x = variable, y = metric, - fill = value)) + - ggplot2::geom_tile(color = "white", - width = 0.97, height = 0.97) + - ggplot2::geom_text(ggplot2::aes(y = metric, label = value)) + - ggplot2::scale_fill_gradient2(low = "steelblue", mid = "white", - high = "salmon", - name = "Correlation", - breaks = c(-1, -0.5, 0, 0.5, 1)) + - ggplot2::theme_light() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1), - panel.grid.major.y = ggplot2::element_blank(), - panel.grid.minor.y = ggplot2::element_blank(), - panel.grid.major.x = ggplot2::element_blank(), - panel.grid.minor.x = ggplot2::element_blank()) + - ggplot2::labs(x = "", y = "") + - ggplot2::coord_cartesian(expand = FALSE) + - ggplot2::labs(title = "Correlation between metrics") - - return(plot) -} - - - - - - #' @title Plot Contributions to the Weighted Interval Score #' #' @description #' Visualise the components of the weighted interval score: penalties for -#' over-prediction, under-prediction and for a lack of sharpness +#' over-prediction, under-prediction and for high dispersion (lack of sharpness) #' #' @param scores A data.frame of scores based on quantile forecasts as -#' produced by [eval_forecasts()] +#' produced by [score()] and summarised using [summarise_scores()] #' @param x The variable from the scores you want to show on the x-Axis. -#' Usually this will be "model" -#' @param group Choose a grouping variable for the plot that gets directly -#' passed down to ggplot. Default is `NULL` +#' Usually this will be "model". #' @param relative_contributions show relative contributions instead of absolute #' contributions. Default is FALSE and this functionality is not available yet. -#' @param facet_formula facetting formula passed down to ggplot. Default is -#' `NULL` -#' @param scales scales argument that gets passed down to ggplot. Only necessary -#' if you make use of facetting. Default is "free_y" -#' @param facet_wrap_or_grid Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param ncol Number of columns for facet wrap. Only relevant if -#' `facet_formula` is given and `facet_wrap_or_grid == "facet_wrap"` -#' @param x_text_angle Angle for the text on the x-axis. Default is 90 -#' @param xlab Label for the x-axis. Default is the variable name on the x-axis -#' @param ylab Label for the y-axis. Default is "WIS contributions" +#' @param flip boolean (default is `FALSE`), whether or not to flip the axes. #' @return A ggplot2 object showing a contributions from the three components of #' the weighted interval score #' @importFrom ggplot2 ggplot aes_string aes geom_linerange facet_wrap labs -#' theme theme_light unit +#' theme theme_light unit guides guide_legend #' @export #' #' @examples -#' scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, -#' summarise_by = c("model", "value_desc")) -#' scoringutils::wis_components(scores, x = "model", facet_formula = ~ value_desc, -#' relative_contributions = TRUE) -#' scoringutils::wis_components(scores, x = "model", facet_formula = ~ value_desc, -#' relative_contributions = FALSE) +#' library("scoringutils") +#' library(ggplot2) +#' scores <- score(example_quantile) +#' scores <- summarise_scores(scores, by = c("model", "target_type")) +#' +#' plot_wis(scores, +#' x = "model", +#' relative_contributions = TRUE +#' ) + +#' facet_wrap(~target_type) +#' plot_wis(scores, +#' x = "model", +#' relative_contributions = FALSE +#' ) + +#' facet_wrap(~target_type, scales = "free_x") #' @references #' Bracher J, Ray E, Gneiting T, Reich, N (2020) Evaluating epidemic forecasts -#' in an interval format. - - -wis_components <- function(scores, - x = "model", - group = NULL, - relative_contributions = FALSE, - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap", - x_text_angle = 90, - xlab = x, - ylab = "WIS contributions") { +#' in an interval format. +plot_wis <- function(scores, + x = "model", + relative_contributions = FALSE, + flip = FALSE) { scores <- data.table::as.data.table(scores) scores <- data.table::melt(scores, - measure.vars = c("overprediction", - "underprediction", - "sharpness"), - variable.name = "wis_component_name", - value.name = "component_value") + measure.vars = c( + "overprediction", + "underprediction", + "dispersion" + ), + variable.name = "wis_component_name", + value.name = "component_value" + ) # stack or fill the geom_col position col_position <- ifelse(relative_contributions, "fill", "stack") - plot <- ggplot2::ggplot(scores, ggplot2::aes_string(x = x, group = group)) + - ggplot2::geom_col(position = col_position, - ggplot2::aes(y = component_value, fill = wis_component_name)) + - ggplot2::facet_wrap(facet_formula, ncol = ncol, - scales = scales) + - ggplot2::labs(x = xlab, y = ylab) + - ggplot2::theme_light() + - ggplot2::theme(panel.spacing = ggplot2::unit(4, "mm"), - axis.text.x = ggplot2::element_text(angle = x_text_angle, - vjust = 1, - hjust=1)) + plot <- ggplot(scores, aes_string(y = x)) + + geom_col( + position = col_position, + aes(x = component_value, fill = wis_component_name) + ) + + theme_scoringutils() + + guides(fill = guide_legend(title = "WIS component")) + + xlab("WIS contributions") - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - plot <- plot + - ggplot2::facet_wrap(facet_formula, ncol = ncol, - scales = scales) - } else { - plot <- plot + - ggplot2::facet_grid(facet_formula, scales = scales) - } + if (flip) { + plot <- plot + + theme( + panel.spacing = unit(4, "mm"), + axis.text.x = element_text( + angle = 90, + vjust = 1, + hjust = 1 + ) + ) + + coord_flip() } return(plot) - } @@ -363,30 +221,18 @@ wis_components <- function(scores, #' #' @description #' Visualise the metrics by range, e.g. if you are interested how different -#' interval ranges contribute to the overall interval score, or how sharpness -#' changes by range. +#' interval ranges contribute to the overall interval score, or how +#' sharpness / dispersion changes by range. #' #' @param scores A data.frame of scores based on quantile forecasts as -#' produced by [eval_forecasts()]. Note that "range" must be included -#' in the `summarise_by` argument when running `eval_forecasts` +#' produced by [score()] or [summarise_scores()]. Note that "range" must be included +#' in the `by` argument when running [summarise_scores()] #' @param y The variable from the scores you want to show on the y-Axis. -#' This could be something like "interval_score" (the default) or "sharpness" +#' This could be something like "interval_score" (the default) or "dispersion" #' @param x The variable from the scores you want to show on the x-Axis. #' Usually this will be "model" #' @param colour Character vector of length one used to determine a variable #' for colouring dots. The Default is "range". -#' @param facet_formula facetting formula passed down to ggplot. Default is -#' `NULL` -#' @param scales scales argument that gets passed down to ggplot. Only necessary -#' if you make use of facetting. Default is "free_y" -#' @param facet_wrap_or_grid Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param ncol Number of columns for facet wrap. Only relevant if -#' `facet_formula` is given and `facet_wrap_or_grid == "facet_wrap"` -#' @param xlab Label for the x-axis. Default is the variable name on the x-axis -#' @param ylab Label for the y-axis. Default is "WIS contributions" #' @return A ggplot2 object showing a contributions from the three components of #' the weighted interval score #' @importFrom ggplot2 ggplot aes_string aes geom_point geom_line @@ -394,61 +240,44 @@ wis_components <- function(scores, #' @export #' #' @examples -#' scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, -#' summarise_by = c("model", "value_desc", "range")) +#' library("scoringutils") +#' library(ggplot2) +#' scores <- score(example_quantile) +#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) #' -#' scoringutils::range_plot(scores, x = "model", facet_formula = ~ value_desc) +#' plot_ranges(scores, x = "model") + +#' facet_wrap(~target_type, scales = "free") #' -#' # visualise sharpness instead of interval score -#' scoringutils::range_plot(scores, y = "sharpness", x = "model", -#' facet_formula = ~value_desc) -#' -#' # we saw above that sharpness values crossed. Let's look at the unweighted WIS -#' scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, -#' interval_score_arguments = list(weigh = FALSE), -#' summarise_by = c("model", "value_desc", "range")) -#' scoringutils::range_plot(scores, y = "sharpness", x = "model", -#' facet_formula = ~value_desc) - - -range_plot <- function(scores, - y = "interval_score", - x = "model", - colour = "range", - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap", - xlab = x, - ylab = y) { - - plot <- ggplot2::ggplot(scores, - ggplot2::aes_string(x = x, - y = y, - colour = colour)) + - ggplot2::geom_point(size = 2) + - ggplot2::geom_line(ggplot2::aes(group = range), - colour = "black", - size = 0.01) + - ggplot2::theme_light() + - ggplot2::expand_limits(y = 0) + - ggplot2::scale_color_continuous(low = "steelblue", high = "salmon") + - ggplot2::theme(legend.position = "right", - axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1)) + - ggplot2::labs(y = ylab, - x = xlab) - - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - plot <- plot + - ggplot2::facet_wrap(facet_formula, ncol = ncol, - scales = scales) - } else { - plot <- plot + - ggplot2::facet_grid(facet_formula, scales = scales) - } - } +#' # visualise dispersion instead of interval score +#' plot_ranges(scores, y = "dispersion", x = "model") + +#' facet_wrap(~target_type) +plot_ranges <- function(scores, + y = "interval_score", + x = "model", + colour = "range") { + plot <- ggplot( + scores, + aes_string( + x = x, + y = y, + colour = colour + ) + ) + + geom_point(size = 2) + + geom_line(aes(group = range), + colour = "black", + size = 0.01 + ) + + theme_scoringutils() + + expand_limits(y = 0) + + scale_color_continuous(low = "steelblue", high = "salmon") + + theme( + legend.position = "right", + axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1 + ) + ) return(plot) } @@ -463,25 +292,13 @@ range_plot <- function(scores, #' different locations. #' #' @param scores A data.frame of scores based on quantile forecasts as -#' produced by [eval_forecasts()]. +#' produced by [score()]. #' @param y The variable from the scores you want to show on the y-Axis. The #' default for this is "model" #' @param x The variable from the scores you want to show on the x-Axis. This #' could be something like "horizon", or "location" #' @param metric the metric that determines the value and colour shown in the #' tiles of the heatmap -#' @param xlab Label for the x-axis. Default is the variable name on the x-axis -#' @param ylab Label for the y-axis. Default is the variable name on the y-axis -#' @param facet_formula facetting formula passed down to ggplot. Default is -#' `NULL` -#' @param scales scales argument that gets passed down to ggplot. Only necessary -#' if you make use of facetting. Default is "free_y" -#' @param facet_wrap_or_grid Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param ncol Number of columns for facet wrap. Only relevant if -#' `facet_formula` is given and `facet_wrap_or_grid == "facet_wrap"` #' @return A ggplot2 object showing a heatmap of the desired metric #' @importFrom data.table setDT `:=` #' @importFrom ggplot2 ggplot aes_string aes geom_tile geom_text @@ -489,52 +306,37 @@ range_plot <- function(scores, #' @export #' #' @examples -#' scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, -#' summarise_by = c("model", "value_desc", "range")) -#' -#' scoringutils::score_heatmap(scores, x = "value_desc", metric = "bias") +#' library("scoringutils") +#' scores <- score(example_quantile) +#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) #' - - - -score_heatmap <- function(scores, - y = "model", - x, - metric, - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap", - ylab = y, - xlab = x) { - - +#' plot_heatmap(scores, x = "target_type", metric = "bias") +plot_heatmap <- function(scores, + y = "model", + x, + metric) { data.table::setDT(scores) scores[, eval(metric) := round(get(metric), 2)] - plot <- ggplot2::ggplot(scores, - ggplot2::aes_string(y = y, - x = x, - fill = metric)) + - ggplot2::geom_tile() + - ggplot2::geom_text(ggplot2::aes_string(label = metric)) + - ggplot2::scale_fill_gradient2(low = "skyblue", high = "red") + - ggplot2::labs(y = ylab, x = xlab) + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1)) + - ggplot2::coord_cartesian(expand = FALSE) + plot <- ggplot( + scores, + aes_string( + y = y, + x = x, + fill = metric + ) + ) + + geom_tile() + + geom_text(aes_string(label = metric)) + + scale_fill_gradient2(low = "skyblue", high = "red") + + theme_scoringutils() + + theme(axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1 + )) + + coord_cartesian(expand = FALSE) - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - plot <- plot + - ggplot2::facet_wrap(facet_formula, ncol = ncol, - scales = scales) - } else { - plot <- plot + - ggplot2::facet_grid(facet_formula, scales = scales) - } - } return(plot) } @@ -547,7 +349,7 @@ score_heatmap <- function(scores, #' Make a plot of observed and predicted values #' #' @param data a data.frame that follows the same specifications outlined in -#' [eval_forecasts()]. The data.frame needs to have columns called +#' [score()]. The data.frame needs to have columns called #' "true_value", "prediction" and then either a column called sample, or one #' called "quantile" or two columns called "range" and "boundary". Internally, #' these will be separated into a truth and forecast data set in order to be @@ -593,34 +395,30 @@ score_heatmap <- function(scores, #' @param allow_truth_without_pred logical, whether or not #' to allow instances where there is truth data, but no forecast. If `FALSE` #' (the default), these get filtered out. -#' @param xlab Label for the x-axis. Default is the variable name on the x-axis -#' @param ylab Label for the y-axis. Default is "True and predicted values" -#' @param verbose print out additional helpful messages (default is TRUE) #' @return ggplot object with a plot of true vs predicted values -#' @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual -#' facet_wrap facet_grid +#' @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual theme_light +#' @importFrom ggplot2 facet_wrap facet_grid aes_string geom_line #' @importFrom data.table dcast +#' @importFrom ggdist geom_lineribbon #' @export #' #' @examples -#' example1 <- scoringutils::continuous_example_data -#' example2 <- scoringutils::range_example_data_long -#' -#' scoringutils::plot_predictions(example1, x = "value_date", -#' filter_truth = list('value_date <= "2020-06-22"', -#' 'value_date > "2020-05-01"'), -#' filter_forecasts = list("model == 'SIRCOVID'", -#' 'creation_date == "2020-06-22"'), -#' facet_formula = geography ~ value_desc) +#' example1 <- scoringutils::example_continuous #' -#' scoringutils::plot_predictions(example2, x = "value_date", -#' filter_truth = list('value_date <= "2020-06-22"', -#' 'value_date > "2020-05-01"'), -#' filter_forecasts = list("model == 'SIRCOVID'", -#' 'creation_date == "2020-06-22"'), -#' allow_truth_without_pred = TRUE, -#' facet_formula = geography ~ value_desc) - +#' plot_predictions( +#' example1, +#' x = "target_end_date", +#' filter_truth = list( +#' 'target_end_date <= "2021-07-22"', +#' 'target_end_date > "2021-05-01"' +#' ), +#' filter_forecasts = list( +#' "model == 'EuroCOVIDhub-ensemble'", +#' 'forecast_date == "2021-06-07"' +#' ), +#' facet_formula = location ~ target_type, +#' range = c(0, 50, 90, 95) +#' ) plot_predictions <- function(data = NULL, forecasts = NULL, truth_data = NULL, @@ -635,23 +433,18 @@ plot_predictions <- function(data = NULL, ncol = NULL, scales = "free_y", allow_truth_without_pred = FALSE, - remove_from_truth = c("model", "forecaster", "quantile", "prediction", "sample", "interval"), - xlab = x, - ylab = "True and predicted values", - verbose = TRUE) { + remove_from_truth = c("model", "forecaster", "quantile", "prediction", "sample", "interval")) { # preparations --------------------------------------------------------------- # check data argument is provided if (is.null(data) && (is.null(truth_data) | is.null(forecasts))) { - stop("need arguments 'data' in function 'eval_forecasts()', or alternatively 'forecasts' and 'truth_data'") + stop("need arguments 'data' in function 'score()', or alternatively 'forecasts' and 'truth_data'") } if (is.null(data)) { data <- merge_pred_and_obs(forecasts, truth_data, by = merge_by, join = "full") if (nrow(data) == 0) { - if (verbose) { - warning("After attempting to merge, only an empty data.table was left") - } + warning("After attempting to merge, only an empty data.table was left") return(data) } } @@ -674,7 +467,7 @@ plot_predictions <- function(data = NULL, truth_data <- filter_df(truth_data, c(filter_both, filter_truth)) forecasts <- filter_df(forecasts, c(filter_both, filter_forecasts)) - # if specificed, get all combinations of the facet variables present in the + # if specified, get all combinations of the facet variables present in the # forecasts and filter the truth_data accordingly if (!allow_truth_without_pred && !is.null(facet_formula)) { facet_vars <- all.vars(facet_formula) @@ -688,22 +481,25 @@ plot_predictions <- function(data = NULL, truth_data <- merge(truth_data, combinations_forecasts) # add back together truth_data <- data.table::rbindlist(list(truth_without_pred, truth_data), - use.names = TRUE) + use.names = TRUE + ) } # delete certain columns that denominate the forecaster from the truth data - truth_data <- delete_columns(truth_data, remove_from_truth) + truth_data <- delete_columns(truth_data, remove_from_truth, make_unique = TRUE) # find out what type of predictions we have. convert sample based to # range data colnames <- colnames(forecasts) if ("sample" %in% colnames) { - forecasts <- scoringutils::sample_to_range_long(forecasts, - range = range, - keep_quantile_col = FALSE) + forecasts <- sample_to_range_long(forecasts, + range = range, + keep_quantile_col = FALSE + ) } else if ("quantile" %in% colnames) { - forecasts <- scoringutils::quantile_to_range_long(forecasts, - keep_quantile_col = FALSE) + forecasts <- quantile_to_range_long(forecasts, + keep_quantile_col = FALSE + ) } # select appropriate boundaries and pivot wider @@ -716,63 +512,76 @@ plot_predictions <- function(data = NULL, intervals[, quantile := NULL] } - pal <- grDevices::colorRampPalette(c("lightskyblue1", "steelblue3")) - - plot <- ggplot2::ggplot(data = data, aes(x = !!ggplot2::sym(x))) + - ggplot2::scale_colour_manual("",values = c("black", "steelblue4")) + - ggplot2::scale_fill_manual(name = "range", values = pal(length(range))) + - ggplot2::theme_light() + plot <- ggplot(data = data, aes_string(x = x)) + + scale_colour_manual("", values = c("black", "steelblue4")) + + theme_scoringutils() if (nrow(intervals) != 0) { # pivot wider and convert range to a factor intervals <- data.table::dcast(intervals, ... ~ boundary, value.var = "prediction") - intervals[, range := factor(range, - levels = sort(unique(range), decreasing = TRUE), - ordered = TRUE)] - # plot prediction ranges plot <- plot + - ggplot2::geom_ribbon(data = intervals, - ggplot2::aes(ymin = lower, ymax = upper, - group = range, fill = range)) + ggdist::geom_lineribbon( + data = intervals, + aes( + ymin = lower, ymax = upper, + # We use the fill_ramp aesthetic for this instead of the default fill + # because we want to keep fill to be able to use it for other + # variables + fill_ramp = factor(range, levels = sort(unique(range), decreasing = TRUE)) + ), + lwd = 0.4 + ) + + ggdist::scale_fill_ramp_discrete( + name = "range" + ) + } - # add median in a different colour + # We could treat this step as part of ggdist::geom_lineribbon() but we treat + # it separately here to deal with the case when only the median is provided + # (in which case ggdist::geom_lineribbon() will fail) if (0 %in% range) { select_median <- (forecasts$range %in% 0 & forecasts$boundary == "lower") median <- forecasts[select_median] if (nrow(median) > 0) { plot <- plot + - ggplot2::geom_line(data = median, - mapping = ggplot2::aes(y = prediction, colour = "median"), - lwd = 0.4) - } + geom_line( + data = median, + mapping = aes(y = prediction, colour = "median"), + lwd = 0.4 + ) + } } # add true_values if (nrow(truth_data) > 0) { plot <- plot + - ggplot2::geom_point(data = truth_data, - ggplot2::aes(y = true_value, colour = "actual"), - size = 0.5) + - ggplot2::geom_line(data = truth_data, - ggplot2::aes(y = true_value, colour = "actual"), - lwd = 0.2) + geom_point( + data = truth_data, + aes(y = true_value, colour = "actual"), + size = 0.5 + ) + + geom_line( + data = truth_data, + aes(y = true_value, colour = "actual"), + lwd = 0.2 + ) } plot <- plot + - ggplot2::labs(x = xlab, y = ylab) + ylab("True and predicted values") # facet if specified by the user if (!is.null(facet_formula)) { if (facet_wrap_or_grid == "facet_wrap") { plot <- plot + - ggplot2::facet_wrap(facet_formula, scales = scales, ncol = ncol) + facet_wrap(facet_formula, scales = scales, ncol = ncol) } else { plot <- plot + - ggplot2::facet_grid(facet_formula, scales = scales) + facet_grid(facet_formula, scales = scales) } } @@ -790,66 +599,53 @@ plot_predictions <- function(data = NULL, #' @description #' Plot interval coverage #' -#' @param summarised_scores Summarised scores as produced by -#' [eval_forecasts()]. Make sure that "range" is included in -#' `summarise_by` when producing the summarised scores +#' @param scores A data.frame of scores based on quantile forecasts as +#' produced by [score()] or [summarise_scores()]. Note that "range" must be included +#' in the `by` argument when running [summarise_scores()] #' @param colour According to which variable shall the graphs be coloured? #' Default is "model". -#' @param facet_formula formula for facetting in ggplot. If this is `NULL` -#' (the default), no facetting will take place -#' @param facet_wrap_or_grid Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param scales scales argument that gets passed down to ggplot. Only necessary -#' if you make use of facetting. Default is "free_y" #' @return ggplot object with a plot of interval coverage #' @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual -#' facet_wrap facet_grid +#' facet_wrap facet_grid geom_polygon #' @importFrom data.table dcast #' @export #' #' @examples -#' example1 <- scoringutils::range_example_data_long -#' example1 <- scoringutils::range_long_to_quantile(example1) -#' scores <- scoringutils::eval_forecasts(example1, -#' summarise_by = c("model", "range")) -#' interval_coverage(scores) - -interval_coverage <- function(summarised_scores, - colour = "model", - facet_formula = NULL, - facet_wrap_or_grid = "facet_wrap", - scales = "free_y") { +#' library("scoringutils") +#' scores <- score(example_quantile) +#' scores <- summarise_scores(scores, by = c("model", "range")) +#' plot_interval_coverage(scores) +plot_interval_coverage <- function(scores, + colour = "model") { ## overall model calibration - empirical interval coverage - p1 <- ggplot2::ggplot(summarised_scores, ggplot2::aes_string(x = "range", - colour = colour)) + - ggplot2::geom_polygon(data = data.frame(x = c(0, 0, 100), - y = c(0, 100, 100), - g = c("o", "o", "o")), - ggplot2::aes(x = x, y = y, group = g, - fill = g), - alpha = 0.05, - colour = "white", - fill = "olivedrab3") + - ggplot2::geom_line(ggplot2::aes(y = range), colour = "grey", - linetype = "dashed") + - ggplot2::geom_line(ggplot2::aes(y = coverage * 100)) + - ggplot2::theme_light() + - ggplot2::theme(legend.position = "bottom") + - ggplot2::ylab("% Obs inside interval") + - ggplot2::xlab("Interval range") + - ggplot2::coord_cartesian(expand = FALSE) - - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - p1 <- p1 + - ggplot2::facet_wrap(facet_formula, scales = scales) - } else { - p1 <- p1 + - ggplot2::facet_grid(facet_formula, scales = scales) - } - } + p1 <- ggplot(scores, aes_string( + x = "range", + colour = colour + )) + + geom_polygon( + data = data.frame( + x = c(0, 0, 100), + y = c(0, 100, 100), + g = c("o", "o", "o") + ), + aes( + x = x, y = y, group = g, + fill = g + ), + alpha = 0.05, + colour = "white", + fill = "olivedrab3" + ) + + geom_line(aes(y = range), + colour = "grey", + linetype = "dashed" + ) + + geom_line(aes(y = coverage * 100)) + + theme_scoringutils() + + theme(legend.position = "bottom") + + ylab("% Obs inside interval") + + xlab("Nominal interval coverage") + + coord_cartesian(expand = FALSE) return(p1) } @@ -863,81 +659,473 @@ interval_coverage <- function(summarised_scores, #' @description #' Plot quantile coverage #' -#' @param summarised_scores Summarised scores as produced by -#' [eval_forecasts()]. Make sure that "quantile" is included in -#' `summarise_by` when producing the summarised scores +#' @param scores A data.frame of scores based on quantile forecasts as +#' produced by [score()] or [summarise_scores()]. Note that "range" must be included +#' in the `by` argument when running [summarise_scores()] #' @param colour According to which variable shall the graphs be coloured? #' Default is "model". -#' @param facet_formula formula for facetting in ggplot. If this is `NULL` -#' (the default), no facetting will take place -#' @param facet_wrap_or_grid Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param scales scales argument that gets passed down to ggplot. Only necessary -#' if you make use of facetting. Default is "free_y" #' @return ggplot object with a plot of interval coverage #' @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual -#' facet_wrap facet_grid +#' scale_y_continuous #' @importFrom data.table dcast #' @export #' #' @examples -#' example1 <- scoringutils::quantile_example_data -#' scores <- scoringutils::eval_forecasts(example1, -#' summarise_by = c("model", "quantile")) -#' quantile_coverage(scores) - -quantile_coverage <- function(summarised_scores, - colour = "model", - facet_formula = NULL, - facet_wrap_or_grid = "facet_wrap", - scales = "free_y") { - - p2 <- ggplot2::ggplot(data = summarised_scores, - ggplot2::aes_string(x = "quantile", colour = colour)) + - ggplot2::geom_polygon(data = data.frame(x = c(0, 0.5, 0.5, - 0.5, 0.5, 1), - y = c(0, 0, 0.5, - 0.5, 1, 1), - g = c("o", "o", "o")), - ggplot2::aes(x = x, y = y, group = g, - fill = g), - alpha = 0.05, - colour = "white", - fill = "olivedrab3") + - ggplot2::geom_line(ggplot2::aes(y = quantile), colour = "grey", - linetype = "dashed") + - ggplot2::geom_line(ggplot2::aes(y = quantile_coverage)) + - ggplot2::theme_light() + - ggplot2::theme(legend.position = "bottom") + - ggplot2::xlab("Quantile") + - ggplot2::ylab("% obs below quantile") + - ggplot2::coord_cartesian(expand = FALSE) +#' library("scoringutils") +#' scores <- score(example_quantile) +#' scores <- summarise_scores(scores, by = c("model", "quantile")) +#' plot_quantile_coverage(scores) +plot_quantile_coverage <- function(scores, + colour = "model") { + p2 <- ggplot( + data = scores, + aes_string(x = "quantile", colour = colour) + ) + + geom_polygon( + data = data.frame( + x = c( + 0, 0.5, 0.5, + 0.5, 0.5, 1 + ), + y = c( + 0, 0, 0.5, + 0.5, 1, 1 + ), + g = c("o", "o", "o") + ), + aes( + x = x, y = y, group = g, + fill = g + ), + alpha = 0.05, + colour = "white", + fill = "olivedrab3" + ) + + geom_line(aes(y = quantile), + colour = "grey", + linetype = "dashed" + ) + + geom_line(aes(y = quantile_coverage)) + + theme_scoringutils() + + theme(legend.position = "bottom") + + xlab("Quantile") + + ylab("% Obs below quantile") + + scale_y_continuous(labels = function(x) {paste(100 * x)}) + + coord_cartesian(expand = FALSE) - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - p2 <- p2 + - ggplot2::facet_wrap(facet_formula, scales = scales) + return(p2) +} + + + + + + + +#' @title Plot Heatmap of Pairwise Comparisons +#' +#' @description +#' Creates a heatmap of the ratios or pvalues from a pairwise comparison +#' between models +#' +#' @param comparison_result A data.frame as produced by +#' [pairwise_comparison()] +#' @param type character vector of length one that is either "mean_scores_ratio" or "pval". +#' This denotes whether to visualise the ratio or the p-value of the +#' pairwise comparison. Default is "mean_scores_ratio" +#' @param smaller_is_good logical (default is `TRUE`) that indicates whether +#' smaller or larger values are to be interpreted as 'good' (as you could just +#' invert the mean scores ratio) +#' @importFrom ggplot2 ggplot aes geom_tile geom_text labs coord_cartesian +#' scale_fill_gradient2 theme_light element_text +#' @importFrom data.table as.data.table setnames rbindlist +#' @importFrom stats reorder +#' @importFrom ggplot2 labs coord_cartesian facet_wrap facet_grid theme +#' element_text element_blank ggtitle +#' @export +#' +#' @examples +#' library(ggplot2) +#' library(scoringutils) +#' df <- data.frame( +#' model = rep(c("model1", "model2", "model3"), each = 10), +#' id = rep(1:10), +#' interval_score = abs(rnorm(30, mean = rep(c(1, 1.3, 2), each = 10))), +#' ae_median = (abs(rnorm(30))) +#' ) +#' +#' scores <- score(example_quantile) +#' pairwise <- pairwise_comparison(scores, by = "target_type") +#' plot_pairwise_comparison(pairwise) + +#' facet_wrap(~target_type) +plot_pairwise_comparison <- function(comparison_result, + type = c("mean_scores_ratio", "pval", "together"), + smaller_is_good = TRUE) { + comparison_result <- data.table::as.data.table(comparison_result) + + comparison_result[, model := reorder(model, -relative_skill)] + levels <- levels(comparison_result$model) + + + get_fill_scale <- function(values, breaks, plot_scales) { + values[is.na(values)] <- 1 # this would be either ratio = 1 or pval = 1 + scale <- cut(values, + breaks = breaks, + include.lowest = TRUE, + right = FALSE, + labels = plot_scales + ) + # scale[is.na(scale)] <- 0 + return(as.numeric(as.character(scale))) + } + + type <- match.arg(type) + + if (type == "together") { + # obtain only the upper triangle of the comparison + # that is used for showing ratios + # need to change the order if larger is good + if (smaller_is_good) { + unique_comb <- as.data.frame(t(combn(rev(levels), 2))) } else { - p2 <- p2 + - ggplot2::facet_grid(facet_formula, scales = scales) + unique_comb <- as.data.frame(t(combn((levels), 2))) + } + + colnames(unique_comb) <- c("model", "compare_against") + upper_triangle <- merge(comparison_result, unique_comb) + + # change levels for plotting order + upper_triangle[, `:=`( + model = factor(model, levels), + compare_against = factor(compare_against, levels) + )] + + # reverse y and x if larger is better + if (!smaller_is_good) { + data.table::setnames( + upper_triangle, + c("model", "compare_against"), + c("compare_against", "model") + ) } + + # modify upper triangle ------------------------------------------------------ + # add columns where a model is compared with itself. make adj_pval NA + # to plot it as grey later on + equal <- data.table::data.table( + model = levels, + compare_against = levels, + mean_scores_ratio = 1, + pval = NA, + adj_pval = NA + ) + upper_triangle_complete <- data.table::rbindlist(list( + upper_triangle, + equal + ), fill = TRUE) + + # define interest variable + upper_triangle_complete[, var_of_interest := round(mean_scores_ratio, 2)] + + # implemnt breaks for colour heatmap + breaks <- c(0, 0.1, 0.5, 0.75, 1, 1.33, 2, 10, Inf) + plot_scales <- c(-1, -0.5, -0.25, 0, 0, 0.25, 0.5, 1) + if (!smaller_is_good) { + plot_scales <- rev(plot_scales) + } + upper_triangle_complete[, fill_col := get_fill_scale( + var_of_interest, + breaks, plot_scales + )] + + # create mean_scores_ratios in plot + plot <- ggplot( + upper_triangle_complete, + aes( + x = compare_against, + y = model, + fill = fill_col + ) + ) + + geom_tile(width = 0.98, height = 0.98) + + geom_text(aes(label = var_of_interest), + na.rm = TRUE + ) + + scale_fill_gradient2( + low = "skyblue", mid = "grey95", + high = "brown1", + na.value = "lightgrey", + midpoint = 0, + limits = c(-1, 1), + name = NULL + ) + + theme_scoringutils() + + theme( + axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1, color = "brown4" + ), + axis.text.y = element_text(color = "steelblue4"), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + # panel.background = element_rect(fill = "grey90"), + # axis.line.y = element_line(color = "steelblue4", size = 4), + # axis.line.x = element_line(color = "brown3", size = 4), + legend.position = "none" + ) + + labs( + x = "", y = "", + title = "Pairwise comparisons - mean_scores_ratio (upper) and pval (lower)" + ) + + coord_cartesian(expand = FALSE) + + # add pvalues to plot -------------------------------------------------------- + # obtain lower triangle for the pvalues + lower_triangle <- data.table::copy(upper_triangle) + data.table::setnames( + lower_triangle, + c("model", "compare_against"), + c("compare_against", "model") + ) + + lower_triangle[, var_of_interest := round(adj_pval, 3)] + # implemnt breaks for colour heatmap + breaks <- c(0, 0.01, 0.05, 0.1, 1) + plot_scales <- c(0.8, 0.5, 0.1, 0.000001) + lower_triangle[, fill_col := get_fill_scale( + var_of_interest, + breaks, plot_scales + )] + + fill_rule <- ifelse(lower_triangle$fill_col == 0.000001, "grey95", "palegreen3") + lower_triangle[, var_of_interest := as.character(var_of_interest)] + lower_triangle[, var_of_interest := ifelse(var_of_interest == "0", + "< 0.001", var_of_interest + )] + + plot <- plot + + geom_tile( + data = lower_triangle, + aes(alpha = fill_col), + fill = fill_rule, + color = "white", + width = 0.97, height = 0.97 + ) + + geom_text( + data = lower_triangle, + aes(label = var_of_interest), + na.rm = TRUE + ) + } else if (type == "mean_scores_ratio") { + comparison_result[, var_of_interest := round(mean_scores_ratio, 2)] + + # implemnt breaks for colour heatmap + breaks <- c(0, 0.1, 0.5, 0.75, 1, 1.33, 2, 10, Inf) + plot_scales <- c(-1, -0.5, -0.25, 0, 0, 0.25, 0.5, 1) + comparison_result[, fill_col := get_fill_scale( + var_of_interest, + breaks, plot_scales + )] + + high_col <- "brown1" + } else { + comparison_result[, var_of_interest := round(pval, 3)] + # implemnt breaks for colour heatmap + breaks <- c(0, 0.01, 0.05, 0.1, 1) + plot_scales <- c(1, 0.5, 0.1, 0) + comparison_result[, fill_col := get_fill_scale( + var_of_interest, + breaks, plot_scales + )] + + high_col <- "palegreen3" + comparison_result[, var_of_interest := as.character(var_of_interest)] + comparison_result[, var_of_interest := ifelse(var_of_interest == "0", + "< 0.001", var_of_interest + )] } - return(p2) + plot <- ggplot( + comparison_result, + aes( + y = reorder(model, 1 / mean_scores_ratio, FUN = geom_mean_helper), + x = reorder(compare_against, mean_scores_ratio, FUN = geom_mean_helper), + fill = fill_col + ) + ) + + geom_tile( + color = "white", + width = 0.97, height = 0.97 + ) + + geom_text(aes(label = var_of_interest), + na.rm = TRUE + ) + + scale_fill_gradient2( + low = "skyblue", mid = "grey95", + high = high_col, + na.value = "lightgrey", + midpoint = 0, + limits = c(-1, 1), + name = NULL + ) + + theme_scoringutils() + + theme( + axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1 + ), + legend.position = "none" + ) + + labs( + x = "", y = "", + title = "Pairwise comparisons - p-value whether mean scores ratio equal to 1" + ) + + coord_cartesian(expand = FALSE) + + if (type == "mean_scores_ratio") { + plot <- plot + + theme( + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1, color = "brown4" + ), + axis.text.y = element_text(color = "steelblue4") + ) + + ggtitle("Pairwise comparisons - ratio of mean scores (for overlapping forecast sets)") + } + return(plot) } +#' @title PIT Histogram +#' +#' @description +#' Make a simple histogram of the probability integral transformed values to +#' visually check whether a uniform distribution seems likely. +#' +#' @param pit either a vector with the PIT values of size n, or a data.frame as +#' produced by [pit()] +#' @param num_bins the number of bins in the PIT histogram, default is "auto". +#' When `num_bins == "auto"`, [plot_pit()] will either display 10 bins, or it +#' will display a bin for each available quantile in case you passed in data in +#' a quantile-based format. +#' You can control the number of bins by supplying a number. This is fine for +#' sample-based pit histograms, but may fail for quantile-based formats. In this +#' case it is preferred to supply explicit breaks points using the `breaks` +#' argument. +#' @param breaks numeric vector with the break points for the bins in the +#' PIT histogram. This is preferred when creating a PIT histogram based on +#' quantile-based data. Default is `NULL` and breaks will be determined by +#' `num_bins`. +#' @importFrom stats as.formula +#' @importFrom ggplot2 geom_col +#' @return vector with the scoring values +#' @examples +#' library(scoringutils) +#' +#' # PIT histogram in vector based format +#' true_values <- rnorm(30, mean = 1:30) +#' predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) +#' pit <- pit_sample(true_values, predictions) +#' plot_pit(pit) +#' +#' # quantile-based pit +#' pit <- pit(example_quantile, by = c("model")) +#' plot_pit(pit, breaks = seq(0.1, 1, 0.1)) +#' +#' # sample-based pit +#' pit <- pit(example_integer, by = c("model")) +#' plot_pit(pit) +#' @importFrom ggplot2 ggplot aes xlab ylab geom_histogram stat theme_light +#' @export +plot_pit <- function(pit, + num_bins = "auto", + breaks = NULL) { + if ("quantile" %in% names(pit)) { + type <- "quantile-based" + } else { + type <- "sample-based" + } + + # use breaks if explicitly given, otherwise assign based on number of bins + if (!is.null(breaks)) { + plot_quantiles <- breaks + } else if (is.null(num_bins) | num_bins == "auto") { + # automatically set number of bins + if (type == "sample-based") { + num_bins <- 10 + width <- 1 / num_bins + plot_quantiles <- seq(width, 1, width) + } + if (type == "quantile-based") { + plot_quantiles <- unique(pit$quantile) + } + } else { + # if num_bins is explicitly given + width <- 1 / num_bins + plot_quantiles <- seq(width, 1, width) + } + # function for data.frames + if (is.data.frame(pit)) { + facet_cols <- get_forecast_unit(pit) + formula <- as.formula(paste("~", paste(facet_cols, collapse = "+"))) + + # quantile version + if (type == "quantile-based") { + if (num_bins == "auto") { + } else { + width <- 1 / num_bins + plot_quantiles <- seq(width, 1, width) + } + if (!is.null(breaks)) { + plot_quantiles <- breaks + } + hist <- ggplot( + data = pit[quantile %in% plot_quantiles], + aes(x = quantile, y = pit_value) + ) + + geom_col(position = "dodge") + + facet_wrap(formula) + } + if (type == "sample-based") { + hist <- ggplot( + data = pit, + aes(x = pit_value) + ) + + geom_histogram(aes(y = stat(count) / sum(count)), + breaks = plot_quantiles, + colour = "grey" + ) + + facet_wrap(formula) + } + } else { + # non data.frame version + hist <- ggplot( + data = data.frame(x = pit), + aes(x = x) + ) + + geom_histogram(aes(y = stat(count) / sum(count)), + breaks = plot_quantiles, + colour = "grey" + ) + } + hist <- hist + + xlab("PIT") + + ylab("Frequency") + + theme_scoringutils() + return(hist) +} #' @title Visualise Where Forecasts Are Available @@ -945,123 +1133,95 @@ quantile_coverage <- function(summarised_scores, #' @description #' Visualise Where Forecasts Are Available #' -#' @param data data.frame with predictions in the same format required for -#' [eval_forecasts()] +#' @param avail_forecasts data.frame with a column called `Number forecasts` as +#' produced by [avail_forecasts()] #' @param y character vector of length one that denotes the name of the column -#' to appear on the y-axis of the plot +#' to appear on the y-axis of the plot. Default is "model". #' @param x character vector of length one that denotes the name of the column -#' to appear on the x-axis of the plot +#' to appear on the x-axis of the plot. Default is "forecast_date". #' @param make_x_factor logical (default is TRUE). Whether or not to convert #' the variable on the x-axis to a factor. This has an effect e.g. if dates #' are shown on the x-axis. -#' @param summarise_by character vector or `NULL` (the default) that -#' denotes the categories over which the number of forecasts should be summed -#' up. By default (i.e. `summarise_by = NULL`) this will be all the -#' columns that appear in either x, y, or the facetting formula. -#' @param collapse_to_one logical. If `TRUE`) (the default), everything -#' not included in `by` will be counted only once. This is useful, for -#' example, if you don't want to count every single sample or quantile, but -#' instead treat one set of samples or quantiles as one forecast. -#' @param by character vector or `NULL` (the default) that denotes the -#' unit of an individual forecast. This argument behaves similarly to the -#' `by` argument in \code{link{eval_forecasts}}. By default, all columns -#' are used that are not part of any internally protected columns like "sample" -#' or "prediction" or similar. The `by` argument is only necessary if -#' `collapse_to_one = TRUE` to indicate which rows not to collapse to one. #' @param show_numbers logical (default is `TRUE`) that indicates whether #' or not to show the actual count numbers on the plot -#' @param facet_formula formula for facetting in ggplot. If this is `NULL` -#' (the default), no facetting will take place -#' @param facet_wrap_or_grid character. Use ggplot2's `facet_wrap` or -#' `facet_grid`? Anything other than "facet_wrap" will be interpreted as -#' `facet_grid`. This only takes effect if `facet_formula` is not -#' `NULL` -#' @param scales character. The scales argument gets passed down to ggplot. -#' Only necessary -#' if you make use of facetting. Default is "fixed" -#' @param legend_position character that indicates where to put the legend. -#' The argument gets passed to ggplot2. By default ("none"), no legend is shown. #' @return ggplot object with a plot of interval coverage #' @importFrom ggplot2 ggplot scale_colour_manual scale_fill_manual -#' facet_wrap facet_grid +#' geom_tile scale_fill_gradient aes_string #' @importFrom data.table dcast .I .N #' @export #' #' @examples -#' example1 <- scoringutils::range_example_data_long -#' show_avail_forecasts(example1, x = "value_date", facet_formula = ~ value_desc) - -show_avail_forecasts <- function(data, +#' library(scoringutils) +#' library(ggplot2) +#' avail_forecasts <- avail_forecasts(example_quantile, +#' by = c( +#' "model", "target_type", +#' "target_end_date" +#' ) +#' ) +#' plot_avail_forecasts(avail_forecasts, +#' x = "target_end_date", +#' show_numbers = FALSE +#' ) + +#' facet_wrap("target_type") +plot_avail_forecasts <- function(avail_forecasts, y = "model", x = "forecast_date", make_x_factor = TRUE, - summarise_by = NULL, - collapse_to_one = TRUE, - by = NULL, - show_numbers = TRUE, - facet_formula = NULL, - facet_wrap_or_grid = "facet_wrap", - scales = "fixed", - legend_position = "none") { - - data <- data.table::as.data.table(data) - - if (is.null(summarise_by)) { - facet_vars <- all.vars(facet_formula) - summarise_by <- unique(c(x, y, facet_vars)) - } - - data <- data[!is.na(prediction),] - - if (collapse_to_one) { - # only count one forecast per group in by - # this e.g. makes sure that quantiles and samples are not counted - # multiple times - if (is.null(by)) { - protected_columns <- c("prediction", "true_value", "sample", "quantile", - "range", "boundary") - by <- setdiff(colnames(data), protected_columns) - } - data <- data[data[, .I[1], by = by]$V1] - } - - # count items per group in summarise_by - df <- data[, .(n_obs = .N), by = summarise_by] + show_numbers = TRUE) { + avail_forecasts <- as.data.table(avail_forecasts) if (make_x_factor) { - df[, eval(x) := as.factor(get(x))] + avail_forecasts[, eval(x) := as.factor(get(x))] } - plot <- ggplot2::ggplot(df, - ggplot2::aes_string(y = y, x = x)) + - ggplot2::geom_tile(ggplot2::aes(fill = n_obs), - width = 0.97, height = 0.97) + - ggplot2::scale_fill_gradient(low = "grey95", high = "steelblue", - na.value = "lightgrey") + - ggplot2::theme_light() + - ggplot2::theme(panel.grid.major.x = ggplot2::element_blank(), - panel.grid.minor.x = ggplot2::element_blank(), - legend.position = legend_position, - axis.text.x = ggplot2::element_text(angle = 90, vjust = 1, - hjust=1)) + - ggplot2::theme(panel.spacing = ggplot2::unit(2, "lines")) + plot <- ggplot( + avail_forecasts, + aes_string(y = y, x = x) + ) + + geom_tile(aes(fill = `Number forecasts`), + width = 0.97, height = 0.97 + ) + + scale_fill_gradient( + low = "grey95", high = "steelblue", + na.value = "lightgrey" + ) + + theme_scoringutils() + + theme( + panel.grid.major.x = element_blank(), + panel.grid.minor.x = element_blank(), + axis.text.x = element_text( + angle = 90, vjust = 1, + hjust = 1 + ) + ) + + theme(panel.spacing = unit(2, "lines")) if (show_numbers) { plot <- plot + - ggplot2::geom_text(ggplot2::aes(label = n_obs)) - } - - if (!is.null(facet_formula)) { - if (facet_wrap_or_grid == "facet_wrap") { - plot <- plot + - ggplot2::facet_wrap(facet_formula, scales = scales) - } else { - plot <- plot + - ggplot2::facet_grid(facet_formula, scales = scales) - } + geom_text(aes(label = `Number forecasts`)) } return(plot) } + + +#' @title Scoringutils ggplot2 theme +#' +#' @description +#' A theme for ggplot2 plots used in scoringutils +#' @return A ggplot2 theme +#' @importFrom ggplot2 theme theme_minimal element_line +#' @export +theme_scoringutils <- function() { + theme_minimal() + + theme(axis.line = element_line(colour = "grey80"), + axis.ticks = element_line(colour = "grey80"), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_blank(), + panel.background = element_blank(), + legend.position = "bottom") +} diff --git a/R/score.R b/R/score.R new file mode 100644 index 000000000..d1a8b7865 --- /dev/null +++ b/R/score.R @@ -0,0 +1,118 @@ +#' @title Evaluate forecasts +#' +#' @description The function `score` allows automatic scoring of forecasts and +#' wraps the lower level functions in the \pkg{scoringutils} package. +#' +#' It can be used to score forecasts in a quantile-based, sample-based, or +#' binary format. To obtain an overview of what input is expected, have a look +#' at the [example_quantile], [example_continuous], [example_integer], and +#' [example_binary] data sets. +#' +#' You can (and should) check your input using the function [check_forecasts()] +#' before scoring. +#' +#' To obtain a quick overview of the evaluation metrics used, have a look at the +#' [metrics_summary] data included in the package. +#' +#' @param data A data.frame or data.table with the predictions and observations. +#' The following columns need to be present: +#' \itemize{ +#' \item `true_value` - the true observed values +#' \item `prediction` - predictions or predictive samples for one +#' true value. (You only don't need to provide a prediction column if +#' you want to score quantile forecasts in a wide range format.)} +#' For integer and continuous forecasts a `sample` column is needed: +#' \itemize{ +#' \item `sample` - an index to identify the predictive samples in the +#' prediction column generated by one model for one true value. Only +#' necessary for continuous and integer forecasts, not for +#' binary predictions.} +#' For a quantile-format forecast you should provide a column called `quantile`: +#' - `quantile`: quantile to which the prediction corresponds +#' @param metrics the metrics you want to have in the output. If `NULL` (the +#' default), all available metrics will be computed. For a list of available +#' metrics see [available_metrics()] +#' @param ... additional parameters passed down to lower-level functions. +#' For example, the following arguments can change how weighted interval +#' scores are computed: +#' - `count_median_twice` that controls how the interval scores for different +#' intervals are summed up. This should be a logical (default is `FALSE`) that +#' indicates whether or not to count the median twice when summarising. +#' This would conceptually treat the +#' median as a 0% prediction interval, where the median is the lower as well as +#' the upper bound. The alternative is to treat the median as a single quantile +#' forecast instead of an interval. The interval score would then +#' be better understood as an average of quantile scores.) +#' +#' @return A data.table with unsummarised scores. There will be one score per +#' quantile or sample, which is usually not desired, so you should always run +#' [summarise_scores()] on the unsummarised scores. +#' +#' @importFrom data.table ':=' as.data.table +#' +#' @examples +#' library(magrittr) # pipe operator +#' +#' check_forecasts(example_quantile) +#' score(example_quantile) %>% +#' add_coverage(by = c("model", "target_type")) %>% +#' summarise_scores(by = c("model", "target_type")) +#' +#' # forecast formats with different metrics +#' score(example_binary) +#' score(example_quantile) +#' score(example_integer) +#' score(example_continuous) +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @references Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ +#' (2019) Assessing the performance of real-time epidemic forecasts: A +#' case study of Ebola in the Western Area region of Sierra Leone, 2014-15. +#' PLoS Comput Biol 15(2): e1006785. +#' @export + +score <- function(data, + metrics = NULL, + ...) { + + # preparations --------------------------------------------------------------- + check_data <- check_forecasts(data) + + data <- check_data$cleaned_data + prediction_type <- check_data$prediction_type + forecast_unit <- check_data$forecast_unit + target_type <- check_data$target_type + + # check metrics are available or set to all metrics -------------------------- + metrics <- check_metrics(metrics) + + # Score binary predictions --------------------------------------------------- + if (target_type == "binary") { + scores <- score_binary( + data = data, + forecast_unit = forecast_unit, + metrics = metrics + ) + } + + # Score quantile predictions ------------------------------------------------- + if (prediction_type == "quantile") { + scores <- score_quantile( + data = data, + forecast_unit = forecast_unit, + metrics = metrics, + ... + ) + } + + # Score integer or continuous predictions ------------------------------------ + if (prediction_type %in% c("integer", "continuous") && (target_type != "binary")) { + scores <- score_sample( + data = data, + forecast_unit = forecast_unit, + metrics = metrics, + prediction_type = prediction_type + ) + } + + return(scores[]) +} diff --git a/R/score_binary.R b/R/score_binary.R new file mode 100644 index 000000000..fee34f308 --- /dev/null +++ b/R/score_binary.R @@ -0,0 +1,32 @@ +#' @title Evaluate forecasts in a Binary Format +#' +#' @inheritParams score +#' @param forecast_unit A character vector with the column names that define +#' the unit of a single forecast, i.e. a forecast was made for a combination +#' of the values in `forecast_unit`. +#' +#' @return A data.table with appropriate scores. For more information see +#' [score()]. +#' +#' @importFrom data.table ':=' +#' +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @keywords internal + +score_binary <- function(data, + forecast_unit, + metrics) { + if ("brier_score" %in% metrics) { + data[, "brier_score" := brier_score(true_value, prediction), + by = forecast_unit + ] + } + + if ("log_score" %in% metrics) { + data[, "log_score" := logs_binary(true_value, prediction), + by = forecast_unit + ] + } + + return(data[]) +} diff --git a/R/score_continuous_integer.R b/R/score_continuous_integer.R new file mode 100644 index 000000000..6aeb35b76 --- /dev/null +++ b/R/score_continuous_integer.R @@ -0,0 +1,86 @@ +#' @title Evaluate forecasts in a Sample-Based Format (Integer or Continuous) +#' +#' @inheritParams score +#' @param prediction_type character, should be either "continuous" or "integer" +#' @param forecast_unit A character vector with the column names that define +#' the unit of a single forecast, i.e. a forecast was made for a combination +#' of the values in `forecast_unit` +#' +#' @return A data.table with appropriate scores. For more information see +#' [score()] +#' +#' @importFrom data.table ':=' as.data.table rbindlist %like% +#' +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @inherit score references +#' @keywords internal + +score_sample <- function(data, + forecast_unit, + metrics, + prediction_type) { + if (missing(prediction_type)) { + if (isTRUE(all.equal(data$prediction, as.integer(data$prediction)))) { + prediction_type <- "integer" + } else { + prediction_type <- "continuous" + } + } + + # calculate scores ----------------------------------------------------------- + # sharpness + if (any(c("sharpness", "mad") %in% metrics)) { + data[, mad := mad_sample(t(prediction)), by = forecast_unit] + } + # bias + if ("bias" %in% metrics) { + data[, bias := bias_sample( + unique(true_value), + t(prediction) + ), by = forecast_unit] + } + # DSS + if ("dss" %in% metrics) { + data[, dss := scoringutils::dss_sample( + unique(true_value), + t(prediction) + ), by = forecast_unit] + } + # CRPS + if ("crps" %in% metrics) { + data[, crps := scoringutils::crps_sample( + unique(true_value), + t(prediction) + ), by = forecast_unit] + } + # Log Score + if ("log_score" %in% metrics) { + # only compute if prediction type is continuous + if (prediction_type == "continuous") { + data[, log_score := scoringutils::logs_sample( + unique(true_value), + t(prediction) + ), by = forecast_unit] + } + } + # absolute error + if (any(c("ae_median", "abs_error", "ae_point") %in% metrics)) { + data[, ae_median := abs(unique(true_value) - median(prediction)), + by = forecast_unit] + } + # squared error + if (any(c("se_mean", "squared_error", "se_point") %in% metrics)) { + data[, se_mean := (unique(true_value) - mean(prediction))^2, + by = forecast_unit] + } + + res <- data.table::copy(data) + + # make scores unique to avoid redundancy. + res <- res[, lapply(.SD, unique), + .SDcols = colnames(res) %like% paste(metrics, collapse = "|"), + by = forecast_unit + ] + + return(res[]) +} diff --git a/R/score_quantile.R b/R/score_quantile.R new file mode 100644 index 000000000..5616eb095 --- /dev/null +++ b/R/score_quantile.R @@ -0,0 +1,160 @@ +#' @title Evaluate forecasts in a Quantile-Based Format +#' +#' @inheritParams score +#' @inheritParams interval_score +#' @param count_median_twice logical that controls whether or not to count the +#' median twice when summarising (default is \code{FALSE}). Counting the +#' median twice would conceptually treat it as a 0\% prediction interval, where +#' the median is the lower as well as the upper bound. The alternative is to +#' treat the median as a single quantile forecast instead of an interval. The +#' interval score would then be better understood as an average of quantile +#' scores. +#' @param forecast_unit A character vector with the column names that define +#' the unit of a single forecast, i.e. a forecast was made for a combination +#' of the values in `forecast_unit` +#' +#' @return A data.table with appropriate scores. For more information see +#' [score()] +#' +#' @importFrom data.table ':=' as.data.table rbindlist %like% +#' +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @inherit score references +#' @keywords internal + +score_quantile <- function(data, + forecast_unit, + metrics, + weigh = TRUE, + count_median_twice = FALSE, + separate_results = TRUE) { + + # make sure to have both quantile as well as range format -------------------- + range_data <- quantile_to_range_long(data, + keep_quantile_col = FALSE + ) + # adds the range column to the quantile data set + quantile_data <- range_long_to_quantile(range_data, + keep_range_col = TRUE + ) + + # to deal with point forecasts in a quantile format. This in effect adds + # a third column next to lower and upper after pivoting + range_data[is.na(range), boundary := "point"] + + range_data <- data.table::dcast(range_data, ... ~ boundary, + value.var = "prediction" + ) + + # if we only score point forecasts, it may be true that there are no columns + # upper and lower in the data.frame. If so, these need to be added + if (!all(c("upper", "lower") %in% colnames(range_data))) { + range_data[, c("upper", "lower") := NA] + } + + # set up results data.table that will then be modified throughout ------------ + res <- data.table::copy(range_data) + + # calculate scores on range format ------------------------------------------- + if ("interval_score" %in% metrics) { + # compute separate results if desired + if (separate_results) { + outcols <- c( + "interval_score", "dispersion", + "underprediction", "overprediction" + ) + } else { + outcols <- "interval_score" + } + res <- res[, eval(outcols) := do.call( + scoringutils::interval_score, + list(true_value, lower, + upper, range, + weigh, + separate_results = TRUE + ) + )] + } + + # compute coverage for every single observation + if ("coverage" %in% metrics) { + res[, coverage := ifelse(true_value <= upper & true_value >= lower, 1, 0)] + res[, coverage_deviation := coverage - range / 100] + } + + # compute bias + if ("bias" %in% metrics) { + res[, bias := bias_range( + range = range, lower = lower, upper = upper, + true_value = unique(true_value) + ), + by = forecast_unit + ] + } + + # compute absolute and squared error for point forecasts + # these are marked by an NA in range, and a numeric value for point + if (any(c("se_point, se_mean, ae_point", "ae_median", "absolute_error") %in% metrics)) { + if ("point" %in% colnames(res)) { + res[ + is.na(range) & is.numeric(point), + `:=` (ae_point = abs_error(predictions = point, true_value), + se_point = squared_error(predictions = point, true_value)) + ] + } + } + + # calculate scores on quantile format ---------------------------------------- + # compute absolute error of the median + if ("ae_median" %in% metrics) { + quantile_data[, ae_median := ae_median_quantile( + true_value, + prediction, + quantile + ), + by = forecast_unit + ] + } + + # compute quantile coverage based on quantile version + if ("quantile_coverage" %in% metrics) { + quantile_data[, quantile_coverage := (true_value <= prediction)] + } + + # merge metrics computed on quantile data (i.e. ae_median, quantile_coverage) back + # into metrics computed on range data. One important side effect of this is + # that it controls whether we count the median twice for the interval score + # (row is then duplicated) or only once. However, merge only needs to happen + # if we computed either the interval score or the ae_median or quantile coverage + if (any(c("ae_median", "interval_score", "quantile_coverage") %in% metrics)) { + # delete unnecessary columns before merging back + keep_cols <- unique(c( + forecast_unit, "quantile", "ae_median", "quantile_coverage", + "boundary", "range" + )) + delete_cols <- names(quantile_data)[!(names(quantile_data) %in% keep_cols)] + quantile_data[, eval(delete_cols) := NULL] + + # duplicate median column before merging if median is to be counted twice + # if this is false, then the res will have one entry for every quantile, + # which translates to two rows for every interval, but only one for the median + if (count_median_twice) { + median <- quantile_data[quantile == 0.5, ][, boundary := "upper"] + quantile_data <- data.table::rbindlist(list(quantile_data, median)) + } + + # merge back with other metrics + merge_cols <- setdiff(keep_cols, c( + "ae_median", "quantile_coverage", "quantile", + "boundary" + )) + # specify all.x = TRUE as the point forecasts got deleted when + # going from range to quantile above + res <- merge(res, quantile_data, by = merge_cols, all.x = TRUE) + } + + # delete internal columns before returning result + res <- delete_columns(res, c("upper", "lower", "boundary", "point", "true_value")) + + return(res[]) +} diff --git a/R/scoringRules_wrappers.R b/R/scoringRules_wrappers.R index 06d77e257..8d6800b83 100644 --- a/R/scoringRules_wrappers.R +++ b/R/scoringRules_wrappers.R @@ -1,4 +1,4 @@ -#' @title LogS +#' @title Logarithmic score #' #' @description #' Wrapper around the [`logs_sample()`][scoringRules::scores_sample_univ] @@ -10,108 +10,62 @@ #' integer-valued Monte Carlo Samples. The Log Score can be used for specific #' integer valued probability distributions. See the scoringRules package for #' more details. -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the -#' number of Monte Carlo samples +#' @inheritParams ae_median_sample #' @return vector with the scoring values #' @importFrom scoringRules logs_sample #' @examples #' true_values <- rpois(30, lambda = 1:30) #' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' logs(true_values, predictions) +#' logs_sample(true_values, predictions) #' @export #' @references #' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic #' Forecasts with scoringRules, - - -logs <- function(true_values, predictions) { - - # ============== Error handling ============== - - if (missing(true_values) | missing(predictions)) { - stop("true_values or predictions argument missing") - } - - n <- length(true_values) - - if (is.data.frame(predictions)) { - predictions <- as.matrix(predictions) - } - if (!is.matrix(predictions)) { - msg <- sprintf("'predictions' should be a matrix. Instead `%s` was found", - class(predictions[1])) - stop(msg) - } - if (nrow(predictions) != n) { - msg <- sprintf("Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", - n, nrow(predictions)) - stop(msg) - } - - # ============================================ - - scoringRules::logs_sample(y = true_values, - dat = predictions) +#' @keywords metric + +logs_sample <- function(true_values, predictions) { + check_true_values(true_values) + check_predictions(predictions, true_values, + class = "matrix" + ) + + scoringRules::logs_sample( + y = true_values, + dat = predictions + ) } - - #' @title Dawid-Sebastiani Score #' #' @description #' Wrapper around the [`dss_sample()`][scoringRules::scores_sample_univ] #' function from the #' \pkg{scoringRules} package. -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the -#' number of Monte Carlo samples +#' @inheritParams logs_sample #' @return vector with scoring values #' @importFrom scoringRules dss_sample #' @examples #' true_values <- rpois(30, lambda = 1:30) #' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' dss(true_values, predictions) +#' dss_sample(true_values, predictions) #' @export #' @references #' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic #' Forecasts with scoringRules, - -dss <- function(true_values, predictions) { - - # ============== Error handling ============== - if (missing(true_values) | missing(predictions)) { - stop("true_values or predictions argument missing") - } - - n <- length(true_values) - - if (is.data.frame(predictions)) { - predictions <- as.matrix(predictions) - } - if (!is.matrix(predictions)) { - msg <- sprintf("'predictions' should be a matrix. Instead `%s` was found", - class(predictions[1])) - stop(msg) - } - if (nrow(predictions) != n) { - msg <- sprintf("Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", - n, nrow(predictions)) - stop(msg) - } - # ============================================ - - scoringRules::dss_sample(y = true_values, - dat = predictions) +#' @keywords metric + +dss_sample <- function(true_values, predictions) { + check_true_values(true_values) + check_predictions(predictions, true_values, + class = "matrix" + ) + + scoringRules::dss_sample( + y = true_values, + dat = predictions + ) } - - - - - #' @title Ranked Probability Score #' #' @description @@ -119,45 +73,29 @@ dss <- function(true_values, predictions) { #' function from the #' \pkg{scoringRules} package. Can be used for continuous as well as integer #' valued forecasts -#' @param true_values A vector with the true observed values of size n -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the -#' number of Monte Carlo samples +#' @inheritParams logs_sample #' @return vector with the scoring values #' @importFrom scoringRules crps_sample #' @examples #' true_values <- rpois(30, lambda = 1:30) #' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' crps(true_values, predictions) +#' crps_sample(true_values, predictions) #' @export #' @references #' Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic #' Forecasts with scoringRules, +#' @keywords metric -crps <- function(true_values, predictions) { - - # ============== Error handling ============== - if (missing(true_values) | missing(predictions)) { - stop("true_values or predictions argument missing") - } - - n <- length(true_values) +crps_sample <- function(true_values, predictions) { - if (is.data.frame(predictions)) { - predictions <- as.matrix(predictions) - } - if (!is.matrix(predictions)) { - msg <- sprintf("'predictions' should be a matrix. Instead `%s` was found", - class(predictions[1])) - stop(msg) - } - if (nrow(predictions) != n) { - msg <- sprintf("Mismatch: 'true_values' has length `%s`, but 'predictions' has `%s` rows.", - n, nrow(predictions)) - stop(msg) - } - # ============================================ + # check inputs + check_true_values(true_values) + check_predictions(predictions, true_values, + class = "matrix" + ) - scoringRules::crps_sample(y = true_values, - dat = predictions) + scoringRules::crps_sample( + y = true_values, + dat = predictions + ) } diff --git a/R/scoringutils.R b/R/scoringutils.R index 69c5903df..99e338ead 100644 --- a/R/scoringutils.R +++ b/R/scoringutils.R @@ -12,42 +12,40 @@ #' integer, or binary. #' #' A collection of different metrics and scoring rules can be accessed through -#' the function [eval_forecasts()]. Given a data.frame of the +#' the function [score()]. Given a data.frame of the #' correct form the function will automatically figure out the type of #' prediction and true values and return appropriate scoring metrics. #' #' The package also has a lot of default visualisation based on the output -#' created by [eval_forecasts()]. +#' created by [score()]. #' -#' - [score_table()] -#' - [correlation_plot()] -#' - [wis_components()] -#' - [range_plot()] -#' - [score_heatmap()] +#' - [plot_score_table()] +#' - [plot_correlation()] +#' - [plot_wis()] +#' - [plot_ranges()] +#' - [plot_heatmap()] #' - [plot_predictions()] -#' - [interval_coverage()] -#' - [quantile_coverage()] +#' - [plot_interval_coverage()] +#' - [plot_quantile_coverage()] #' #' Alternatively, the following functions can be accessed directly: #' #' - [brier_score()] #' - [pit()] -#' - [bias()] -#' - [quantile_bias()] -#' - [sharpness()] -#' - [crps()] -#' - [logs()] -#' - [dss()] +#' - [bias_sample()] +#' - [bias_quantile()] +#' - [bias_range()] +#' - [mad_sample()] +#' - [crps_sample()] +#' - [logs_sample()] +#' - [dss_sample()] #' - [ae_median_sample()] #' #' Predictions can be evaluated in a lot of different formats. If you want to #' convert from one format to the other, the following helper functions can #' do that for you: #' -#' - [sample_to_range_long()] #' - [sample_to_quantile()] -#' - [quantile_to_range_long()] -#' - [range_long_to_quantile()] #' #' @docType package #' @name scoringutils diff --git a/R/sharpness.R b/R/sharpness.R index 79e044de6..bd52845a2 100644 --- a/R/sharpness.R +++ b/R/sharpness.R @@ -1,18 +1,18 @@ -#' @title Determines sharpness of a probabilistic forecast +#' @title Determine dispersion of a probabilistic forecast #' @details #' Sharpness is the ability of the model to generate predictions within a -#' narrow range. It is a data-independent measure, and is purely a feature +#' narrow range and dispersion is the lack thereof. +#' It is a data-independent measure, and is purely a feature #' of the forecasts themselves. #' -#' Sharpness of predictive samples corresponding to one single true value is +#' Dispersion of predictive samples corresponding to one single true value is #' measured as the normalised median of the absolute deviation from #' the median of the predictive samples. For details, see [mad()][stats::mad()] +#' and the explanations given in Funk et al. (2019) #' -#' @param predictions nxN matrix of predictive samples, n (number of rows) being -#' the number of data points and N (number of columns) the -#' number of Monte Carlo samples +#' @inheritParams ae_median_sample #' @importFrom stats mad -#' @return vector with sharpness values +#' @return vector with dispersion values #' #' @references #' Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ (2019) @@ -23,25 +23,11 @@ #' @export #' @examples #' predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -#' sharpness(predictions) +#' mad_sample(predictions) +#' @keywords metric -sharpness <- function (predictions) { - - # ============== Error handling ============== - - if (missing(predictions)) { - stop("predictions argument missing") - } - - if (is.data.frame(predictions)) { - predictions <- as.matrix(predictions) - } - - if (!is.matrix(predictions)) { - stop("'predictions' should be a matrix") - } - - # ============================================ +mad_sample <- function(predictions) { + check_predictions(predictions, class = "matrix") sharpness <- apply(predictions, MARGIN = 1, mad) return(sharpness) diff --git a/R/summarise_scores.R b/R/summarise_scores.R new file mode 100644 index 000000000..facc0cbfe --- /dev/null +++ b/R/summarise_scores.R @@ -0,0 +1,244 @@ +#' @title Summarise scores as produced by [score()] +#' +#' @description Summarise scores as produced by [score()]- +#' +#' @inheritParams pairwise_comparison +#' @inheritParams score +#' @param by character vector with column names to summarise scores by. Default +#' is `NULL`, meaning that the only summary that takes is place is summarising +#' over quantiles (in case of quantile-based forecasts), such that there is one +#' score per forecast as defined by the unit of a single forecast (rather than +#' one score for every quantile). +#' @param fun a function used for summarising scores. Default is `mean`. +#' @param relative_skill logical, whether or not to compute relative +#' performance between models based on pairwise comparisons. +#' If `TRUE` (default is `FALSE`), then a column called +#' 'model' must be present in the input data. For more information on +#' the computation of relative skill, see [pairwise_comparison()]. +#' Relative skill will be calculated for the aggregation level specified in +#' `by`. +#' @param metric character with the name of the metric for which +#' a relative skill shall be computed. If equal to 'auto' (the default), then +#' this will be either interval score, crps or brier score (depending on which +#' of these is available in the input data) +#' @param baseline character string with the name of a model. If a baseline is +#' given, then a scaled relative skill with respect to the baseline will be +#' returned. By default (`NULL`), relative skill will not be scaled with +#' respect to a baseline model. +#' @examples +#' library(magrittr) # pipe operator +#' +#' # summarise over samples or quantiles to get one score per forecast +#' scores <- score(example_quantile) +#' summarise_scores(scores) +#' +#' # get scores by model +#' summarise_scores(scores, by = c("model")) +#' +#' # get scores by model and target type +#' summarise_scores(scores, by = c("model", "target_type")) +#' +#' # get standard deviation +#' summarise_scores(scores, by = "model", fun = sd) +#' +#' # round digits +#' summarise_scores(scores, by = c("model")) %>% +#' summarise_scores(fun = signif, digits = 2) +#' +#' # get quantiles of scores +#' # make sure to aggregate over ranges first +#' summarise_scores(scores, +#' by = "model", fun = quantile, +#' probs = c(0.25, 0.5, 0.75) +#' ) +#' +#' # get ranges +#' # summarise_scores(scores, by = "range") +#' @export +#' @keywords scoring + +summarise_scores <- function(scores, + by = NULL, + fun = mean, + relative_skill = FALSE, + metric = "auto", + baseline = NULL, + ...) { + + # preparations --------------------------------------------------------------- + # get unit of a single forecast + forecast_unit <- get_forecast_unit(scores) + + # if by is not provided, set to the unit of a single forecast + if (is.null(by)) { + by <- forecast_unit + } + + # check input arguments and check whether relative skill can be computed + relative_skill <- check_summary_params( + scores = scores, + by = by, + relative_skill = relative_skill, + baseline = baseline, + metric = metric + ) + + # get all available metrics to determine names of columns to summarise over + cols_to_summarise <- paste0(available_metrics(), collapse = "|") + + # takes the mean over ranges and quantiles first, if neither range nor + # quantile are in `by`. Reason to do this is that summaries may be + # inaccurate if we treat individual quantiles as independent forecasts + scores <- scores[, lapply(.SD, mean, ...), + by = c(unique(c(forecast_unit, by))), + .SDcols = colnames(scores) %like% cols_to_summarise + ] + + # do pairwise comparisons ---------------------------------------------------- + if (relative_skill) { + pairwise <- pairwise_comparison( + scores = scores, + metric = metric, + baseline = baseline, + by = by + ) + + # delete unnecessary columns + pairwise[, c( + "compare_against", "mean_scores_ratio", + "pval", "adj_pval" + ) := NULL] + pairwise <- unique(pairwise) + + # merge back + scores <- merge(scores, pairwise, + all.x = TRUE, + by = get_forecast_unit(pairwise) + ) + } + + # summarise scores ----------------------------------------------------------- + scores <- scores[, lapply(.SD, fun, ...), + by = c(by), + .SDcols = colnames(scores) %like% cols_to_summarise + ] + + # remove unnecessary columns ------------------------------------------------- + # if neither quantile nor range are in by, remove coverage and + # quantile_coverage because averaging does not make sense + if (!("range" %in% by) & ("coverage" %in% colnames(scores))) { + scores[, c("coverage") := NULL] + } + if (!("quantile" %in% by) & "quantile_coverage" %in% names(scores)) { + scores[, c("quantile_coverage") := NULL] + } + + return(scores[]) +} + +#' @title Check input parameters for [summarise_scores()] +#' +#' @description A helper function to check the input parameters for +#' [score()]. +#' +#' @inheritParams summarise_scores +#' +#' @keywords internal +check_summary_params <- function(scores, + by, + relative_skill, + baseline, + metric) { + + # check that columns in 'by' are actually present ---------------------------- + if (!all(by %in% c(colnames(scores), "range", "quantile"))) { + not_present <- setdiff(by, c(colnames(scores), "range", "quantile")) + msg <- paste0( + "The following items in `by` are not", + "valid column names of the data: '", + paste(not_present, collapse = ", "), + "'. Check and run `summarise_scores()` again" + ) + stop(msg) + } + + # error handling for relative skill computation ------------------------------ + if (relative_skill) { + if (!("model" %in% colnames(scores))) { + warning("to compute relative skills, there must column present called 'model'. Relative skill will not be computed") + relative_skill <- FALSE + } + models <- unique(scores$model) + if (length(models) < 2 + (!is.null(baseline))) { + warning("you need more than one model non-baseline model to make model comparisons. Relative skill will not be computed") + relative_skill <- FALSE + } + if (!is.null(baseline) && !(baseline %in% models)) { + warning("The baseline you provided for the relative skill is not one of the models in the data. Relative skill will not be computed") + relative_skill <- FALSE + } + if (metric != "auto" && !(metric %in% available_metrics())) { + warning("argument 'metric' must either be 'auto' or one of the metrics that can be computed. Relative skill will not be computed") + relative_skill <- FALSE + } + } + return(relative_skill) +} + + + +#' @title Add coverage of central prediction intervals +#' +#' @description Adds a column with the coverage of central prediction intervals +#' to unsummarised scores as produced by [score()] +#' +#' @details +#' The coverage values that are added are computed according to the values +#' specified in `by`. If, for example, `by = "model"`, then there will be one +#' coverage value for every model and [add_coverage()] will compute the coverage +#' for every model across the values present in all other columns which define +#' the unit of a single forecast. +#' +#' @inheritParams summarise_scores +#' @param by character vector with column names to add the coverage for. +#' @param ranges numeric vector of the ranges of the central prediction intervals +#' for which coverage values shall be added. +#' @return a data.table with unsummarised scores with columns added for the +#' coverage of the central prediction intervals. While the overall data.table +#' is still unsummarised, note that for the coverage columns some level of +#' summary is present according to the value specified in `by`. +#' @examples +#' library(magrittr) # pipe operator +#' score(example_quantile) %>% +#' add_coverage(by = c("model", "target_type")) %>% +#' summarise_scores(by = c("model", "target_type")) %>% +#' summarise_scores(fun = signif, digits = 2) +#' @export +#' @keywords scoring + +add_coverage <- function(scores, + by, + ranges = c(50, 90)) { + summarised_scores <- summarise_scores( + scores, + by = c(by, "range") + )[range %in% ranges] + + + # create cast formula + cast_formula <- + paste( + paste(by, collapse = "+"), + "~", + "paste0('coverage_', range)" + ) + + coverages <- dcast( + summarised_scores, + value.var = "coverage", + formula = cast_formula + ) + + scores_with_coverage <- merge(scores, coverages, by = by) + return(scores_with_coverage[]) +} diff --git a/R/utils.R b/R/utils.R index 1db72c84e..125a81493 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,69 +1,3 @@ -#' @title Check Variable is not NULL -#' -#' @description -#' Check whether a certain variable is not `NULL` and return the name of that -#' variable and the function call where the variable is missing. This function -#' is a helper function that should only be called within other functions -#' @param ... The variables to check -#' @return The function returns `NULL`, but throws an error if the variable is -#' missing. -#' -#' @keywords internal -check_not_null <- function(...) { - vars <- list(...) - varnames <- names(vars) - - for (i in 1:length(vars)) { - varname = varnames[i] - if (is.null(vars[[i]])) { - calling_function <- deparse1(sys.calls()[[sys.nframe()-1]]) - stop(paste0("variable '", varname, - "' is `NULL` in the following function call: '", - calling_function, "'")) - } - } - return(invisible(NULL)) -} - - - - - -#' @title Check Length -#' -#' @description -#' Check whether variables all have the same length -#' @param ... The variables to check -#' @param one_allowed logical, allow arguments of length one that can be recycled -#' -#' @return The function returns `NULL`, but throws an error if variable lengths -#' differ -#' -#' @keywords internal -check_equal_length <- function(..., - one_allowed = TRUE) { - vars <- list(...) - lengths <- sapply(vars, - FUN = function(x) { - length(x) - }) - - lengths <- unique(lengths) - - if (one_allowed) { - lengths <- lengths[lengths != 1] - } - - if (length(unique(lengths)) != 1) { - calling_function <- deparse1(sys.calls()[[sys.nframe()-1]]) - stop(paste0("Arguments passed to the following function call: '", - calling_function, - "' should have the same length (or length one). Arguments have the following lengths: ", - paste0(lengths, collapse = ", "))) - } - return(invisible(NULL)) -} - #' @title Calculate Geometric Mean #' #' @param x numeric vector of values for which to calculate the geometric mean @@ -76,161 +10,125 @@ geom_mean_helper <- function(x) { } -globalVariables(c("..index", - ".", - ".SD", - "adj_pval", - "ae_point", - "aem", - "boundary", - "brier_score", - "component_value", - "..colnames_x", - "..colnames_y", - "compare_against", - "count", - "coverage_deviation", - "CRPS", - "DSS", - "fill_col", - "identif", - "Interval_Score", - "overprediction", - "underprediction", - "quantile_coverage", - "LogS", - "calibration", - "coverage", - "hist", - "id", - "InternalDuplicateCheck", - "InternalNumCheck", - "log_score", - "lower", - "mean_scores_ratio", - "metric", - "metrics_select", - "model", - "n_obs", - "n_obs wis_component_name", - "pit_p_val", - "point", - "prediction", - "pval", - "quantile", - "ratio", - "rel_to_baseline", - "relative_skill", - "rn", - "theta", - "true_value", - "type", - "upper", - "value", - "value_scaled", - "var_of_interest", - "variable", - "wis_component_name", - "x", - "y", - "g")) +globalVariables(c( + "..index", + ".", + ".SD", + "adj_pval", + "ae_point", + "ae_median", + "boundary", + "bias", + "brier_score", + "component_value", + "..colnames_x", + "..colnames_y", + "..samplecols", + "compare_against", + "count", + "coverage_deviation", + "CRPS", + "crps", + "DSS", + "dss", + "fill_col", + "identifCol", + "Interval_Score", + "overprediction", + "underprediction", + "quantile_coverage", + "LogS", + "calibration", + "coverage", + "hist", + "InternalDuplicateCheck", + "InternalNumCheck", + "log_score", + "lower", + "mad", + "mean_scores_ratio", + "metric", + "metrics_select", + "metrics_summary", + "model", + "n_obs", + "n_obs wis_component_name", + "Number forecasts", + "pit_p_val", + "pit_value", + "point", + "prediction", + "pval", + "quantile", + "ratio", + "rel_to_baseline", + "relative_skill", + "rn", + "se_mean", + "sharpness", + "theta", + "true_value", + "type", + "upper", + "value", + "value_scaled", + "var_of_interest", + "variable", + "wis_component_name", + "x", + "y", + "g" +)) #' @title Available metrics in scoringutils #' #' @return A vector with the name of all available metrics #' @export +#' @keywords info available_metrics <- function() { - available_metrics <- c("ae_point", "aem", "log_score", "sharpness", "bias", "dss", "crps", - "coverage", "coverage_deviation", "quantile_coverage", - "pit_p_val", "pit_sd","interval_score", - "underprediction", "overprediction", "relative_skill", - "scaled_rel_skill") - - return(available_metrics) + return(unique(metrics_summary$Name)) } - - -#' @title Extract Elements From a List of Lists -#' -#' @description -#' Extract corresponding elements from a list of lists. -#' @param list the list of lists -#' @param what character with the name of the element to extract from every -#' individual list element of `list` -#' @return A list with the extracted element from every sublist -#' missing. -#' -#' @keywords internal -extract_from_list <- function(list, what) { - out <- lapply(list, - FUN = function(list_element) { - return(list_element[[what]]) - }) - return(out) -} - - - - - -#' Update a List +#' @title Simple permutation test #' -#' @description `r lifecycle::badge("stable")` -#' Used to handle updating settings in a list. For example when making -#' changes to `interval_score_arguments` in `eval_forecasts()` -#' @param defaults A list of default settings -#' @param optional A list of optional settings to override defaults -#' @return A list +#' @description #' The implementation of the permutation test follows the +#' function +#' `permutationTest` from the `surveillance` package by Michael Höhle, +#' Andrea Riebler and Michaela Paul. #' +#' @return p-value of the permutation test #' @keywords internal -update_list <- function(defaults = list(), optional = list()) { - if (length(optional) != 0) { - defaults <- defaults[setdiff(names(defaults), names(optional))] - updated <- c(defaults, optional) - } else { - updated <- defaults - } - return(updated) -} - - - - - - - - permutation_test <- function(scores1, scores2, - nPermutation = 999, - oneSided = FALSE, + n_permutation = 999, + one_sided = FALSE, comparison_mode = c("difference", "ratio")) { - nTime = length(scores1) + nTime <- length(scores1) meanscores1 <- mean(scores1) meanscores2 <- mean(scores2) - if (comparison_mode[1] == "ratio") { + comparison_mode <- match.arg(comparison_mode) + if (comparison_mode == "ratio") { # distinguish between on-sided and two-sided: - testStat_observed <- ifelse(oneSided, - meanscores1 / meanscores2, - max(meanscores1 / meanscores2, meanscores2 / meanscores1)) + testStat_observed <- ifelse(one_sided, + meanscores1 / meanscores2, + max(meanscores1 / meanscores2, meanscores2 / meanscores1) + ) } else { - testStat_observed <- ifelse(oneSided, meanscores1 - meanscores2, abs(meanscores1 - meanscores2)) + testStat_observed <- ifelse(one_sided, meanscores1 - meanscores2, abs(meanscores1 - meanscores2)) } - testStat_permuted <- replicate(nPermutation, { + testStat_permuted <- replicate(n_permutation, { sel <- rbinom(nTime, size = 1, prob = 0.5) - g1 <- (sum(scores1[sel == 0]) + sum(scores2[sel == 1]))/nTime - g2 <- (sum(scores1[sel == 1]) + sum(scores2[sel == 0]))/nTime - if (comparison_mode[1] == "ratio") { - ifelse(oneSided, g1 / g2, max(g1 / g2, g2/g1)) + g1 <- (sum(scores1[sel == 0]) + sum(scores2[sel == 1])) / nTime + g2 <- (sum(scores1[sel == 1]) + sum(scores2[sel == 0])) / nTime + if (comparison_mode == "ratio") { + ifelse(one_sided, g1 / g2, max(g1 / g2, g2 / g1)) } else { - ifelse(oneSided, g1 - g2, abs(g1 - g2)) + ifelse(one_sided, g1 - g2, abs(g1 - g2)) } }) - # abs needs to be removed here (messes with one sided vs two-sided) - pVal <- (1 + sum(testStat_permuted >= testStat_observed))/(nPermutation + 1) + pVal <- (1 + sum(testStat_permuted >= testStat_observed)) / (n_permutation + 1) # plus ones to make sure p-val is never 0? return(pVal) } @@ -242,16 +140,108 @@ permutation_test <- function(scores1, #' are present in the data.table #' @param df A data.table or data.frame from which columns shall be deleted #' @param cols_to_delete character vector with names of columns to be deleted +#' @param make_unique whether to make the data set unique after removing columns #' @importFrom data.table as.data.table #' @return A data.table #' #' @keywords internal -delete_columns <- function(df, cols_to_delete) { +#' +delete_columns <- function(df, cols_to_delete, make_unique = FALSE) { df <- data.table::as.data.table(df) delete_columns <- names(df)[names(df) %in% cols_to_delete] if (length(delete_columns) > 0) { - df <- unique(df[, eval(delete_columns) := NULL]) + if (make_unique) { + df <- unique(df[, eval(delete_columns) := NULL]) + } else { + df <- df[, eval(delete_columns) := NULL] + } } return(df) } + + +#' @title Get prediction type of a forecast +#' +#' @description Internal helper function to get the prediction type of a +#' forecast. That is inferred based on the properties of the values in the +#' `prediction` column. +#' +#' @inheritParams check_forecasts +#' +#' @return Character vector of length one with either "quantile", "integer", or +#' "continuous". +#' +#' @keywords internal + +get_prediction_type <- function(data) { + if (is.data.frame(data)) { + if ("quantile" %in% names(data)) { + return("quantile") + } else if (isTRUE( + all.equal(data$prediction, as.integer(data$prediction))) + ) { + return("integer") + } else { + return("continuous") + } + } else { + if (isTRUE(all.equal(data, as.integer(data)))) { + return("integer") + } else { + return("continuous") + } + } +} + + +#' @title Get type of the target true values of a forecast +#' +#' @description Internal helper function to get the type of the target +#' true values of a forecast. That is inferred based on the which columns +#' are present in the data. +#' +#' @inheritParams check_forecasts +#' +#' @return Character vector of length one with either "binary", "integer", or +#' "continuous" +#' +#' @keywords internal + +get_target_type <- function(data) { + if (isTRUE(all.equal(data$true_value, as.integer(data$true_value)))) { + if (all(data$true_value %in% c(0, 1)) && + all(data$prediction >= 0) && all(data$prediction <= 1)) { + return("binary") + } else { + return("integer") + } + } else { + return("continuous") + } +} + + + +#' @title Get unit of a single forecast +#' +#' @description Helper function to get the unit of a single forecast, i.e. +#' the column names that define where a single forecast was made for +#' +#' @inheritParams check_forecasts +#' +#' @return A character vector with the column names that define the unit of +#' a single forecast +#' +#' @keywords internal + +get_forecast_unit <- function(data) { + protected_columns <- c( + "prediction", "true_value", "sample", "quantile", "upper", "lower", + "pit_value", + "range", "boundary", available_metrics(), + names(data)[grepl("coverage_", names(data))] + ) + forecast_unit <- setdiff(colnames(data), protected_columns) + return(forecast_unit) +} diff --git a/R/utils_data_handling.R b/R/utils_data_handling.R index 09b6269f3..7457afbd0 100644 --- a/R/utils_data_handling.R +++ b/R/utils_data_handling.R @@ -1,129 +1,122 @@ -#' @title Pivot Range Format Forecasts From Long to Wide Format +#' @title Merge Forecast Data And Observations #' #' @description -#' Given a data.frame that follows the structure shown in -#' [range_example_data_long()], the function outputs the same -#' data in a long format as (as shown in -#' [range_example_data_wide()]). This can be useful e.g. for -#' plotting. #' -#' @param data a data.frame following the specifications from -#' [eval_forecasts()]) for quantile forecasts. For an example, see -#' [range_example_data_long()]) -#' @return a data.frame in wide format -#' @importFrom data.table dcast -#' @export -#' @examples -#' long <- scoringutils::range_example_data_long -#' wide <- scoringutils::range_long_to_wide(long) +#' The function more or less provides a wrapper around `merge` that +#' aims to handle the merging well if additional columns are present +#' in one or both data sets. If in doubt, you should probably merge the +#' data sets manually. #' +#' @param forecasts data.frame with the forecast data (as can be passed to +#' [score()]). +#' @param observations data.frame with the observations +#' @param join character, one of `c("left", "full", "right")`. Determines the +#' type of the join. Usually, a left join is appropriate, but sometimes you +#' may want to do a full join to keep dates for which there is a forecast, but +#' no ground truth data. +#' @param by character vector that denotes the columns by which to merge. Any +#' value that is not a column in observations will be removed. +#' @return a data.frame with forecasts and observations +#' @examples +#' forecasts <- example_quantile_forecasts_only +#' observations <- example_truth_only +#' merge_pred_and_obs(forecasts, observations) +#' @keywords data-handling +#' @export -range_long_to_wide <- function(data) { - data <- data.table::as.data.table(data) +merge_pred_and_obs <- function(forecasts, observations, + join = c("left", "full", "right"), + by = NULL) { + forecasts <- data.table::as.data.table(forecasts) + observations <- data.table::as.data.table(observations) - # remove quantile column if one is present - if ("quantile" %in% colnames(data)) { - data[, quantile := NULL] + if (is.null(by)) { + protected_columns <- c( + "prediction", "true_value", "sample", "quantile", + "range", "boundary" + ) + by <- setdiff(colnames(forecasts), protected_columns) } - out <- data.table::dcast(data, ... ~ boundary + range, - value.var = "prediction") - return(out[]) -} + obs_cols <- colnames(observations) + by <- intersect(by, obs_cols) + join <- match.arg(join) + if (join == "left") { + # do a left_join, where all data in the observations are kept. + combined <- merge(observations, forecasts, by = by, all.x = TRUE) + } else if (join == "full") { + # do a full, where all data is kept. + combined <- merge(observations, forecasts, by = by, all = TRUE) + } else { + combined <- merge(observations, forecasts, by = by, all.y = TRUE) + } -#' @title Pivot Range Format Forecasts From Long to Wide Format -#' -#' @description -#' Legacy function that will not be supported in future updates. -#' @inheritParams range_long_to_wide -#' @return a data.frame in wide format -#' @importFrom data.table dcast -#' @export -quantile_to_wide <- function(data) { - warning("This function will be deprecated. Please use `range_long_to_wide()` in the future") - out <- scoringutils::range_long_to_wide(data) - return(out[]) -} + # get colnames that are the same for x and y + colnames <- colnames(combined) + colnames_x <- colnames[endsWith(colnames, ".x")] + colnames_y <- colnames[endsWith(colnames, ".y")] + + # extract basenames + basenames_x <- sub(".x$", "", colnames_x) + basenames_y <- sub(".y$", "", colnames_y) + + # see whether the column name as well as the content is the same + overlapping <- (as.list(combined[, ..colnames_x]) %in% as.list(combined[, ..colnames_y])) & basenames_x == basenames_y + overlap_names <- colnames_x[overlapping] + basenames_overlap <- sub(".x$", "", overlap_names) + # delete overlapping columns + if (length(basenames_overlap > 0)) { + combined[, paste0(basenames_overlap, ".x") := NULL] + combined[, paste0(basenames_overlap, ".y") := NULL] + } + return(combined[]) +} -#' @title Pivot Range Format Forecasts From Wide to Long Format +#' @title Change Data from a Sample Based Format to a Quantile Format #' #' @description -#' Given a data.frame that follows the structure shown in -#' [range_example_data_wide()], the function outputs the same -#' data in a long format as (as shown in -#' [range_example_data_long()]). This can be useful e.g. for -#' plotting. #' -#' @param data a data.frame following the specifications from -#' [eval_forecasts()]) for quantile forecasts. For an example, see -#' [range_example_data_wide()]) -#' @return a data.frame in long format -#' @importFrom data.table melt +#' Transform data from a format that is based on predictive samples to a format +#' based on plain quantiles. +#' +#' +#' @param data a data.frame with samples +#' @param quantiles a numeric vector of quantiles to extract +#' @param type type argument passed down to the quantile function. For more +#' information, see [quantile()] +#' @return a data.frame in a long interval range format +#' @importFrom data.table as.data.table +#' @importFrom stats quantile +#' @importFrom methods hasArg +#' @keywords data-handling #' @export #' @examples -#' wide <- scoringutils::range_example_data_wide -#' long <- scoringutils::range_wide_to_long(wide) -#' - -range_wide_to_long <- function(data) { - +#' sample_to_quantile(example_integer) +sample_to_quantile <- function(data, + quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95), + type = 7) { data <- data.table::as.data.table(data) - colnames <- colnames(data) - - # semi-wide format where only lower and upper are given independently - if (all(c("lower", "upper") %in% colnames)) { - id_vars <- colnames[!(colnames %in% c("lower", "upper"))] - - # need to remove quantile column if present - if ("quantile" %in% colnames) { - data[, "quantile" := NULL] - } + reserved_columns <- c("prediction", "sample") + by <- setdiff(colnames(data), reserved_columns) - data <- data.table::melt(data, - id.vars = id_vars, - measure.vars = c("lower", "upper"), - variable.name = "boundary", - value.name = "prediction") - } else { - # alternative is super-wide format where every range has its own column - ranges <- colnames[grepl("lower", colnames) | grepl("upper", colnames)] - - id_vars <- colnames[!(colnames %in% ranges)] - - data <- data.table::melt(data, - id.vars = id_vars, - measure.vars = ranges, - variable.name = "range", - value.name = "prediction") - data[, boundary := gsub("_.*", "", range)] - data[, range := as.numeric(gsub("^.*?_","", range))] - } + data <- data[, .(quantile = quantiles, + prediction = quantile(x = prediction, prob = quantiles, + type = type, na.rm = TRUE)), + by = by] return(data[]) } -#' @title Pivot Range Format Forecasts From Wide to Long Format -#' -#' @description -#' Legacy function that will not be supported in future updates. -#' @inheritParams range_long_to_wide -#' @return a data.frame in long format -#' @export -quantile_to_long <- function(data) { - warning("This function will be deprecated. Please use `range_wide_to_long()` in the future") - out <- scoringutils::range_wide_to_long(data) - return(out[]) -} - - +# ==================== Functions internally used for scoring =================== +# These functions would ideally be replaced in the future #' @title Change Data from a Range Format to a Quantile Format #' @@ -132,34 +125,17 @@ quantile_to_long <- function(data) { #' Transform data from a format that uses interval ranges to denote quantiles #' to a format that uses quantiles only. #' -#' Given a data.frame that follows the structure shown in -#' [range_example_data_long()], the function outputs the same -#' data in a long format as (as shown in -#' [range_example_data_long()]). This can be useful e.g. for -#' plotting. If you're data.frame is in a different format, consider running -#' [range_long_to_wide()] first. -#' #' @param data a data.frame following the specifications from -#' [eval_forecasts()]) for quantile forecasts. For an example, see -#' [range_example_data_long()]) +#' [score()]) for quantile forecasts. #' @param keep_range_col keep the range and boundary columns after #' transformation (default is FALSE) #' @return a data.frame in a plain quantile format #' @importFrom data.table copy -#' @export -#' @examples -#' wide <- range_example_data_wide -#' semiwide <- range_example_data_semi_wide -#' -#' long <- range_wide_to_long(wide) -#' long2 <- range_wide_to_long(semiwide) -#' -#' plain_quantile <- range_long_to_quantile(long2) -#' +#' @keywords internal range_long_to_quantile <- function(data, - keep_range_col = FALSE) { + keep_range_col = FALSE) { data <- data.table::as.data.table(data) # filter out duplicated median @@ -169,7 +145,8 @@ range_long_to_quantile <- function(data, data[, quantile := ifelse(boundary == "lower", round((100 - range) / 200, 10), - round((1 - (100 - range) / 200), 10))] + round((1 - (100 - range) / 200), 10) + )] if (!keep_range_col) { data[, c("range", "boundary") := NULL] @@ -180,24 +157,6 @@ range_long_to_quantile <- function(data, } -#' @title Pivot Change Data from a Range Format to a Quantile Format -#' -#' @description -#' Legacy function that will not be supported in future updates. -#' @inheritParams range_long_to_quantile -#' @return a data.frame in long format -#' @export -range_to_quantile <- function(data, - keep_range_col = FALSE) { - warning("This function will be deprecated. Please use `range_long_to_quantile()` in the future") - out <- scoringutils::range_long_to_quantile(data, keep_range_col) - return(out[]) -} - - - - - #' @title Change Data from a Plain Quantile Format to a Long Range Format #' #' @description @@ -205,24 +164,12 @@ range_to_quantile <- function(data, #' Transform data from a format that uses quantiles only to one that uses #' interval ranges to denote quantiles. #' -#' Given a data.frame that follows the structure shown in -#' [quantile_example_data()], the function outputs the same -#' data in a long format as (as shown in -#' [range_example_data_long()]). -#' -#' @param data a data.frame following the specifications shown in the example -#' [range_example_data_long()]) +#' @param data a data.frame in quantile format #' @param keep_quantile_col keep the quantile column in the final #' output after transformation (default is FALSE) #' @return a data.frame in a long interval range format #' @importFrom data.table copy -#' @export -#' -#' @examples -#' quantile <- scoringutils::quantile_example_data -#' -#' long <- scoringutils::quantile_to_range_long(quantile) -#' +#' @keywords internal quantile_to_range_long <- function(data, keep_quantile_col = TRUE) { @@ -231,7 +178,8 @@ quantile_to_range_long <- function(data, data[, boundary := ifelse(quantile <= 0.5, "lower", "upper")] data[, range := ifelse(boundary == "lower", round((1 - 2 * quantile) * 100, 10), - round((2 * quantile - 1) * 100, 10))] + round((2 * quantile - 1) * 100, 10) + )] # add median quantile median <- data[quantile == 0.5, ] @@ -246,74 +194,15 @@ quantile_to_range_long <- function(data, # if only point forecasts are scored, we only have NA values for range and # boundary. In that instance we need to set the type of the columns # explicitly to avoid future collisions. - data[, `:=`(boundary = as.character(boundary), - range = as.numeric(range))] - - return(data[]) -} - - -#' @title Change Data from a Plain Quantile Format to a Long Range Format -#' -#' @description -#' Legacy function that will not be supported in future updates. -#' @inheritParams quantile_to_range_long -#' @return a data.frame in long format -#' @export -quantile_to_range <- function(data, - keep_quantile_col = FALSE) { - warning("This function will be deprecated. Please use `quantile_to_range_long()` in the future") - out <- scoringutils::quantile_to_range_long(data, keep_quantile_col) - return(out[]) -} - - - - -#' @title Change Data from a Sample Based Format to a Quantile Format -#' -#' @description -#' -#' Transform data from a format that is based on predictive samples to a format -#' based on plain quantiles. -#' -#' -#' @param data a data.frame with samples -#' @param quantiles a numeric vector of quantiles to extract -#' @param type type argument passed down to the quantile function. For more -#' information, see [quantile()] -#' @return a data.frame in a long interval range format -#' @importFrom data.table as.data.table -#' @importFrom stats quantile -#' @export -#' -#' @examples -#' example_data <- scoringutils::integer_example_data -#' -#' quantile_data <- scoringutils::sample_to_quantile(example_data) -#' - - - -sample_to_quantile <- function(data, - quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95), - type = 7) { - - data <- data.table::as.data.table(data) - - reserved_columns <- c("prediction", "sample") - by <- setdiff(colnames(data), reserved_columns) - - data <- data[, .(quantile = quantiles, - prediction = quantile(prediction, prob = quantiles, - type = type, na.rm = TRUE)), - by = by] + data[, `:=`( + boundary = as.character(boundary), + range = as.numeric(range) + )] return(data[]) } - #' @title Change Data from a Sample Based Format to a Long Interval Range Format #' #' @description @@ -331,132 +220,26 @@ sample_to_quantile <- function(data, #' @return a data.frame in a long interval range format #' @importFrom data.table as.data.table #' @importFrom stats quantile -#' @export -#' -#' @examples -#' example_data <- scoringutils::integer_example_data -#' -#' quantile_data <- scoringutils::sample_to_range_long(example_data) -#' +#' @keywords internal sample_to_range_long <- function(data, range = c(0, 50, 90), type = 7, keep_quantile_col = TRUE) { - data <- data.table::as.data.table(data) lower_quantiles <- (100 - range) / 200 upper_quantiles <- 1 - lower_quantiles quantiles <- sort(unique(c(lower_quantiles, upper_quantiles))) - data <- scoringutils::sample_to_quantile(data, - quantiles = quantiles, - type = type) + data <- sample_to_quantile(data, + quantiles = quantiles, + type = type + ) - data <- scoringutils::quantile_to_range_long(data, - keep_quantile_col = keep_quantile_col) + data <- quantile_to_range_long(data, + keep_quantile_col = keep_quantile_col + ) return(data[]) } - - - -#' @title Change Data from a Sample Based Format to a Long Interval Range Format -#' -#' @description -#' Legacy function that will not be supported in future updates. -#' @inheritParams sample_to_range_long -#' @return a data.frame in long format -#' @export -sample_to_range <- function(data, - range = c(0, 50, 90), - type = 7, - keep_quantile_col = TRUE) { - warning("This function will be deprecated. Please use `sample_to_range-long()` in the future") - out <- scoringutils::sample_to_range_long(data, range, type, keep_quantile_col) - return(out[]) -} - - - - - -#' @title Merge Forecast Data And Observations -#' -#' @description -#' -#' The function more or less provides a wrapper around `merge` that -#' aims to handle the merging well if additional columns are present -#' in one or both data sets. If in doubt, you should probably merge the -#' data sets manually. -#' -#' -#' @param forecasts data.frame with the forecast data (as can be passed to -#' [eval_forecasts()]). -#' @param observations data.frame with the observations -#' @param join character, one of `c("left", "full", "right")`. Determines the -#' type of the join. Usually, a left join is appropriate, but sometimes you -#' may want to do a full join to keep dates for which there is a forecast, but -#' no ground truth data. -#' @param by character vector that denotes the columns by which to merge. Any -#' value that is not a column in observations will be removed. -#' @return a data.frame with forecasts and observations -#' @export - - -merge_pred_and_obs <- function(forecasts, observations, - join = c("left", "full", "right"), - by = NULL) { - - forecasts <- data.table::as.data.table(forecasts) - observations <- data.table::as.data.table(observations) - - if (is.null(by)) { - protected_columns <- c("prediction", "true_value", "sample", "quantile", - "range", "boundary") - by <- setdiff(colnames(forecasts), protected_columns) - } - - - obs_cols <- colnames(observations) - by <- intersect(by, obs_cols) - - if (join[1] == "left") { - # do a left_join, where all data in the observations are kept. - combined <- merge(observations, forecasts, by = by, all.x = TRUE) - } else if (join[1] == "full") { - # do a full, where all data is kept. - combined <- merge(observations, forecasts, by = by, all = TRUE) - } else { - combined <- merge(observations, forecasts, by = by, all.y = TRUE) - } - - - # get colnames that are the same for x and y - colnames <- colnames(combined) - colnames_x <- colnames[endsWith(colnames, ".x")] - colnames_y <- colnames[endsWith(colnames, ".y")] - - # extract basenames - basenames_x <- sub(".x$", "", colnames_x) - basenames_y <- sub(".y$", "", colnames_y) - - # see whether the column name as well as the content is the same - overlapping <- (as.list(combined[, ..colnames_x]) %in% as.list(combined[, ..colnames_y])) & basenames_x == basenames_y - overlap_names <- colnames_x[overlapping] - basenames_overlap <- sub(".x$", "", overlap_names) - - # delete overlapping columns - if (length(basenames_overlap > 0)) { - combined[, paste0(basenames_overlap, ".x") := NULL] - combined[, paste0(basenames_overlap, ".y") := NULL] - } - - return(combined[]) -} - - - - - diff --git a/R/zzz.R b/R/zzz.R index 762053008..0ef6aff7b 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,3 +1,3 @@ .onAttach <- function(libname, pkgname) { - packageStartupMessage("Note: The definition of the weighted interval score has slightly changed in version 0.1.5. If you want to use the old definition, use the argument `count_median_twice = TRUE` in the function `eval_forecasts()`") + packageStartupMessage('Note: scoringutils 1.0.0 introduces a lot of breaking changes and we apologise for any inconvenience. If you prefer the old interface, please download version 0.1.8 using remotes::install_github("epiforecasts/scoringutils@v0.1.8")') } diff --git a/README.Rmd b/README.Rmd index 711bde75f..beea2750d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,13 +1,14 @@ --- title: 'scoringutils: Utilities for Scoring and Assessing Predictions' output: - github_document + github_document: + toc: false --- [![R-CMD-check](https://github.com/epiforecasts/scoringutils/workflows/R-CMD-check/badge.svg)](https://github.com/epiforecasts/scoringutils/actions) [![codecov](https://codecov.io/gh/epiforecasts/scoringutils/branch/master/graphs/badge.svg)](https://codecov.io/gh/epiforecasts/scoringutils/) [![CRAN\_Release\_Badge](https://www.r-pkg.org/badges/version-ago/scoringutils)](https://CRAN.R-project.org/package=scoringutils) -[![develVersion](https://img.shields.io/badge/devel%20version-0.1.7-green.svg?style=flat)](https://github.com/epiforecasts/scoringutils) +[![develVersion](https://img.shields.io/badge/devel%20version-1.0.0-green.svg?style=flat)](https://github.com/epiforecasts/scoringutils) [![metacran downloads](http://cranlogs.r-pkg.org/badges/grand-total/scoringutils)](https://cran.r-project.org/package=scoringutils) @@ -19,341 +20,101 @@ knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, comment = "#>", fig.path = "man/figures/") -library(magrittr) -``` - -# Introduction and Overview of Functionality - -The `scoringutils` package provides a collection of metrics and proper scoring rules -that make it simple to score forecasts against the true observed values. -Predictions can either be automatically scored from a `data.frame` using the function `eval_forecasts`. Alternatively, evaluation metrics can be accessed directly using lower level functions within a vector/matrix framework. - -Predictions can be handled in various formats: `scoringutils` can handle probabilistic forecasts in either a sample based or a quantile based format. For more detail on the expected input formats please see below. True values can be integer, continuous or binary. - -In addition to automatic scoring, `scoringutils` offers a variety of plots and visualisations. - -# Scoring Forecasts Automatically - -Most of the time, the `eval_forecasts` function will be able to do the entire evaluation for you. The idea is simple, yet flexible. - -All you need to do is to pass in a `data.frame` that has a column called `prediction` and one called `true_value`. Depending on the exact input format, additional columns like `sample`, `quantile` or `range` and `boundary` are needed. Additional columns may be present to indicate a grouping of forecasts. For example, we could have forecasts made by different models in various locations at different time points, each for several weeks into the future. In this case, we would have additional columns called for example `model`, `date`, `forecast_date`, `forecast_horizon` and `location`. - -Using the `by` argument you need to specify the *unit of a single forecast*. In this example here we would set `by = c("model", "date", "forecast_date", "forecast_horizon", "location")` (note: if we want to be pedantic, there is a small duplication as the information of "date" is already included in the combination of "forecast_date" and "forecast_horizon". But as long as there isn't some weird shift, this doesn't matter for the purpose of grouping our observations). If you don't specify `by` (i.e. `by = NULL`), `scoringutils` will automatically use all appropriate present columns. Note that you don't need to include columns such as `quantile` or `sample` in the `by` argument, as several quantiles / samples make up one forecast. - -Using the `summarise_by` argument you can now choose categories to aggregate over. If you were only interested in scores for the different models, you would specify `summarise_by = c("model")`. If you wanted to have scores for every model in every location, you would need to specify `summarise_by = c("model", "location")`. If you wanted to have one score per quantile or one per prediction interval range, you could specify something like `summarise_by = c("model", "quantile")` or `summarise_by = c("model", "quantile", "range")` (note again that some information is duplicated in quantile and range, but this doesn't really matter for grouping purposes). When aggregating, `eval_forecasts` takes the mean according to the group defined in `summarise_by` (i.e. in this example, if `summarise_by = c("model", "location")`, scores will be averaged over all forecast dates, forecast horizons and quantiles to yield one score per model and location). In addition to the mean, you can also obtain the standard deviation of the scores over which you average or any desired quantile (e.g. the median in addition to the mean) by specifying `sd = TRUE` and `quantiles = c(0.5)`. - - -## Example Evaluation - -Here is an example of an evaluation using the example data included in the package. The data comes from a set of [Covid-19 short-term forecasts in the UK](https://github.com/epiforecasts/covid19.forecasts.uk). - -```{r} library(scoringutils) +library(magrittr) library(data.table) +library(ggplot2) +library(knitr) ``` -```{r} -data <- scoringutils::quantile_example_data -print(data, 3, 3) - -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "quantile", "range")) -print(scores, 3, 3) -``` - - +The `scoringutils` package provides a collection of metrics and proper scoring rules that make it simple to score probabilistic forecasts against the true observed values. The `scoringutils` package offers convenient automated forecast evaluation in a `data.table` format (using the function `score()`), but also provides experienced users with a set of reliable lower-level scoring metrics operating on vectors/matriced they can build upon in other applications. In addition it implements a wide range of flexible plots that are able to cover many use cases. -```{r eval = FALSE, echo = FALSE} -# -# filtered_data <- data[geography == "England" & -# creation_date <= "2020-06-29" & -# value_desc == "Deaths"] - -scoringutils::plot_predictions(data = data, - filter_both = list("geography == 'England'"), - filter_forecasts = list("creation_date == '2020-06-29'", - "value_desc == 'Deaths'", - "model %in% c('SIRCOVID', 'EpiSoon')"), - filter_truth = list("as.Date(value_date) <= '2020-07-06'"), - x = "value_date", - range = c(0, 50, 90), - scale = "free", - facet_formula = value_desc ~ model) - -``` - - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model")) -scoringutils::score_table(scores) -``` - -Given this level of aggregation, not all metrics may make sense. In this case, for example, averaging over different quantiles to compute quantile coverage does not make much sense. If you like, you can select specific metrics for the visualisation. - -Let us look at calibration: - -```{r out.width="50%", fig.show="hold"} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "range", "quantile")) -scoringutils::interval_coverage(scores) + - ggplot2::ggtitle("Interval Coverage") - -scoringutils::quantile_coverage(scores) + - ggplot2::ggtitle("Quantile Coverage") -``` - -Let us look at the individual components of the weighted interval score: - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "value_desc")) -scoringutils::wis_components(scores, facet_formula = ~ value_desc) -``` - -We can also look at contributions to different metrics by range: - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "range", "value_desc")) -scoringutils::range_plot(scores, y = "interval_score", - facet_formula = ~ value_desc) -``` - -We can also visualise metrics using a heatmap: - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "horizon")) -scores <- scores[, horizon := as.factor(horizon)] -scoringutils::score_heatmap(scores, - x = "horizon", metric = "bias") -``` +The goal of this package is to provide a tested and reliable collection of metrics that can be used for scoring probabilistic forecasts (forecasts with a full predictive distribution, rather than point forecasts). It has a much stronger focus on convenience than e.g. the `scoringRules` package, which provides a comprehensive collection of proper scoring rules (also used in `scoringutils`). In contrast to other packages, `scoringutils` offers functionality to automatically evaluate forecasts, to visualise scores and to obtain relative scores between models. +Predictions can be handled in various formats: `scoringutils` can handle probabilistic forecasts in either a sample based or a quantile based format. For more detail on the expected input formats please see below. True values can be integer, continuous or binary. -### Expected Input Formats -The `eval_forecasts` function is designed to work with various different input formats. The following formats are currently supported: +## Installation -quantile forecasts in either a plain quantile format or in a format that specifies interval ranges and the boundary of a given interval range. +Install the CRAN version of this package using: -``` {r} -print(scoringutils::quantile_example_data, 3, 3) -print(scoringutils::range_example_data_long, 3, 3) -print(scoringutils::range_example_data_wide, 3, 3) +```{r, eval = FALSE} +install.packages("scoringutils") ``` -sample based format with either continuous or integer values +Install the stable development version of the package with: -``` {r} -print(scoringutils::integer_example_data, 3, 3) -print(scoringutils::continuous_example_data, 3, 3) +```{r, eval = FALSE} +install.packages("scoringutils", repos = "https://epiforecasts.r-universe.dev") ``` -forecasts in a binary format: - -``` {r} -print(scoringutils::binary_example_data, 3, 3) -``` +Install the unstable development from GitHub using the following, -It also offers functionality to convert between these formats. For more information have a look at the documentation of the following functions: -``` {r eval=FALSE} -scoringutils::sample_to_quantile() # convert from sample based to quantile format -scoringutils::range_long_to_quantile() # convert from range format to plain quantile -scoringutils::quantile_to_range_long() # convert the other way round -scoringutils::range_wide_to_long() # convert range based format from wide to long -scoringutils::range_long_to_wide() # convert the other way round +```{r, eval = FALSE} +remotes::install_github("epiforecasts/scoringutils", dependencies = TRUE) ``` +## Quick start -# Scoring Forecasts Directly - -A variety of metrics and scoring rules can also be accessed directly through -the `scoringutils` package. - -The following gives an overview of (most of) the implemented metrics. - -## Bias - -The function `bias` determines bias from predictive Monte-Carlo samples, -automatically recognising whether forecasts are continuous or -integer valued. - -For continuous forecasts, Bias is measured as -$$B_t (P_t, x_t) = 1 - 2 \cdot (P_t (x_t))$$ - -where $P_t$ is the empirical cumulative distribution function of the -prediction for the true value $x_t$. Computationally, $P_t (x_t)$ is -just calculated as the fraction of predictive samples for $x_t$ -that are smaller than $x_t$. - -For integer valued forecasts, Bias is measured as +In this quick start guide we explore some of the functionality of the `scoringutils` package using quantile forecasts from the [ECDC forecasting hub](https://covid19forecasthub.eu/) as an example. For more detailed documentation please see the package vignettes, and individual function documentation. -$$B_t (P_t, x_t) = 1 - (P_t (x_t) + P_t (x_t + 1))$$ +### Plotting forecasts -to adjust for the integer nature of the forecasts. In both cases, Bias can -assume values between -1 and 1 and is 0 ideally. +As a first step to evaluating the forecasts we visualise them. For the purposes of this example here we make use of `plot_predictions()` to filter the available forecasts for a single model, and forecast date. -```{r} -## integer valued forecasts -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -bias(true_values, predictions) - -## continuous forecasts -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(30, mean = 1:30)) -bias(true_values, predictions) +```{r, fig.width = 9, fig.height = 6} +example_quantile %>% + plot_predictions( + x = "target_end_date", + filter_truth = list( + 'target_end_date <= "2021-07-15"', 'target_end_date > "2021-05-22"' + ), + filter_forecasts = list( + "model == 'EuroCOVIDhub-ensemble'", 'forecast_date == "2021-06-28"' + ) + ) + + facet_wrap(target_type ~ location, ncol = 4, scales = "free") + + theme(legend.position = "bottom") ``` +### Scoring forecasts -## Sharpness -Sharpness is the ability of the model to generate predictions within a -narrow range. It is a data-independent measure, and is purely a feature -of the forecasts themselves. - -Shaprness of predictive samples corresponding to one single true value is -measured as the normalised median of the absolute deviation from -the median of the predictive samples. For details, see `?stats::mad` +Forecasts can be easily and quickly scored using the `score()` function. This function returns unsumarised scores, which in most cases is not what the user wants. Here we make use of additional functions from `scoringutils` to add empirical coverage-levels (`add_coverage()`), and scores relative to a baseline model (here chosen to be the EuroCOVIDhub-ensemble model). See the getting started vignette for more details. Finally we summarise these scores by model and target type. -```{r} -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -sharpness(predictions) +```{r score-example} +example_quantile %>% + score() %>% + add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% + summarise_scores( + by = c("model", "target_type"), + relative_skill = TRUE, + baseline = "EuroCOVIDhub-ensemble" + ) %>% + summarise_scores( + fun = signif, + digits = 2 + ) %>% + kable() ``` -## Calibration - -Calibration or reliability of forecasts is the ability of a model to -correctly identify its own uncertainty in making predictions. In a model -with perfect calibration, the observed data at each time point look as if -they came from the predictive probability distribution at that time. - -Equivalently, one can inspect the probability integral transform of the -predictive distribution at time t, - -$$u_t = F_t (x_t)$$ - -where $x_t$ is the observed data point at time $t \text{ in } t_1, …, t_n$, -n being the number of forecasts, and $F_t$ is the (continuous) predictive -cumulative probability distribution at time t. If the true probability -distribution of outcomes at time t is $G_t$ then the forecasts $F_t$ are -said to be ideal if $F_t = G_t$ at all times $t$. In that case, the -probabilities ut are distributed uniformly. - -In the case of discrete outcomes such as incidence counts, -the PIT is no longer uniform even when forecasts are ideal. -In that case a randomised PIT can be used instead: - -$$u_t = P_t(k_t) + v \cdot (P_t(k_t) - P_t(k_t - 1) )$$ - -where $k_t$ is the observed count, $P_t(x)$ is the predictive -cumulative probability of observing incidence $k$ at time $t$, -$P_t (-1) = 0$ by definition and $v$ is standard uniform and independent -of $k$. If $P_t$ is the true cumulative -probability distribution, then $u_t$ is standard uniform. - -The function checks whether integer or continuous forecasts were provided. -It then applies the (randomised) probability integral and tests -the values $u_t$ for uniformity using the -Anderson-Darling test. +`scoringutils` contains additional functionality to summarise these scores at different levels, to visualise them, and to explore the forecasts themselves. See the package vignettes and function documentation for more information. -As a rule of thumb, there is no evidence to suggest a forecasting model is -miscalibrated if the p-value found was greater than a threshold of $p >= 0.1$, -some evidence that it was miscalibrated if $0.01 < p < 0.1$, and good -evidence that it was miscalibrated if $p <= 0.01$. -In this context it should be noted, though, that uniformity of the -PIT is a necessary but not sufficient condition of calibration. It should -als be noted that the test only works given sufficient samples, otherwise the -Null hypothesis will often be rejected outright. +## Citation +If using `scoringutils` in your work please consider citing it using the following, -## Continuous Ranked Probability Score (CRPS) -Wrapper around the `crps_sample` function from the -`scoringRules` package. For more information look at the manuals from the -`scoringRules` package. The function can be used for continuous as well as -integer valued forecasts. Smaller values are better. - -```{r} -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -crps(true_values, predictions) -``` - - - -## Dawid-Sebastiani Score (DSS) -Wrapper around the `dss_sample` function from the -`scoringRules` package. For more information look at the manuals from the -`scoringRules` package. The function can be used for continuous as well as -integer valued forecasts. Smaller values are better. - -```{r} -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -dss(true_values, predictions) +```{r, echo = FALSE} +citation("scoringutils") ``` -## Log Score -Wrapper around the `log_sample` function from the -`scoringRules` package. For more information look at the manuals from the -`scoringRules` package. The function should not be used for integer valued -forecasts. While Log Scores are in principle possible for integer valued -forecasts they require a kernel density estimate which is not well defined -for discrete values. Smaller values are better. - -```{r} -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -logs(true_values, predictions) -``` - -## Brier Score -The Brier score is a proper score rule that assesses the accuracy of -probabilistic binary predictions. The outcomes can be either 0 or 1, -the predictions must be a probability that the true outcome will be 1. - -The Brier Score is then computed as the mean squared error between the -probabilistic prediction and the true outcome. +## How to make a bug report or feature request +Please briefly describe your problem and what output you expect in an [issue](https://github.com/epiforecasts/scoringutils/issues). If you have a question, please don't open an issue. Instead, ask on our [Q and A page](https://github.com/epiforecasts/scoringutils/discussions/categories/q-a). -```{r} -true_values <- sample(c(0,1), size = 30, replace = TRUE) -predictions <- runif(n = 30, min = 0, max = 1) +## Contributing -brier_score(true_values, predictions) -``` - -## Interval Score -The Interval Score is a Proper Scoring Rule to score quantile predictions, -following Gneiting and Raftery (2007). Smaller values are better. - -The score is computed as - -![interval_score](man/figures/interval_score.png) - -where $1()$ is the indicator function and $\alpha$ is the decimal value that -indicates how much is outside the prediction interval. -To improve usability, the user is asked to provide an interval range in -percentage terms, i.e. interval_range = 90 (percent) for a 90 percent -prediction interval. Correspondingly, the user would have to provide the -5\% and 95\% quantiles (the corresponding alpha would then be 0.1). -No specific distribution is assumed, -but the range has to be symmetric (i.e you can't use the 0.1 quantile -as the lower bound and the 0.7 quantile as the upper). -Setting `weigh = TRUE` will weigh the score by $\frac{\alpha}{2}$ such that -the Interval Score converges to the CRPS for increasing number of quantiles. - - -```{r} -true_values <- rnorm(30, mean = 1:30) -interval_range <- 90 -alpha <- (100 - interval_range) / 100 -lower <- qnorm(alpha/2, rnorm(30, mean = 1:30)) -upper <- qnorm((1- alpha/2), rnorm(30, mean = 1:30)) - -interval_score(true_values = true_values, - lower = lower, - upper = upper, - interval_range = interval_range) -``` +We welcome contributions and new contributors! We particularly appreciate help on priority problems in the [issues](https://github.com/epiforecasts/scoringutils/issues). Please check and add to the issues, and/or add a [pull request](https://github.com/epiforecasts/scoringutils/pulls). +## Code of Conduct + +Please note that the `scoringutils` project is released with a [Contributor Code of Conduct](https://epiforecasts.io/scoringutils/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. diff --git a/README.md b/README.md index 80d77794e..234909f6d 100644 --- a/README.md +++ b/README.md @@ -4,576 +4,171 @@ scoringutils: Utilities for Scoring and Assessing Predictions [![R-CMD-check](https://github.com/epiforecasts/scoringutils/workflows/R-CMD-check/badge.svg)](https://github.com/epiforecasts/scoringutils/actions) [![codecov](https://codecov.io/gh/epiforecasts/scoringutils/branch/master/graphs/badge.svg)](https://codecov.io/gh/epiforecasts/scoringutils/) [![CRAN_Release_Badge](https://www.r-pkg.org/badges/version-ago/scoringutils)](https://CRAN.R-project.org/package=scoringutils) -[![develVersion](https://img.shields.io/badge/devel%20version-0.1.7-green.svg?style=flat)](https://github.com/epiforecasts/scoringutils) +[![develVersion](https://img.shields.io/badge/devel%20version-1.0.0-green.svg?style=flat)](https://github.com/epiforecasts/scoringutils) [![metacran downloads](http://cranlogs.r-pkg.org/badges/grand-total/scoringutils)](https://cran.r-project.org/package=scoringutils) -# Introduction and Overview of Functionality - The `scoringutils` package provides a collection of metrics and proper -scoring rules that make it simple to score forecasts against the true -observed values. Predictions can either be automatically scored from a -`data.frame` using the function `eval_forecasts`. Alternatively, -evaluation metrics can be accessed directly using lower level functions -within a vector/matrix framework. +scoring rules that make it simple to score probabilistic forecasts +against the true observed values. The `scoringutils` package offers +convenient automated forecast evaluation in a `data.table` format (using +the function `score()`), but also provides experienced users with a set +of reliable lower-level scoring metrics operating on vectors/matriced +they can build upon in other applications. In addition it implements a +wide range of flexible plots that are able to cover many use cases. + +The goal of this package is to provide a tested and reliable collection +of metrics that can be used for scoring probabilistic forecasts +(forecasts with a full predictive distribution, rather than point +forecasts). It has a much stronger focus on convenience than e.g. the +`scoringRules` package, which provides a comprehensive collection of +proper scoring rules (also used in `scoringutils`). In contrast to other +packages, `scoringutils` offers functionality to automatically evaluate +forecasts, to visualise scores and to obtain relative scores between +models. Predictions can be handled in various formats: `scoringutils` can handle probabilistic forecasts in either a sample based or a quantile based format. For more detail on the expected input formats please see below. True values can be integer, continuous or binary. -In addition to automatic scoring, `scoringutils` offers a variety of -plots and visualisations. - -# Scoring Forecasts Automatically - -Most of the time, the `eval_forecasts` function will be able to do the -entire evaluation for you. The idea is simple, yet flexible. - -All you need to do is to pass in a `data.frame` that has a column called -`prediction` and one called `true_value`. Depending on the exact input -format, additional columns like `sample`, `quantile` or `range` and -`boundary` are needed. Additional columns may be present to indicate a -grouping of forecasts. For example, we could have forecasts made by -different models in various locations at different time points, each for -several weeks into the future. In this case, we would have additional -columns called for example `model`, `date`, `forecast_date`, -`forecast_horizon` and `location`. - -Using the `by` argument you need to specify the *unit of a single -forecast*. In this example here we would set -`by = c("model", "date", "forecast_date", "forecast_horizon", "location")` -(note: if we want to be pedantic, there is a small duplication as the -information of “date” is already included in the combination of -“forecast_date” and “forecast_horizon”. But as long as there isn’t some -weird shift, this doesn’t matter for the purpose of grouping our -observations). If you don’t specify `by` (i.e. `by = NULL`), -`scoringutils` will automatically use all appropriate present columns. -Note that you don’t need to include columns such as `quantile` or -`sample` in the `by` argument, as several quantiles / samples make up -one forecast. - -Using the `summarise_by` argument you can now choose categories to -aggregate over. If you were only interested in scores for the different -models, you would specify `summarise_by = c("model")`. If you wanted to -have scores for every model in every location, you would need to specify -`summarise_by = c("model", "location")`. If you wanted to have one score -per quantile or one per prediction interval range, you could specify -something like `summarise_by = c("model", "quantile")` or -`summarise_by = c("model", "quantile", "range")` (note again that some -information is duplicated in quantile and range, but this doesn’t really -matter for grouping purposes). When aggregating, `eval_forecasts` takes -the mean according to the group defined in `summarise_by` (i.e. in this -example, if `summarise_by = c("model", "location")`, scores will be -averaged over all forecast dates, forecast horizons and quantiles to -yield one score per model and location). In addition to the mean, you -can also obtain the standard deviation of the scores over which you -average or any desired quantile (e.g. the median in addition to the -mean) by specifying `sd = TRUE` and `quantiles = c(0.5)`. - -## Example Evaluation - -Here is an example of an evaluation using the example data included in -the package. The data comes from a set of [Covid-19 short-term forecasts -in the UK](https://github.com/epiforecasts/covid19.forecasts.uk). - -``` r -library(scoringutils) -#> Note: The definition of the weighted interval score has slightly changed in version 0.1.5. If you want to use the old definition, use the argument `count_median_twice = TRUE` in the function `eval_forecasts()` -library(data.table) -``` - -``` r -data <- scoringutils::quantile_example_data -print(data, 3, 3) -#> value_date value_type geography value_desc true_value -#> 1: 2020-05-04 hospital_inc England Hospital admissions 1043 -#> 2: 2020-05-04 hospital_prev England Total beds occupied 10648 -#> 3: 2020-05-11 hospital_inc England Hospital admissions 743 -#> --- -#> 5150: 2020-08-03 death_inc_line Wales Deaths 1 -#> 5151: 2020-08-03 death_inc_line Wales Deaths 1 -#> 5152: 2020-08-03 death_inc_line Wales Deaths 1 -#> model creation_date quantile prediction horizon -#> 1: NA NA NA -#> 2: NA NA NA -#> 3: NA NA NA -#> --- -#> 5150: SIRCOVID 2020-07-13 0.85 4 21 -#> 5151: DetSEIRwithNB MCMC 2020-07-13 0.90 2 21 -#> 5152: SIRCOVID 2020-07-13 0.90 6 21 - -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "quantile", "range")) -print(scores, 3, 3) -#> model quantile range interval_score sharpness underprediction -#> 1: DetSEIRwithNB MCMC 0.50 0 54.45528 0.000000 54.16260163 -#> 2: DetSEIRwithNB MCMC 0.45 10 53.96138 6.310976 47.42276423 -#> 3: DetSEIRwithNB MCMC 0.55 10 53.96138 6.310976 47.42276423 -#> --- -#> 55: SIRCOVID 0.90 80 18.18000 17.368889 0.15555556 -#> 56: SIRCOVID 0.05 90 11.69444 11.661111 0.03333333 -#> 57: SIRCOVID 0.95 90 11.69444 11.661111 0.03333333 -#> overprediction coverage coverage_deviation bias aem -#> 1: 0.2926829 0.3170732 0.31707317 -0.2333333 54.45528 -#> 2: 0.2276423 0.4308943 0.33089431 -0.2333333 54.45528 -#> 3: 0.2276423 0.4308943 0.33089431 -0.2333333 54.45528 -#> --- -#> 55: 0.6555556 0.9333333 0.13333333 0.2255556 42.90000 -#> 56: 0.0000000 0.9888889 0.08888889 0.2255556 42.90000 -#> 57: 0.0000000 0.9888889 0.08888889 0.2255556 42.90000 -#> quantile_coverage -#> 1: 0.4959350 -#> 2: 0.4308943 -#> 3: 0.5691057 -#> --- -#> 55: 0.9777778 -#> 56: 0.2666667 -#> 57: 0.9888889 -``` - - - -``` r -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model")) -scoringutils::score_table(scores) -``` - -![](man/figures/unnamed-chunk-4-1.png) - -Given this level of aggregation, not all metrics may make sense. In this -case, for example, averaging over different quantiles to compute -quantile coverage does not make much sense. If you like, you can select -specific metrics for the visualisation. - -Let us look at calibration: - -``` r -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "range", "quantile")) -scoringutils::interval_coverage(scores) + - ggplot2::ggtitle("Interval Coverage") - -scoringutils::quantile_coverage(scores) + - ggplot2::ggtitle("Quantile Coverage") -``` - - - -Let us look at the individual components of the weighted interval score: - -``` r -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "value_desc")) -scoringutils::wis_components(scores, facet_formula = ~ value_desc) -``` - -![](man/figures/unnamed-chunk-6-1.png) - -We can also look at contributions to different metrics by range: - -``` r -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "range", "value_desc")) -scoringutils::range_plot(scores, y = "interval_score", - facet_formula = ~ value_desc) -``` - -![](man/figures/unnamed-chunk-7-1.png) - -We can also visualise metrics using a heatmap: - -``` r -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "horizon")) -scores <- scores[, horizon := as.factor(horizon)] -scoringutils::score_heatmap(scores, - x = "horizon", metric = "bias") -``` - -![](man/figures/unnamed-chunk-8-1.png) - -### Expected Input Formats - -The `eval_forecasts` function is designed to work with various different -input formats. The following formats are currently supported: - -quantile forecasts in either a plain quantile format or in a format that -specifies interval ranges and the boundary of a given interval range. - -``` r -print(scoringutils::quantile_example_data, 3, 3) -#> value_date value_type geography value_desc true_value -#> 1: 2020-05-04 hospital_inc England Hospital admissions 1043 -#> 2: 2020-05-04 hospital_prev England Total beds occupied 10648 -#> 3: 2020-05-11 hospital_inc England Hospital admissions 743 -#> --- -#> 5150: 2020-08-03 death_inc_line Wales Deaths 1 -#> 5151: 2020-08-03 death_inc_line Wales Deaths 1 -#> 5152: 2020-08-03 death_inc_line Wales Deaths 1 -#> model creation_date quantile prediction horizon -#> 1: NA NA NA -#> 2: NA NA NA -#> 3: NA NA NA -#> --- -#> 5150: SIRCOVID 2020-07-13 0.85 4 21 -#> 5151: DetSEIRwithNB MCMC 2020-07-13 0.90 2 21 -#> 5152: SIRCOVID 2020-07-13 0.90 6 21 -print(scoringutils::range_example_data_long, 3, 3) -#> value_date value_type geography value_desc true_value -#> 1: 2020-05-04 hospital_inc England Hospital admissions 1043 -#> 2: 2020-05-04 hospital_prev England Total beds occupied 10648 -#> 3: 2020-05-11 hospital_inc England Hospital admissions 743 -#> --- -#> 5417: 2020-07-27 death_inc_line Wales Deaths 1 -#> 5418: 2020-08-03 death_inc_line Wales Deaths 1 -#> 5419: 2020-08-03 death_inc_line Wales Deaths 1 -#> model creation_date prediction horizon boundary range -#> 1: NA NA NA -#> 2: NA NA NA -#> 3: NA NA NA -#> --- -#> 5417: SIRCOVID 2020-07-13 1 14 upper 0 -#> 5418: DetSEIRwithNB MCMC 2020-07-13 0 21 upper 0 -#> 5419: SIRCOVID 2020-07-13 1 21 upper 0 -print(scoringutils::range_example_data_wide, 3, 3) -#> value_date value_type geography value_desc true_value -#> 1: 2020-05-04 death_inc_line England Deaths 448 -#> 2: 2020-05-04 death_inc_line Northern Ireland Deaths 9 -#> 3: 2020-05-04 death_inc_line Scotland Deaths 40 -#> --- -#> 344: 2020-08-03 hospital_prev England Total beds occupied 784 -#> 345: 2020-08-03 hospital_prev Scotland Total beds occupied 265 -#> 346: 2020-08-03 icu_prev Scotland ICU beds occupied 3 -#> model creation_date horizon lower_0 lower_10 lower_20 -#> 1: NA NA NA NA -#> 2: NA NA NA NA -#> 3: NA NA NA NA -#> --- -#> 344: NA NA NA NA -#> 345: NA NA NA NA -#> 346: DetSEIRwithNB MCMC 2020-07-13 21 2 2 2 -#> lower_30 lower_40 lower_50 lower_60 lower_70 lower_80 lower_90 upper_0 -#> 1: NA NA NA NA NA NA NA NA -#> 2: NA NA NA NA NA NA NA NA -#> 3: NA NA NA NA NA NA NA NA -#> --- -#> 344: NA NA NA NA NA NA NA NA -#> 345: NA NA NA NA NA NA NA NA -#> 346: 2 2 1 1 1 1 0 2 -#> upper_10 upper_20 upper_30 upper_40 upper_50 upper_60 upper_70 upper_80 -#> 1: NA NA NA NA NA NA NA NA -#> 2: NA NA NA NA NA NA NA NA -#> 3: NA NA NA NA NA NA NA NA -#> --- -#> 344: NA NA NA NA NA NA NA NA -#> 345: NA NA NA NA NA NA NA NA -#> 346: 3 3 3 3 4 4 4 5 -#> upper_90 -#> 1: NA -#> 2: NA -#> 3: NA -#> --- -#> 344: NA -#> 345: NA -#> 346: 6 -``` - -sample based format with either continuous or integer values - -``` r -print(scoringutils::integer_example_data, 3, 3) -#> value_date value_type geography value_desc model -#> 1: 2020-05-04 hospital_inc England Hospital admissions -#> 2: 2020-05-04 hospital_prev England Total beds occupied -#> 3: 2020-05-11 hospital_inc England Hospital admissions -#> --- -#> 13427: 2020-08-03 death_inc_line Wales Deaths SIRCOVID -#> 13428: 2020-08-03 death_inc_line Wales Deaths SIRCOVID -#> 13429: 2020-08-03 death_inc_line Wales Deaths SIRCOVID -#> creation_date horizon prediction sample true_value -#> 1: NA NA NA 1043 -#> 2: NA NA NA 10648 -#> 3: NA NA NA 743 -#> --- -#> 13427: 2020-07-13 21 0 48 1 -#> 13428: 2020-07-13 21 0 49 1 -#> 13429: 2020-07-13 21 0 50 1 -print(scoringutils::continuous_example_data, 3, 3) -#> value_date value_type geography value_desc model -#> 1: 2020-05-04 hospital_inc England Hospital admissions -#> 2: 2020-05-04 hospital_prev England Total beds occupied -#> 3: 2020-05-11 hospital_inc England Hospital admissions -#> --- -#> 13427: 2020-08-03 death_inc_line Wales Deaths SIRCOVID -#> 13428: 2020-08-03 death_inc_line Wales Deaths SIRCOVID -#> 13429: 2020-08-03 death_inc_line Wales Deaths SIRCOVID -#> creation_date horizon prediction sample true_value -#> 1: NA NA NA 1043 -#> 2: NA NA NA 10648 -#> 3: NA NA NA 743 -#> --- -#> 13427: 2020-07-13 21 0.3340917507 48 1 -#> 13428: 2020-07-13 21 0.3540187438 49 1 -#> 13429: 2020-07-13 21 0.0001998965 50 1 -``` +## Installation -forecasts in a binary format: +Install the CRAN version of this package using: ``` r -print(scoringutils::binary_example_data, 3, 3) -#> value_date value_type geography value_desc model -#> 1: 2020-05-04 hospital_inc England Hospital admissions -#> 2: 2020-05-04 hospital_prev England Total beds occupied -#> 3: 2020-05-11 hospital_inc England Hospital admissions -#> --- -#> 344: 2020-07-27 death_inc_line Wales Deaths SIRCOVID -#> 345: 2020-08-03 death_inc_line Wales Deaths DetSEIRwithNB MCMC -#> 346: 2020-08-03 death_inc_line Wales Deaths SIRCOVID -#> creation_date horizon prediction true_value -#> 1: NA NA NA -#> 2: NA NA NA -#> 3: NA NA NA -#> --- -#> 344: 2020-07-13 14 0.34 0 -#> 345: 2020-07-13 21 0.22 1 -#> 346: 2020-07-13 21 0.26 0 +install.packages("scoringutils") ``` -It also offers functionality to convert between these formats. For more -information have a look at the documentation of the following functions: - -``` r -scoringutils::sample_to_quantile() # convert from sample based to quantile format -scoringutils::range_long_to_quantile() # convert from range format to plain quantile -scoringutils::quantile_to_range_long() # convert the other way round -scoringutils::range_wide_to_long() # convert range based format from wide to long -scoringutils::range_long_to_wide() # convert the other way round -``` - -# Scoring Forecasts Directly - -A variety of metrics and scoring rules can also be accessed directly -through the `scoringutils` package. - -The following gives an overview of (most of) the implemented metrics. - -## Bias - -The function `bias` determines bias from predictive Monte-Carlo samples, -automatically recognising whether forecasts are continuous or integer -valued. - -For continuous forecasts, Bias is measured as -*B**t*(*P**t*,*x**t*) = 1 − 2 ⋅ (*P**t*(*x**t*)) - -where *P**t* is the empirical cumulative distribution -function of the prediction for the true value *x**t*. -Computationally, *P**t*(*x**t*) is just calculated -as the fraction of predictive samples for *x**t* that are -smaller than *x**t*. - -For integer valued forecasts, Bias is measured as - -*B**t*(*P**t*,*x**t*) = 1 − (*P**t*(*x**t*)+*P**t*(*x**t*+1)) - -to adjust for the integer nature of the forecasts. In both cases, Bias -can assume values between -1 and 1 and is 0 ideally. +Install the stable development version of the package with: ``` r -## integer valued forecasts -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -bias(true_values, predictions) -#> [1] -0.700 -0.520 -0.460 -0.665 0.305 0.780 0.750 -0.600 -0.935 0.975 -#> [11] -0.410 -0.380 -0.135 0.080 0.580 -0.310 -0.380 -0.195 0.375 0.700 -#> [21] 0.545 -0.010 0.080 -0.440 0.885 0.310 0.530 -0.945 -0.045 -0.635 - -## continuous forecasts -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(30, mean = 1:30)) -bias(true_values, predictions) -#> [1] -0.17 0.42 0.53 -0.70 0.71 -0.49 -0.71 0.28 -0.51 0.53 0.92 0.52 -#> [13] 0.31 -0.92 0.03 0.75 -0.66 0.59 0.48 -0.83 -0.99 -0.66 -0.57 0.54 -#> [25] 0.43 -0.33 -0.27 -0.39 0.58 -0.17 +install.packages("scoringutils", repos = "https://epiforecasts.r-universe.dev") ``` -## Sharpness - -Sharpness is the ability of the model to generate predictions within a -narrow range. It is a data-independent measure, and is purely a feature -of the forecasts themselves. - -Shaprness of predictive samples corresponding to one single true value -is measured as the normalised median of the absolute deviation from the -median of the predictive samples. For details, see `?stats::mad` +Install the unstable development from GitHub using the following, ``` r -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -sharpness(predictions) -#> [1] 1.4826 1.4826 1.4826 1.4826 1.4826 2.9652 2.9652 2.9652 2.9652 2.9652 -#> [11] 4.4478 2.9652 2.9652 2.9652 4.4478 4.4478 3.7065 4.4478 4.4478 4.4478 -#> [21] 5.9304 4.4478 4.4478 4.4478 4.4478 4.4478 4.4478 5.9304 4.4478 5.9304 +remotes::install_github("epiforecasts/scoringutils", dependencies = TRUE) ``` -## Calibration - -Calibration or reliability of forecasts is the ability of a model to -correctly identify its own uncertainty in making predictions. In a model -with perfect calibration, the observed data at each time point look as -if they came from the predictive probability distribution at that time. - -Equivalently, one can inspect the probability integral transform of the -predictive distribution at time t, - -*u**t* = *F**t*(*x**t*) - -where *x**t* is the observed data point at time -*t* in *t*1, …, *t**n*, n being the number of -forecasts, and *F**t* is the (continuous) predictive -cumulative probability distribution at time t. If the true probability -distribution of outcomes at time t is *G**t* then the -forecasts *F**t* are said to be ideal if -*F**t* = *G**t* at all times *t*. In that case, -the probabilities ut are distributed uniformly. - -In the case of discrete outcomes such as incidence counts, the PIT is no -longer uniform even when forecasts are ideal. In that case a randomised -PIT can be used instead: - -*u**t* = *P**t*(*k**t*) + *v* ⋅ (*P**t*(*k**t*)−*P**t*(*k**t*−1)) - -where *k**t* is the observed count, *P**t*(*x*) is -the predictive cumulative probability of observing incidence *k* at time -*t*, *P**t*(−1) = 0 by definition and *v* is standard uniform -and independent of *k*. If *P**t* is the true cumulative -probability distribution, then *u**t* is standard uniform. - -The function checks whether integer or continuous forecasts were -provided. It then applies the (randomised) probability integral and -tests the values *u**t* for uniformity using the -Anderson-Darling test. +## Quick start -As a rule of thumb, there is no evidence to suggest a forecasting model -is miscalibrated if the p-value found was greater than a threshold of -*p* \>  = 0.1, some evidence that it was miscalibrated if -0.01 \< *p* \< 0.1, and good evidence that it was miscalibrated if -*p* \<  = 0.01. In this context it should be noted, though, that -uniformity of the PIT is a necessary but not sufficient condition of -calibration. It should als be noted that the test only works given -sufficient samples, otherwise the Null hypothesis will often be rejected -outright. +In this quick start guide we explore some of the functionality of the +`scoringutils` package using quantile forecasts from the [ECDC +forecasting hub](https://covid19forecasthub.eu/) as an example. For more +detailed documentation please see the package vignettes, and individual +function documentation. -## Continuous Ranked Probability Score (CRPS) +### Plotting forecasts -Wrapper around the `crps_sample` function from the `scoringRules` -package. For more information look at the manuals from the -`scoringRules` package. The function can be used for continuous as well -as integer valued forecasts. Smaller values are better. +As a first step to evaluating the forecasts we visualise them. For the +purposes of this example here we make use of `plot_predictions()` to +filter the available forecasts for a single model, and forecast date. ``` r -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -crps(true_values, predictions) -#> [1] 0.633225 0.522550 0.396400 2.935500 0.646975 1.092100 1.673725 1.208975 -#> [9] 2.349250 1.457050 1.492775 3.949725 1.190300 3.185700 5.178925 0.968050 -#> [17] 0.979150 1.072325 2.183100 4.927400 1.109600 1.740250 1.685275 1.841800 -#> [25] 1.937725 1.789300 2.097750 1.441450 7.593550 8.196575 +example_quantile %>% + plot_predictions( + x = "target_end_date", + filter_truth = list( + 'target_end_date <= "2021-07-15"', 'target_end_date > "2021-05-22"' + ), + filter_forecasts = list( + "model == 'EuroCOVIDhub-ensemble'", 'forecast_date == "2021-06-28"' + ) + ) + + facet_wrap(target_type ~ location, ncol = 4, scales = "free") + + theme(legend.position = "bottom") ``` -## Dawid-Sebastiani Score (DSS) - -Wrapper around the `dss_sample` function from the `scoringRules` -package. For more information look at the manuals from the -`scoringRules` package. The function can be used for continuous as well -as integer valued forecasts. Smaller values are better. - -``` r -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -dss(true_values, predictions) -#> [1] -0.1800783 1.0913706 1.0079771 3.4442881 1.3031143 1.8876494 -#> [7] 2.3585774 2.2110750 2.3802473 2.6640302 3.0230465 7.3333000 -#> [13] 2.7466030 2.8338357 2.8143720 2.8633432 2.8711060 3.3624147 -#> [19] 3.0109214 3.0643456 3.1705285 3.9281835 3.3284476 16.2662208 -#> [25] 3.2660274 3.5280882 3.9380496 4.4371102 3.4380811 3.6981317 -``` - -## Log Score - -Wrapper around the `log_sample` function from the `scoringRules` -package. For more information look at the manuals from the -`scoringRules` package. The function should not be used for integer -valued forecasts. While Log Scores are in principle possible for integer -valued forecasts they require a kernel density estimate which is not -well defined for discrete values. Smaller values are better. - -``` r -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -logs(true_values, predictions) -#> [1] 1.2611949 2.0085114 0.9560384 1.0689759 1.0129317 1.4904568 2.9205285 -#> [8] 0.8460895 1.4281170 0.9316350 0.9953321 1.0236646 0.9766526 1.8764237 -#> [15] 1.6515511 1.0918181 2.1622704 1.5427550 0.8694894 1.5452538 0.9868115 -#> [22] 0.9044315 1.3446104 1.9701726 1.1217898 1.2175854 1.4064990 1.0417268 -#> [29] 1.1115333 0.8998055 -``` - -## Brier Score - -The Brier score is a proper score rule that assesses the accuracy of -probabilistic binary predictions. The outcomes can be either 0 or 1, the -predictions must be a probability that the true outcome will be 1. - -The Brier Score is then computed as the mean squared error between the -probabilistic prediction and the true outcome. - -``` r -true_values <- sample(c(0,1), size = 30, replace = TRUE) -predictions <- runif(n = 30, min = 0, max = 1) - -brier_score(true_values, predictions) -#> [1] 0.3554911 -``` - -## Interval Score - -The Interval Score is a Proper Scoring Rule to score quantile -predictions, following Gneiting and Raftery (2007). Smaller values are -better. - -The score is computed as - -![interval_score](man/figures/interval_score.png) - -where 1() is the indicator function and *α* is the decimal value that -indicates how much is outside the prediction interval. To improve -usability, the user is asked to provide an interval range in percentage -terms, i.e. interval_range = 90 (percent) for a 90 percent prediction -interval. Correspondingly, the user would have to provide the 5% and 95% -quantiles (the corresponding alpha would then be 0.1). No specific -distribution is assumed, but the range has to be symmetric (i.e you -can’t use the 0.1 quantile as the lower bound and the 0.7 quantile as -the upper). Setting `weigh = TRUE` will weigh the score by -$\\frac{\\alpha}{2}$ such that the Interval Score converges to the CRPS -for increasing number of quantiles. - -``` r -true_values <- rnorm(30, mean = 1:30) -interval_range <- 90 -alpha <- (100 - interval_range) / 100 -lower <- qnorm(alpha/2, rnorm(30, mean = 1:30)) -upper <- qnorm((1- alpha/2), rnorm(30, mean = 1:30)) +![](man/figures/unnamed-chunk-4-1.png) -interval_score(true_values = true_values, - lower = lower, - upper = upper, - interval_range = interval_range) -#> [1] 0.1435602 0.1363273 0.1214869 0.3309745 0.1909975 0.1015329 0.1686463 -#> [8] 0.1605127 0.2402594 0.1066244 0.7533624 0.5321044 0.1682648 0.2013887 -#> [15] 0.3317460 0.2161267 0.3294493 0.1764297 0.9422938 0.2414125 2.0005948 -#> [22] 0.1053030 0.1262531 0.2189997 0.1860936 0.1452726 0.1516566 0.2706347 -#> [29] 0.1062714 0.2404143 -``` +### Scoring forecasts + +Forecasts can be easily and quickly scored using the `score()` function. +This function returns unsumarised scores, which in most cases is not +what the user wants. Here we make use of additional functions from +`scoringutils` to add empirical coverage-levels (`add_coverage()`), and +scores relative to a baseline model (here chosen to be the +EuroCOVIDhub-ensemble model). See the getting started vignette for more +details. Finally we summarise these scores by model and target type. + +``` r +example_quantile %>% + score() %>% + add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% + summarise_scores( + by = c("model", "target_type"), + relative_skill = TRUE, + baseline = "EuroCOVIDhub-ensemble" + ) %>% + summarise_scores( + fun = signif, + digits = 2 + ) %>% + kable() +#> The following messages were produced when checking inputs: +#> 1. Some values for `prediction` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected. +#> Warning in any(sign(scores[[metric]] < 0)): coercing argument of type 'double' +#> to logical +``` + +| model | target_type | interval_score | dispersion | underprediction | overprediction | coverage_deviation | bias | ae_median | coverage_50 | coverage_90 | relative_skill | scaled_rel_skill | +|:----------------------|:------------|---------------:|-----------:|----------------:|---------------:|-------------------:|--------:|----------:|------------:|------------:|---------------:|-----------------:| +| EuroCOVIDhub-baseline | Cases | 28000 | 4100 | 10000.0 | 14000.0 | -0.110 | 0.0980 | 38000 | 0.33 | 0.82 | 1.20 | 1.6 | +| EuroCOVIDhub-baseline | Deaths | 160 | 91 | 2.1 | 66.0 | 0.120 | 0.3400 | 230 | 0.66 | 1.00 | 1.90 | 3.8 | +| EuroCOVIDhub-ensemble | Cases | 18000 | 3700 | 4200.0 | 10000.0 | -0.098 | -0.0560 | 24000 | 0.39 | 0.80 | 0.74 | 1.0 | +| EuroCOVIDhub-ensemble | Deaths | 41 | 30 | 4.1 | 7.1 | 0.200 | 0.0730 | 53 | 0.88 | 1.00 | 0.50 | 1.0 | +| UMass-MechBayes | Deaths | 53 | 27 | 17.0 | 9.0 | -0.023 | -0.0220 | 78 | 0.46 | 0.88 | 0.63 | 1.2 | +| epiforecasts-EpiNow2 | Cases | 21000 | 5700 | 3300.0 | 12000.0 | -0.067 | -0.0790 | 28000 | 0.47 | 0.79 | 0.86 | 1.2 | +| epiforecasts-EpiNow2 | Deaths | 67 | 32 | 16.0 | 19.0 | -0.043 | -0.0051 | 100 | 0.42 | 0.91 | 0.83 | 1.6 | + +`scoringutils` contains additional functionality to summarise these +scores at different levels, to visualise them, and to explore the +forecasts themselves. See the package vignettes and function +documentation for more information. + +## Citation + +If using `scoringutils` in your work please consider citing it using the +following, + + #> + #> To cite scoringutils in publications use: + #> + #> Nikos I. Bosse, Sam Abbott, EpiForecasts, and Sebastian Funk (2020). + #> scoringutils: Utilities for Scoring and Assessing Predictions, DOI: + #> 10.5281/zenodo.4618017 + #> + #> A BibTeX entry for LaTeX users is + #> + #> @Manual{, + #> title = {scoringutils: Utilities for Scoring and Assessing Predictions}, + #> author = {Nikos I. Bosse and Sam Abbott and {EpiForecasts} and Sebastian Funk}, + #> year = {2020}, + #> doi = {10.5281/zenodo.4618017}, + #> } + +## How to make a bug report or feature request + +Please briefly describe your problem and what output you expect in an +[issue](https://github.com/epiforecasts/scoringutils/issues). If you +have a question, please don’t open an issue. Instead, ask on our [Q and +A +page](https://github.com/epiforecasts/scoringutils/discussions/categories/q-a). + +## Contributing + +We welcome contributions and new contributors! We particularly +appreciate help on priority problems in the +[issues](https://github.com/epiforecasts/scoringutils/issues). Please +check and add to the issues, and/or add a [pull +request](https://github.com/epiforecasts/scoringutils/pulls). + +## Code of Conduct + +Please note that the `scoringutils` project is released with a +[Contributor Code of +Conduct](https://epiforecasts.io/scoringutils/CODE_OF_CONDUCT.html). By +contributing to this project, you agree to abide by its terms. diff --git a/_pkgdown.yml b/_pkgdown.yml index 4adb25251..187327235 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,61 +1,48 @@ +url: https://epiforecasts.io/scoringutils/ +template: + bootstrap: 5 + package: preferably + params: + toggle: manual + twitter: + creator: "@nikosbosse" + card: summary_large_image + +development: + mode: release + +authors: + Nikos Bosse: + href: "https://followtheargument.org/" + Sam Abbott: + href: "https://www.samabbott.co.uk/" + + reference: - title: Package documentation contents: - - scoringutils - - title: Check input functions - contents: - - check_forecasts - - print.scoringutils_check - - title: Evaluation functions - contents: - - available_metrics - - compare_two_models - - starts_with("eval_") - - eval_forecasts - - eval_forecasts_binary - - eval_forecasts_sample - - pairwise_comparison - - pairwise_comparison_one_group - - title: Scoring functions - contents: - - abs_error - - ae_median_quantile - - ae_median_sample - - bias - - brier_score - - crps - - dss - - interval_score - - logs - - mse - - pit - - pit_df - - pit_df_fast - - quantile_bias - - sharpness + - scoring-forecasts-directly + - metric-details + - title: Functions to check and analyse inputs + contents: + - has_keyword("check-forecasts") + - title: Functions for convenient forecast evaluation + contents: + - has_keyword("scoring") + - starts_with("score_") + - title: Lower-level scoring functions + contents: + - has_keyword("metric") - title: Data wrangling helpers contents: - - ends_with("to_long") - - ends_with("to_wide") - - ends_with("to_quantile") - - ends_with("to_range") - - ends_with("to_range_long") - - merge_pred_and_obs - - title: Plotting helpers + - has_keyword("data-handling") + - title: Functions for plotting and data visualisation contents: - starts_with("plot_") - - ends_with("_plot") - - hist_PIT - - hist_PIT_quantile - - interval_coverage - - quantile_coverage - - score_table - - score_heatmap - - show_avail_forecasts - - wis_components - title: Internal functions contents: - has_keyword("internal") - - title: Example data + - title: Example data and information contents: - has_keyword("datasets") + - has_keyword("info") diff --git a/data/binary_example_data.rda b/data/binary_example_data.rda deleted file mode 100644 index 6276797da..000000000 Binary files a/data/binary_example_data.rda and /dev/null differ diff --git a/data/continuous_example_data.rda b/data/continuous_example_data.rda deleted file mode 100644 index d9d83e95d..000000000 Binary files a/data/continuous_example_data.rda and /dev/null differ diff --git a/data/example_binary.rda b/data/example_binary.rda new file mode 100644 index 000000000..1536b8ca2 Binary files /dev/null and b/data/example_binary.rda differ diff --git a/data/example_continuous.rda b/data/example_continuous.rda new file mode 100644 index 000000000..af2a07ccf Binary files /dev/null and b/data/example_continuous.rda differ diff --git a/data/example_integer.rda b/data/example_integer.rda new file mode 100644 index 000000000..27d80e352 Binary files /dev/null and b/data/example_integer.rda differ diff --git a/data/example_quantile.rda b/data/example_quantile.rda new file mode 100644 index 000000000..e4a0a2579 Binary files /dev/null and b/data/example_quantile.rda differ diff --git a/data/example_quantile_forecasts_only.rda b/data/example_quantile_forecasts_only.rda index 78b5d457a..cd3cbdd49 100644 Binary files a/data/example_quantile_forecasts_only.rda and b/data/example_quantile_forecasts_only.rda differ diff --git a/data/example_truth_data_only.rda b/data/example_truth_data_only.rda deleted file mode 100644 index 805fde2f8..000000000 Binary files a/data/example_truth_data_only.rda and /dev/null differ diff --git a/data/example_truth_only.rda b/data/example_truth_only.rda new file mode 100644 index 000000000..ae829a5b3 Binary files /dev/null and b/data/example_truth_only.rda differ diff --git a/data/integer_example_data.rda b/data/integer_example_data.rda deleted file mode 100644 index d9699e668..000000000 Binary files a/data/integer_example_data.rda and /dev/null differ diff --git a/data/metrics_summary.rda b/data/metrics_summary.rda new file mode 100644 index 000000000..cc96c38b9 Binary files /dev/null and b/data/metrics_summary.rda differ diff --git a/data/quantile_example_data.rda b/data/quantile_example_data.rda deleted file mode 100644 index 9b405ef5c..000000000 Binary files a/data/quantile_example_data.rda and /dev/null differ diff --git a/data/range_example_data_long.rda b/data/range_example_data_long.rda deleted file mode 100644 index a5ac559fb..000000000 Binary files a/data/range_example_data_long.rda and /dev/null differ diff --git a/data/range_example_data_semi_wide.rda b/data/range_example_data_semi_wide.rda deleted file mode 100644 index 9f3a715be..000000000 Binary files a/data/range_example_data_semi_wide.rda and /dev/null differ diff --git a/data/range_example_data_wide.rda b/data/range_example_data_wide.rda deleted file mode 100644 index e0e6e1cbc..000000000 Binary files a/data/range_example_data_wide.rda and /dev/null differ diff --git a/inst/create_example_data.R b/inst/create_example_data.R index 9c2c5567a..77734c527 100644 --- a/inst/create_example_data.R +++ b/inst/create_example_data.R @@ -1,80 +1,102 @@ library(data.table) library(dplyr) library(devtools) +library(here) +library(covidHubUtils) # devtools::install_github("reichlab/covidHubUtils") +library(purrr) +library(data.table) +library(stringr) +library(scoringutils) + + +# download data from the European Forecast Hub Github Repository using +# subversion. You can also download the folders manually instead. + +system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-ensemble") +system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-baseline") +system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/UMass-MechBayes") +system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/epiforecasts-EpiNow2") + +# load truth data using the covidHubutils package ------------------------------ +truth <- covidHubUtils::load_truth(hub = "ECDC") |> + filter(target_variable %in% c("inc case", "inc death")) |> + mutate(target_variable = ifelse(target_variable == "inc case", + "Cases", "Deaths")) |> + rename(target_type = target_variable, + true_value = value) |> + select(-model) + +# get the correct file paths to all forecasts ---------------------------------- +folders <- here(c("EuroCOVIDhub-ensemble", "EuroCOVIDhub-baseline", "UMass-MechBayes", "epiforecasts-EpiNow2")) + +file_paths <- purrr::map(folders, + .f = function(folder) { + files <- list.files(folder) + out <- here::here(folder, files) + return(out)}) %>% + unlist() +file_paths <- file_paths[grepl(".csv", file_paths)] + +# load all past forecasts ------------------------------------------------------ +# ceate a helper function to get model name from a file path +get_model_name <- function(file_path) { + split <- str_split(file_path, pattern = "/")[[1]] + model <- split[length(split) - 1] + return(model) +} -# install package from github repository -# devtools::install_github("epiforecasts/covid19.forecasts.uk") - - - - - -# create quantile example ------------------------------------------------------ -# load forecasts and do some filtering -data <- covid19.forecasts.uk::uk_forecasts %>% - dplyr::mutate(horizon = as.numeric(value_date - creation_date), - quantile = round(quantile, 3)) %>% - dplyr::filter(model %in% c("EpiSoon", "SIRCOVID", "DetSEIRwithNB MCMC"), - creation_date > "2020-06-01", - geography %in% c("England", "Scotland", "Wales", "Northern Ireland"), - horizon %in% c(7, 14, 21)) %>% - dplyr::rename(prediction = value) - -# get available dates -dates <- data$value_date %>% - unique() - -# load observations and keep a couple of weeks before any forecasts were made -obs <- covid19.forecasts.uk::covid_uk_data %>% - dplyr::filter(value_date %in% c(as.Date(c("2020-06-08", "2020-06-01", "2020-05-25", - "2020-05-18", "2020-05-11", "2020-05-04")), - dates), - geography %in% c("England", "Scotland", "Wales", "Northern Ireland")) %>% - dplyr::rename(true_value = value) %>% - dplyr::select(-truncation) +# load forecasts +prediction_data <- map_dfr(file_paths, + .f = function(file_path) { + data <- fread(file_path) + data[, `:=`( + target_end_date = as.Date(target_end_date), + quantile = as.numeric(quantile), + forecast_date = as.Date(forecast_date), + model = get_model_name(file_path) + )] + return(data) + }) %>% + filter(grepl("case", target) | grepl("death", target)) %>% + mutate(target_type = ifelse(grepl("death", target), + "Deaths", "Cases"), + horizon = as.numeric(substr(target, 1, 1))) %>% + rename(prediction = value) %>% + filter(type == "quantile", + grepl("inc", target)) %>% + select(location, forecast_date, quantile, prediction, + model, target_end_date, target, target_type, horizon) + +# harmonise forecast dates to be the date a submission was made +hub_data <- mutate(prediction_data, + forecast_date = calc_submission_due_date(forecast_date)) + +hub_data <- hub_data |> + filter(horizon <= 3, + forecast_date > "2021-05-01", + forecast_date < "2021-07-15", + # quantile %in% c(seq(0.05, 0.45, 0.1), 0.5, seq(0.55, 0.95, 0.1)), + location %in% c("DE", "GB", "FR", "IT")) |> + select(-target) + +truth <- truth |> + filter(target_end_date > "2021-01-01", + target_end_date < max(hub_data$target_end_date), + location %in% c("DE", "GB", "FR", "IT")) |> + select(-population) # save example data with forecasts only -example_quantile_forecasts_only <- data +example_quantile_forecasts_only <- hub_data usethis::use_data(example_quantile_forecasts_only, overwrite = TRUE) -example_truth_data_only <- obs -usethis::use_data(example_truth_data_only, overwrite = TRUE) +example_truth_only <- truth +usethis::use_data(example_truth_only, overwrite = TRUE) - -# join -quantile_example_data <- dplyr::left_join(obs, data) %>% - dplyr::mutate(model = as.character(model)) -data.table::setDT(quantile_example_data) +# merge forecast data and truth data and save +example_quantile <- merge_pred_and_obs(hub_data, truth) +data.table::setDT(example_quantile) # make model a character instead of a factor -usethis::use_data(quantile_example_data, overwrite = TRUE) - - - - -# create long range example ---------------------------------------------------- -range_example_data_long <- quantile_to_range_long(quantile_example_data, - keep_quantile_col = FALSE) -usethis::use_data(range_example_data_long, overwrite = TRUE) - - - -# create wide range example ---------------------------------------------------- -range_example_data_wide <- range_long_to_wide(range_example_data_long) -range_example_data_wide[, NA_NA := NULL] -usethis::use_data(range_example_data_wide, overwrite = TRUE) - - - - -#create semi-wide range example ------------------------------------------------ -range_example_data_semi_wide <- data.table::copy(range_example_data_long) -range_example_data_semi_wide <- data.table::dcast(range_example_data_semi_wide, - ... ~ boundary, - value.var = "prediction") -range_example_data_semi_wide[, "NA" := NULL] -usethis::use_data(range_example_data_semi_wide, overwrite = TRUE) - - +usethis::use_data(example_quantile, overwrite = TRUE) # get continuous sample data --------------------------------------------------- # define gamma function @@ -114,26 +136,26 @@ get_samples <- function(values, quantiles, n_samples = 1000) { } # calculate samples -setDT(quantile_example_data) -n_samples <- 50 -continuous_example_data <- quantile_example_data[, .(prediction = get_samples(prediction, - quantile, - n_samples = n_samples), - sample = 1:n_samples, - true_value = unique(true_value)), - by = c("value_date", "value_type", "geography", - "value_desc", "model", "creation_date", - "horizon")] +setDT(example_quantile) +n_samples <- 40 +example_continuous <- example_quantile[, .(prediction = get_samples(prediction, + quantile, + n_samples = n_samples), + sample = 1:n_samples, + true_value = unique(true_value)), + by = c("location", "location_name", + "target_end_date", "target_type", + "forecast_date", "model", "horizon")] # remove unnecessary rows where no predictions are available -continuous_example_data[is.na(prediction), sample := NA] -continuous_example_data <- unique(continuous_example_data) -usethis::use_data(continuous_example_data, overwrite = TRUE) +example_continuous[is.na(prediction), sample := NA] +example_continuous <- unique(example_continuous) +usethis::use_data(example_continuous, overwrite = TRUE) # get integer sample data ------------------------------------------------------ -integer_example_data <- data.table::copy(continuous_example_data) -integer_example_data <- integer_example_data[, prediction := round(prediction)] -usethis::use_data(integer_example_data, overwrite = TRUE) +example_integer <- data.table::copy(example_continuous) +example_integer <- example_integer[, prediction := round(prediction)] +usethis::use_data(example_integer, overwrite = TRUE) @@ -145,26 +167,26 @@ usethis::use_data(integer_example_data, overwrite = TRUE) # observed value was below or above that mean prediction. # Take this as a way to create example data, not as sound statistical practice -binary_example_data <- data.table::copy(continuous_example_data) +example_binary <- data.table::copy(example_continuous) # store grouping variable -by <- c("value_date", "value_type", "geography", "value_desc", - "model", "creation_date", "horizon") +by = c("location", "target_end_date", "target_type", + "forecast_date", "model", "horizon") # calculate mean value -binary_example_data[, mean_val := mean(prediction), - by = by] +example_binary[, mean_val := mean(prediction), + by = by] # calculate binary prediction as percentage above mean -binary_example_data[, prediction := mean(prediction > mean_val), - by = by] +example_binary[, prediction := mean(prediction > mean_val), + by = by] # calculate true value as whether or not observed was above mean -binary_example_data[, true_value := true_value > mean_val] +example_binary[, true_value := true_value > mean_val] # delete unnecessary columns and take unique values -binary_example_data[, `:=`(sample = NULL, mean_val = NULL, - true_value = as.numeric(true_value))] -binary_example_data <- unique(binary_example_data) -usethis::use_data(binary_example_data, overwrite = TRUE) +example_binary[, `:=`(sample = NULL, mean_val = NULL, + true_value = as.numeric(true_value))] +example_binary <- unique(example_binary) +usethis::use_data(example_binary, overwrite = TRUE) diff --git a/inst/manuscript/R/illustration-relation-to-scale.R b/inst/manuscript/R/illustration-relation-to-scale.R new file mode 100644 index 000000000..3bc0b70bd --- /dev/null +++ b/inst/manuscript/R/illustration-relation-to-scale.R @@ -0,0 +1,219 @@ +library(scoringutils) +library(dplyr) +library(ggplot2) +library(data.table) + +library(data.table) +library(dplyr) +library(scoringutils) +library(ggplot2) +library(tidyr) +library(patchwork) +# +# sizes_nbinom <- c(0.1, 1, 1e4) +# n = 1000 +# mus <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5) +# +# df <- expand.grid("mu" = mus, +# "size" = sizes_nbinom) +# setDT(df) +# +# df[, `Log score` := mean(scoringRules::logs(y = rnbinom(n, size = size, mu = mu), +# family = "negative-binomial", +# size = size, mu = mu)), +# by = c("mu", "size")] +# df[, DSS := mean(scoringRules::dss_nbinom(y = rnbinom(n, size = size, mu = mu), +# size = size, mu = mu)), +# by = c("mu", "size")] +# df[, CRPS := mean(scoringRules::crps(y = rnbinom(n, size = size, mu = mu), +# family = "negative-binomial", +# size = size, mu = mu)), +# by = c("mu", "size")] +# +# +# df |> +# melt(measure.vars = c("Log score", "DSS"), +# variable.name = "Scoring rule", +# value.name = "Score") |> +# ggplot(aes(y = `Score`, x = mu, color = `Scoring rule`, group = `Scoring rule`)) + +# geom_line() + +# facet_wrap(~ size) +# +# +# +# make_plot <- function(scores, summary_fct = mean) { +# p1 <- scores |> +# group_by(state_size, scale, Theta) |> +# summarise(interval_score = summary_fct(interval_score)) |> +# group_by(Theta, scale) |> +# mutate(interval_score = interval_score / mean(interval_score), +# Theta = ifelse(Theta == "1e+09", "1b", Theta)) |> +# ggplot(aes(y = interval_score, x = state_size, colour = Theta)) + +# geom_point(size = 0.4) + +# labs(y = "WIS", x = "Size of state") + +# theme_minimal() + +# facet_wrap(~ scale, scales = "free_y") +# +# p2 <- p1 + +# scale_x_continuous(trans = "log10") + +# scale_y_continuous(trans = "log10") +# +# p1 / p2 +# } +# + + + + + + + +## Real Data + +ex <- example_continuous |> + filter(model == "EuroCOVIDhub-ensemble") + +scores <- ex |> + score() + +setnames(scores, old = c("dss", "crps", "log_score"), + new = c("DSS", "CRPS", "Log score")) + +df <- ex[sample == 1] |> + merge(scores) |> + melt(measure.vars = c("DSS", "CRPS", "Log score"), + variable.name = "Scoring rule", value.name = "Score") + +df[, `Scoring rule` := factor(`Scoring rule`, levels = c("CRPS", "DSS", "Log score"))] + +p_true <- df |> + filter(horizon == 3, location == "DE") |> + ggplot(aes(x = true_value, y = Score, ,group = `Scoring rule`, + colour = `Scoring rule`)) + + geom_line() + + scale_y_log10() + + scale_x_log10() + + labs(x = "Observed value") + + theme_scoringutils() + + theme(legend.position = "bottom") + +ggsave("inst/manuscript/plots/illustration-effect-scale.png", + width = 8, height = 3) + + + + + + + +# ------------------------------------------------------------------------------ +# different illustration: +# in this we see that the mean as well as the variance of the scores scale +# for crps, while the variance stays constant for dss and log score + + +library(scoringutils) +library(dplyr) +library(data.table) +library(tidyr) +library(ggplot2) + +simulate <- function(n_samples = 5e3, + n_replicates = 1e3, + true_value = 1, + scale_mean = 1, + scale_sd = scale_mean) { + pred <- rnorm(n_replicates * n_samples, + mean = true_value * scale_mean, + sd = true_value * scale_sd) + + df <- data.table( + true_value = true_value * scale_mean, + prediction = pred, + sample = 1:n_samples, + id = paste0("id", rep(1:n_replicates, each = n_samples)) + ) + + scores <- score_simulation(df, scale_mean = scale_mean, scale_sd = scale_sd) + return(scores) +} + +score_simulation <- function(df, scale_mean = 1, scale_sd = scale_mean) { + scores <- score(df) + m <- summarise_scores(scores, by = "model", fun = mean) |> + melt(id.vars = "model", value.name = "mean", variable.name = "score") + + s <- summarise_scores(scores, by = "model", fun = stats::sd) |> + melt(id.vars = "model", value.name = "sd", variable.name = "score") + + out <- merge(m, s, by = c("model", "score")) |> + melt(id.vars = c("model", "score"), variable.name = "type") + + return(out[]) +} + +scales_mean <- scales_sd <- c(1, 2, 5, 10, 20, 50) + +grid <- expand.grid( + scale_mean = scales_mean, + scale_sd = scales_sd +) |> + setDT() + +res <- readRDS("inst/manuscript/plots/relation-to-scale-example.Rda") + +# res <- grid |> +# rowwise() |> +# mutate(simulation := list(simulate(scale_mean = scale_mean, scale_sd = scale_sd))) +# +# saveRDS(res, file = "inst/manuscript/plots/relation-to-scale-example.Rda") + +df <- res |> + tidyr::unnest(cols = "simulation") + +df <- df |> + filter(score != "bias") |> + rename(`Scoring rule` = score) |> + mutate(type = ifelse(type == "mean", "Mean score", "Sd score")) |> + mutate(`Scoring rule` = ifelse(`Scoring rule` == "dss", + "DSS", + ifelse(`Scoring rule` == "crps", "CRPS", "Log score"))) + + +p1 <- df |> + filter(scale_mean == 1, + scale_sd < 20) |> + ggplot(aes(y = value, x = scale_sd, + group = `Scoring rule`, color = `Scoring rule`)) + + geom_line() + + facet_wrap(~ type, scales = "free") + + scale_y_log10() + + scale_x_log10() + + theme_scoringutils() + + labs(y = "Score", x = "Sd of F and G (mean constant)") + + +p2 <- df |> + filter(scale_sd == 1, + scale_mean < 20) |> + ggplot(aes(y = value, x = scale_mean, + group = `Scoring rule`, color = `Scoring rule`)) + + geom_line() + + facet_wrap(~ type, scales = "free") + + scale_y_log10() + + scale_x_log10() + + theme_scoringutils() + + labs(y = "Score", x = "Mean of F and G (sd constant)") + +layout <- " +AAACC +BBBCC +" + +p2 + p1 + p_true + + plot_layout(guides = "collect", design = layout) & + theme(legend.position = "bottom") & + plot_annotation(tag_levels = 'A') + +ggsave("inst/manuscript/plots/illustration-effect-scale.png", + height = 5, width = 8) diff --git a/inst/manuscript/R/illustration-sharpness-calibration.R b/inst/manuscript/R/illustration-sharpness-calibration.R new file mode 100644 index 000000000..3b449b5ff --- /dev/null +++ b/inst/manuscript/R/illustration-sharpness-calibration.R @@ -0,0 +1,74 @@ +library(ggplot2) +library(patchwork) +library(scoringutils) + +p1 <- + ggplot(data.frame(x = seq(-8, 8, 0.01), + x_example = rnorm(n = 1601, mean = 0, sd = 0.45)), + aes(x = x)) + + # geom_histogram(aes(x = x_example, y = ..density..), + # colour = "white", fill = "grey50", bins = 50) + + geom_function(fun = dnorm, colour = "black", + args = list(sd = 0.45)) + + expand_limits(y = c(0, 1.0), x = c(-3, 3)) + + scale_y_continuous(breaks = seq(0, 1, 0.25)) + + ggtitle("More sharp") + + theme_scoringutils() + +p2 <- + ggplot(data.frame(x = seq(-8, 8, 0.01), + x_example = rnorm(n = 1601, mean = 0, sd = 1.25)), + aes(x = x)) + + # geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_function(fun = dnorm, colour = "black", + args = list(sd = 1.25)) + + expand_limits(y = c(0, 1.0), x = c(-3, 3)) + + scale_y_continuous(breaks = seq(0, 1, 0.25)) + + ggtitle("Less sharp") + + theme_scoringutils() + +p1 + p2 + +ggsave("inst/manuscript/plots/sharpness-illustration.png", + width = 10, height = 4) + +p21 <- ggplot(data.frame(x = seq(-8, 8, 0.01), + x_example = rnorm(n = 1601, mean = 0, sd = 1.05)), + aes(x = x)) + + geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_function(fun = dnorm, colour = "black", + args = list(sd = 1)) + + ggtitle("Well calibrated") + + labs(y = "Density") + + theme_scoringutils() + +p22 <- ggplot(data.frame(x = seq(-8, 8, 0.01), + x_example = rnorm(n = 1601, mean = 1, sd = 1.05)), + aes(x = x)) + + geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_function(fun = dnorm, colour = "black", + args = list(mean = 2, sd = 1)) + + ggtitle("Badly calibrated") + + labs(y = "Density") + + theme_scoringutils() + +p23 <- ggplot(data.frame(x = seq(-8, 8, 0.01), + x_example = rnorm(n = 1601, mean = 0, sd = 1.05)), + aes(x = x)) + + geom_histogram(aes(x = x_example, y = ..density..), colour = "white", fill = "grey50") + + geom_function(fun = dnorm, colour = "black", + args = list(mean = 0, sd = 2.05)) + + ggtitle("Badly calibrated") + + labs(y = "Density") + + theme_scoringutils() + + +p21 + p22 + p23 + +ggsave("inst/manuscript/plots/calibration-illustration.png", + width = 10, height = 4) + +(p1 + p2) / + (p21 + p22 + p23) +ggsave("inst/manuscript/plots/calibration-sharpness-illustration.png", + width = 8, height = 4.5) diff --git a/inst/manuscript/R/toy-example-calibration.R b/inst/manuscript/R/toy-example-calibration.R new file mode 100644 index 000000000..001efe22b --- /dev/null +++ b/inst/manuscript/R/toy-example-calibration.R @@ -0,0 +1,145 @@ +library(scoringutils) +library(patchwork) +library(ggplot2) +library(data.table) +library(dplyr) + +# generate predictions data.table +n_truth = 1000 +n_samples = 1000 +predictions <- rnorm(n_truth * n_samples, 0, 1) +true_values1 <- rnorm(n_samples) +true_values2 <- rnorm(n_samples, mean = 0.5) +true_values3 <- rnorm(n_samples, sd = 1.4) +true_values4 <- rnorm(n_samples, sd = 0.7) + +df <- data.table(prediction = rep(predictions, each = 4), + id = rep(1:n_truth, each = n_samples), + true_value = rep(c(true_values1, true_values2, + true_values3, true_values4), each = n_samples), + sample = 1:n_samples, + `true_distr` = rep(c("Truth: N(0, 1)", "Truth: N(0.5, 1)", + "Truth: N(0, 1.4)", "Truth: N(0, 0.7)"), + each = n_truth * n_samples)) + +df[, true_distr := factor(`true_distr`, + levels = c("Truth: N(0, 1)", "Truth: N(0.5, 1)", + "Truth: N(0, 1.4)", "Truth: N(0, 0.7)"))] + +# obtain scores and create a table based on scores ----------------------------- +res <- score(df) +res_summarised <- summarise_scores(res, by = c("true_distr")) + +scores_table_plot <- plot_score_table(res_summarised, y = "true_distr") + + coord_flip() + + theme_scoringutils() + + theme(axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5)) + + theme(legend.position = "none") + + +# create histogram true vs. predicted ------------------------------------------ +pred_hist <- df |> + ggplot(aes(x = true_value)) + + facet_wrap(~ true_distr, nrow = 1) + + geom_histogram(aes(y=..density..), + fill = "grey", + colour = "dark grey") + + geom_function(fun = dnorm, colour = "black") + + theme_scoringutils() + + labs(y = "Density", x = "Value") + + +# create pit plots ------------------------------------------------------------- +pit <- pit(df, by = "true_distr") +pit_plots <- plot_pit(pit) + + facet_wrap(~ true_distr, nrow = 1) + + theme_scoringutils() + +# create interval and quantile coverage plots ---------------------------------- +# create coverage plots by transforming to quantile format first +quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) +df_quantile <- sample_to_quantile(df, + quantiles = quantiles) + +res_quantile <- score(df_quantile) +res_quantile <- summarise_scores(res_quantile, + by = c("true_distr", "range", "quantile")) + +res_quantile[, true_distr := factor(true_distr, + levels = c("Truth: N(0, 1)", "Truth: N(0.5, 1)", + "Truth: N(0, 1.4)", "Truth: N(0, 0.7)"))] + +res_quantile[, model := true_distr] + +interval_coverage <- plot_interval_coverage(res_quantile) + + facet_wrap(~ true_distr, nrow = 1) + + theme_scoringutils() + +quantile_coverage <- plot_quantile_coverage(res_quantile) + + facet_wrap(~ model, nrow = 1) + + theme_scoringutils() + + +# bring plot together ---------------------------------------------------------- +p <- pred_hist / + pit_plots / + interval_coverage / + quantile_coverage / + scores_table_plot + + plot_layout(guides = 'collect') & + theme(legend.position = "none") & + theme(panel.spacing = unit(2, "lines")) + +ggsave("inst/manuscript/plots/calibration-diagnostic-examples.png", width = 11.5, height = 11) + + + + + + + + + + + + + +# +# # plot with observations +# true_value_plot <- ggplot(data = data.frame(x = true_values), +# aes(x = x)) + +# geom_histogram(aes(y = ..density..), +# fill = "grey", +# colour = "dark grey") + +# theme_minimal() + +# labs(x = "True values", +# y = "Density") + +# theme(legend.position = "bottom") +# +# # plot with standard normal distribution +# standard_normal <- true_value_plot + +# geom_function(fun = dnorm, colour = "black") + +# ggtitle("Normal(0, 1)") +# +# # plot with shifted mean +# shifted_mean <- true_value_plot + +# geom_function(fun = dnorm, colour = "black", args = list(mean = 0.5)) + +# ggtitle("Normal(0.5, 1)") +# +# # plot with overdispersion +# overdispersion <- true_value_plot + +# geom_function(fun = dnorm, colour = "black", args = list(sd = 1.4)) + +# ggtitle("Normal(0, 1.4)") +# +# # plot with underdispersion +# underdispersion <- true_value_plot + +# geom_function(fun = dnorm, colour = "black", args = list(sd = 0.7)) + +# ggtitle("Normal(0, 0.7)") +# +# (standard_normal | shifted_mean | overdispersion | underdispersion) / +# pit_plots / +# interval_coverage / +# quantile_coverage +# # / +# # gridExtra::tableGrob(scores_table) +# diff --git a/inst/manuscript/R/toy-example-locality.R b/inst/manuscript/R/toy-example-locality.R new file mode 100644 index 000000000..4d57b6a7c --- /dev/null +++ b/inst/manuscript/R/toy-example-locality.R @@ -0,0 +1,61 @@ +library(scoringutils) +library(ggplot2) +library(data.table) + +quantiles <- seq(0.1, 1, 0.1) +forecast_a <- c(0.3, 0.35, 0.25, 0.04, 0.02, 0.01, 0.01, 0.01, 0.005, 0.005) +forecast_b <- c(0.1, 0.35, 0.05, 0.02, 0.01, 0.01, 0.05, 0.07, 0.2, 0.14) +true_value <- 2 + +df <- data.table( + forecaster = rep(c("Forecaster A", "Forecaster B"), each = 10), + outcome = rep(1:10, 2), + prob = c(forecast_a, forecast_b), + true_value = true_value +) + +df[, crps := sum((cumsum(prob) - (outcome >= true_value))^2), + by = c("forecaster")] +df[, log_score := -log(prob[outcome == true_value]), + by = c("forecaster")] +df[, mean_pred := sum(prob * outcome) / sum(prob), + by = c("forecaster")] +df[, sd_pred := sqrt(sum((prob * outcome - mean_pred)^2)), + by = c("forecaster")] +df[, log_score := -log(prob[outcome == true_value]), + by = c("forecaster")] +df[, dss := ((true_value - mean_pred)^2) / sd_pred + 2 * log(sd_pred), + by = c("forecaster")] + +# sense-check: compute crps using samples +sample_a <- sample(x=1:10, size = 1e5, replace = TRUE, prob = forecast_a) +sample_b <- sample(x=1:10, size = 1e5, replace = TRUE, prob = forecast_b) + +crps_a <- scoringutils::crps_sample(2, t(as.matrix(sample_a))) +crps_b <- scoringutils::crps_sample(2, t(as.matrix(sample_b))) + +annotation <- df[, .(forecaster, crps, log_score, dss)] |> unique() + + +ggplot(df, aes(x = factor(outcome), y = prob)) + + geom_col() + + geom_text(data = annotation, x = 4, y = 0.3, hjust = "left", + aes(label = paste("CRPS: ", round(crps, 3)))) + + geom_text(data = annotation,x = 4, y = 0.27, hjust = "left", + aes(label = paste("Log score: ", round(log_score, 3)))) + + geom_text(data = annotation, x = 4, y = 0.24, hjust = "left", + aes(label = paste("DSS: ", round(dss, 3)))) + + facet_wrap(~ forecaster) + + geom_vline(aes(xintercept = 2), linetype = "dashed") + + theme_scoringutils() + + labs(y = "Probability assigned", x = "Possible outcomes") + +ggsave("inst/manuscript/plots/score-locality.png", height = 3, width = 8) + + +# test with WIS. Problem: that doesn't work at the moment as intervals are +# not symmetric +# dt <- copy(df) +# dt[, quantile := cumsum(prob_b)] +# dt[, prediction := outcome] +# score(dt[, .(true_value, prediction, quantile)]) diff --git a/inst/manuscript/R/toy-example-mean-sd-deviation.R b/inst/manuscript/R/toy-example-mean-sd-deviation.R new file mode 100644 index 000000000..d11aaed03 --- /dev/null +++ b/inst/manuscript/R/toy-example-mean-sd-deviation.R @@ -0,0 +1,96 @@ +library(data.table) +library(scoringutils) +library(ggplot2) +library(scoringRules) +library(dplyr) +library(patchwork) + +# define simulation parameters +n_steps = 500 +n_rep <- 5000 +true_mean = 0 +true_sd = 5 +true_values <- rnorm(n = n_rep, mean = true_mean, sd = true_sd) +sd <- 10^(seq(-1, 1.6, length.out = n_steps)) +mu <- seq(0, 100, length.out = n_steps) + + +# look at effect of change in sd on score +res_sd <- data.table(sd = sd, + mu = true_mean) + +res_sd[, `:=` (CRPS = mean(scoringRules::crps(y = true_values, family = "normal", mean = mu, sd = sd)), + `Log score` = mean(scoringRules::logs(y = true_values, family = "normal", mean = mu, sd = sd)), + DSS = mean(scoringRules::dss_norm(y = true_values, mean = mu, sd = sd))), + by = "sd"] + +deviation_sd <- res_sd |> + melt(id.vars = c("sd", "mu"), value.name = "value", variable.name = "Score") |> + ggplot(aes(x = sd, y = value, color = Score)) + + geom_line() + + theme_scoringutils() + + geom_vline(aes(xintercept = 5), linetype = "dashed") + + coord_cartesian(ylim=c(0, 20)) + + annotate(geom="text", x=6, y=17, label="Sd of true \ndata-generating \ndistribution: 5", + color="black", hjust = "left") + + labs(y = "Score", x = "Standard deviation of predictive distribution") + +# +# # look at effect of change in mean on score +# res_mu <- data.table(sd = true_sd, +# mu = mu, +# crps = NA_real_, +# dss = NA_real_, +# logs = NA_real_) +# +# res_mu[, `:=` (crps = mean(crps_sample(y = true_values, family = "normal", mean = mu, sd = sd)), +# logs = mean(logs_sample(y = true_values, family = "normal", mean = mu, sd = sd)), +# dss = mean(dss_norm(y = true_values, mean = mu, sd = sd))), +# by = "mu"] +# +# deviation_mu <- res_mu |> +# melt(id.vars = c("sd", "mu"), value.name = "value", variable.name = "Score") |> +# ggplot(aes(x = mu, y = value, color = Score)) + +# geom_line() + +# theme_minimal() + +# labs(y = "Score", x = "Mean of predictive distribution") + +# geom_vline(aes(xintercept = 0), linetype = "dashed") + +# coord_cartesian(ylim=c(0, 150)) + + + +# define simulation parameters +true_values <- seq(0, 4, length.out = 1000) +true_sd = 1 +true_mu = 0 + +# look at effect of change in sd on score +res_mu2 <- data.table(true_value = true_values) + +res_mu2[, `:=` (CRPS = scoringRules::crps(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10, + `Log score` = scoringRules::logs(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10, + DSS = scoringRules::dss_norm(y = true_value, mean = true_mu, sd = true_sd) / 10)] + +label_fn <- function(x) { + paste(10*x) +} + +outlier <- res_mu2 |> + melt(id.vars = c("true_value"), value.name = "value", variable.name = "Score") |> + ggplot(aes(x = true_value, y = value, color = Score)) + + geom_line() + + theme_scoringutils() + + annotate(geom="text", x=0, y=.8, label="Predictive distribution: \nN(0,1)", + color="black", hjust = "left") + + labs(y = "Score", x = "Observed value") + + # geom_vline(aes(xintercept = 0), linetype = "dashed") + + geom_area(stat = "function", fun = dnorm, color = "grey", fill = "grey", alpha = 0.5, xlim = c(0, 4)) + + scale_y_continuous(label = label_fn) + + +deviation_sd + outlier + + plot_layout(guides = "collect") & + theme(legend.position = "bottom") + +ggsave("inst/manuscript/plots/score-deviation-sd-mu.png", + height = 3, width = 8) diff --git a/inst/manuscript/R/toy-example-sample-convergence.R b/inst/manuscript/R/toy-example-sample-convergence.R new file mode 100644 index 000000000..6c8040d60 --- /dev/null +++ b/inst/manuscript/R/toy-example-sample-convergence.R @@ -0,0 +1,75 @@ +library(data.table) +library(scoringutils) +library(ggplot2) + +sample_sizes <- seq(50, 5000, 50) +sample_sizes <- round(1 * 10^(seq(1, 5, 0.1))) +n_rep <- 500 + +true_value = 0 +sd <- 3 +mu <- 2 + +# analytical scores +true_crps <- scoringRules::crps(y = 0, family = "normal", mean = mu, sd = sd) +true_logs <- scoringRules::logs(y = 0, family = "normal", mean = mu, sd = sd) +true_dss <- scoringRules::dss_norm(y = 0, mean = mu, sd = sd) + +# +# results <- list() +# for (i in sample_sizes) { +# samples <- as.data.table( +# replicate(n_rep, +# rnorm(n = i, mean = mu, sd = sd)) +# ) +# setnames(samples, as.character(1:n_rep)) +# samples[, sample := 1:i] +# samples <- melt(samples, id.vars = "sample", +# variable.name = "repetition", +# value.name = "prediction") +# samples[, true_value := true_value] +# results[[paste(i)]] <- score( +# samples, metrics = c("crps", "log_score", "dss") +# )[, n_samples := i] +# } +# writeRDS(results2, "inst/manuscript/plots/sample-convergence.Rda") + +resuts2 <- readRDS("inst/manuscript/plots/sample-convergence.Rda") +results2 <- rbindlist(results) +results2 <- melt(results2, id.vars = c("n_samples", "repetition", "model"), + variable.name = "score") + +label_fn <- function(x) { + ifelse (x >= 1000, + paste0(x / 1000, "k"), + x) +} + +df <- results2[, .(mean = mean(value), + quantile_0.05 = quantile(value, 0.05), + quantile_0.25 = quantile(value, 0.25), + quantile_0.75 = quantile(value, 0.75), + quantile_0.95 = quantile(value, 0.95)), + by = c("n_samples", "score")] +df[score == "crps", true_score := true_crps] +df[score == "log_score", true_score := true_logs] +df[score == "dss", true_score := true_dss] + +df[, score := ifelse(score == "dss", "DSS", + ifelse(score == "crps", "CRPS", + "Log score"))] + +ggplot(df, aes(x = n_samples)) + + geom_line(aes(y = mean)) + + geom_ribbon(aes(ymax = quantile_0.95, ymin = quantile_0.05), + alpha = 0.1) + + geom_ribbon(aes(ymax = quantile_0.75, ymin = quantile_0.25), + alpha = 0.3) + + geom_hline(aes(yintercept = true_score), linetype = "dashed") + + facet_wrap(~ score, scales = "free") + + scale_x_continuous(trans = "log10", labels = label_fn) + + theme_scoringutils() + + labs(x = "Number of samples", + y = "Score based on samples") + +ggsave("inst/manuscript/plots/sample-convergence.png", height = 3, width = 8) diff --git a/inst/manuscript/apa.csl b/inst/manuscript/apa.csl new file mode 100644 index 000000000..bbb7fdb73 --- /dev/null +++ b/inst/manuscript/apa.csl @@ -0,0 +1,1914 @@ + + diff --git a/inst/manuscript/jss.bst b/inst/manuscript/jss.bst new file mode 100644 index 000000000..ae7541547 --- /dev/null +++ b/inst/manuscript/jss.bst @@ -0,0 +1,1653 @@ +%% +%% This is file `jss.bst', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% merlin.mbs (with options: `ay,nat,nm-rvx,keyxyr,dt-beg,yr-par,note-yr,tit-qq,atit-u,trnum-it,vol-bf,volp-com,num-xser,pre-edn,isbn,issn,edpar,pp,ed,xedn,xand,etal-it,revdata,eprint,url,url-blk,doi,nfss') +%% +%% ** BibTeX style file for JSS publications (http://www.jstatsoft.org/) +%% +%% License: GPL-2 | GPL-3 + % =============================================================== + % IMPORTANT NOTICE: + % This bibliographic style (bst) file has been generated from one or + % more master bibliographic style (mbs) files, listed above, provided + % with kind permission of Patrick W Daly. + % + % This generated file can be redistributed and/or modified under the terms + % of the General Public License (Version 2 or 3). + % =============================================================== + % Name and version information of the main mbs file: + % \ProvidesFile{merlin.mbs}[2011/11/18 4.33 (PWD, AO, DPC)] + % For use with BibTeX version 0.99a or later + %------------------------------------------------------------------- + % This bibliography style file is intended for texts in ENGLISH + % This is an author-year citation style bibliography. As such, it is + % non-standard LaTeX, and requires a special package file to function properly. + % Such a package is natbib.sty by Patrick W. Daly + % The form of the \bibitem entries is + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}... + % The essential feature is that the label (the part in brackets) consists + % of the author names, as they should appear in the citation, with the year + % in parentheses following. There must be no space before the opening + % parenthesis! + % With natbib v5.3, a full list of authors may also follow the year. + % In natbib.sty, it is possible to define the type of enclosures that is + % really wanted (brackets or parentheses), but in either case, there must + % be parentheses in the label. + % The \cite command functions as follows: + % \citet{key} ==>> Jones et al. (1990) + % \citet*{key} ==>> Jones, Baker, and Smith (1990) + % \citep{key} ==>> (Jones et al., 1990) + % \citep*{key} ==>> (Jones, Baker, and Smith, 1990) + % \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \citep[e.g.][]{key} ==>> (e.g. Jones et al., 1990) + % \citep[e.g.][p. 32]{key} ==>> (e.g. Jones et al., 1990, p. 32) + % \citeauthor{key} ==>> Jones et al. + % \citeauthor*{key} ==>> Jones, Baker, and Smith + % \citeyear{key} ==>> 1990 + %--------------------------------------------------------------------- + +ENTRY + { address + archive + author + booktitle + chapter + collaboration + doi + edition + editor + eid + eprint + howpublished + institution + isbn + issn + journal + key + month + note + number + numpages + organization + pages + publisher + school + series + title + type + url + volume + year + } + {} + { label extra.label sort.label short.list } +INTEGERS { output.state before.all mid.sentence after.sentence after.block } +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} +STRINGS { s t} +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} +FUNCTION {add.blank} +{ " " * before.all 'output.state := +} + +FUNCTION {date.block} +{ + new.block +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} +FUNCTION {non.stop} +{ duplicate$ + "}" * add.period$ + #-1 #1 substring$ "." = +} + +STRINGS {z} + +FUNCTION {remove.dots} +{ 'z := + "" + { z empty$ not } + { z #1 #2 substring$ + duplicate$ "\." = + { z #3 global.max$ substring$ 'z := * } + { pop$ + z #1 #1 substring$ + z #2 global.max$ substring$ 'z := + duplicate$ "." = 'pop$ + { * } + if$ + } + if$ + } + while$ +} +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "\emph{" swap$ * "}" * } + if$ +} +FUNCTION {bolden} +{ duplicate$ empty$ + { pop$ "" } + { "\textbf{" swap$ * "}" * } + if$ +} +FUNCTION {tie.or.space.prefix} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ +} + +FUNCTION {capitalize} +{ "u" change.case$ "t" change.case$ } + +FUNCTION {space.word} +{ " " swap$ * " " * } + % Here are the language-specific definitions for explicit words. + % Each function has a name bbl.xxx where xxx is the English word. + % The language selected here is ENGLISH +FUNCTION {bbl.and} +{ "and"} + +FUNCTION {bbl.etal} +{ "et~al." } + +FUNCTION {bbl.editors} +{ "eds." } + +FUNCTION {bbl.editor} +{ "ed." } + +FUNCTION {bbl.edby} +{ "edited by" } + +FUNCTION {bbl.edition} +{ "edition" } + +FUNCTION {bbl.volume} +{ "volume" } + +FUNCTION {bbl.of} +{ "of" } + +FUNCTION {bbl.number} +{ "number" } + +FUNCTION {bbl.nr} +{ "no." } + +FUNCTION {bbl.in} +{ "in" } + +FUNCTION {bbl.pages} +{ "pp." } + +FUNCTION {bbl.page} +{ "p." } + +FUNCTION {bbl.eidpp} +{ "pages" } + +FUNCTION {bbl.chapter} +{ "chapter" } + +FUNCTION {bbl.techrep} +{ "Technical Report" } + +FUNCTION {bbl.mthesis} +{ "Master's thesis" } + +FUNCTION {bbl.phdthesis} +{ "Ph.D. thesis" } + +MACRO {jan} {"January"} + +MACRO {feb} {"February"} + +MACRO {mar} {"March"} + +MACRO {apr} {"April"} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"August"} + +MACRO {sep} {"September"} + +MACRO {oct} {"October"} + +MACRO {nov} {"November"} + +MACRO {dec} {"December"} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications of the ACM"} + +MACRO {ibmjrd} {"IBM Journal of Research and Development"} + +MACRO {ibmsj} {"IBM Systems Journal"} + +MACRO {ieeese} {"IEEE Transactions on Software Engineering"} + +MACRO {ieeetc} {"IEEE Transactions on Computers"} + +MACRO {ieeetcad} + {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"Journal of the ACM"} + +MACRO {jcss} {"Journal of Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM Journal on Computing"} + +MACRO {tocs} {"ACM Transactions on Computer Systems"} + +MACRO {tods} {"ACM Transactions on Database Systems"} + +MACRO {tog} {"ACM Transactions on Graphics"} + +MACRO {toms} {"ACM Transactions on Mathematical Software"} + +MACRO {toois} {"ACM Transactions on Office Information Systems"} + +MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} +FUNCTION {bibinfo.check} +{ swap$ + duplicate$ missing$ + { + pop$ pop$ + "" + } + { duplicate$ empty$ + { + swap$ pop$ + } + { swap$ + pop$ + } + if$ + } + if$ +} +FUNCTION {bibinfo.warn} +{ swap$ + duplicate$ missing$ + { + swap$ "missing " swap$ * " in " * cite$ * warning$ pop$ + "" + } + { duplicate$ empty$ + { + swap$ "empty " swap$ * " in " * cite$ * warning$ + } + { swap$ + pop$ + } + if$ + } + if$ +} +FUNCTION {format.eprint} +{ eprint duplicate$ empty$ + 'skip$ + { "\eprint" + archive empty$ + 'skip$ + { "[" * archive * "]" * } + if$ + "{" * swap$ * "}" * + } + if$ +} +FUNCTION {format.url} +{ + url + duplicate$ empty$ + { pop$ "" } + { "\urlprefix\url{" swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + + +STRINGS { bibinfo} + +FUNCTION {format.names} +{ 'bibinfo := + duplicate$ empty$ 'skip$ { + 's := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}{ jj}{ f{}}" + format.name$ + remove.dots + bibinfo bibinfo.check + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + "," * + t "others" = + { + " " * bbl.etal emphasize * + } + { " " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + } if$ +} +FUNCTION {format.names.ed} +{ + 'bibinfo := + duplicate$ empty$ 'skip$ { + 's := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{f{}~}{vv~}{ll}{ jj}" + format.name$ + remove.dots + bibinfo bibinfo.check + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + "," * + t "others" = + { + + " " * bbl.etal emphasize * + } + { " " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + } if$ +} +FUNCTION {format.key} +{ empty$ + { key field.or.null } + { "" } + if$ +} + +FUNCTION {format.authors} +{ author "author" format.names + duplicate$ empty$ 'skip$ + { collaboration "collaboration" bibinfo.check + duplicate$ empty$ 'skip$ + { " (" swap$ * ")" * } + if$ + * + } + if$ +} +FUNCTION {get.bbl.editor} +{ editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ } + +FUNCTION {format.editors} +{ editor "editor" format.names duplicate$ empty$ 'skip$ + { + " " * + get.bbl.editor + "(" swap$ * ")" * + * + } + if$ +} +FUNCTION {format.isbn} +{ isbn "isbn" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "ISBN " swap$ * + } + if$ +} + +FUNCTION {format.issn} +{ issn "issn" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "ISSN " swap$ * + } + if$ +} + +FUNCTION {format.doi} +{ doi empty$ + { "" } + { + new.block + "\doi{" doi * "}" * + } + if$ +} +FUNCTION {format.note} +{ + note empty$ + { "" } + { note #1 #1 substring$ + duplicate$ "{" = + 'skip$ + { output.state mid.sentence = + { "l" } + { "u" } + if$ + change.case$ + } + if$ + note #2 global.max$ substring$ * "note" bibinfo.check + } + if$ +} + +FUNCTION {format.title} +{ title + "title" bibinfo.check + duplicate$ empty$ 'skip$ + { + "\enquote{" swap$ * + add.period$ "}" * + } + if$ +} +FUNCTION {format.full.names} +{'s := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}" format.name$ + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + t "others" = + { + " " * bbl.etal emphasize * + } + { + numnames #2 > + { "," * } + 'skip$ + if$ + bbl.and + space.word * t * + } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {author.editor.key.full} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {author.key.full} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {editor.key.full} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ +} + +FUNCTION {make.full.names} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.full + { type$ "proceedings" = + 'editor.key.full + 'author.key.full + if$ + } + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem[{" write$ + label write$ + ")" make.full.names duplicate$ short.list = + { pop$ } + { * } + if$ + "}]{" * write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {n.dashify} +{ + 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {word.in} +{ bbl.in capitalize + " " * } + +FUNCTION {format.date} +{ year "year" bibinfo.check duplicate$ empty$ + { + "empty year in " cite$ * "; set to ????" * warning$ + pop$ "????" + } + 'skip$ + if$ + extra.label * + before.all 'output.state := + " (" swap$ * ")" * +} +FUNCTION {format.btitle} +{ title "title" bibinfo.check + duplicate$ empty$ 'skip$ + { + emphasize + } + if$ +} +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { bbl.volume volume tie.or.space.prefix + "volume" bibinfo.check * * + series "series" bibinfo.check + duplicate$ empty$ 'pop$ + { swap$ bbl.of space.word * swap$ + emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { series empty$ + { number "number" bibinfo.check } + { output.state mid.sentence = + { bbl.number } + { bbl.number capitalize } + if$ + number tie.or.space.prefix "number" bibinfo.check * * + bbl.in space.word * + series "series" bibinfo.check * + } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition duplicate$ empty$ 'skip$ + { + output.state mid.sentence = + { "l" } + { "t" } + if$ change.case$ + "edition" bibinfo.check + " " * bbl.edition * + } + if$ +} +INTEGERS { multiresult } +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} +FUNCTION {format.pages} +{ pages duplicate$ empty$ 'skip$ + { duplicate$ multi.page.check + { + bbl.pages swap$ + n.dashify + } + { + bbl.page swap$ + } + if$ + tie.or.space.prefix + "pages" bibinfo.check + * * + } + if$ +} +FUNCTION {format.journal.pages} +{ pages duplicate$ empty$ 'pop$ + { swap$ duplicate$ empty$ + { pop$ pop$ format.pages } + { + ", " * + swap$ + n.dashify + "pages" bibinfo.check + * + } + if$ + } + if$ +} +FUNCTION {format.journal.eid} +{ eid "eid" bibinfo.check + duplicate$ empty$ 'pop$ + { swap$ duplicate$ empty$ 'skip$ + { + ", " * + } + if$ + swap$ * + numpages empty$ 'skip$ + { bbl.eidpp numpages tie.or.space.prefix + "numpages" bibinfo.check * * + " (" swap$ * ")" * * + } + if$ + } + if$ +} +FUNCTION {format.vol.num.pages} +{ volume field.or.null + duplicate$ empty$ 'skip$ + { + "volume" bibinfo.check + } + if$ + bolden + number "number" bibinfo.check duplicate$ empty$ 'skip$ + { + swap$ duplicate$ empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + swap$ + "(" swap$ * ")" * + } + if$ * + eid empty$ + { format.journal.pages } + { format.journal.eid } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { bbl.chapter } + { type "l" change.case$ + "type" bibinfo.check + } + if$ + chapter tie.or.space.prefix + "chapter" bibinfo.check + * * + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.booktitle} +{ + booktitle "booktitle" bibinfo.check + emphasize +} +FUNCTION {format.in.ed.booktitle} +{ format.booktitle duplicate$ empty$ 'skip$ + { + editor "editor" format.names.ed duplicate$ empty$ 'pop$ + { + " " * + get.bbl.editor + "(" swap$ * "), " * + * swap$ + * } + if$ + word.in swap$ * + } + if$ +} +FUNCTION {format.thesis.type} +{ type duplicate$ empty$ + 'pop$ + { swap$ pop$ + "t" change.case$ "type" bibinfo.check + } + if$ +} +FUNCTION {format.tr.number} +{ number "number" bibinfo.check + type duplicate$ empty$ + { pop$ bbl.techrep } + 'skip$ + if$ + "type" bibinfo.check + swap$ duplicate$ empty$ + { pop$ "t" change.case$ } + { tie.or.space.prefix * * } + if$ +} +FUNCTION {format.article.crossref} +{ + word.in + " \cite{" * crossref * "}" * +} +FUNCTION {format.book.crossref} +{ volume duplicate$ empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + pop$ word.in + } + { bbl.volume + capitalize + swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word * + } + if$ + " \cite{" * crossref * "}" * +} +FUNCTION {format.incoll.inproc.crossref} +{ + word.in + " \cite{" * crossref * "}" * +} +FUNCTION {format.org.or.pub} +{ 't := + "" + address empty$ t empty$ and + 'skip$ + { + t empty$ + { address "address" bibinfo.check * + } + { t * + address empty$ + 'skip$ + { ", " * address "address" bibinfo.check * } + if$ + } + if$ + } + if$ +} +FUNCTION {format.publisher.address} +{ publisher "publisher" bibinfo.warn format.org.or.pub +} + +FUNCTION {format.organization.address} +{ organization "organization" bibinfo.check format.org.or.pub +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { + journal + "journal" bibinfo.check + emphasize + "journal" output.check + format.vol.num.pages output + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + format.issn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + date.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + format.edition output + new.sentence + format.publisher.address output + } + { + new.block + format.book.crossref output.nonnull + } + if$ + format.isbn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {booklet} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + howpublished "howpublished" bibinfo.check output + address "address" bibinfo.check output + format.isbn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + date.block + format.btitle "title" output.check + crossref missing$ + { + format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + format.edition output + new.sentence + format.publisher.address output + } + { + format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + crossref missing$ + { format.isbn output } + 'skip$ + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.edition output + format.chapter.pages output + new.sentence + format.publisher.address output + format.isbn output + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + new.sentence + publisher empty$ + { format.organization.address output } + { organization "organization" bibinfo.check output + format.publisher.address output + } + if$ + format.isbn output + format.issn output + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {conference} { inproceedings } +FUNCTION {manual} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + date.block + format.btitle "title" output.check + organization address new.block.checkb + organization "organization" bibinfo.check output + address "address" bibinfo.check output + format.edition output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.btitle + "title" output.check + new.block + bbl.mthesis format.thesis.type output.nonnull + school "school" bibinfo.warn output + address "address" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + date.block + format.title output + new.block + howpublished "howpublished" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.btitle + "title" output.check + new.block + bbl.phdthesis format.thesis.type output.nonnull + school "school" bibinfo.warn output + address "address" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + format.editors output + editor format.key output + format.date "year" output.check + date.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + new.sentence + publisher empty$ + { format.organization.address output } + { organization "organization" bibinfo.check output + format.publisher.address output + } + if$ + format.isbn output + format.issn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title + "title" output.check + new.block + format.tr.number emphasize output.nonnull + institution "institution" bibinfo.warn output + address "address" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + format.doi output + new.block + format.note "note" output.check + format.eprint output + format.url output + fin.entry +} + +FUNCTION {default.type} { misc } +READ +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} +INTEGERS { len } +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} +FUNCTION {format.lab.names} +{ 's := + "" 't := + s #1 "{vv~}{ll}" format.name$ + s num.names$ duplicate$ + #2 > + { pop$ + " " * bbl.etal emphasize * + } + { #2 < + 'skip$ + { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { + " " * bbl.etal emphasize * + } + { bbl.and space.word * s #2 "{vv~}{ll}" format.name$ + * } + if$ + } + if$ + } + if$ +} + +FUNCTION {author.key.label} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {author.editor.key.label} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {editor.key.label} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ +} + +FUNCTION {calc.short.authors} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.label + { type$ "proceedings" = + 'editor.key.label + 'author.key.label + if$ + } + if$ + 'short.list := +} + +FUNCTION {calc.label} +{ calc.short.authors + short.list + "(" + * + year duplicate$ empty$ + short.list key field.or.null = or + { pop$ "" } + 'skip$ + if$ + * + 'label := +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" + format.name$ 't := + nameptr #1 > + { + " " * + namesleft #1 = t "others" = and + { "zzzzz" 't := } + 'skip$ + if$ + t sortify * + } + { t sortify * } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} +FUNCTION {editor.sort} +{ editor empty$ + { key empty$ + { "to sort, need editor or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ +} +FUNCTION {presort} +{ calc.label + label sortify + " " + * + type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.sort + 'author.sort + if$ + } + if$ + #1 entry.max$ substring$ + 'sort.label := + sort.label + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} +SORT +STRINGS { last.label next.extra } +INTEGERS { last.extra.num last.extra.num.extended last.extra.num.blank number.label } +FUNCTION {initialize.extra.label.stuff} +{ #0 int.to.chr$ 'last.label := + "" 'next.extra := + #0 'last.extra.num := + "a" chr.to.int$ #1 - 'last.extra.num.blank := + last.extra.num.blank 'last.extra.num.extended := + #0 'number.label := +} +FUNCTION {forward.pass} +{ last.label label = + { last.extra.num #1 + 'last.extra.num := + last.extra.num "z" chr.to.int$ > + { "a" chr.to.int$ 'last.extra.num := + last.extra.num.extended #1 + 'last.extra.num.extended := + } + 'skip$ + if$ + last.extra.num.extended last.extra.num.blank > + { last.extra.num.extended int.to.chr$ + last.extra.num int.to.chr$ + * 'extra.label := } + { last.extra.num int.to.chr$ 'extra.label := } + if$ + } + { "a" chr.to.int$ 'last.extra.num := + "" 'extra.label := + label 'last.label := + } + if$ + number.label #1 + 'number.label := +} +FUNCTION {reverse.pass} +{ next.extra "b" = + { "a" 'extra.label := } + 'skip$ + if$ + extra.label 'next.extra := + extra.label + duplicate$ empty$ + 'skip$ + { "{\natexlab{" swap$ * "}}" * } + if$ + 'extra.label := + label extra.label * 'label := +} +EXECUTE {initialize.extra.label.stuff} +ITERATE {forward.pass} +REVERSE {reverse.pass} +FUNCTION {bib.sort.order} +{ sort.label + " " + * + year field.or.null sortify + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} +ITERATE {bib.sort.order} +SORT +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" number.label int.to.str$ * "}" * + write$ newline$ + "\newcommand{\enquote}[1]{``#1''}" + write$ newline$ + "\providecommand{\natexlab}[1]{#1}" + write$ newline$ + "\providecommand{\url}[1]{\texttt{#1}}" + write$ newline$ + "\providecommand{\urlprefix}{URL }" + write$ newline$ + "\expandafter\ifx\csname urlstyle\endcsname\relax" + write$ newline$ + " \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else" + write$ newline$ + " \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup \urlstyle{rm}\Url}\fi" + write$ newline$ + "\providecommand{\eprint}[2][]{\url{#2}}" + write$ newline$ +} +EXECUTE {begin.bib} +EXECUTE {init.state.consts} +ITERATE {call.type$} +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} +EXECUTE {end.bib} +%% End of customized bst file +%% +%% End of file `jss.bst'. diff --git a/inst/manuscript/jss.cls b/inst/manuscript/jss.cls new file mode 100644 index 000000000..eb33e7f2c --- /dev/null +++ b/inst/manuscript/jss.cls @@ -0,0 +1,495 @@ +%% My own packages added here +\usepackage{colortbl} +\usepackage{xcolor} +\usepackage{multirow} + +%% +%% This is file `jss.cls' +\def\fileversion{3.2} +\def\filename{jss} +\def\filedate{2020/12/09} +%% +%% Package `jss' to use with LaTeX2e for JSS publications (http://www.jstatsoft.org/) +%% License: GPL-2 | GPL-3 +%% Copyright: (C) Achim Zeileis +%% Please report errors to Achim.Zeileis@R-project.org +%% +\NeedsTeXFormat{LaTeX2e} +\ProvidesClass{jss}[\filedate\space\fileversion\space jss class by Achim Zeileis] +%% options +\newif\if@article +\newif\if@codesnippet +\newif\if@bookreview +\newif\if@softwarereview +\newif\if@review +\newif\if@shortnames +\newif\if@nojss +\newif\if@notitle +\newif\if@noheadings +\newif\if@nofooter + +\@articletrue +\@codesnippetfalse +\@bookreviewfalse +\@softwarereviewfalse +\@reviewfalse +\@shortnamesfalse +\@nojssfalse +\@notitlefalse +\@noheadingsfalse +\@nofooterfalse + +\DeclareOption{article}{\@articletrue% + \@codesnippetfalse \@bookreviewfalse \@softwarereviewfalse} +\DeclareOption{codesnippet}{\@articlefalse% + \@codesnippettrue \@bookreviewfalse \@softwarereviewfalse} +\DeclareOption{bookreview}{\@articlefalse% + \@codesnippetfalse \@bookreviewtrue \@softwarereviewfalse} +\DeclareOption{softwarereview}{\@articlefalse% + \@codesnippetfalse \@bookreviewfalse \@softwarereviewtrue} +\DeclareOption{shortnames}{\@shortnamestrue} +\DeclareOption{nojss}{\@nojsstrue} +\DeclareOption{notitle}{\@notitletrue} +\DeclareOption{noheadings}{\@noheadingstrue} +\DeclareOption{nofooter}{\@nofootertrue} + +\ProcessOptions +\LoadClass[11pt,a4paper,twoside]{article} +%% required packages +\RequirePackage{graphicx,color,ae,fancyvrb} +\RequirePackage[T1]{fontenc} +\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} +\IfFileExists{lmodern.sty}{\RequirePackage{lmodern}}{} +%% bibliography +\if@shortnames + \usepackage[authoryear,round]{natbib} +\else + \usepackage[authoryear,round,longnamesfirst]{natbib} +\fi +\bibpunct{(}{)}{;}{a}{}{,} +\bibliographystyle{jss} +%% page layout +\topmargin 0pt +\textheight 46\baselineskip +\advance\textheight by \topskip +\oddsidemargin 0.1in +\evensidemargin 0.15in +\marginparwidth 1in +\oddsidemargin 0.125in +\evensidemargin 0.125in +\marginparwidth 0.75in +\textwidth 6.125in +%% paragraphs +\setlength{\parskip}{0.7ex plus0.1ex minus0.1ex} +\setlength{\parindent}{0em} +%% for all publications +\newcommand{\Address}[1]{\def\@Address{#1}} +\newcommand{\Plaintitle}[1]{\def\@Plaintitle{#1}} +\newcommand{\Shorttitle}[1]{\def\@Shorttitle{#1}} +\newcommand{\Plainauthor}[1]{\def\@Plainauthor{#1}} +\newcommand{\Volume}[1]{\def\@Volume{#1}} +\newcommand{\Year}[1]{\def\@Year{#1}} +\newcommand{\Month}[1]{\def\@Month{#1}} +\newcommand{\Issue}[1]{\def\@Issue{#1}} +\newcommand{\Submitdate}[1]{\def\@Submitdate{#1}} +%% for articles and code snippets +\newcommand{\Acceptdate}[1]{\def\@Acceptdate{#1}} +\newcommand{\Abstract}[1]{\def\@Abstract{#1}} +\newcommand{\Keywords}[1]{\def\@Keywords{#1}} +\newcommand{\Plainkeywords}[1]{\def\@Plainkeywords{#1}} +%% for book and software reviews +\newcommand{\Reviewer}[1]{\def\@Reviewer{#1}} +\newcommand{\Booktitle}[1]{\def\@Booktitle{#1}} +\newcommand{\Bookauthor}[1]{\def\@Bookauthor{#1}} +\newcommand{\Publisher}[1]{\def\@Publisher{#1}} +\newcommand{\Pubaddress}[1]{\def\@Pubaddress{#1}} +\newcommand{\Pubyear}[1]{\def\@Pubyear{#1}} +\newcommand{\ISBN}[1]{\def\@ISBN{#1}} +\newcommand{\Pages}[1]{\def\@Pages{#1}} +\newcommand{\Price}[1]{\def\@Price{#1}} +\newcommand{\Plainreviewer}[1]{\def\@Plainreviewer{#1}} +\newcommand{\Softwaretitle}[1]{\def\@Softwaretitle{#1}} +\newcommand{\URL}[1]{\def\@URL{#1}} +\newcommand{\DOI}[1]{\def\@DOI{#1}} +%% for internal use +\newcommand{\Seriesname}[1]{\def\@Seriesname{#1}} +\newcommand{\Hypersubject}[1]{\def\@Hypersubject{#1}} +\newcommand{\Hyperauthor}[1]{\def\@Hyperauthor{#1}} +\newcommand{\Footername}[1]{\def\@Footername{#1}} +\newcommand{\Firstdate}[1]{\def\@Firstdate{#1}} +\newcommand{\Seconddate}[1]{\def\@Seconddate{#1}} +\newcommand{\Reviewauthor}[1]{\def\@Reviewauthor{#1}} +%% defaults +\author{Firstname Lastname\\Affiliation} +\title{Title} +\Abstract{---!!!---an abstract is required---!!!---} +\Plainauthor{\@author} +\Volume{VV} +\Year{YYYY} +\Month{MMMMMM} +\Issue{II} +\Submitdate{yyyy-mm-dd} +\Acceptdate{yyyy-mm-dd} +\Address{ + Firstname Lastname\\ + Affiliation\\ + Address, Country\\ + E-mail: \email{name@address}\\ + URL: \url{http://link/to/webpage/} +} + +\Reviewer{Firstname Lastname\\Affiliation} +\Plainreviewer{Firstname Lastname} +\Booktitle{Book Title} +\Bookauthor{Book Author} +\Publisher{Publisher} +\Pubaddress{Publisher's Address} +\Pubyear{YYY} +\ISBN{x-xxxxx-xxx-x} +\Pages{xv + 123} +\Price{USD 69.95 (P)} +\URL{http://link/to/webpage/} +\DOI{10.18637/jss.v000.i00} +\if@article + \Seriesname{Issue} + \Hypersubject{Journal of Statistical Software} + \Plaintitle{\@title} + \Shorttitle{\@title} + \Plainkeywords{\@Keywords} +\fi + +\if@codesnippet + \Seriesname{Code Snippet} + \Hypersubject{Journal of Statistical Software -- Code Snippets} + \Plaintitle{\@title} + \Shorttitle{\@title} + \Plainkeywords{\@Keywords} +\fi + +\if@bookreview + \Seriesname{Book Review} + \Hypersubject{Journal of Statistical Software -- Book Reviews} + \Plaintitle{\@Booktitle} + \Shorttitle{\@Booktitle} + \Reviewauthor{\@Bookauthor\\ + \@Publisher, \@Pubaddress, \@Pubyear.\\ + ISBN~\@ISBN. \@Pages~pp. \@Price.\\ + \url{\@URL}} + \Plainkeywords{} + \@reviewtrue +\fi + +\if@softwarereview + \Seriesname{Software Review} + \Hypersubject{Journal of Statistical Software -- Software Reviews} + \Plaintitle{\@Softwaretitle} + \Shorttitle{\@Softwaretitle} + \Booktitle{\@Softwaretitle} + \Reviewauthor{\@Publisher, \@Pubaddress. \@Price.\\ + \url{\@URL}} + \Plainkeywords{} + \@reviewtrue +\fi + +\if@review + \Hyperauthor{\@Plainreviewer} + \Keywords{} + \Footername{Reviewer} + \Firstdate{\textit{Published:} \@Submitdate} + \Seconddate{} +\else + \Hyperauthor{\@Plainauthor} + \Keywords{---!!!---at least one keyword is required---!!!---} + \Footername{Affiliation} + \Firstdate{\textit{Submitted:} \@Submitdate} + \Seconddate{\textit{Accepted:} \@Acceptdate} +\fi +%% Sweave(-like) +\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{fontshape=sl} +\newenvironment{Schunk}{}{} +\DefineVerbatimEnvironment{Code}{Verbatim}{} +\DefineVerbatimEnvironment{CodeInput}{Verbatim}{fontshape=sl} +\DefineVerbatimEnvironment{CodeOutput}{Verbatim}{} +\newenvironment{CodeChunk}{}{} +\setkeys{Gin}{width=0.8\textwidth} +%% footer +\newlength{\footerskip} +\setlength{\footerskip}{2.5\baselineskip plus 2ex minus 0.5ex} + +\newcommand{\makefooter}{% + \vspace{\footerskip} + + \if@nojss + \begin{samepage} + \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak + \@Address \nopagebreak + \end{samepage} + \else + \begin{samepage} + \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak + \@Address \nopagebreak + \vfill + \hrule \nopagebreak + \vspace{.1\baselineskip} + {\fontfamily{pzc} \fontsize{13}{15} \selectfont Journal of Statistical Software} + \hfill + \url{http://www.jstatsoft.org/}\\ \nopagebreak + published by the Foundation for Open Access Statistics + \hfill + \url{http://www.foastat.org/}\\[.3\baselineskip] \nopagebreak + {\@Month{} \@Year, Volume~\@Volume, \@Seriesname~\@Issue} + \hfill + \@Firstdate\\ \nopagebreak + {\href{https://doi.org/\@DOI}{\tt doi:\@DOI}} + \hfill + \@Seconddate \nopagebreak + \vspace{.3\baselineskip} + \hrule + \end{samepage} + \fi +} +\if@nofooter + %% \AtEndDocument{\makefooter} +\else + \AtEndDocument{\makefooter} +\fi +%% required packages +\RequirePackage{hyperref} +%% new \maketitle +\def\@myoddhead{ + {\color{white} JSS}\\[-1.42cm] + \hspace{-2em} \includegraphics[height=23mm,keepaspectratio]{jsslogo} \hfill + \parbox[b][23mm]{118mm}{\hrule height 3pt + \center{ + {\fontfamily{pzc} \fontsize{28}{32} \selectfont Journal of Statistical Software} + \vfill + {\it \small \@Month{} \@Year, Volume~\@Volume, \@Seriesname~\@Issue.% + \hfill \href{https://doi.org/\@DOI}{doi:\,\@DOI}}}\\[0.1cm] + \hrule height 3pt}} +\if@review + \renewcommand{\maketitle}{ + \if@nojss + %% \@oddhead{\@myoddhead}\\[3\baselineskip] + \else + \@oddhead{\@myoddhead}\\[3\baselineskip] + \fi + {\large + \noindent + Reviewer: \@Reviewer + \vspace{\baselineskip} + \hrule + \vspace{\baselineskip} + \textbf{\@Booktitle} + \begin{quotation} \noindent + \@Reviewauthor + \end{quotation} + \vspace{0.7\baselineskip} + \hrule + \vspace{1.3\baselineskip} + } + + \thispagestyle{empty} + \if@nojss + \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}} + \else + \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}} + \fi + \pagestyle{myheadings} + } +\else + \def\maketitle{ + \if@nojss + %% \@oddhead{\@myoddhead} \par + \else + \@oddhead{\@myoddhead} \par + \fi + \begingroup + \def\thefootnote{\fnsymbol{footnote}} + \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} + \long\def\@makefntext##1{\parindent 1em\noindent + \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1} + \@maketitle \@thanks + \endgroup + \setcounter{footnote}{0} + + \if@noheadings + %% \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}} + \else + \thispagestyle{empty} + \if@nojss + \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}} + \else + \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}} + \fi + \pagestyle{myheadings} + \fi + + \let\maketitle\relax \let\@maketitle\relax + \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax + } + + \def\@maketitle{\vbox{\hsize\textwidth \linewidth\hsize + \if@nojss + %% \vskip 1in + \else + \vskip 1in + \fi + {\centering + {\LARGE\bf \@title\par} + \vskip 0.2in plus 1fil minus 0.1in + { + \def\and{\unskip\enspace{\rm and}\enspace}% + \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil + \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces}% + \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup + \vskip 0.1in plus 1fil minus 0.05in + \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil + \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces} + \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil + \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\@author + \end{tabular}\hss\egroup + \hfil\hfil\egroup} + \vskip 0.3in minus 0.1in + \hrule + \begin{abstract} + \@Abstract + \end{abstract}} + \textit{Keywords}:~\@Keywords. + \vskip 0.1in minus 0.05in + \hrule + \vskip 0.2in minus 0.1in + }} +\fi +%% sections, subsections, and subsubsections +\newlength{\preXLskip} +\newlength{\preLskip} +\newlength{\preMskip} +\newlength{\preSskip} +\newlength{\postMskip} +\newlength{\postSskip} +\setlength{\preXLskip}{1.8\baselineskip plus 0.5ex minus 0ex} +\setlength{\preLskip}{1.5\baselineskip plus 0.3ex minus 0ex} +\setlength{\preMskip}{1\baselineskip plus 0.2ex minus 0ex} +\setlength{\preSskip}{.8\baselineskip plus 0.2ex minus 0ex} +\setlength{\postMskip}{.5\baselineskip plus 0ex minus 0.1ex} +\setlength{\postSskip}{.3\baselineskip plus 0ex minus 0.1ex} + +\newcommand{\jsssec}[2][default]{\vskip \preXLskip% + \pdfbookmark[1]{#1}{Section.\thesection.#1}% + \refstepcounter{section}% + \centerline{\textbf{\Large \thesection. #2}} \nopagebreak + \vskip \postMskip \nopagebreak} +\newcommand{\jsssecnn}[1]{\vskip \preXLskip% + \centerline{\textbf{\Large #1}} \nopagebreak + \vskip \postMskip \nopagebreak} + +\newcommand{\jsssubsec}[2][default]{\vskip \preMskip% + \pdfbookmark[2]{#1}{Subsection.\thesubsection.#1}% + \refstepcounter{subsection}% + \textbf{\large \thesubsection. #2} \nopagebreak + \vskip \postSskip \nopagebreak} +\newcommand{\jsssubsecnn}[1]{\vskip \preMskip% + \textbf{\large #1} \nopagebreak + \vskip \postSskip \nopagebreak} + +\newcommand{\jsssubsubsec}[2][default]{\vskip \preSskip% + \pdfbookmark[3]{#1}{Subsubsection.\thesubsubsection.#1}% + \refstepcounter{subsubsection}% + {\large \textit{#2}} \nopagebreak + \vskip \postSskip \nopagebreak} +\newcommand{\jsssubsubsecnn}[1]{\vskip \preSskip% + {\textit{\large #1}} \nopagebreak + \vskip \postSskip \nopagebreak} + +\newcommand{\jsssimplesec}[2][default]{\vskip \preLskip% +%% \pdfbookmark[1]{#1}{Section.\thesection.#1}% + \refstepcounter{section}% + \textbf{\large #1} \nopagebreak + \vskip \postSskip \nopagebreak} +\newcommand{\jsssimplesecnn}[1]{\vskip \preLskip% + \textbf{\large #1} \nopagebreak + \vskip \postSskip \nopagebreak} + +\if@review + \renewcommand{\section}{\secdef \jsssimplesec \jsssimplesecnn} + \renewcommand{\subsection}{\secdef \jsssimplesec \jsssimplesecnn} + \renewcommand{\subsubsection}{\secdef \jsssimplesec \jsssimplesecnn} +\else + \renewcommand{\section}{\secdef \jsssec \jsssecnn} + \renewcommand{\subsection}{\secdef \jsssubsec \jsssubsecnn} + \renewcommand{\subsubsection}{\secdef \jsssubsubsec \jsssubsubsecnn} +\fi +%% colors +\definecolor{Red}{rgb}{0.5,0,0} +\definecolor{Blue}{rgb}{0,0,0.5} +\if@review + \hypersetup{% + hyperindex = {true}, + colorlinks = {true}, + linktocpage = {true}, + plainpages = {false}, + linkcolor = {Blue}, + citecolor = {Blue}, + urlcolor = {Red}, + pdfstartview = {Fit}, + pdfpagemode = {None}, + pdfview = {XYZ null null null} + } +\else + \hypersetup{% + hyperindex = {true}, + colorlinks = {true}, + linktocpage = {true}, + plainpages = {false}, + linkcolor = {Blue}, + citecolor = {Blue}, + urlcolor = {Red}, + pdfstartview = {Fit}, + pdfpagemode = {UseOutlines}, + pdfview = {XYZ null null null} + } +\fi +\if@nojss + \AtBeginDocument{ + \hypersetup{% + pdfauthor = {\@Hyperauthor}, + pdftitle = {\@Plaintitle}, + pdfkeywords = {\@Plainkeywords} + } + } +\else + \AtBeginDocument{ + \hypersetup{% + pdfauthor = {\@Hyperauthor}, + pdftitle = {\@Plaintitle}, + pdfsubject = {\@Hypersubject}, + pdfkeywords = {\@Plainkeywords} + } + } +\fi +\if@notitle + %% \AtBeginDocument{\maketitle} +\else + \@ifundefined{AddToHook}{\AtBeginDocument{\maketitle}}{\AddToHook{begindocument}[maketitle]{\maketitle}} +\fi +%% commands +\newcommand\code{\bgroup\@makeother\_\@makeother\~\@makeother\$\@codex} +\def\@codex#1{{\normalfont\ttfamily\hyphenchar\font=-1 #1}\egroup} +%%\let\code=\texttt +\let\proglang=\textsf +\newcommand{\pkg}[1]{{\fontseries{m}\fontseries{b}\selectfont #1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} +\ifx\csname urlstyle\endcsname\relax + \newcommand\@doi[1]{doi:\discretionary{}{}{}#1}\else + \newcommand\@doi{doi:\discretionary{}{}{}\begingroup +\urlstyle{tt}\Url}\fi +\newcommand{\doi}[1]{\href{https://doi.org/#1}{\normalfont\texttt{\@doi{#1}}}} +\newcommand{\E}{\mathsf{E}} +\newcommand{\VAR}{\mathsf{VAR}} +\newcommand{\COV}{\mathsf{COV}} +\newcommand{\Prob}{\mathsf{P}} +\endinput +%% +%% End of file `jss.cls'. diff --git a/inst/manuscript/jsslogo.jpg b/inst/manuscript/jsslogo.jpg new file mode 100644 index 000000000..4751aef9d Binary files /dev/null and b/inst/manuscript/jsslogo.jpg differ diff --git a/inst/manuscript/manuscript.Rmd b/inst/manuscript/manuscript.Rmd new file mode 100644 index 000000000..90bc7bc41 --- /dev/null +++ b/inst/manuscript/manuscript.Rmd @@ -0,0 +1,655 @@ +--- +documentclass: jss +author: + - name: Nikos I. Bosse + affiliation: London School of Hygiene & Tropical Medicine (LSHTM) \AND + address: | + | Centre for Mathematical Modelling of Infectious Diseases + | London School of Hygiene & Tropical Medicine + | Keppel Street + | London WC1E 7HT + email: \email{nikos.bosse@lshtm.ac.uk} + url: https://lshtm.ac.uk + - name: Hugo Gruson + affiliation: LSHTM + address: | + | Centre for Mathematical Modelling of Infectious Diseases + | London School of Hygiene & Tropical Medicine + | Keppel Street + | London WC1E 7HT + email: \email{hugo.gruson@lshtm.ac.uk} + - name: Anne Cori + affiliation: Imperial College London + address: | + | Department + | Imperial College London + | Street + | City and Post code + email: \email{a.cori@imperial.ac.uk} + - name: Edwin van Leeuwen + affiliation: Public Health England \AND + address: | + | Department + | Institution + | Street + | City and Post code + email: \email{Edwin.VanLeeuwen@phe.gov.uk} + - name: Johannes Bracher + affiliation: Karlsruhe Institute of Technology + address: | + | Department + | Institution + | Street + | City and Post code + email: \email{johannes.bracher@kit.edu} + - name: Sebastian Funk + affiliation: LSHTM + # use this syntax to add text on several lines + address: | + | Centre for Mathematical Modelling of Infectious Diseases + | London School of Hygiene & Tropical Medicine + | Keppel Street + | London WC1E 7HT + email: \email{sebastian.funk@lshtm.ac.uk} + - name: Sam Abbott + affiliation: LSHTM + # use this syntax to add text on several lines + address: | + | Centre for Mathematical Modelling of Infectious Diseases + | London School of Hygiene & Tropical Medicine + | Keppel Street + | London WC1E 7HT + email: \email{sam.abbott@lshtm.ac.uk} +title: + formatted: "Evaluating Forecasts using \\pkg{scoringutils} in \\proglang{R}" + # If you use tex in the formatted title, also supply version without + plain: "Evaluating Forecasts using scoringutils in R" + # For running headers, if needed + short: "Evaluating Forecasts with \\pkg{scoringutils} in \\proglang{R}" +abstract: > + Forecasts play an important role in a variety of fields. Their role in informing public policy has attracted increased attention from the general public with the emergence of the Covid-19 pandemic. Much theoretical work has been done on the development of proper scoring rules and other scoring metrics that can help evaluate these forecasts. However, there is a vast choice of scoring rules available for different types of data, and there has been less of a focus on facilitating their use by those without expertise in forecast evaluation. In this paper we introduce \pkg{scoringutils}, an \proglang{R} package that, given a set of forecasts and truth data, automatically chooses, applies and visualises a set of appropriate scores. It gives the user access to a wide range of scoring metrics for various types of forecasts as well as a variety of ways to visualise the evaluation. We give an overview of the evaluation process and the metrics implemented in \pkg{scoringutils} and show an example evaluation of forecasts for COVID-19 cases and deaths submitted to the European Forecast Hub between May and September 2021. +keywords: + # at least one keyword must be supplied + formatted: [keywords, not capitalized, "\\proglang{R}"] + plain: [keywords, not capitalized, R] +preamble: > + \usepackage{amsmath} + \shortcites{reichCollaborativeMultiyearMultimodel2019, kukkonenReviewOperationalRegionalscale2012, funkShorttermForecastsInform2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021, bracherNationalSubnationalShortterm2021} + \usepackage{amssymb} + \usepackage{caption} + \captionsetup[table]{skip=10pt} + \newcommand{\class}[1]{`\code{#1}'} + \newcommand{\fct}[1]{\code{#1()}} + +bibliography: + - references.bib + - scoringutils-paper.bib +# doesn't work... +# csl: apa.csl +biblio-style: "apalike" +output: + rticles::jss_article: + citation_package: natbib + toc: true + +--- + +```{r, setup, include=FALSE} +options(prompt = 'R> ', continue = '+ ', width = 70) + +library(knitr) +library(dplyr) +library(magrittr) +library(kableExtra) +library(formatR) +options(width = 70) +opts_chunk$set( + cache = TRUE, + warning = FALSE, + message = FALSE, + out.width = "100%" +) +``` + +```{r eval = FALSE, include=FALSE} +# trackdown::update_file("inst/manuscript/manuscript.Rmd", gfile = "scoringutils-paper", hide_code = FALSE) +trackdown::download_file("inst/manuscript/manuscript.Rmd", gfile = "scoringutils-paper") +``` + +# Introduction + +Good forecasts are of great interest to decision makers in various fields like finance \citep{timmermannForecastingMethodsFinance2018, elliottForecastingEconomicsFinance2016}, weather predictions \citep{gneitingWeatherForecastingEnsemble2005, kukkonenReviewOperationalRegionalscale2012} or infectious disease modeling \citep{reichCollaborativeMultiyearMultimodel2019, funkShorttermForecastsInform2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021}. Throughout the COVID-19 pandemic, forecasts from different research institutions on COVID-19 targets like reported cases and deaths have been systematically collated by Forecast Hubs in the US, Germany and Poland, and Europe. An integral part of assessing and improving their usefulness is forecast evaluation. For decades, researchers have developed and refined an arsenal of techniques not only to forecast, but also to evaluate these forecasts (see e.g. \cite{bracherEvaluatingEpidemicForecasts2021}, \cite{funkAssessingPerformanceRealtime2019}, \cite{gneitingProbabilisticForecastsCalibration2007}, and \cite{gneitingStrictlyProperScoring2007}). Yet even with this rich body of research available, implementing a complete forecast evaluation is not trivial. + +There already exist a few \proglang{R} \citep{R} packages which implement a wide variety of scoring metrics. The \pkg{scoringRules} package \citep{scoringRules} for example offers a very extensive collection of functions with efficient implementations of different proper scoring rules (some of which are directly reused in \pkg{scoringutils}. However, it focuses on proper scoring rules only and does not implement other evaluation metrics or provide functionality to compare forecast performance visually. It also does not provide functionality to score predictive distributions that are represented by a set of quantiles and does not handle situations with missing data. The \pkg{topmodels} package \citep{topmodels} provides users with various graphical tools to visually evaluate and compare different forecasts. However, the package is as of today not on CRAN and the visualisations are only available for forecasts based on the model classes `lm`, `glm`, `crch` `disttree`. The ¸\pkg{tscount} package \citep{tscount} offers functionality to fit flexible time series models and compare the quality of the generated forecasts using different proper scoring rules. The application of these rules, however, is confined to forecasts of class `tsglm`. Other packages like \pkg{Metrics} \citep{Metrics} and \pkg{MLmetrics} \citep{MLmetrics} provide a collection of metrics geared towards machine learning problems, but also lack plotting functionality as well as support for a variety of metrics and tools commonly used to evaluate and compare probabilistic forecasts. In contrast to the above, \pkg{scoringutils} not only provides metrics to score individual forecasts, but attempts to simplify the process of comparing different forecasts against each other. It accepts arbitrary forecasts regardless of how they were created and automatically returns a variety of suitable metrics, depending on the type and format of the input forecast. It also provides functionality to facilitate comparing forecasters even when individual forecasts are missing and offers a range of plotting functions to visualise different aspects of forecast performance. The \pkg{scoringutils} package is also unique in its extensive support for forecasts in a quantile format like the one used in various COVID-19 Forecast Hubs \citep{cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021, bracherNationalSubnationalShortterm2021}. + +The remainder of this section will provide an overview of the fundamental ideas behind forecast evaluation. Section \ref{metrics} will give a detailed theoretical explanation of the evaluation metrics in \pkg{scoringutils} and when to use them. Section \ref{evaluation-example} will demonstrate how to conduct an evaluation in \pkg{scoringutils} using forecasts of COVID-19 submitted to the European Forecast Hub \citep{europeancovid-19forecasthubEuropeanCovid19Forecast2021} as a case study. In the following we will use the words “model” and “forecaster” interchangeably, regardless of how forecasts were actually generated. + + + +## Forecast types and forecast formats + +In its most general sense, a forecast is the forecaster’s stated belief about the future \citep{gneitingStrictlyProperScoring2007} that can come in many different forms. Quantitative forecasts are either point forecasts or probabilistic in nature and can make statements about continuous, discrete or binary outcome variables. Point forecasts only give one single number for the expected or most likely outcome. Probabilistic forecasts, in contrast, by definition provide a full predictive probability distribution. This makes them much more useful in any applied setting, as we learn about the forecaster's uncertainty and their belief about all aspects of the underlying data-generating distribution. + +The \pkg{scoringutils} package focuses on probabilistic forecasts, and specifically on forecasts that are represented through either predictive samples or through quantiles of the predictive distributions, making it possible to evaluate arbitrary forecasts even if a closed form (i.e. parametric) distribution is not available. A variety of parametric distributions can be scored directly using \pkg{scoringRules}, but this is not yet supported in \pkg{scoringutils}. + +Predictive samples offer a lot of flexibility. However, the number of samples necessary to store in order to represent the predictive distribution satisfactorily may be high. This loss of precision is usually especially pronounced in the tails of the predictive distribution. For that reason, often quantiles or central prediction intervals are reported instead. One recent example of this are the COVID-19 Forecast Hubs \citep{cramerCOVID19ForecastHub2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, bracherNationalSubnationalShortterm2021, europeancovid-19forecasthubEuropeanCovid19Forecast2021}. For binary or multinomial prediction targets, common in many classification problems, a probabilistic forecast is represented by the probability that an outcome will come true. Table \ref{tab:forecast-types} summarises the different forecast types and formats. +\begin{table}[] +\centering +\caption{Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.} +\label{tab:forecast-types} +\begin{tabular}{@{}lll@{}} +\toprule +Forecast type & Target type & Representation of the predictive distribution \\ \midrule +\multicolumn{1}{l}{Point forecast} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}continuous\\ discrete\\ binary\end{tabular}} & one single number for the predicted outcome \\ \midrule +\multicolumn{1}{l}{\multirow{2}{*}{Probabilistic forecast}} & \begin{tabular}[c]{@{}l@{}}continuous\\ discrete\end{tabular} & \begin{tabular}[c]{@{}l@{}}predictive samples, \\ closed analytical form, \\ or quantiles \end{tabular} \\ \cmidrule(lr){2-3} +\multicolumn{1}{l}{} & binary & binary probabilities \\ \bottomrule +\end{tabular} +\end{table} + +## The Forecasting paradigm + +Any forecaster should aim to provide a predictive distribution $F$ that is equal to the unknown true data-generating distribution $G$ \citep{gneitingProbabilisticForecastsCalibration2007}. For an ideal forecast, we therefore have + +$$ F = G, $$ + +where $F$ and $G$ are both cumulative distribution functions. As we don't know the true data-generating distribution $G$, we cannot assess the similarity between the two distributions directly. \cite{gneitingProbabilisticForecastsCalibration2007} instead suggest to focus on two central aspects of the predictive distribution: calibration and sharpness (illustrated in Figure \ref{fig:forecast-paradigm}). Calibration refers to the statistical consistency (i.e. absence of systematic deviations) between the predictive distribution and the observations. One can distinguish several forms of calibration which are discussed in detail by \cite{gneitingProbabilisticForecastsCalibration2007}. Sharpness is a feature of the forecast only and describes how concentrated the predictive distribution is, i.e. how informative the forecasts are. The general forecasting paradigm states that a forecaster should maximise sharpness of the predictive distribution subject to calibration. + +```{r forecast-paradigm, echo = FALSE, fig.cap= "Schematic illustration of sharpness (top row) and calibration (bottom row). Sharpness is a property of the forecast (black distributions) only, while calibration is the consistency between the forecasts and the observations drawn from the true data-generating distribution (grey histograms). For illustrative purposes, the probability density function (PDF) rather than the cumulative density function (CDF) is shown.", fig.show="hold"} + +include_graphics("plots/calibration-sharpness-illustration.png") +``` + +# Scoring metrics implemented in \pkg{scoringutils} {short-title="Scoring metrics implemented in scoringutils" #metrics} + +An overview of the metrics implemented in \pkg{scoringutils} can be found in Table \ref{tab:metrics-summary}, while Table \ref{tab:score-table-detailed} in the Appendix provides mathematical definitions, as well as a more thorough explanations. Some of the metrics in \pkg{scoringutils} focus on sharpness or calibration alone, others are so-called proper scoring rules \citep{gneitingStrictlyProperScoring2007}, which combine both aspects into a single number. A scoring rule is proper if the ideal forecaster (i.e. one using the data-generating distribution) receives the lowest score in expectation. The scoring rule is called strictly proper, if its optimum is unique. This makes sure that a forecaster evaluated by a strictly proper scoring rule is always incentivised to state their best estimate. Looking at calibration and sharpness independently can be helpful to learn about specific aspects of the forecasts and improve them. Proper scoring rules are especially useful to assess and rank predictive performance of forecasters. + + + +\newpage + +```{r metrics-summary, echo = FALSE, cache=FALSE} +# load data file from inst directory of the package +data <- readRDS(system.file("metrics-overview/metrics-summary.Rda", package = "scoringutils")) + +cap <- "Summary table of scores available in \\pkg{scoringutils}. A version of this table which includes corresponding function names can be accessed in \\proglang{R} by calling \\code{scoringutils::metrics\\_summary}. Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, 'x', or '$\\sim$' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary forecasts), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g. binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g. for 'global' and 'local' see section \\ref{localglobal}). For mathematical defintions of the metrics see Table \\ref{tab:score-table-detailed}." + +data[, 1:6] |> + kableExtra::kbl(format = "latex", booktabs = TRUE, + escape = FALSE, + longtable = TRUE, + caption = cap, + align = c("lccccl"), + linesep = c('\\addlinespace')) |> + kableExtra::column_spec(1, width = "2.9cm") |> + kableExtra::column_spec(6, width = "9.3cm") |> + kableExtra::kable_styling(latex_options = c("striped", + "repeat_header, scale_down"), + # full_width = TRUE, + font_size = 7.5) +``` + +\newpage + +## Assessing calibration + +There are many ways in which a forecast can be miscalibrated, i.e. systematically deviate from the observations. We also discuss metrics measuring bias, as this is an especially common form of miscalibration. + +### Probabilistic calibration {#probabilistic-calibration} + +The form of calibration most commonly focused on is called probabilistic calibration (for other form of calibration, see \cite{gneitingProbabilisticForecastsCalibration2007}). Probabilistic calibration means that the forecast distributions are consistent with the true data-generating distributions in the sense that on average, $\tau$\% of true observations will be below the corresponding $\tau$-\%-quantiles of the cumulative forecast distributions. This also implies that nominal coverage of the central prediction intervals (proportion of observations that should ideally be covered by the prediction intervals) corresponds to empirical coverage (proportion of observations actually covered). For example, the central 50\% prediction intervals of all forecasts should really contain around 50\% of the observed values, the 90\% central intervals should contain around 90\% of observations etc. Forecasts that are too narrow and do not cover the required proportion of observations are called overconfident or under-dispersed, while predictive distributions that are too wide are often called underconfident, over-dispersed or conservative. + +One can visualise probabilistic calibration in different ways and \pkg{scoringutils} offers three options. *Interval coverage plots* (see row 3 in Figure \ref{fig:calibration-plots}) show nominal coverage of the central prediction intervals against the percentage of observed values that fall inside the corresponding prediction intervals. Ideally forecasters should lie on the diagonal line. A shift to the left means a forecaster is too conservative and issues a predictive distribution that is too wide and covers more of the observed values than needed. A shift to the right means a forecaster is overconfident and the forecast distribution is too narrow. Similarly, *quantile coverage plots* (row 4 in Figure \ref{fig:calibration-plots}) show the quantiles of the predictive distribution against the percentage of observed values below the corresponding predictive quantiles. For quantiles below the median, a line to the right of the diagonal (predictive quantiles lower than the quantiles of the data-generating distribution) means a forecaster is too conservative, while for quantiles above the median, a line to the left of the diagonal line (predictive quantiles higher than the quantiles of the data-generating distribution) implies conservative predictions. + + +A similar way to visualise the same information is the probability integral transform (PIT) histogram \citep{dawidPresentPositionPotential1984}. The PIT is equal to $F(x_t)$, the cumulative predictive distribution evaluated at the observed value $x_t$ (see more details in Table \ref{tab:score-table-detailed}). If forecasts are probabilistically calibrated, then the transformed values will be uniformly distributed (for a proof see e.g. @angusProbabilityIntegralTransform1994). When plotting a histogram of PIT values (see row 2 in Figure \ref{fig:calibration-plots}), bias usually leads to a triangular shape, a U-shaped histogram corresponds to forecasts that are under-dispersed (too sharp) and a hump-shape appears when forecasts are over-dispersed (too wide). + +It is in principle possible to formally test probabilistic calibration, for example by employing a test on the uniformity of PIT values (e.g. the Anderson-Darling test \citep{andersonAsymptoticTheoryCertain1952}). In practice this can be difficult as forecasts and therefore also PIT values are often correlated. We therefore advise against using formal tests in most applied settings. It is also important to note that uniformity of the PIT histogram (or a diagonal on quantile and interval coverage plots) indicates probabilistic calibration, but does not guarantee that forecasts are indeed calibrated in every relevant sense. \cite{gneitingProbabilisticForecastsCalibration2007, hamillInterpretationRankHistograms2001a} provide examples with different forecasters who are clearly mis-calibrated, but have uniform PIT histograms. + +```{r calibration-plots, echo = FALSE, fig.pos = "!h", out.extra = "", fig.cap= "Top row: Standard normal forecasting distribution (black, constant across all examples) against observations sampled from different predictive distributions (grey histograms based on 1000 samples). Second row: PIT histograms based the standard normal predictive distributions and the sampled observations shown in the first row. Third row: Empirical vs. nominal coverage of the central prediction intervals for simulated observations and predictions. Areas shaded in green indicate that the forecasts are too wide (i.e. underconfident), covering more true values than they actually should, while areas in white indicate that the model generates too narrow predictions and fails to cover the desired proportion of true values with its prediction intervals. Fourth row: Quantile coverage values, with green areas indicating too wide (i.e. conservative) forecasts. Last row: Scores for the standard normal predictive distribution and the observations drawn from different data-generating distributions.", cache = FALSE} +include_graphics("plots/calibration-diagnostic-examples.png") + +# readRDS("plots/calibration-diagnostic-examples.Rda") |> +# print() +# +# kableExtra::kbl(format = "latex", booktabs = TRUE, +# escape = FALSE, +# linesep = c('\\addlinespace')) + +``` + +### Bias +Biased forecasts systematically over- or under-predict the observed values. The bias metric implemented in \pkg{scoringutils} follows \cite{funkAssessingPerformanceRealtime2019}, with slight adaptations for different forecast formats. It captures how much probability mass of the forecast was above or below the true value (mapped to values between -1 and 1, with 0 being ideal) and therefore represents a general tendency to over- or under-predict in relative terms. A value of -1 implies that the entire probability mass of the predictive distribution was below the observed value (and analogously above it for a value of 1). + +For forecasts in a quantile format, bias is also reflected in the over- and under-prediction components of the weighted interval score (a proper scoring rule explained in more detail in section \ref{wis}). These measure over- and under-prediction on an absolute scale (analogous to the absolute error of a point forecast), rather than a relative scale. However, it is not clear what the decomposition 'should' look like and a forecast can be well calibrated and still have different amounts of over- and under-prediction. High over-prediction or under-prediction values can therefore not immediately be interpreted as systematic bias. + +## Assessing sharpness + +Sharpness is the ability to produce narrow forecasts. In contrast to calibration it does not depend on the actual observations and is a quality of the forecast only \citep{gneitingProbabilisticForecastsCalibration2007}. Sharpness is therefore only useful subject to calibration, as exemplified in Figure \ref{fig:forecast-paradigm}. For forecasts provided as samples from the predictive distribution, \pkg{scoringutils} calculates dispersion (the inverse of sharpness) as the normalised median absolute deviation about the median (MAD), following \cite{funkAssessingPerformanceRealtime2019} (for details see Table \ref{tab:metrics-summary}). For quantile forecasts, we instead report the dispersion component of the weighted interval score (see details in section \ref{wis} and \ref{tab:score-table-detailed}) which corresponds to a weighted average of the individual interval widths. + +## Proper scoring rules for sample-based forecasts (CRPS, log score and DSS) + +For forecasts in a sample format, the \pkg{scoringutils} package implements the following proper scoring rules by providing wrappers to the corresponding functions in the \pkg{scoringRules} package: the (continuous) ranked probability score (CRPS) \citep{epsteinScoringSystemProbability1969, murphyNoteRankedProbability1971a, mathesonScoringRulesContinuous1976, gneitingStrictlyProperScoring2007}, the logarithmic score (log score) \citep{goodRationalDecisions1952}, and the Dawid-Sebastiani-score (DSS) \citep{dawidCoherentDispersionCriteria1999} (formal definitions are given in Table \ref{tab:score-table-detailed}). Compared to the implementations in the \pkg{scoringRules} these are exposed to the user through a slightly adapted interface. Other, closed form variants of the CRPS, log score and DSS are available in the \pkg{scoringRules} package. + +When scoring forecasts in a sample-based format, the choice is usually between the log score and the CRPS. The DSS is much less commonly used. It is easier to compute, but apart from that does not have immediate advantages over the former two. DSS, CRPS and log score differ in several important aspects: ease of estimation and speed of convergence, treatment of over- and underconfidence, sensitivity to distance \cite{winklerScoringRulesEvaluation1996}, sensitivity to outlier predictions, and sensitivity to the order of magnitude of the forecast quantity. + +### Estimation details and the number of samples required for accurate scoring + +The CRPS, DSS and log score are in principle all applicable to continuous as well as discrete forecasts. However, they differ in how easily and accurately scores can be computed based on predictive samples. This is an issue for the log score in particular, which equals the negative log density of the predictive distribution evaluated at the observed value and therefore requires a density estimation. The kernel density estimation used in \pkg{scoringutils} (through the function \fct{log\_sample} from the \pkg{scoringRules} package) may be particularly inappropriate for discrete values (see also Table \ref{tab:score-table-detailed}). The log score is therefore not computed for discrete predictions in \pkg{scoringutils}. For a small number of samples, estimated scores may deviate considerably from the exact scores computed based on closed-form predictive functions. This is especially pronounced for the log score, as illustrated in Figure \ref{fig:score-convergence} (adapted from \citep{jordanEvaluatingProbabilisticForecasts2019}). + +```{r score-convergence, echo = FALSE, fig.cap="Top: Estimation of scores from predictive samples (adapted from \\citep{jordanEvaluatingProbabilisticForecasts2019}). Scores were computed based on samples of differing size (from 10 to 100,000). This was repeated 500 times for each sample size. The black line is the mean score across the 500 repetitions, shaded areas represent 50\\% and 90\\% intervals, and the dashed line represents the true calculated score. Bottom left: Change in score when the uncertainty of the predictive distribution is changed. The true distribution is N(0,5) with the true standard deviation marked with a dashed line, while the standard deviation of the predictive distribution is varied along the x-axis. Log score and DSS clearly punish overconfidence much more severely than underconfidence. Bottom right: Score achieved for a standard normal predictive distribution (illustrated in grey) and different true observed values. Log score and DSS punish instances more harshly where the observed value is far away from the predictive distribution.", fig.show="hold"} +include_graphics("plots/sample-convergence.png") +include_graphics("plots/score-deviation-sd-mu.png") +``` + +### Overconfidence, underconfidence and outliers + +Proper scoring rules differ in how they penalise over- or underconfident forecasts. The log score and the DSS pnealise overconfidence much more severely than underconfidence, while the CRPS does not distinguish between over- and underconfidence and penalises both rather leniently \citep{macheteContrastingProbabilisticScoring2012} (see Figure \ref{fig:score-convergence}B, left panel). Similarly, the CRPS is relatively lenient with regards to outlier predictions compared to the log score and the DSS (see Figure \ref{fig:score-convergence}B, right panel). The CRPS, which can be thought of as a generalisation of the absolute error to a predictive distribution, scales linearly with the distance between forecast distribution and true value. The log score, on the other hand, as the negative logarithm of the predictive density evaluated at the observed value, can quickly tend to infinity if the probability assigned to the observed outcome is close to zero. Whether or not harsh penalisation of overconfidence and bad predictions is desirable or not depends of course on the setting. If, for example, one wanted to forecast hospital bed capacity, it may be prudent to score forecasts using a log score as one might prefer to be too cautious rather than too confident. + +### Sensitivity to distance - local vs\. global scores {#localglobal} + +The CRPS and the DSS are so-called global scoring rules, which means that the score is sensitive to the distance of the entire predictive distribution from the observed value. The log score, on the other hand, is local and the resulting score depends only on the probability density assigned to the actual outcome, ignoring the rest of the predictive distribution (see Figure \ref{fig:score-locality}). +Sensitivity to distance (taking the entire predictive distribution into account) may be a desirable property in most settings that involve decision making. A prediction which assigns high probability to results far away from the observed value is arguably less useful than a forecast which assigns a lot of probability mass to values closer to the observed outcome (the probability assigned to the actual outcome being equal for both forecasts). The log score is only implicitly sensitive to distance in expectation if we assume that values close to the observed value are actually more likely to occur. The fact that the log score only depends on the outcome that actually realised, however, may make it more appropriate for inferential purposes (see \citep{winklerScoringRulesEvaluation1996}) and it is commonly used in Bayesian statistics \citep{gelmanUnderstandingPredictiveInformation2014}. + +```{r score-locality, echo = FALSE, fig.cap="Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match. The true number later observed, 2, is marked with a dashed line. Both forecasters assign a probability of 0.35 to the observed outcome, 2. Forecaster A's prediction is centred around the observed value, while Forecaster B assigns significant probability to outcomes far away from the observed value. Judged by a local score like the Log Score, both forecasters receive the same score. A global score like the CRPS and the DSS penalises forecaster B more severely."} + +include_graphics("plots/score-locality.png") +``` + +### Sensitivity to the order of magnitude of the forecast quantity + +Average scores usually scale with the order of magnitude of the quantity we try to forecast (as the variance of the data-generating distribution usually increases with the mean). The effect is illustrated in Figure \ref{fig:score-scale}. This makes it harder to compare forecasts for very different targets, or assess average performance if the quantity of interest varies substantially over time. Average scores tend to be dominated by forecasts for targets with high absolute numbers. This is especially the case for the CRPS (as a generalisation of the absolute error), for which average scores tend to increase strongly with the order of magnitude of the quantity to forecast. The log score and the DSS tend to be more robust against this effect, depending on the exact relationship between mean and variance of the data-generating distribution. + +```{r score-scale, echo = FALSE, fig.cap="Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value. A: Mean and standard deviation of scores from a simulation of perfect forecasts with predictive distribution $F$ equal to the true data-generating distribution $G$. The standard deviation of the two distributions was held constant at $\\sigma$, and for different mean values $\\mu$ 100 pairs of forecasts and observations were simulated. Every simulated forecast consisted of 1000 draws from the data-generating distribution $G$ and 5000 draws from the (same) predictive distribution $F$. For all three scoring rules, mean and sd of the calculated scores stay constant regardless of the mean $\\mu$ of $F$ and $G$. B: Same setup, but now the mean of $F$ and $G$ was held constant at $\\mu = 1$ and the standard deviation $\\sigma$ was varied. Average scores increase for all three scoring rules, but most strongly for the CRPS. Standard deviations of the estimated scores stay roughly constant for the DSS and log score, but also increase for the CRPS. C: Scores for forecasts of COVID-19 cases and deaths from the European Forecast Hub ensemble based on the example data provided in the package."} + +include_graphics("plots/illustration-effect-scale.png") +``` + + +## Proper scoring rule for quantile-based forecasts (WIS) {#wis} +For forecasts in an interval or quantile format, \pkg{scoringutils} offers the weighted interval score (WIS) \citep{bracherEvaluatingEpidemicForecasts2021}. The WIS has very similar properties to the CRPS and can be thought of as a quantile-based approximation. For an increasing number of equally-spaced prediction intervals the WIS converges to the CRPS. One additional benefit of the WIS is that it can easily be decomposed into three additive components: an uncertainty penalty (called dispersion or sharpness penalty) for the width of a prediction interval and penalties for over- and under-prediction (if a value falls outside of a prediction interval). This can be very helpful in diagnosing issues with forecasts. It may even be useful to estimate quantiles from predictive samples and use the WIS in addition to the CRPS to make use of this decomposition for this purpose (with the caveat that estimating quantiles from samples may be biased if the number of samples is small). + + +## Proper scoring rules for binary outcomes (BS and log score) + +Binary forecasts can be scored using the Brier score (BS) or the log score. The Brier score \citep{brierVERIFICATIONFORECASTSEXPRESSED1950} corresponds to the squared difference between the given probability and the outcome (either 0 or 1) and equals the ranked probability score for the case of only two possible outcomes \citep{epsteinScoringSystemProbability1969, murphyNoteRankedProbability1971a}. The log score corresponds to the log of the probability assigned to the observed outcome. Just as with continuous forecasts, the log score penalises overconfidence much more harshly than underconfidence. The Brier score, on the other hand, does not distinguish between over- and underconfidence \citep{macheteContrastingProbabilisticScoring2012} and is therefore more forgiving of outlier predictions. + +## Pairwise comparisons + +In order to compare performance of different models fairly even if forecasts are missing, we can compute relative skill scores based on pairwise comparisons \citep{cramerEvaluationIndividualEnsemble2021}. Models enter a 'pairwise tournament', where all possible pairs of models are compared based on the overlapping set of available forecasts common to both models (omitting comparisons where there is no overlapping set of forecasts). For every pair, the ratio of the mean scores of both models is computed. The relative skill score of a model is then the geometric mean of all mean score ratios which involve that model. This gives us an indicator of performance relative to all other models, with the orientation depending on the score used. The method is able to account for missing forecasts, but it is nevertheless advisable to only compare forecasts that are at least 50\% complete. Furthermore, pairwise comparisons are only possible if all scores have the same sign. Then, a relative skill score smaller than 1 indicates that a model is performing better than the average model for negatively oriented scores. One can also compute a scaled relative skill score by providing baseline model. In that case those mean score rations which include the baseline are excluded when taking the geometric mean to obtain relative skill scores for individual models (which therefore differ slightly from relative scores without a baseline). All individual relative skill scores are then scaled by (i.e. divided by) the relative score of the baseline model. + +It is in principle possible to compute p-values to determine whether two models perform significantly differently. \pkg{scoringutils} allows to compute these using either the Wilcoxon rank sum test (also known as Mann-Whitney-U test) \citep{mannTestWhetherOne1947} or a permutation test. In practice, this is complicated by the fact that both tests assume independent observations. In reality, however, forecasts by a model may be correlated across time or another dimension (e.g. if a forecaster has a bad day, they might perform badly across different targets for a given forecast date). P-values may therefore be too liberal in suggesting significant differences where there aren't any. One way to mitigate this is to aggregate observations over a category where one suspects correlation (for example averaging across all forecasts made on a given date) before making pairwise comparisons. A test that is performed on aggregate scores will likely be more conservative. + +# Evaluating forecasts using scoringutils {#evaluation-example} + +The \pkg{scoringutils} package offers comprehensive functionality to conduct a forecast evaluation and allows users to check inputs, score forecasts and visualise results. Most functions operate on a `data.frame`-based format, but the package also provides a set of reliable lower-level scoring metrics operating on vectors/matrices, which experienced users can use in their own evaluation pipelines. These will not be discussed in this paper and refer to the vignettes and package documentation for further information. Some helper functions for data-handling, as well as example data sets and tables with additional information about available scoring metrics are also included in the package. Internally, operations are handled using \pkg{data.table} to allow for fast and efficient computation. + +## Example data + +The example data included in the package and used in this paper consists of one to three week ahead forecasts made between May and September 2021 for COVID-19 cases and deaths from four different forecasting models. It represents a small subset of short-term predictions for COVID-19 cases and deaths submitted to the European Forecast Hub \citep{europeancovid-19forecasthubEuropeanCovid19Forecast2021}. The European Forecast Hub each week collates, aggregates and evaluates one to four week ahead predictions of different COVID-19 related targets submitted by different research groups. Forecasts are submitted in a quantile-based format with a set of 22 quantiles plus the median ($0.01, 0.025, 0.05, ..., 0.5, ... 0.95, 0.975, 0.99$). The full official hub evaluations, which also use \pkg{scoringutils}, can be seen at https://covid19forecasthub.eu/. + +In the following, we will use the \fct{glimpse} function from the package \pkg{tibble} to display outputs more concisely. After loading the \pkg{scoringutils} package we can directly inspect the example data: + +```{r, cache = FALSE} +library(scoringutils) +library(tibble) + +example_quantile |> + na.omit() |> + glimpse() +``` + +## Expected input formats and data checking + +Depending on the format of the forecast, a `data.frame` (or similar) is required for most functions with column names as shown in Table \ref{tab:column-requirements}. +```{r, column-requirements, echo=FALSE} +library(data.table) +requirements <- + data.table( + "Format" = c("quantile-based", "sample-based", "binary", "pairwise-comparisons"), + `Required columns` = c("'true_value', 'prediction', 'quantile'", + "'true_value', 'prediction', 'sample'", + "'true_value', 'prediction'", + "additionally a column 'model'"), + "Example data" = c("example_quantile", "example_integer, \n example_continuous", "example_binary", "~") + ) + +requirements |> + kbl(format = "latex", + booktabs = TRUE, + linesep = c('\\addlinespace'), + caption = "Overview of the columns required for different input formats.") |> + kableExtra::column_spec(3, width = "3.7cm") |> + kableExtra::kable_styling(latex_options = c("striped", + "repeat_header, scale_down")) + +``` +Additional columns may be present to indicate a grouping of forecasts. A combination of different columns should uniquely define the unit of a single forecast, meaning that a single forecast is defined by the combination of values in the other columns. For example, a single forecast could be uniquely defined by a model name, a location, a forecast date and a forecast horizon. + +The function \fct{check\_forecasts} allows to check whether input data conforms to the function requirements and returns a list with entries that provide information on what \pkg{scoringutils} infers from the data. + +```{r,tidy=TRUE, width = 70, tidy.opts=list(width.cutoff=I(70)), echo=TRUE, cache = FALSE} +check_forecasts(example_quantile) +``` + +The values stored in the list elements \code{target_type} and \code{prediction_type} refer to type of the forecast and the target variable. \code{forecast_unit} contains a vector of the columns which \pkg{scoringutils} thinks denote the unit of a single forecast. This means that in this instance a single forecast (with a set of 23 quantiles) can uniquely be identified by the values in the columns "location", "target\_end\_date", "target\_type", "location\_name", "forecast\_date", "model", "horizon". In this example, having "location" as well as "location\_name" included does not make a difference, as they contain duplicated information. In general, however, it is strongly advised to remove all unnecessary columns that do not help identify a single forecast. \code{unique_values} gives an overview of the number of unique values per column across the entire data set, providing a first hint as to whether the forecast set is complete. \code{warnings} shows potential warnings about the data. In this example, \pkg{scoringutils} warns that there are observed values present for which there is no corresponding forecast. These warnings can often be ignored, but may provide important information. If there are errors that cannot be ignored, a list entry \code{errors} will appear. + +## Visualising forecast data + +It is helpful to start the evaluation process by examining forecast availability, as missing forecasts can impact the evaluation if missingness correlates with performance. The function \fct{avail_forecasts} returns information about the number of available forecasts, given a level of summary that can be specified through the \code{by} argument. For example, to see how many forecasts there are per model and target_type, we can run + +```{r avail-forecasts, echo=TRUE, fig.width = 10, eval = TRUE, fig.cap="Overview of the number of available forecasts."} +avail_forecasts(data = example_integer, + by = c("model", "target_type")) +``` + +and visualise results using the function \fct{plot\_avail\_forecasts}. The plot resulting from running the following code is displayed in Figure \ref{fig:avail-forecasts-plot}. + +```{r avail-forecasts-plot, echo=TRUE, fig.pos = "!h", fig.width = 8, fig.height = 3, eval = TRUE, fig.cap="Overview of the number of available forecasts."} +library(ggplot2) + +avail_forecasts(data = example_integer, + by = c("model", "target_type", "forecast_date")) |> + plot_avail_forecasts(x = "forecast_date", + show_numbers = FALSE) + + facet_wrap(~ target_type) + + labs(y = "Model", x = "Forecast date") + + theme(legend.position = "bottom") +``` + + + +The forecasts and observed values themselves can be visualised using the \fct{plot\_predictions} function. It offers the user an optional ad hoc way to filter both forecasts and observed values. Forecasts and observed values can be passed in separately (and are merged internally) or as a single data.frame. Conditions to filter on need to be provided as a list of strings, where each of the strings represents an expression that can be evaluated to filter the data. To display, for example, short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021 as well as 5 weeks of prior data, we can call the following. The resulting plot is shown in Figure \ref{fig:forecast-visualisation}. + +```{r forecast-visualisation, fig.pos = "!h", fig.width = 10, fig.height = 5, fig.cap = "Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021."} +plot_predictions(data = example_quantile, + x = "target_end_date", + filter_truth = list('target_end_date <= "2021-07-15"', + 'target_end_date > "2021-05-22"'), + filter_forecasts = list("model == 'EuroCOVIDhub-ensemble'", + 'forecast_date == "2021-06-28"')) + + facet_wrap(target_type ~ location, ncol = 4, scales = "free_y") + + theme(legend.position = "bottom") + + labs(x = "Target end date") +``` + +## Scoring forecasts with \fct{score} {short-title="Scoring forecasts with score()" #scoring} + +The function \fct{score} evaluates predictions against observed values and automatically applies the appropriate scoring metrics depending on the input data. We can simply call: + +```{r} +score(example_quantile) |> + glimpse() +``` + +The above produces one score for every forecast. However, we usually like to summarise scores to learn about average performance across certain categories. This can be done using the function \fct{summarise\_scores}, which returns one summarised score per category (column name) specified in the argument \code{by}. To return, for example, one score per model and forecast target, we can run the following: + +```{r} +score(example_quantile) |> + summarise_scores(by = c("model", "target_type")) |> + glimpse() +``` + +Summarised scores can then be visualised using the function \fct{scores\_table}. In order to display scores it is often useful to round the output to e.g. two significant digits, which can be achieved through another call of \fct{summarise\_scores}. The output of the following is shown in Figure \ref{fig:score-table}: + +```{r score-table, fig.width = 11, fig.cap="Coloured table to visualise the computed scores. Red colours indicate that a value is higher than ideal, blue indicicates it is lower than ideal and the opacity indicates the strength of the deviation from the ideal."} +score(example_quantile) |> + summarise_scores(by = c("model", "target_type")) |> + summarise_scores(fun = signif, digits = 2) |> + plot_score_table(y = "model", by = "target_type") + + facet_wrap(~ target_type) +``` + +While \fct{summarise\_scores} accepts arbitrary summary functions, care has to be taken when using something else than \fct{mean}, because scores may lose propriety when using other summary functions. For example, the median of several individual scores (individually based on a proper scoring rule) is usually not proper. A forecaster judged by the median of several scores may be incentivised to misrepresent their true belief in a way that is not true for the mean score. + +The user must exercise additional caution and should usually avoid aggregating scores across categories which differ much in the magnitude of the quantity to forecast, as forecast errors usually increase with the order of magnitude of the forecast target. In the given example, looking at one score per model (i.e. specifying \code{summarise_by = c("model")}) is problematic, as overall aggregate scores would be dominated by case forecasts, while performance on deaths would have little influence. Similarly, aggregating over different forecast horizons is often ill-advised as the mean will be dominated by further ahead forecast horizons. + +As a proxy for calibration, we are often interested in empirical coverage-levels of certain central prediction intervals, for example the percentage of true values which fell inside all 50% or 90% prediction intervals. For any quantile-based forecast, we can simply add this information using the function \fct{add\_coverage}. The function has a \code{by} argument which accepts a vector of column names defining the level of grouping for which empirical coverage is computed. Note that these column names should be equal to those passed to \code{by} in subsequent calls of \fct{summarise\_forecasts}. + +For sample-based forecasts, calculating coverage requires an extra step, namely estimating quantiles of the predictive distribution from samples. The function \fct{sample\_to\_quantile} takes a \code{data.frame} in a sample-based format and outputs one in a quantile-based format, which can then be passed to \fct{score} and \fct{add\_coverage}: + +```{r} +q <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + +example_integer |> + sample_to_quantile(quantiles = q) |> + score() |> + add_coverage(ranges = c(50, 90), by = c("model", "target_type")) |> + summarise_scores(by = c("model", "target_type")) |> + glimpse() +``` + +The process is designed to require conscious action by the user, because the estimation of quantiles from predictive samples may be biased if the number of available samples is not sufficiently large. The possibility to switch to a quantile-based format may also be useful for model analytics because of the decomposition of the weighted interval score which is not easily available for the CRPS. + +## Pairwise comparisons + +In order to obtain a model ranking, we recommend looking at the relative skill in terms of an appropriate proper scoring rule instead of the raw score. Relative skill scores can be aggregated more easily across different forecast targets as they are less influenced by the order of magnitude of the quantity to forecast than e.g. the WIS or the CRPS. + + + +Relative skill scores can either be obtained by specifying \code{relative_skill = TRUE} in the function \fct{summarise\_scores}, or by calling the function \fct{pairiwse\_comparison}. In both cases, pairwise comparisons are computed according to the grouping specified in the argument \code{by}: internally, the \code{data.frame} with all scores gets split into different \code{data.frame}s according to the values specified in \code{by} (excluding the column 'model'). Relative scores are then computed for every individual group separately. In the example below we specify \code{by = c("model", "target_type")}, which means that there is one relative skill score per model, calculated separately for the different forecasting targets. Using the argument \code{baseline}, we can compute relative skill with respect to a baseline model. + +```{r} +score(example_quantile) |> + pairwise_comparison(by = c("model", "target_type"), + baseline = "EuroCOVIDhub-baseline") |> + glimpse() +``` + +When a baseline is provided, pairwise comparisons involving that baseline model are excluded from the computation of relative scores for all non-baseline models. Relative skill scores with a baseline included therefore differ slightly from relative scores without a baseline. Scaled relative skill scores are then computed by scaling (i.e. dividing) all relative scores by the relative score of the baseline model. + + +Pairwise comparisons should usually be made based on unsummarised scores (\fct{pairwise\_comparison} internally summarises over samples and quantiles automatically, but nothing else), as summarising can change the set of overlapping forecasts between two models and distort relative skill scores. When computing relative skill scores using \fct{summarise_scores}, this happens by default. When using \fct{pairwise\_comparison}, the function \fct{summarise\_scores} should usually not be called beforehand. One potential exception to this is when one is interested in the p-values obtained from pairwise comparisons. As forecasts are usually highly correlated (which the calculation of p-values do not account for), it may be sensible to summaries over a few categories to reduce correlation and obtain more conservative p-values. + +Using the function \fct{plot\_pairwise\_comparison} we can visualise the mean score ratios between all models. The output of the following code is shown in Figure \ref{fig:pairwise-plot}. + +```{r pairwise-plot, echo=TRUE, fig.width = 8, fig.cap="Ratios of mean scores based on overlapping forecast sets. When interpreting the plot one should look at the model on the y-axis, and the model on the x-axis is the one it is compared against. If a tile is blue, then the model on the y-axis performed better. If it is red, the model on the x-axis performed better in direct comparison. In the example above, the EuroCOVIDhub-ensemble performs best (it only has values smaller than one), while the EuroCOVIDhub-baseline performs worst (and only has values larger than one). For cases, the UMass-MechBayes model is of course excluded as there are no case forecasts available and therefore the set of overlapping forecasts is empty."} +score(example_quantile) |> + pairwise_comparison(by = c("model", "target_type"), + baseline = "EuroCOVIDhub-baseline") |> + plot_pairwise_comparison() + + facet_wrap(~ target_type) +``` + +## Model diagnostics + +The \pkg{scoringutils} package offers a variety of functions to aid the user in diagnosing issues with models. + +For example, to detect systematic patterns it may be useful to visualise a single metric across several dimensions. The following produces a heatmap of bias values across different locations and forecast targets (output shown in Figure \ref{fig:score-heatmap}). + +```{r score-heatmap, fig.pos = "!h", fig.width = 8, fig.cap = "Heatmap of bias values for different models across different locations and forecast targets. Bias values are bound between -1 (under-prediction) and 1 (over-prediction) and should be 0 ideally. Red tiles indicate an upwards bias (over-prediction), while blue tiles indicate a downwards bias (under-predicction)"} +score(example_continuous) |> + summarise_scores(by = c("model", "location", "target_type")) |> + plot_heatmap(x = "location", metric = "bias") + + facet_wrap(~ target_type) + + theme(legend.position = "bottom") + +``` + +For quantile-based forecasts, it is helpful to visualise the decomposition of the weighted interval score into its components: dispersion, over-prediction and under-prediction. This can be achieved using the function \fct{plot\_wis\_components}, as shown in Figure \ref{fig:wis-components} + +As described in section \ref{scoring} it is possible to convert from a sample-based to a quantile-based forecast format to make use of this decomposition even for sample-based forecasts. + +```{r wis-components, fig.pos = "!h", fig.width = 8, fig.cap = "Decomposition of the weighted interval score (WIS) into dispersion, over-prediction and under-prediction. The WIS components measure over- and under-prediction in absolute, rather than relative terms."} +score(example_quantile) |> + summarise_scores(by = c("model", "target_type")) |> + plot_wis(relative_contributions = FALSE) + + facet_wrap(~ target_type, + scales = "free_x") + + theme(legend.position = "bottom") +``` + +Special attention should be given to calibration. The most common way of assessing calibration (more precisely: probabilistic calibration) are PIT histograms, as explained in section \ref{probabilistic-calibration}. Ideally, PIT values should be uniformly distributed after the transformation. + +We can compute PIT values in the following way + +```{r} +example_continuous |> + pit(by = "model") +``` + +and create PIT histograms using the function \fct{plot\_pit}. The output of the following is shown in Figure \ref{fig:pit-plots}: + + +```{r pit-plots, fig.pos = "!h", fig.cap="PIT histograms of all models stratified by forecast target. Histograms should ideally be uniform. A u-shape usually indicates overconfidence (forecasts are too narrow), a hump-shaped form indicates underconfidence (forecasts are too uncertain) and a triangle-shape indicates bias.", fig.width = 8, fig.height=4} +example_continuous |> + pit(by = c("model", "target_type")) |> + plot_pit() + + facet_grid(target_type ~ model) +``` + +We can also look at interval and quantile coverage plots (explained in more detail in section \ref{probabilistic-calibration}) using the functions \fct{plot\_interval\_coverage} and \fct{plot\_quantile\_coverage}. These plots require that the columns "range" and "quantile", respectively, be present in the scores to plot, and therefore need to be included in the `by` argument when summarising scores. The output of the following is shown in Figure \ref{fig:coverage}. + +```{r coverage, fig.width = 10, fig.pos = "!h", fig.show='hold', fig.cap = "Interval coverage and quantile coverage plots. Areas shaded in green indicate that the forecasts are too wide (i.e. underconfident), while areas in white indicate that the model is overconfident and generates too narrow predictions intervals."} +cov_scores <- score(example_quantile) |> + summarise_scores(by = c("model", "target_type", "range", "quantile")) + +plot_interval_coverage(cov_scores) + + facet_wrap(~ target_type) + +plot_quantile_coverage(cov_scores) + + facet_wrap(~ target_type) +``` + +It may sometimes be interesting to see how different scores correlate with each other. We can examine this using the function \fct{correlation}. When dealing with quantile-based forecasts, it is important to call \fct{summarise\_scorees} before \fct{correlation} in order to summarise over quantiles before computing correlations. The plot resulting from the following code is shown in Figure \ref{fig:correlation-plot}. + +```{r correlation-plot, fig.pos = "!h", fig.width=8, fig.height=4, fig.cap = "Correlation between different scores"} +correlations <- example_quantile |> + score() |> + summarise_scores() |> + correlation() + +correlations |> + glimpse() + +correlations |> + plot_correlation() +``` + +## Summary and discussion + +Forecast evaluation is invaluable to understanding and improving current forecasts. The \pkg{scoringutils} package aims to facilitate this process and make it easier, even for less experienced users. It provides a fast, flexible and convenient evaluation framework based on `data.frame`s, but also makes a set of scoring functions available to more experienced users to be used in other packages or pipelines. A set of visualisations and plotting functions help with diagnosing issues with models and allow for thorough comparison between different forecasting approaches. + +The package is still under active development and we warmly welcome contributions to \pkg{scoringutils}. In the future we hope to extend the number of scoring metrics supported. This includes spherical scoring rules \citep{gneitingStrictlyProperScoring2007, joseCharacterizationSphericalScoring2009, macheteContrastingProbabilisticScoring2012}, evaluation of multinomial prediction tasks, as well as a broader range of scoring metrics for point forecasts. We also plan to expand the plotting functionality and hope to make templates available for automated scoring reports. + +NOT SURE WHAT ELSE TO DISCUSS? + + + + + + + + +## Acknowledgments + +Funding statements + + + + + + + + + + + + + + + + + + +\newpage + +\appendix + +# (APPENDIX) Supplementary information {-} + +```{r score-table-detailed, echo=FALSE, cache = FALSE} + +data <- readRDS(system.file("metrics-overview/metrics-detailed.Rda", package = "scoringutils")) + +data[, 1:2] |> + kableExtra::kbl(format = "latex", booktabs = TRUE, + escape = FALSE, + caption = "Detailed explanation of all the metrics,", + longtable = TRUE, + linesep = c('\\addlinespace')) |> + kableExtra::column_spec(1, width = "1.1in") |> + kableExtra::column_spec(2, width = "4.625in") |> + kableExtra::kable_styling(latex_options = c("striped", "repeat_header")) + +``` + + + +```{r wis-components-relative, eval = FALSE, echo = FALSE, include = FALSE, fig.width = 10, fig.cap = "Components of the weighted interval score normalised to one to show relative contribution of different penalties."} +score(example_quantile) |> + summarise_scores(by = c("model", "target_type")) |> + plot_wis(relative_contributions = TRUE) + + facet_wrap(~ target_type, + scales = "free_x") + + coord_flip() + + theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) + + theme(legend.position = "bottom") +``` + + + +```{r pit-plots-quantile, eval = FALSE, echo = FALSE, include = FALSE, fig.cap="PIT histograms based on a subset of ten equally-spaced quantiles of all models stratified by forecast target. Histograms should ideally be uniform. A u-shape usually indicates overconfidence (forecasts are too narrow), a hump-shaped form indicates underconfidence (forecasts are too uncertain) and a triangle-shape indicates bias."} +subset(example_quantile, example_quantile$quantile %in% seq(0.1, 0.9, 0.1)) |> + pit(by = c("model", "target_type")) |> + plot_pit() + + facet_grid(model ~ target_type) +``` + + + + + + +```{r, eval = FALSE, echo = FALSE, include = FALSE, fig.pos = "!h", fig.width=8, fig.height=4, fig.pos="!h"} +example_quantile |> + score() |> + summarise_scores(by = c("model", "range", "target_type")) |> + plot_ranges() + + facet_wrap(~ target_type, scales = "free") +``` + + + +\newpage + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/inst/manuscript/manuscript.aux b/inst/manuscript/manuscript.aux new file mode 100644 index 000000000..827598ec9 --- /dev/null +++ b/inst/manuscript/manuscript.aux @@ -0,0 +1,179 @@ +\relax +\providecommand\hyper@newdestlabel[2]{} +\bibstyle{jss} +\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} +\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined +\global\let\oldcontentsline\contentsline +\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} +\global\let\oldnewlabel\newlabel +\gdef\newlabel#1#2{\newlabelxx{#1}#2} +\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} +\AtEndDocument{\ifx\hyper@anchor\@undefined +\let\contentsline\oldcontentsline +\let\newlabel\oldnewlabel +\fi} +\fi} +\global\let\hyper@last\relax +\gdef\HyperFirstAtBeginDocument#1{#1} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\citation{timmermannForecastingMethodsFinance2018,elliottForecastingEconomicsFinance2016} +\citation{gneitingWeatherForecastingEnsemble2005,kukkonenReviewOperationalRegionalscale2012} +\citation{reichCollaborativeMultiyearMultimodel2019,funkShorttermForecastsInform2020,cramerEvaluationIndividualEnsemble2021,bracherShorttermForecastingCOVID192021,europeancovid-19forecasthubEuropeanCovid19Forecast2021} +\citation{bracherEvaluatingEpidemicForecasts2021} +\citation{funkAssessingPerformanceRealtime2019} +\citation{gneitingProbabilisticForecastsCalibration2007} +\citation{gneitingStrictlyProperScoring2007} +\newlabel{introduction}{{1}{1}{}{section.1}{}} +\citation{R} +\citation{scoringRules} +\citation{topmodels} +\citation{tscount} +\citation{Metrics} +\citation{MLmetrics} +\citation{cramerEvaluationIndividualEnsemble2021,bracherShorttermForecastingCOVID192021,europeancovid-19forecasthubEuropeanCovid19Forecast2021,bracherNationalSubnationalShortterm2021} +\citation{europeancovid-19forecasthubEuropeanCovid19Forecast2021} +\citation{gneitingStrictlyProperScoring2007} +\newlabel{forecast-types-and-forecast-formats}{{1.1}{2}{}{subsection.1.1}{}} +\citation{cramerCOVID19ForecastHub2020,cramerEvaluationIndividualEnsemble2021,bracherShorttermForecastingCOVID192021,bracherNationalSubnationalShortterm2021,europeancovid-19forecasthubEuropeanCovid19Forecast2021} +\citation{gneitingProbabilisticForecastsCalibration2007} +\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.\relax }}{3}{table.caption.1}\protected@file@percent } +\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} +\newlabel{tab:forecast-types}{{1}{3}{Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.\relax }{table.caption.1}{}} +\newlabel{the-forecasting-paradigm}{{1.2}{3}{}{subsection.1.2}{}} +\citation{gneitingProbabilisticForecastsCalibration2007} +\citation{gneitingProbabilisticForecastsCalibration2007} +\citation{gneitingStrictlyProperScoring2007} +\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Schematic illustration of sharpness (top row) and calibration (bottom row)}}{4}{figure.caption.2}\protected@file@percent } +\newlabel{fig:forecast-paradigm}{{1}{4}{Schematic illustration of sharpness (top row) and calibration (bottom row)}{figure.caption.2}{}} +\newlabel{metrics}{{2}{4}{}{section.2}{}} +\gdef \LT@i {\LT@entry + {1}{94.51282pt}\LT@entry + {1}{19.15776pt}\LT@entry + {1}{19.15776pt}\LT@entry + {1}{19.15776pt}\LT@entry + {1}{19.15776pt}\LT@entry + {1}{276.6107pt}} +\newlabel{tab:metrics-summary}{{2}{6}{}{table.2}{}} +\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Summary table of scores available in {\fontseries {m}\fontseries {b}\selectfont scoringutils}. A version of this table which includes corresponding function names can be accessed in \textsf {R} by calling \bgroup \catcode `\_12\relax \catcode `\~12\relax \catcode `\$12\relax {\normalfont \ttfamily \hyphenchar \font =-1 scoringutils::metrics\_summary}\egroup . Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, 'x', or '$\sim $' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary forecasts), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g. binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g. for 'global' and 'local' see section \ref {localglobal}). For mathematical defintions of the metrics see Table \ref {tab:score-table-detailed}.\relax }}{6}{table.2}\protected@file@percent } +\citation{gneitingProbabilisticForecastsCalibration2007} +\citation{dawidPresentPositionPotential1984} +\citation{angusProbabilityIntegralTransform1994} +\citation{andersonAsymptoticTheoryCertain1952} +\citation{gneitingProbabilisticForecastsCalibration2007,hamillInterpretationRankHistograms2001a} +\newlabel{assessing-calibration}{{2.1}{7}{}{subsection.2.1}{}} +\newlabel{probabilistic-calibration}{{2.1.1}{7}{}{subsubsection.2.1.1}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Top row}}{8}{figure.caption.3}\protected@file@percent } +\newlabel{fig:calibration-plots}{{2}{8}{Top row}{figure.caption.3}{}} +\citation{funkAssessingPerformanceRealtime2019} +\citation{gneitingProbabilisticForecastsCalibration2007} +\citation{funkAssessingPerformanceRealtime2019} +\citation{epsteinScoringSystemProbability1969,murphyNoteRankedProbability1971a,mathesonScoringRulesContinuous1976,gneitingStrictlyProperScoring2007} +\citation{goodRationalDecisions1952} +\citation{dawidCoherentDispersionCriteria1999} +\citation{winklerScoringRulesEvaluation1996} +\newlabel{bias}{{2.1.2}{9}{}{subsubsection.2.1.2}{}} +\newlabel{assessing-sharpness}{{2.2}{9}{}{subsection.2.2}{}} +\newlabel{proper-scoring-rules-for-sample-based-forecasts-crps-log-score-and-dss}{{2.3}{9}{}{subsection.2.3}{}} +\citation{jordanEvaluatingProbabilisticForecasts2019} +\citation{jordanEvaluatingProbabilisticForecasts2019} +\citation{jordanEvaluatingProbabilisticForecasts2019} +\citation{macheteContrastingProbabilisticScoring2012} +\citation{winklerScoringRulesEvaluation1996} +\citation{gelmanUnderstandingPredictiveInformation2014} +\newlabel{estimation-details-and-the-number-of-samples-required-for-accurate-scoring}{{2.3.1}{10}{}{subsubsection.2.3.1}{}} +\newlabel{overconfidence-underconfidence-and-outliers}{{2.3.2}{10}{}{subsubsection.2.3.2}{}} +\newlabel{localglobal}{{2.3.3}{10}{}{subsubsection.2.3.3}{}} +\newlabel{sensitivity-to-the-order-of-magnitude-of-the-forecast-quantity}{{2.3.4}{10}{}{subsubsection.2.3.4}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Top}}{11}{figure.caption.4}\protected@file@percent } +\newlabel{fig:score-convergence}{{3}{11}{Top}{figure.caption.4}{}} +\citation{bracherEvaluatingEpidemicForecasts2021} +\citation{brierVERIFICATIONFORECASTSEXPRESSED1950} +\citation{epsteinScoringSystemProbability1969,murphyNoteRankedProbability1971a} +\citation{macheteContrastingProbabilisticScoring2012} +\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match}}{12}{figure.caption.5}\protected@file@percent } +\newlabel{fig:score-locality}{{4}{12}{Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match}{figure.caption.5}{}} +\newlabel{wis}{{2.4}{12}{}{subsection.2.4}{}} +\newlabel{proper-scoring-rules-for-binary-outcomes-bs-and-log-score}{{2.5}{12}{}{subsection.2.5}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value}}{13}{figure.caption.6}\protected@file@percent } +\newlabel{fig:score-scale}{{5}{13}{Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value}{figure.caption.6}{}} +\citation{cramerEvaluationIndividualEnsemble2021} +\citation{mannTestWhetherOne1947} +\newlabel{pairwise-comparisons}{{2.6}{14}{}{subsection.2.6}{}} +\newlabel{evaluation-example}{{3}{14}{}{section.3}{}} +\citation{europeancovid-19forecasthubEuropeanCovid19Forecast2021} +\newlabel{example-data}{{3.1}{15}{}{subsection.3.1}{}} +\newlabel{expected-input-formats-and-data-checking}{{3.2}{15}{}{subsection.3.2}{}} +\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Overview of the columns required for different input formats.\relax }}{16}{table.caption.7}\protected@file@percent } +\newlabel{tab:column-requirements}{{3}{16}{Overview of the columns required for different input formats.\relax }{table.caption.7}{}} +\newlabel{visualising-forecast-data}{{3.3}{18}{}{subsection.3.3}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Overview of the number of available forecasts}}{19}{figure.caption.8}\protected@file@percent } +\newlabel{fig:avail-forecasts-plot}{{6}{19}{Overview of the number of available forecasts}{figure.caption.8}{}} +\newlabel{scoring}{{3.4}{19}{}{subsection.3.4}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021}}{20}{figure.caption.9}\protected@file@percent } +\newlabel{fig:forecast-visualisation}{{7}{20}{Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021}{figure.caption.9}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Coloured table to visualise the computed scores}}{21}{figure.caption.10}\protected@file@percent } +\newlabel{fig:score-table}{{8}{21}{Coloured table to visualise the computed scores}{figure.caption.10}{}} +\newlabel{pairwise-comparisons-1}{{3.5}{22}{}{subsection.3.5}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Ratios of mean scores based on overlapping forecast sets}}{24}{figure.caption.11}\protected@file@percent } +\newlabel{fig:pairwise-plot}{{9}{24}{Ratios of mean scores based on overlapping forecast sets}{figure.caption.11}{}} +\newlabel{model-diagnostics}{{3.6}{24}{}{subsection.3.6}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Heatmap of bias values for different models across different locations and forecast targets}}{25}{figure.caption.12}\protected@file@percent } +\newlabel{fig:score-heatmap}{{10}{25}{Heatmap of bias values for different models across different locations and forecast targets}{figure.caption.12}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Decomposition of the weighted interval score (WIS) into dispersion, over-prediction and under-prediction}}{25}{figure.caption.13}\protected@file@percent } +\newlabel{fig:wis-components}{{11}{25}{Decomposition of the weighted interval score (WIS) into dispersion, over-prediction and under-prediction}{figure.caption.13}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces PIT histograms of all models stratified by forecast target}}{27}{figure.caption.14}\protected@file@percent } +\newlabel{fig:pit-plots}{{12}{27}{PIT histograms of all models stratified by forecast target}{figure.caption.14}{}} +\newlabel{summary-and-discussion}{{3.7}{27}{}{subsection.3.7}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces Interval coverage and quantile coverage plots}}{28}{figure.caption.15}\protected@file@percent } +\newlabel{fig:coverage}{{13}{28}{Interval coverage and quantile coverage plots}{figure.caption.15}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Correlation between different scores}}{28}{figure.caption.16}\protected@file@percent } +\newlabel{fig:correlation-plot}{{14}{28}{Correlation between different scores}{figure.caption.16}{}} +\citation{gneitingStrictlyProperScoring2007,joseCharacterizationSphericalScoring2009,macheteContrastingProbabilisticScoring2012} +\newlabel{acknowledgments}{{3.8}{29}{}{subsection.3.8}{}} +\gdef \LT@ii {\LT@entry + {1}{91.49744pt}\LT@entry + {1}{346.24875pt}} +\newlabel{appendix-supplementary-information}{{3.8}{30}{}{subsection.3.8}{}} +\@writefile{toc}{\contentsline {section}{(APPENDIX) Supplementary information}{30}{subsection.3.8}\protected@file@percent } +\newlabel{tab:score-table-detailed}{{4}{30}{}{table.4}{}} +\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Detailed explanation of all the metrics,\relax }}{30}{table.4}\protected@file@percent } +\bibdata{references.bib,scoringutils-paper.bib} +\bibcite{andersonAsymptoticTheoryCertain1952}{{1}{1952}{{Anderson and Darling}}{{}}} +\bibcite{angusProbabilityIntegralTransform1994}{{2}{1994}{{Angus}}{{}}} +\bibcite{bracherEvaluatingEpidemicForecasts2021}{{3}{2021{a}}{{Bracher \emph {et~al.}}}{{Bracher, Ray, Gneiting, and Reich}}} +\bibcite{bracherShorttermForecastingCOVID192021}{{4}{2021{b}}{{Bracher \emph {et~al.}}}{{Bracher, Wolffram, Deuschel, G{\"o}rgen, Ketterer, Ullrich, Abbott, Barbarossa, Bertsimas, Bhatia, Bodych, Bosse, Burgard, Castro, Fairchild, Fuhrmann, Funk, Gogolewski, Gu, Heyder, Hotz, Kheifetz, Kirsten, Krueger, Krymova, Li, Meinke, Michaud, Niedzielewski, O{\.z}a{\'n}ski, Rakowski, Scholz, Soni, Srivastava, Zieli{\'n}ski, Zou, Gneiting, and Schienle}}} +\bibcite{bracherNationalSubnationalShortterm2021}{{5}{2021{c}}{{Bracher \emph {et~al.}}}{{Bracher, Wolffram, Deuschel, G{\"o}rgen, Ketterer, Ullrich, Abbott, Barbarossa, Bertsimas, Bhatia, Bodych, Bosse, Burgard, Fiedler, Fuhrmann, Funk, Gambin, Gogolewski, Heyder, Hotz, Kheifetz, Kirsten, Krueger, Krymova, Leith{\"a}user, Li, Meinke, Miasojedow, Mohring, Nouvellet, Nowosielski, Ozanski, Radwan, Rakowski, Scholz, Soni, Srivastava, Gneiting, and Schienle}}} +\bibcite{brierVERIFICATIONFORECASTSEXPRESSED1950}{{6}{1950}{{Brier}}{{}}} +\bibcite{cramerEvaluationIndividualEnsemble2021}{{7}{2021}{{Cramer \emph {et~al.}}}{{Cramer, Ray, Lopez, Bracher, Brennen, Rivadeneira, Gerding, Gneiting, House, Huang, Jayawardena, Kanji, Khandelwal, Le, M{\"u}hlemann, Niemi, Shah, Stark, Wang, Wattanachit, Zorn, Gu, Jain, Bannur, Deva, Kulkarni, Merugu, Raval, Shingi, Tiwari, White, Woody, Dahan, Fox, Gaither, Lachmann, Meyers, Scott, Tec, Srivastava, George, Cegan, Dettwiller, England, Farthing, Hunter, Lafferty, Linkov, Mayo, Parno, Rowland, Trump, Corsetti, Baer, Eisenberg, Falb, Huang, Martin, McCauley, Myers, Schwarz, Sheldon, Gibson, Yu, Gao, Ma, Wu, Yan, Jin, Wang, Chen, Guo, Zhao, Gu, Chen, Wang, Xu, Zhang, Zou, Biegel, Lega, Snyder, Wilson, McConnell, Walraven, Shi, Ban, Hong, Kong, Turtle, {Ben-Nun}, Riley, Riley, Koyluoglu, DesRoches, Hamory, Kyriakides, Leis, Milliken, Moloney, Morgan, Ozcan, Schrader, Shakhnovich, Siegel, Spatz, Stiefeling, Wilkinson, Wong, Gao, Bian, Cao, Ferres, Li, Liu, Xie, Zhang, Zheng, Vespignani, Chinazzi, Davis, Mu, y~Piontti, Xiong, Zheng, Baek, Farias, Georgescu, Levi, Sinha, Wilde, Penna, Celi, Sundar, Cavany, Espa{\~n}a, Moore, Oidtman, Perkins, Osthus, Castro, Fairchild, Michaud, Karlen, Lee, Dent, Grantz, Kaminsky, Kaminsky, Keegan, Lauer, Lemaitre, Lessler, Meredith, {Perez-Saez}, Shah, Smith, Truelove, Wills, Kinsey, Obrecht, Tallaksen, Burant, Wang, Gao, Gu, Kim, Li, Wang, Wang, Yu, Reiner, Barber, Gaikedu, Hay, Lim, Murray, Pigott, Prakash, Adhikari, Cui, Rodr{\'i}guez, Tabassum, Xie, Keskinocak, Asplund, Baxter, Oruc, Serban, Arik, Dusenberry, Epshteyn, Kanal, Le, Li, Pfister, Sava, Sinha, Tsai, Yoder, Yoon, Zhang, Abbott, Bosse, Funk, Hellewel, Meakin, Munday, Sherratt, Zhou, Kalantari, Yamana, Pei, Shaman, Ayer, Adee, Chhatwal, Dalgic, Ladd, Linas, Mueller, Xiao, Li, Bertsimas, Lami, Soni, Bouardi, Wang, Wang, Xie, Zeng, Green, Bien, Hu, Jahja, Narasimhan, Rajanala, Rumack, Simon, Tibshirani, Tibshirani, Ventura, Wasserman, O'Dea, Drake, Pagano, Walker, Slayton, Johansson, Biggerstaff, and Reich}}} +\bibcite{cramerCOVID19ForecastHub2020}{{8}{2020}{{Cramer \emph {et~al.}}}{{Cramer, Reich, Wang, Niemi, Hannan, House, Gu, Xie, Horstman, {aniruddhadiga}, Walraven, {starkari}, Li, Gibson, Castro, Karlen, Wattanachit, {jinghuichen}, {zyt9lsb}, {aagarwal1996}, Woody, Ray, Xu, Biegel, GuidoEspana, X, Bracher, Lee, {har96}, and {leyouz}}}} +\bibcite{dawidPresentPositionPotential1984}{{9}{1984}{{Dawid}}{{}}} +\bibcite{dawidCoherentDispersionCriteria1999}{{10}{1999}{{Dawid and Sebastiani}}{{}}} +\bibcite{elliottForecastingEconomicsFinance2016}{{11}{2016}{{Elliott and Timmermann}}{{}}} +\bibcite{epsteinScoringSystemProbability1969}{{12}{1969}{{Epstein}}{{}}} +\bibcite{europeancovid-19forecasthubEuropeanCovid19Forecast2021}{{13}{2021}{{European Covid-19 Forecast Hub}}{{}}} +\bibcite{funkShorttermForecastsInform2020}{{14}{2020}{{Funk \emph {et~al.}}}{{Funk, Abbott, Atkins, Baguelin, Baillie, Birrell, Blake, Bosse, Burton, Carruthers, Davies, Angelis, Dyson, Edmunds, Eggo, Ferguson, Gaythorpe, Gorsich, {Guyver-Fletcher}, Hellewell, Hill, Holmes, House, Jewell, Jit, Jombart, Joshi, Keeling, Kendall, Knock, Kucharski, Lythgoe, Meakin, Munday, Openshaw, Overton, Pagani, Pearson, {Perez-Guzman}, Pellis, Scarabel, Semple, Sherratt, Tang, Tildesley, Leeuwen, Whittles, Group, Team, and Investigators}}} +\bibcite{funkAssessingPerformanceRealtime2019}{{15}{2019}{{Funk \emph {et~al.}}}{{Funk, Camacho, Kucharski, Lowe, Eggo, and Edmunds}}} +\bibcite{gelmanUnderstandingPredictiveInformation2014}{{16}{2014}{{Gelman \emph {et~al.}}}{{Gelman, Hwang, and Vehtari}}} +\bibcite{gneitingProbabilisticForecastsCalibration2007}{{17}{2007}{{Gneiting \emph {et~al.}}}{{Gneiting, Balabdaoui, and Raftery}}} +\bibcite{gneitingWeatherForecastingEnsemble2005}{{18}{2005}{{Gneiting and Raftery}}{{}}} +\bibcite{gneitingStrictlyProperScoring2007}{{19}{2007}{{Gneiting and Raftery}}{{}}} +\bibcite{goodRationalDecisions1952}{{20}{1952}{{Good}}{{}}} +\bibcite{hamillInterpretationRankHistograms2001a}{{21}{2001}{{Hamill}}{{}}} +\bibcite{Metrics}{{22}{2018}{{Hamner and Frasco}}{{}}} +\bibcite{scoringRules}{{23}{2019{a}}{{Jordan \emph {et~al.}}}{{Jordan, Kr\"uger, and Lerch}}} +\bibcite{jordanEvaluatingProbabilisticForecasts2019}{{24}{2019{b}}{{Jordan \emph {et~al.}}}{{Jordan, Kr{\"u}ger, and Lerch}}} +\bibcite{joseCharacterizationSphericalScoring2009}{{25}{2009}{{Jose}}{{}}} +\bibcite{kukkonenReviewOperationalRegionalscale2012}{{26}{2012}{{Kukkonen \emph {et~al.}}}{{Kukkonen, Olsson, Schultz, Baklanov, Klein, Miranda, Monteiro, Hirtl, Tarvainen, Boy, Peuch, Poupkou, Kioutsioukis, Finardi, Sofiev, Sokhi, Lehtinen, Karatzas, San~Jos{\'e}, Astitha, Kallos, Schaap, Reimer, Jakobs, and Eben}}} +\bibcite{tscount}{{27}{2017}{{Liboschik \emph {et~al.}}}{{Liboschik, Fokianos, and Fried}}} +\bibcite{macheteContrastingProbabilisticScoring2012}{{28}{2012}{{Machete}}{{}}} +\bibcite{mannTestWhetherOne1947}{{29}{1947}{{Mann and Whitney}}{{}}} +\bibcite{mathesonScoringRulesContinuous1976}{{30}{1976}{{Matheson and Winkler}}{{}}} +\bibcite{murphyNoteRankedProbability1971a}{{31}{1971}{{Murphy}}{{}}} +\bibcite{R}{{32}{2021}{{R Core Team}}{{}}} +\bibcite{reichCollaborativeMultiyearMultimodel2019}{{33}{2019}{{Reich \emph {et~al.}}}{{Reich, Brooks, Fox, Kandula, McGowan, Moore, Osthus, Ray, Tushar, Yamana, Biggerstaff, Johansson, Rosenfeld, and Shaman}}} +\bibcite{timmermannForecastingMethodsFinance2018}{{34}{2018}{{Timmermann}}{{}}} +\bibcite{winklerScoringRulesEvaluation1996}{{35}{1996}{{Winkler \emph {et~al.}}}{{Winkler, Mu{\~n}oz, Cervera, Bernardo, Blattenberger, Kadane, Lindley, Murphy, Oliver, and {R{\'i}os-Insua}}}} +\bibcite{MLmetrics}{{36}{2016}{{Yan}}{{}}} +\bibcite{topmodels}{{37}{2022}{{Zeileis and Lang}}{{}}} +\gdef \@abspage@last{43} diff --git a/inst/manuscript/manuscript.bbl b/inst/manuscript/manuscript.bbl new file mode 100644 index 000000000..9f14d5172 --- /dev/null +++ b/inst/manuscript/manuscript.bbl @@ -0,0 +1,407 @@ +\begin{thebibliography}{37} +\newcommand{\enquote}[1]{``#1''} +\providecommand{\natexlab}[1]{#1} +\providecommand{\url}[1]{\texttt{#1}} +\providecommand{\urlprefix}{URL } +\expandafter\ifx\csname urlstyle\endcsname\relax + \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else + \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup + \urlstyle{rm}\Url}\fi +\providecommand{\eprint}[2][]{\url{#2}} + +\bibitem[{Anderson and Darling(1952)}]{andersonAsymptoticTheoryCertain1952} +Anderson TW, Darling DA (1952). +\newblock \enquote{Asymptotic {{Theory}} of {{Certain}} "{{Goodness}} of + {{Fit}}" {{Criteria Based}} on {{Stochastic Processes}}.} +\newblock \emph{The Annals of Mathematical Statistics}, \textbf{23}(2), + 193--212. +\newblock ISSN 0003-4851. + +\bibitem[{Angus(1994)}]{angusProbabilityIntegralTransform1994} +Angus JE (1994). +\newblock \enquote{The {{Probability Integral Transform}} and {{Related + Results}}.} +\newblock \emph{SIAM Review}, \textbf{36}(4), 652--654. +\newblock ISSN 0036-1445. +\newblock \doi{10.1137/1036146}. + +\bibitem[{Bracher \emph{et~al.}(2021{\natexlab{a}})Bracher, Ray, Gneiting, and + Reich}]{bracherEvaluatingEpidemicForecasts2021} +Bracher J, Ray EL, Gneiting T, Reich NG (2021{\natexlab{a}}). +\newblock \enquote{Evaluating Epidemic Forecasts in an Interval Format.} +\newblock \emph{PLoS computational biology}, \textbf{17}(2), e1008618. +\newblock ISSN 1553-7358. +\newblock \doi{10.1371/journal.pcbi.1008618}. + +\bibitem[{Bracher \emph{et~al.}(2021{\natexlab{b}})Bracher, Wolffram, Deuschel, + G{\"o}rgen, Ketterer, Ullrich, Abbott, Barbarossa, Bertsimas, Bhatia, Bodych, + Bosse, Burgard, Castro, Fairchild, Fuhrmann, Funk, Gogolewski, Gu, Heyder, + Hotz, Kheifetz, Kirsten, Krueger, Krymova, Li, Meinke, Michaud, + Niedzielewski, O{\.z}a{\'n}ski, Rakowski, Scholz, Soni, Srivastava, + Zieli{\'n}ski, Zou, Gneiting, and + Schienle}]{bracherShorttermForecastingCOVID192021} +Bracher J, Wolffram D, Deuschel J, G{\"o}rgen K, Ketterer JL, Ullrich A, Abbott + S, Barbarossa MV, Bertsimas D, Bhatia S, Bodych M, Bosse NI, Burgard JP, + Castro L, Fairchild G, Fuhrmann J, Funk S, Gogolewski K, Gu Q, Heyder S, Hotz + T, Kheifetz Y, Kirsten H, Krueger T, Krymova E, Li ML, Meinke JH, Michaud IJ, + Niedzielewski K, O{\.z}a{\'n}ski T, Rakowski F, Scholz M, Soni S, Srivastava + A, Zieli{\'n}ski J, Zou D, Gneiting T, Schienle M (2021{\natexlab{b}}). +\newblock \enquote{Short-Term Forecasting of {{COVID-19}} in {{Germany}} and + {{Poland}} during the Second Wave \textendash{} a Preregistered Study.} +\newblock \emph{medRxiv}, p. 2020.12.24.20248826. +\newblock \doi{10.1101/2020.12.24.20248826}. + +\bibitem[{Bracher \emph{et~al.}(2021{\natexlab{c}})Bracher, Wolffram, Deuschel, + G{\"o}rgen, Ketterer, Ullrich, Abbott, Barbarossa, Bertsimas, Bhatia, Bodych, + Bosse, Burgard, Fiedler, Fuhrmann, Funk, Gambin, Gogolewski, Heyder, Hotz, + Kheifetz, Kirsten, Krueger, Krymova, Leith{\"a}user, Li, Meinke, Miasojedow, + Mohring, Nouvellet, Nowosielski, Ozanski, Radwan, Rakowski, Scholz, Soni, + Srivastava, Gneiting, and Schienle}]{bracherNationalSubnationalShortterm2021} +Bracher J, Wolffram D, Deuschel J, G{\"o}rgen K, Ketterer JL, Ullrich A, Abbott + S, Barbarossa MV, Bertsimas D, Bhatia S, Bodych M, Bosse NI, Burgard JP, + Fiedler J, Fuhrmann J, Funk S, Gambin A, Gogolewski K, Heyder S, Hotz T, + Kheifetz Y, Kirsten H, Krueger T, Krymova E, Leith{\"a}user N, Li ML, Meinke + JH, Miasojedow B, Mohring J, Nouvellet P, Nowosielski JM, Ozanski T, Radwan + M, Rakowski F, Scholz M, Soni S, Srivastava A, Gneiting T, Schienle M + (2021{\natexlab{c}}). +\newblock \enquote{National and Subnational Short-Term Forecasting of + {{COVID-19}} in {{Germany}} and {{Poland}}, Early 2021.} +\newblock \doi{10.1101/2021.11.05.21265810}. + +\bibitem[{Brier(1950)}]{brierVERIFICATIONFORECASTSEXPRESSED1950} +Brier GW (1950). +\newblock \enquote{{{VERIFICATION OF FORECASTS EXPRESSED IN TERMS OF + PROBABILITY}}.} +\newblock \emph{Monthly Weather Review}, \textbf{78}(1), 1--3. +\newblock ISSN 1520-0493, 0027-0644. +\newblock \doi{10.1175/1520-0493(1950)078<0001:VOFEIT>2.0.CO;2}. + +\bibitem[{Cramer \emph{et~al.}(2021)Cramer, Ray, Lopez, Bracher, Brennen, + Rivadeneira, Gerding, Gneiting, House, Huang, Jayawardena, Kanji, Khandelwal, + Le, M{\"u}hlemann, Niemi, Shah, Stark, Wang, Wattanachit, Zorn, Gu, Jain, + Bannur, Deva, Kulkarni, Merugu, Raval, Shingi, Tiwari, White, Woody, Dahan, + Fox, Gaither, Lachmann, Meyers, Scott, Tec, Srivastava, George, Cegan, + Dettwiller, England, Farthing, Hunter, Lafferty, Linkov, Mayo, Parno, + Rowland, Trump, Corsetti, Baer, Eisenberg, Falb, Huang, Martin, McCauley, + Myers, Schwarz, Sheldon, Gibson, Yu, Gao, Ma, Wu, Yan, Jin, Wang, Chen, Guo, + Zhao, Gu, Chen, Wang, Xu, Zhang, Zou, Biegel, Lega, Snyder, Wilson, + McConnell, Walraven, Shi, Ban, Hong, Kong, Turtle, {Ben-Nun}, Riley, Riley, + Koyluoglu, DesRoches, Hamory, Kyriakides, Leis, Milliken, Moloney, Morgan, + Ozcan, Schrader, Shakhnovich, Siegel, Spatz, Stiefeling, Wilkinson, Wong, + Gao, Bian, Cao, Ferres, Li, Liu, Xie, Zhang, Zheng, Vespignani, Chinazzi, + Davis, Mu, y~Piontti, Xiong, Zheng, Baek, Farias, Georgescu, Levi, Sinha, + Wilde, Penna, Celi, Sundar, Cavany, Espa{\~n}a, Moore, Oidtman, Perkins, + Osthus, Castro, Fairchild, Michaud, Karlen, Lee, Dent, Grantz, Kaminsky, + Kaminsky, Keegan, Lauer, Lemaitre, Lessler, Meredith, {Perez-Saez}, Shah, + Smith, Truelove, Wills, Kinsey, Obrecht, Tallaksen, Burant, Wang, Gao, Gu, + Kim, Li, Wang, Wang, Yu, Reiner, Barber, Gaikedu, Hay, Lim, Murray, Pigott, + Prakash, Adhikari, Cui, Rodr{\'i}guez, Tabassum, Xie, Keskinocak, Asplund, + Baxter, Oruc, Serban, Arik, Dusenberry, Epshteyn, Kanal, Le, Li, Pfister, + Sava, Sinha, Tsai, Yoder, Yoon, Zhang, Abbott, Bosse, Funk, Hellewel, Meakin, + Munday, Sherratt, Zhou, Kalantari, Yamana, Pei, Shaman, Ayer, Adee, Chhatwal, + Dalgic, Ladd, Linas, Mueller, Xiao, Li, Bertsimas, Lami, Soni, Bouardi, Wang, + Wang, Xie, Zeng, Green, Bien, Hu, Jahja, Narasimhan, Rajanala, Rumack, Simon, + Tibshirani, Tibshirani, Ventura, Wasserman, O'Dea, Drake, Pagano, Walker, + Slayton, Johansson, Biggerstaff, and + Reich}]{cramerEvaluationIndividualEnsemble2021} +Cramer E, Ray EL, Lopez VK, Bracher J, Brennen A, Rivadeneira AJC, Gerding A, + Gneiting T, House KH, Huang Y, Jayawardena D, Kanji AH, Khandelwal A, Le K, + M{\"u}hlemann A, Niemi J, Shah A, Stark A, Wang Y, Wattanachit N, Zorn MW, Gu + Y, Jain S, Bannur N, Deva A, Kulkarni M, Merugu S, Raval A, Shingi S, Tiwari + A, White J, Woody S, Dahan M, Fox S, Gaither K, Lachmann M, Meyers LA, Scott + JG, Tec M, Srivastava A, George GE, Cegan JC, Dettwiller ID, England WP, + Farthing MW, Hunter RH, Lafferty B, Linkov I, Mayo ML, Parno MD, Rowland MA, + Trump BD, Corsetti SM, Baer TM, Eisenberg MC, Falb K, Huang Y, Martin ET, + McCauley E, Myers RL, Schwarz T, Sheldon D, Gibson GC, Yu R, Gao L, Ma Y, Wu + D, Yan X, Jin X, Wang YX, Chen Y, Guo L, Zhao Y, Gu Q, Chen J, Wang L, Xu P, + Zhang W, Zou D, Biegel H, Lega J, Snyder TL, Wilson DD, McConnell S, Walraven + R, Shi Y, Ban X, Hong QJ, Kong S, Turtle JA, {Ben-Nun} M, Riley P, Riley S, + Koyluoglu U, DesRoches D, Hamory B, Kyriakides C, Leis H, Milliken J, Moloney + M, Morgan J, Ozcan G, Schrader C, Shakhnovich E, Siegel D, Spatz R, + Stiefeling C, Wilkinson B, Wong A, Gao Z, Bian J, Cao W, Ferres JL, Li C, Liu + TY, Xie X, Zhang S, Zheng S, Vespignani A, Chinazzi M, Davis JT, Mu K, + y~Piontti AP, Xiong X, Zheng A, Baek J, Farias V, Georgescu A, Levi R, Sinha + D, Wilde J, Penna ND, Celi LA, Sundar S, Cavany S, Espa{\~n}a G, Moore S, + Oidtman R, Perkins A, Osthus D, Castro L, Fairchild G, Michaud I, Karlen D, + Lee EC, Dent J, Grantz KH, Kaminsky J, Kaminsky K, Keegan LT, Lauer SA, + Lemaitre JC, Lessler J, Meredith HR, {Perez-Saez} J, Shah S, Smith CP, + Truelove SA, Wills J, Kinsey M, Obrecht RF, Tallaksen K, Burant JC, Wang L, + Gao L, Gu Z, Kim M, Li X, Wang G, Wang Y, Yu S, Reiner RC, Barber R, Gaikedu + E, Hay S, Lim S, Murray C, Pigott D, Prakash BA, Adhikari B, Cui J, + Rodr{\'i}guez A, Tabassum A, Xie J, Keskinocak P, Asplund J, Baxter A, Oruc + BE, Serban N, Arik SO, Dusenberry M, Epshteyn A, Kanal E, Le LT, Li CL, + Pfister T, Sava D, Sinha R, Tsai T, Yoder N, Yoon J, Zhang L, Abbott S, Bosse + NI, Funk S, Hellewel J, Meakin SR, Munday JD, Sherratt K, Zhou M, Kalantari + R, Yamana TK, Pei S, Shaman J, Ayer T, Adee M, Chhatwal J, Dalgic OO, Ladd + MA, Linas BP, Mueller P, Xiao J, Li ML, Bertsimas D, Lami OS, Soni S, Bouardi + HT, Wang Y, Wang Q, Xie S, Zeng D, Green A, Bien J, Hu AJ, Jahja M, + Narasimhan B, Rajanala S, Rumack A, Simon N, Tibshirani R, Tibshirani R, + Ventura V, Wasserman L, O'Dea EB, Drake JM, Pagano R, Walker JW, Slayton RB, + Johansson M, Biggerstaff M, Reich NG (2021). +\newblock \enquote{Evaluation of Individual and Ensemble Probabilistic + Forecasts of {{COVID-19}} Mortality in the {{US}}.} +\newblock \emph{medRxiv}, p. 2021.02.03.21250974. +\newblock \doi{10.1101/2021.02.03.21250974}. + +\bibitem[{Cramer \emph{et~al.}(2020)Cramer, Reich, Wang, Niemi, Hannan, House, + Gu, Xie, Horstman, {aniruddhadiga}, Walraven, {starkari}, Li, Gibson, Castro, + Karlen, Wattanachit, {jinghuichen}, {zyt9lsb}, {aagarwal1996}, Woody, Ray, + Xu, Biegel, GuidoEspana, X, Bracher, Lee, {har96}, and + {leyouz}}]{cramerCOVID19ForecastHub2020} +Cramer E, Reich NG, Wang SY, Niemi J, Hannan A, House K, Gu Y, Xie S, Horstman + S, {aniruddhadiga}, Walraven R, {starkari}, Li ML, Gibson G, Castro L, Karlen + D, Wattanachit N, {jinghuichen}, {zyt9lsb}, {aagarwal1996}, Woody S, Ray E, + Xu FT, Biegel H, GuidoEspana, X X, Bracher J, Lee E, {har96}, {leyouz} + (2020). +\newblock \enquote{{{COVID-19 Forecast Hub}}: 4 {{December}} 2020 Snapshot.} +\newblock \doi{10.5281/zenodo.3963371}. + +\bibitem[{Dawid(1984)}]{dawidPresentPositionPotential1984} +Dawid AP (1984). +\newblock \enquote{Present {{Position}} and {{Potential Developments}}: {{Some + Personal Views Statistical Theory}} the {{Prequential Approach}}.} +\newblock \emph{Journal of the Royal Statistical Society: Series A (General)}, + \textbf{147}(2), 278--290. +\newblock ISSN 2397-2327. +\newblock \doi{10.2307/2981683}. + +\bibitem[{Dawid and Sebastiani(1999)}]{dawidCoherentDispersionCriteria1999} +Dawid AP, Sebastiani P (1999). +\newblock \enquote{Coherent Dispersion Criteria for Optimal Experimental + Design.} +\newblock \emph{The Annals of Statistics}, \textbf{27}(1), 65--81. +\newblock ISSN 0090-5364, 2168-8966. +\newblock \doi{10.1214/aos/1018031101}. + +\bibitem[{Elliott and + Timmermann(2016)}]{elliottForecastingEconomicsFinance2016} +Elliott G, Timmermann A (2016). +\newblock \enquote{Forecasting in {{Economics}} and {{Finance}}.} +\newblock \emph{Annual Review of Economics}, \textbf{8}(1), 81--110. +\newblock \doi{10.1146/annurev-economics-080315-015346}. + +\bibitem[{Epstein(1969)}]{epsteinScoringSystemProbability1969} +Epstein ES (1969). +\newblock \enquote{A {{Scoring System}} for {{Probability Forecasts}} of + {{Ranked Categories}}.} +\newblock \emph{Journal of Applied Meteorology}, \textbf{8}(6), 985--987. +\newblock ISSN 0021-8952. +\newblock \doi{10.1175/1520-0450(1969)008<0985:ASSFPF>2.0.CO;2}. + +\bibitem[{{European Covid-19 Forecast + Hub}(2021)}]{europeancovid-19forecasthubEuropeanCovid19Forecast2021} +{European Covid-19 Forecast Hub} (2021). +\newblock \enquote{European {{Covid-19 Forecast Hub}}.} +\newblock https://covid19forecasthub.eu/. + +\bibitem[{Funk \emph{et~al.}(2020)Funk, Abbott, Atkins, Baguelin, Baillie, + Birrell, Blake, Bosse, Burton, Carruthers, Davies, Angelis, Dyson, Edmunds, + Eggo, Ferguson, Gaythorpe, Gorsich, {Guyver-Fletcher}, Hellewell, Hill, + Holmes, House, Jewell, Jit, Jombart, Joshi, Keeling, Kendall, Knock, + Kucharski, Lythgoe, Meakin, Munday, Openshaw, Overton, Pagani, Pearson, + {Perez-Guzman}, Pellis, Scarabel, Semple, Sherratt, Tang, Tildesley, Leeuwen, + Whittles, Group, Team, and Investigators}]{funkShorttermForecastsInform2020} +Funk S, Abbott S, Atkins BD, Baguelin M, Baillie JK, Birrell P, Blake J, Bosse + NI, Burton J, Carruthers J, Davies NG, Angelis DD, Dyson L, Edmunds WJ, Eggo + RM, Ferguson NM, Gaythorpe K, Gorsich E, {Guyver-Fletcher} G, Hellewell J, + Hill EM, Holmes A, House TA, Jewell C, Jit M, Jombart T, Joshi I, Keeling MJ, + Kendall E, Knock ES, Kucharski AJ, Lythgoe KA, Meakin SR, Munday JD, Openshaw + PJM, Overton CE, Pagani F, Pearson J, {Perez-Guzman} PN, Pellis L, Scarabel + F, Semple MG, Sherratt K, Tang M, Tildesley MJ, Leeuwen EV, Whittles LK, + Group CCW, Team ICCR, Investigators I (2020). +\newblock \enquote{Short-Term Forecasts to Inform the Response to the + {{Covid-19}} Epidemic in the {{UK}}.} +\newblock \emph{medRxiv}, p. 2020.11.11.20220962. +\newblock \doi{10.1101/2020.11.11.20220962}. + +\bibitem[{Funk \emph{et~al.}(2019)Funk, Camacho, Kucharski, Lowe, Eggo, and + Edmunds}]{funkAssessingPerformanceRealtime2019} +Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ (2019). +\newblock \enquote{Assessing the Performance of Real-Time Epidemic Forecasts: + {{A}} Case Study of {{Ebola}} in the {{Western Area}} Region of {{Sierra + Leone}}, 2014-15.} +\newblock \emph{PLOS Computational Biology}, \textbf{15}(2), e1006785. +\newblock ISSN 1553-7358. +\newblock \doi{10.1371/journal.pcbi.1006785}. + +\bibitem[{Gelman \emph{et~al.}(2014)Gelman, Hwang, and + Vehtari}]{gelmanUnderstandingPredictiveInformation2014} +Gelman A, Hwang J, Vehtari A (2014). +\newblock \enquote{Understanding Predictive Information Criteria for + {{Bayesian}} Models.} +\newblock \emph{Statistics and Computing}, \textbf{24}(6), 997--1016. +\newblock ISSN 1573-1375. +\newblock \doi{10.1007/s11222-013-9416-2}. + +\bibitem[{Gneiting \emph{et~al.}(2007)Gneiting, Balabdaoui, and + Raftery}]{gneitingProbabilisticForecastsCalibration2007} +Gneiting T, Balabdaoui F, Raftery AE (2007). +\newblock \enquote{Probabilistic Forecasts, Calibration and Sharpness.} +\newblock \emph{Journal of the Royal Statistical Society: Series B (Statistical + Methodology)}, \textbf{69}(2), 243--268. +\newblock ISSN 1467-9868. +\newblock \doi{10.1111/j.1467-9868.2007.00587.x}. + +\bibitem[{Gneiting and Raftery(2005)}]{gneitingWeatherForecastingEnsemble2005} +Gneiting T, Raftery AE (2005). +\newblock \enquote{Weather {{Forecasting}} with {{Ensemble Methods}}.} +\newblock \emph{Science}, \textbf{310}(5746), 248--249. +\newblock ISSN 0036-8075, 1095-9203. +\newblock \doi{10.1126/science.1115255}. + +\bibitem[{Gneiting and Raftery(2007)}]{gneitingStrictlyProperScoring2007} +Gneiting T, Raftery AE (2007). +\newblock \enquote{Strictly {{Proper Scoring Rules}}, {{Prediction}}, and + {{Estimation}}.} +\newblock \emph{Journal of the American Statistical Association}, + \textbf{102}(477), 359--378. +\newblock ISSN 0162-1459, 1537-274X. +\newblock \doi{10.1198/016214506000001437}. + +\bibitem[{Good(1952)}]{goodRationalDecisions1952} +Good IJ (1952). +\newblock \enquote{Rational {{Decisions}}.} +\newblock \emph{Journal of the Royal Statistical Society. Series B + (Methodological)}, \textbf{14}(1), 107--114. +\newblock ISSN 0035-9246. + +\bibitem[{Hamill(2001)}]{hamillInterpretationRankHistograms2001a} +Hamill TM (2001). +\newblock \enquote{Interpretation of {{Rank Histograms}} for {{Verifying + Ensemble Forecasts}}.} +\newblock \emph{Monthly Weather Review}, \textbf{129}(3), 550--560. +\newblock ISSN 1520-0493, 0027-0644. +\newblock \doi{10.1175/1520-0493(2001)129<0550:IORHFV>2.0.CO;2}. + +\bibitem[{Hamner and Frasco(2018)}]{Metrics} +Hamner B, Frasco M (2018). +\newblock \emph{Metrics: Evaluation Metrics for Machine Learning}. +\newblock R package version 0.1.4, + \urlprefix\url{https://CRAN.R-project.org/package=Metrics}. + +\bibitem[{Jordan \emph{et~al.}(2019{\natexlab{a}})Jordan, Kr\"uger, and + Lerch}]{scoringRules} +Jordan A, Kr\"uger F, Lerch S (2019{\natexlab{a}}). +\newblock \enquote{Evaluating Probabilistic Forecasts with {scoringRules}.} +\newblock \emph{Journal of Statistical Software}, \textbf{90}(12), 1--37. +\newblock \doi{10.18637/jss.v090.i12}. + +\bibitem[{Jordan \emph{et~al.}(2019{\natexlab{b}})Jordan, Kr{\"u}ger, and + Lerch}]{jordanEvaluatingProbabilisticForecasts2019} +Jordan A, Kr{\"u}ger F, Lerch S (2019{\natexlab{b}}). +\newblock \enquote{Evaluating {{Probabilistic Forecasts}} with + {{{\textbf{scoringRules}}}}.} +\newblock \emph{Journal of Statistical Software}, \textbf{90}(12). +\newblock ISSN 1548-7660. +\newblock \doi{10.18637/jss.v090.i12}. + +\bibitem[{Jose(2009)}]{joseCharacterizationSphericalScoring2009} +Jose VR (2009). +\newblock \enquote{A {{Characterization}} for the {{Spherical Scoring Rule}}.} +\newblock \emph{Theory and Decision}, \textbf{66}(3), 263--281. +\newblock ISSN 1573-7187. +\newblock \doi{10.1007/s11238-007-9067-x}. + +\bibitem[{Kukkonen \emph{et~al.}(2012)Kukkonen, Olsson, Schultz, Baklanov, + Klein, Miranda, Monteiro, Hirtl, Tarvainen, Boy, Peuch, Poupkou, + Kioutsioukis, Finardi, Sofiev, Sokhi, Lehtinen, Karatzas, San~Jos{\'e}, + Astitha, Kallos, Schaap, Reimer, Jakobs, and + Eben}]{kukkonenReviewOperationalRegionalscale2012} +Kukkonen J, Olsson T, Schultz DM, Baklanov A, Klein T, Miranda AI, Monteiro A, + Hirtl M, Tarvainen V, Boy M, Peuch VH, Poupkou A, Kioutsioukis I, Finardi S, + Sofiev M, Sokhi R, Lehtinen KEJ, Karatzas K, San~Jos{\'e} R, Astitha M, + Kallos G, Schaap M, Reimer E, Jakobs H, Eben K (2012). +\newblock \enquote{A Review of Operational, Regional-Scale, Chemical Weather + Forecasting Models in {{Europe}}.} +\newblock \emph{Atmospheric Chemistry and Physics}, \textbf{12}(1), 1--87. +\newblock ISSN 1680-7316. +\newblock \doi{10.5194/acp-12-1-2012}. + +\bibitem[{Liboschik \emph{et~al.}(2017)Liboschik, Fokianos, and + Fried}]{tscount} +Liboschik T, Fokianos K, Fried R (2017). +\newblock \enquote{{tscount}: An {R} Package for Analysis of Count Time Series + Following Generalized Linear Models.} +\newblock \emph{Journal of Statistical Software}, \textbf{82}(5), 1--51. +\newblock \doi{10.18637/jss.v082.i05}. + +\bibitem[{Machete(2012)}]{macheteContrastingProbabilisticScoring2012} +Machete RL (2012). +\newblock \enquote{Contrasting {{Probabilistic Scoring Rules}}.} +\newblock \emph{arXiv:1112.4530 [math, stat]}. +\newblock \eprint{1112.4530}. + +\bibitem[{Mann and Whitney(1947)}]{mannTestWhetherOne1947} +Mann HB, Whitney DR (1947). +\newblock \enquote{On a {{Test}} of {{Whether}} One of {{Two Random Variables}} + Is {{Stochastically Larger}} than the {{Other}}.} +\newblock \emph{The Annals of Mathematical Statistics}, \textbf{18}(1), 50--60. +\newblock ISSN 0003-4851, 2168-8990. +\newblock \doi{10.1214/aoms/1177730491}. + +\bibitem[{Matheson and Winkler(1976)}]{mathesonScoringRulesContinuous1976} +Matheson JE, Winkler RL (1976). +\newblock \enquote{Scoring {{Rules}} for {{Continuous Probability + Distributions}}.} +\newblock \emph{Management Science}, \textbf{22}(10), 1087--1096. +\newblock ISSN 0025-1909. +\newblock \doi{10.1287/mnsc.22.10.1087}. + +\bibitem[{Murphy(1971)}]{murphyNoteRankedProbability1971a} +Murphy AH (1971). +\newblock \enquote{A {{Note}} on the {{Ranked Probability Score}}.} +\newblock \emph{Journal of Applied Meteorology and Climatology}, + \textbf{10}(1), 155--156. +\newblock ISSN 1520-0450. +\newblock \doi{10.1175/1520-0450(1971)010<0155:ANOTRP>2.0.CO;2}. + +\bibitem[{{R Core Team}(2021)}]{R} +{R Core Team} (2021). +\newblock \emph{R: A Language and Environment for Statistical Computing}. +\newblock R Foundation for Statistical Computing, Vienna, Austria. +\newblock \urlprefix\url{https://www.R-project.org/}. + +\bibitem[{Reich \emph{et~al.}(2019)Reich, Brooks, Fox, Kandula, McGowan, Moore, + Osthus, Ray, Tushar, Yamana, Biggerstaff, Johansson, Rosenfeld, and + Shaman}]{reichCollaborativeMultiyearMultimodel2019} +Reich NG, Brooks LC, Fox SJ, Kandula S, McGowan CJ, Moore E, Osthus D, Ray EL, + Tushar A, Yamana TK, Biggerstaff M, Johansson MA, Rosenfeld R, Shaman J + (2019). +\newblock \enquote{A Collaborative Multiyear, Multimodel Assessment of Seasonal + Influenza Forecasting in the {{United States}}.} +\newblock \emph{Proceedings of the National Academy of Sciences}, + \textbf{116}(8), 3146--3154. +\newblock ISSN 0027-8424, 1091-6490. +\newblock \doi{10.1073/pnas.1812594116}. + +\bibitem[{Timmermann(2018)}]{timmermannForecastingMethodsFinance2018} +Timmermann A (2018). +\newblock \enquote{Forecasting {{Methods}} in {{Finance}}.} +\newblock \emph{Annual Review of Financial Economics}, \textbf{10}(1), + 449--479. +\newblock \doi{10.1146/annurev-financial-110217-022713}. + +\bibitem[{Winkler \emph{et~al.}(1996)Winkler, Mu{\~n}oz, Cervera, Bernardo, + Blattenberger, Kadane, Lindley, Murphy, Oliver, and + {R{\'i}os-Insua}}]{winklerScoringRulesEvaluation1996} +Winkler RL, Mu{\~n}oz J, Cervera JL, Bernardo JM, Blattenberger G, Kadane JB, + Lindley DV, Murphy AH, Oliver RM, {R{\'i}os-Insua} D (1996). +\newblock \enquote{Scoring Rules and the Evaluation of Probabilities.} +\newblock \emph{Test}, \textbf{5}(1), 1--60. +\newblock ISSN 1863-8260. +\newblock \doi{10.1007/BF02562681}. + +\bibitem[{Yan(2016)}]{MLmetrics} +Yan Y (2016). +\newblock \emph{MLmetrics: Machine Learning Evaluation Metrics}. +\newblock R package version 1.1.1, + \urlprefix\url{https://CRAN.R-project.org/package=MLmetrics}. + +\bibitem[{Zeileis and Lang(2022)}]{topmodels} +Zeileis A, Lang MN (2022). +\newblock \emph{topmodels: Infrastructure for Inference and Forecasting in + Probabilistic Models}. +\newblock R package version 0.1-0/r1498, + \urlprefix\url{https://R-Forge.R-project.org/projects/topmodels/}. + +\end{thebibliography} diff --git a/inst/manuscript/manuscript.blg b/inst/manuscript/manuscript.blg new file mode 100644 index 000000000..415ef448e --- /dev/null +++ b/inst/manuscript/manuscript.blg @@ -0,0 +1,48 @@ +This is BibTeX, Version 0.99d (TeX Live 2021/Arch Linux) +Capacity: max_strings=200000, hash_size=200000, hash_prime=170003 +The top-level auxiliary file: manuscript.aux +The style file: jss.bst +Database file #1: references.bib +Database file #2: scoringutils-paper.bib +Reallocated wiz_functions (elt_size=8) to 6000 items from 3000. +You've used 37 entries, + 3215 wiz_defined-function locations, + 955 strings with 22567 characters, +and the built_in function-call counts, 111867 in all, are: += -- 11765 +> -- 4208 +< -- 25 ++ -- 1591 +- -- 1540 +* -- 8786 +:= -- 12253 +add.period$ -- 195 +call.type$ -- 37 +change.case$ -- 664 +chr.to.int$ -- 38 +cite$ -- 37 +duplicate$ -- 11835 +empty$ -- 7374 +format.name$ -- 1642 +if$ -- 22611 +int.to.chr$ -- 4 +int.to.str$ -- 1 +missing$ -- 812 +newline$ -- 247 +num.names$ -- 148 +pop$ -- 6114 +preamble$ -- 1 +purify$ -- 661 +quote$ -- 0 +skip$ -- 1225 +stack$ -- 0 +substring$ -- 14481 +swap$ -- 1972 +text.length$ -- 3 +text.prefix$ -- 0 +top$ -- 0 +type$ -- 333 +warning$ -- 0 +while$ -- 675 +width$ -- 0 +write$ -- 589 diff --git a/inst/manuscript/manuscript.out b/inst/manuscript/manuscript.out new file mode 100644 index 000000000..7ef6d15ba --- /dev/null +++ b/inst/manuscript/manuscript.out @@ -0,0 +1,26 @@ +\BOOKMARK [1][-]{Section.0.Introduction.1}{\376\377\000I\000n\000t\000r\000o\000d\000u\000c\000t\000i\000o\000n}{}% 1 +\BOOKMARK [2][-]{Subsection.1.0.Forecast types and forecast formats.2}{\376\377\000F\000o\000r\000e\000c\000a\000s\000t\000\040\000t\000y\000p\000e\000s\000\040\000a\000n\000d\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000\040\000f\000o\000r\000m\000a\000t\000s}{Section.0.Introduction.1}% 2 +\BOOKMARK [2][-]{Subsection.1.1.The Forecasting paradigm.2}{\376\377\000T\000h\000e\000\040\000F\000o\000r\000e\000c\000a\000s\000t\000i\000n\000g\000\040\000p\000a\000r\000a\000d\000i\000g\000m}{Section.0.Introduction.1}% 3 +\BOOKMARK [1][-]{Section.1.Scoring metrics implemented in scoringutils.1}{\376\377\000S\000c\000o\000r\000i\000n\000g\000\040\000m\000e\000t\000r\000i\000c\000s\000\040\000i\000m\000p\000l\000e\000m\000e\000n\000t\000e\000d\000\040\000i\000n\000\040\000s\000c\000o\000r\000i\000n\000g\000u\000t\000i\000l\000s}{}% 4 +\BOOKMARK [2][-]{Subsection.2.0.Assessing calibration.2}{\376\377\000A\000s\000s\000e\000s\000s\000i\000n\000g\000\040\000c\000a\000l\000i\000b\000r\000a\000t\000i\000o\000n}{Section.1.Scoring metrics implemented in scoringutils.1}% 5 +\BOOKMARK [3][-]{Subsubsection.2.1.0.Probabilistic calibration.3}{\376\377\000P\000r\000o\000b\000a\000b\000i\000l\000i\000s\000t\000i\000c\000\040\000c\000a\000l\000i\000b\000r\000a\000t\000i\000o\000n}{Subsection.2.0.Assessing calibration.2}% 6 +\BOOKMARK [3][-]{Subsubsection.2.1.1.Bias.3}{\376\377\000B\000i\000a\000s}{Subsection.2.0.Assessing calibration.2}% 7 +\BOOKMARK [2][-]{Subsection.2.1.Assessing sharpness.2}{\376\377\000A\000s\000s\000e\000s\000s\000i\000n\000g\000\040\000s\000h\000a\000r\000p\000n\000e\000s\000s}{Section.1.Scoring metrics implemented in scoringutils.1}% 8 +\BOOKMARK [2][-]{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000s\000\040\000f\000o\000r\000\040\000s\000a\000m\000p\000l\000e\000-\000b\000a\000s\000e\000d\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000\050\000C\000R\000P\000S\000,\000\040\000l\000o\000g\000\040\000s\000c\000o\000r\000e\000\040\000a\000n\000d\000\040\000D\000S\000S\000\051}{Section.1.Scoring metrics implemented in scoringutils.1}% 9 +\BOOKMARK [3][-]{Subsubsection.2.3.0.Estimation details and the number of samples required for accurate scoring.3}{\376\377\000E\000s\000t\000i\000m\000a\000t\000i\000o\000n\000\040\000d\000e\000t\000a\000i\000l\000s\000\040\000a\000n\000d\000\040\000t\000h\000e\000\040\000n\000u\000m\000b\000e\000r\000\040\000o\000f\000\040\000s\000a\000m\000p\000l\000e\000s\000\040\000r\000e\000q\000u\000i\000r\000e\000d\000\040\000f\000o\000r\000\040\000a\000c\000c\000u\000r\000a\000t\000e\000\040\000s\000c\000o\000r\000i\000n\000g}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 10 +\BOOKMARK [3][-]{Subsubsection.2.3.1.Overconfidence, underconfidence and outliers.3}{\376\377\000O\000v\000e\000r\000c\000o\000n\000f\000i\000d\000e\000n\000c\000e\000,\000\040\000u\000n\000d\000e\000r\000c\000o\000n\000f\000i\000d\000e\000n\000c\000e\000\040\000a\000n\000d\000\040\000o\000u\000t\000l\000i\000e\000r\000s}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 11 +\BOOKMARK [3][-]{Subsubsection.2.3.2.Sensitivity to distance - local vs. global scores.3}{\376\377\000S\000e\000n\000s\000i\000t\000i\000v\000i\000t\000y\000\040\000t\000o\000\040\000d\000i\000s\000t\000a\000n\000c\000e\000\040\000-\000\040\000l\000o\000c\000a\000l\000\040\000v\000s\000.\000\040\000g\000l\000o\000b\000a\000l\000\040\000s\000c\000o\000r\000e\000s}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 12 +\BOOKMARK [3][-]{Subsubsection.2.3.3.Sensitivity to the order of magnitude of the forecast quantity.3}{\376\377\000S\000e\000n\000s\000i\000t\000i\000v\000i\000t\000y\000\040\000t\000o\000\040\000t\000h\000e\000\040\000o\000r\000d\000e\000r\000\040\000o\000f\000\040\000m\000a\000g\000n\000i\000t\000u\000d\000e\000\040\000o\000f\000\040\000t\000h\000e\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000\040\000q\000u\000a\000n\000t\000i\000t\000y}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 13 +\BOOKMARK [2][-]{Subsection.2.3.Proper scoring rule for quantile-based forecasts (WIS).2}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000\040\000f\000o\000r\000\040\000q\000u\000a\000n\000t\000i\000l\000e\000-\000b\000a\000s\000e\000d\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000\050\000W\000I\000S\000\051}{Section.1.Scoring metrics implemented in scoringutils.1}% 14 +\BOOKMARK [2][-]{Subsection.2.4.Proper scoring rules for binary outcomes (BS and log score).2}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000s\000\040\000f\000o\000r\000\040\000b\000i\000n\000a\000r\000y\000\040\000o\000u\000t\000c\000o\000m\000e\000s\000\040\000\050\000B\000S\000\040\000a\000n\000d\000\040\000l\000o\000g\000\040\000s\000c\000o\000r\000e\000\051}{Section.1.Scoring metrics implemented in scoringutils.1}% 15 +\BOOKMARK [2][-]{Subsection.2.5.Pairwise comparisons.2}{\376\377\000P\000a\000i\000r\000w\000i\000s\000e\000\040\000c\000o\000m\000p\000a\000r\000i\000s\000o\000n\000s}{Section.1.Scoring metrics implemented in scoringutils.1}% 16 +\BOOKMARK [1][-]{Section.2.Evaluating forecasts using scoringutils.1}{\376\377\000E\000v\000a\000l\000u\000a\000t\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000u\000s\000i\000n\000g\000\040\000s\000c\000o\000r\000i\000n\000g\000u\000t\000i\000l\000s}{}% 17 +\BOOKMARK [2][-]{Subsection.3.0.Example data.2}{\376\377\000E\000x\000a\000m\000p\000l\000e\000\040\000d\000a\000t\000a}{Section.2.Evaluating forecasts using scoringutils.1}% 18 +\BOOKMARK [2][-]{Subsection.3.1.Expected input formats and data checking.2}{\376\377\000E\000x\000p\000e\000c\000t\000e\000d\000\040\000i\000n\000p\000u\000t\000\040\000f\000o\000r\000m\000a\000t\000s\000\040\000a\000n\000d\000\040\000d\000a\000t\000a\000\040\000c\000h\000e\000c\000k\000i\000n\000g}{Section.2.Evaluating forecasts using scoringutils.1}% 19 +\BOOKMARK [2][-]{Subsection.3.2.Visualising forecast data.2}{\376\377\000V\000i\000s\000u\000a\000l\000i\000s\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000\040\000d\000a\000t\000a}{Section.2.Evaluating forecasts using scoringutils.1}% 20 +\BOOKMARK [2][-]{Subsection.3.3.Scoring forecasts with score().2}{\376\377\000S\000c\000o\000r\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000w\000i\000t\000h\000\040\000s\000c\000o\000r\000e\000\050\000\051}{Section.2.Evaluating forecasts using scoringutils.1}% 21 +\BOOKMARK [2][-]{Subsection.3.4.Pairwise comparisons.2}{\376\377\000P\000a\000i\000r\000w\000i\000s\000e\000\040\000c\000o\000m\000p\000a\000r\000i\000s\000o\000n\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 22 +\BOOKMARK [2][-]{Subsection.3.5.Model diagnostics.2}{\376\377\000M\000o\000d\000e\000l\000\040\000d\000i\000a\000g\000n\000o\000s\000t\000i\000c\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 23 +\BOOKMARK [2][-]{Subsection.3.6.Summary and discussion.2}{\376\377\000S\000u\000m\000m\000a\000r\000y\000\040\000a\000n\000d\000\040\000d\000i\000s\000c\000u\000s\000s\000i\000o\000n}{Section.2.Evaluating forecasts using scoringutils.1}% 24 +\BOOKMARK [2][-]{Subsection.3.7.Acknowledgments.2}{\376\377\000A\000c\000k\000n\000o\000w\000l\000e\000d\000g\000m\000e\000n\000t\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 25 +\BOOKMARK [1][-]{subsection.3.8}{\376\377\000\050\000A\000P\000P\000E\000N\000D\000I\000X\000\051\000\040\000S\000u\000p\000p\000l\000e\000m\000e\000n\000t\000a\000r\000y\000\040\000i\000n\000f\000o\000r\000m\000a\000t\000i\000o\000n}{}% 26 diff --git a/inst/manuscript/plots/calibration-diagnostic-examples.Rda b/inst/manuscript/plots/calibration-diagnostic-examples.Rda new file mode 100644 index 000000000..193566bb3 Binary files /dev/null and b/inst/manuscript/plots/calibration-diagnostic-examples.Rda differ diff --git a/inst/manuscript/plots/calibration-diagnostic-examples.png b/inst/manuscript/plots/calibration-diagnostic-examples.png new file mode 100644 index 000000000..b3bcefd44 Binary files /dev/null and b/inst/manuscript/plots/calibration-diagnostic-examples.png differ diff --git a/inst/manuscript/plots/calibration-illustration.png b/inst/manuscript/plots/calibration-illustration.png new file mode 100644 index 000000000..a3047230e Binary files /dev/null and b/inst/manuscript/plots/calibration-illustration.png differ diff --git a/inst/manuscript/plots/calibration-sharpness-illustration.png b/inst/manuscript/plots/calibration-sharpness-illustration.png new file mode 100644 index 000000000..e7909e918 Binary files /dev/null and b/inst/manuscript/plots/calibration-sharpness-illustration.png differ diff --git a/inst/manuscript/plots/forecast-paradigm.png b/inst/manuscript/plots/forecast-paradigm.png new file mode 100644 index 000000000..d898710bf Binary files /dev/null and b/inst/manuscript/plots/forecast-paradigm.png differ diff --git a/inst/manuscript/plots/illustration-effect-scale-sim.png b/inst/manuscript/plots/illustration-effect-scale-sim.png new file mode 100644 index 000000000..18461b336 Binary files /dev/null and b/inst/manuscript/plots/illustration-effect-scale-sim.png differ diff --git a/inst/manuscript/plots/illustration-effect-scale.png b/inst/manuscript/plots/illustration-effect-scale.png new file mode 100644 index 000000000..fc6ce9692 Binary files /dev/null and b/inst/manuscript/plots/illustration-effect-scale.png differ diff --git a/inst/manuscript/plots/relation-to-scale-example.Rda b/inst/manuscript/plots/relation-to-scale-example.Rda new file mode 100644 index 000000000..ee3132fa5 Binary files /dev/null and b/inst/manuscript/plots/relation-to-scale-example.Rda differ diff --git a/inst/manuscript/plots/sample-convergence.png b/inst/manuscript/plots/sample-convergence.png new file mode 100644 index 000000000..64db7d163 Binary files /dev/null and b/inst/manuscript/plots/sample-convergence.png differ diff --git a/inst/manuscript/plots/score-deviation-sd-mu.png b/inst/manuscript/plots/score-deviation-sd-mu.png new file mode 100644 index 000000000..5b146bb5a Binary files /dev/null and b/inst/manuscript/plots/score-deviation-sd-mu.png differ diff --git a/inst/manuscript/plots/score-locality.png b/inst/manuscript/plots/score-locality.png new file mode 100644 index 000000000..e612188ba Binary files /dev/null and b/inst/manuscript/plots/score-locality.png differ diff --git a/inst/manuscript/plots/sharpness-illustration.png b/inst/manuscript/plots/sharpness-illustration.png new file mode 100644 index 000000000..3f6525bfe Binary files /dev/null and b/inst/manuscript/plots/sharpness-illustration.png differ diff --git a/inst/manuscript/references.bib b/inst/manuscript/references.bib new file mode 100644 index 000000000..d24300c50 --- /dev/null +++ b/inst/manuscript/references.bib @@ -0,0 +1,55 @@ + @Article{scoringRules, + title = {Evaluating Probabilistic Forecasts with {scoringRules}}, + author = {Alexander Jordan and Fabian Kr\"uger and Sebastian Lerch}, + journal = {Journal of Statistical Software}, + year = {2019}, + volume = {90}, + number = {12}, + pages = {1--37}, + doi = {10.18637/jss.v090.i12}, + } + + @Manual{Metrics, + title = {Metrics: Evaluation Metrics for Machine Learning}, + author = {Ben Hamner and Michael Frasco}, + year = {2018}, + note = {R package version 0.1.4}, + url = {https://CRAN.R-project.org/package=Metrics}, + } + + @Manual{MLmetrics, + title = {MLmetrics: Machine Learning Evaluation Metrics}, + author = {Yachen Yan}, + year = {2016}, + note = {R package version 1.1.1}, + url = {https://CRAN.R-project.org/package=MLmetrics}, + } + + @Manual{R, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2021}, + url = {https://www.R-project.org/}, + } + + @Manual{topmodels, + title = {topmodels: Infrastructure for Inference and Forecasting in Probabilistic +Models}, + author = {Achim Zeileis and Moritz N. Lang}, + year = {2022}, + note = {R package version 0.1-0/r1498}, + url = {https://R-Forge.R-project.org/projects/topmodels/}, + } + + @Article{tscount, + title = {{tscount}: An {R} Package for Analysis of Count Time Series Following Generalized Linear Models}, + author = {Tobias Liboschik and Konstantinos Fokianos and Roland Fried}, + journal = {Journal of Statistical Software}, + year = {2017}, + volume = {82}, + number = {5}, + pages = {1--51}, + doi = {10.18637/jss.v082.i05}, + } diff --git a/inst/manuscript/scoringutils-paper.bib b/inst/manuscript/scoringutils-paper.bib new file mode 100644 index 000000000..e823df066 --- /dev/null +++ b/inst/manuscript/scoringutils-paper.bib @@ -0,0 +1,593 @@ + +@article{andersonAsymptoticTheoryCertain1952, + title = {Asymptotic {{Theory}} of {{Certain}} "{{Goodness}} of {{Fit}}" {{Criteria Based}} on {{Stochastic Processes}}}, + author = {Anderson, T. W. and Darling, D. A.}, + year = {1952}, + journal = {The Annals of Mathematical Statistics}, + volume = {23}, + number = {2}, + pages = {193--212}, + publisher = {{Institute of Mathematical Statistics}}, + issn = {0003-4851}, + abstract = {The statistical problem treated is that of testing the hypothesis that n independent, identically distributed random variables have a specified continuous distribution function F(x). If Fn(x) is the empirical cumulative distribution function and {$\psi$}(t) is some nonnegative weight function (0 {$\leq$} t {$\leq$} 1), we consider \$n\^\{\textbackslash frac\{1\}\{2\}\} \textbackslash sup\_\{-\textbackslash infty\vphantom\}} +} + +@article{angusProbabilityIntegralTransform1994, + title = {The {{Probability Integral Transform}} and {{Related Results}}}, + author = {Angus, John E.}, + year = {1994}, + month = dec, + journal = {SIAM Review}, + volume = {36}, + number = {4}, + pages = {652--654}, + publisher = {{Society for Industrial and Applied Mathematics}}, + issn = {0036-1445}, + doi = {10.1137/1036146}, + abstract = {A simple proof of the probability integral transform theorem in probability and statistics is given that depends only on probabilistic concepts and elementary properties of continuous functions. This proof yields the theorem in its fullest generality. A similar theorem that forms the basis for the inverse method of random number generation is also discussed and contrasted to the probability integral transform theorem. Typical applications are discussed. Despite their generality and far reaching consequences, these theorems are remarkable in their simplicity and ease of proof.}, + file = {/mnt/data/github-synced/zotero-nikos/storage/8K3YQL5Q/1036146.html} +} + +@article{bracherEvaluatingEpidemicForecasts2021, + title = {Evaluating Epidemic Forecasts in an Interval Format}, + author = {Bracher, Johannes and Ray, Evan L. and Gneiting, Tilmann and Reich, Nicholas G.}, + year = {2021}, + month = feb, + journal = {PLoS computational biology}, + volume = {17}, + number = {2}, + pages = {e1008618}, + issn = {1553-7358}, + doi = {10.1371/journal.pcbi.1008618}, + abstract = {For practical reasons, many forecasts of case, hospitalization, and death counts in the context of the current Coronavirus Disease 2019 (COVID-19) pandemic are issued in the form of central predictive intervals at various levels. This is also the case for the forecasts collected in the COVID-19 Forecast Hub (https://covid19forecasthub.org/). Forecast evaluation metrics like the logarithmic score, which has been applied in several infectious disease forecasting challenges, are then not available as they require full predictive distributions. This article provides an overview of how established methods for the evaluation of quantile and interval forecasts can be applied to epidemic forecasts in this format. Specifically, we discuss the computation and interpretation of the weighted interval score, which is a proper score that approximates the continuous ranked probability score. It can be interpreted as a generalization of the absolute error to probabilistic forecasts and allows for a decomposition into a measure of sharpness and penalties for over- and underprediction.}, + langid = {english}, + pmcid = {PMC7880475}, + pmid = {33577550}, + keywords = {Communicable Diseases,COVID-19,Forecasting,Humans,Pandemics,Probability,SARS-CoV-2}, + file = {/mnt/data/github-synced/zotero-nikos/storage/EX37R6J8/Bracher et al. - 2021 - Evaluating epidemic forecasts in an interval forma.pdf} +} + +@misc{bracherNationalSubnationalShortterm2021, + title = {National and Subnational Short-Term Forecasting of {{COVID-19}} in {{Germany}} and {{Poland}}, Early 2021}, + author = {Bracher, Johannes and Wolffram, Daniel and Deuschel, J. and G{\"o}rgen, K. and Ketterer, J. L. and Ullrich, A. and Abbott, S. and Barbarossa, M. V. and Bertsimas, D. and Bhatia, S. and Bodych, M. and Bosse, N. I. and Burgard, J. P. and Fiedler, J. and Fuhrmann, J. and Funk, S. and Gambin, A. and Gogolewski, K. and Heyder, S. and Hotz, T. and Kheifetz, Y. and Kirsten, H. and Krueger, T. and Krymova, E. and Leith{\"a}user, N. and Li, M. L. and Meinke, J. H. and Miasojedow, B. and Mohring, J. and Nouvellet, P. and Nowosielski, J. M. and Ozanski, T. and Radwan, M. and Rakowski, F. and Scholz, M. and Soni, S. and Srivastava, A. and Gneiting, T. and Schienle, M.}, + year = {2021}, + month = nov, + pages = {2021.11.05.21265810}, + institution = {{Cold Spring Harbor Laboratory Press}}, + doi = {10.1101/2021.11.05.21265810}, + abstract = {We report on the second and final part of a pre-registered forecasting study on COVID-19 cases and deaths in Germany and Poland. Fifteen independent research teams provided forecasts at lead times of one through four weeks from January through mid-April 2021. Compared to the first part (October\textendash December 2020), the number of participating teams increased, and a number of teams started providing subnational-level forecasts. The addressed time period is characterized by rather stable non-pharmaceutical interventions in both countries, making short-term predictions more straightforward than in the first part of our study. In both countries, case counts declined initially, before rebounding due to the rise of the B.1.1.7 variant. Deaths declined through most of the study period in Germany while in Poland they increased after a prolonged plateau. Many, though not all, models outperformed a simple baseline model up to four weeks ahead, with ensemble methods showing very good relative performance. Major trend changes in reported cases, however, remained challenging to predict.}, + copyright = {\textcopyright{} 2021, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution-NonCommercial 4.0 International), CC BY-NC 4.0, as described at http://creativecommons.org/licenses/by-nc/4.0/}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/DXDPZ8TC/Bracher et al. - 2021 - National and subnational short-term forecasting of.pdf;/mnt/data/github-synced/zotero-nikos/storage/Y7YB8TYG/2021.11.05.html} +} + +@article{bracherShorttermForecastingCOVID192021, + title = {Short-Term Forecasting of {{COVID-19}} in {{Germany}} and {{Poland}} during the Second Wave \textendash{} a Preregistered Study}, + author = {Bracher, Johannes and Wolffram, Daniel and Deuschel, J. and G{\"o}rgen, K. and Ketterer, J. L. and Ullrich, A. and Abbott, S. and Barbarossa, M. V. and Bertsimas, D. and Bhatia, S. and Bodych, M. and Bosse, Nikos I. and Burgard, J. P. and Castro, L. and Fairchild, G. and Fuhrmann, J. and Funk, S. and Gogolewski, K. and Gu, Q. and Heyder, S. and Hotz, T. and Kheifetz, Y. and Kirsten, H. and Krueger, T. and Krymova, E. and Li, M. L. and Meinke, J. H. and Michaud, I. J. and Niedzielewski, K. and O{\.z}a{\'n}ski, T. and Rakowski, F. and Scholz, M. and Soni, S. and Srivastava, A. and Zieli{\'n}ski, J. and Zou, D. and Gneiting, T. and Schienle, M.}, + year = {2021}, + month = jan, + journal = {medRxiv}, + pages = {2020.12.24.20248826}, + publisher = {{Cold Spring Harbor Laboratory Press}}, + doi = {10.1101/2020.12.24.20248826}, + abstract = {{$<$}h3{$>$}Abstract{$<$}/h3{$>$} {$<$}p{$>$}We report insights from ten weeks of collaborative COVID-19 forecasting for Germany and Poland (12 October \textendash{} 19 December 2020). The study period covers the onset of the second wave in both countries, with tightening non-pharmaceutical interventions (NPIs) and subsequently a decay (Poland) or plateau and renewed increase (Germany) in reported cases. Thirteen independent teams provided probabilistic real-time forecasts of COVID-19 cases and deaths. These were reported for lead times of one to four weeks, with evaluation focused on one- and two-week horizons, which are less affected by changing NPIs. Heterogeneity between forecasts was considerable both in terms of point predictions and forecast spread. Ensemble forecasts showed good relative performance, in particular in terms of coverage, but did not clearly dominate single-model predictions. The study was preregistered and will be followed up in future phases of the pandemic.{$<$}/p{$>$}}, + copyright = {\textcopyright{} 2021, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution-NonCommercial 4.0 International), CC BY-NC 4.0, as described at http://creativecommons.org/licenses/by-nc/4.0/}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/BHPBLCD9/Bracher et al. - 2021 - Short-term forecasting of COVID-19 in Germany and .pdf;/mnt/data/github-synced/zotero-nikos/storage/I3ULULUZ/2020.12.24.20248826v2.html} +} + +@article{brierVERIFICATIONFORECASTSEXPRESSED1950, + title = {{{VERIFICATION OF FORECASTS EXPRESSED IN TERMS OF PROBABILITY}}}, + author = {Brier, Glenn W.}, + year = {1950}, + month = jan, + journal = {Monthly Weather Review}, + volume = {78}, + number = {1}, + pages = {1--3}, + publisher = {{American Meteorological Society}}, + issn = {1520-0493, 0027-0644}, + doi = {10.1175/1520-0493(1950)078<0001:VOFEIT>2.0.CO;2}, + abstract = {Abstract No Abstract Available.}, + chapter = {Monthly Weather Review}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/ZCBG3Z38/Brier - 1950 - VERIFICATION OF FORECASTS EXPRESSED IN TERMS OF PR.pdf;/mnt/data/github-synced/zotero-nikos/storage/I83583N3/1520-0493_1950_078_0001_vofeit_2_0_co_2.html} +} + +@article{brockerDecompositionsProperScores, + title = {Decompositions of {{Proper Scores}}}, + author = {Brocker, Jochen}, + pages = {9}, + abstract = {Scoring rules are an important tool for evaluating the performance of probabilistic forecasts. A popular example is the Brier score, which allows for a decomposition into terms related to the sharpness (or information content) and to the reliability of the forecast. This feature renders the Brier score a very intuitive measure of forecast quality. In this paper, it is demonstrated that all strictly proper scoring rules allow for a similar decomposition into reliability and sharpness related terms. This finding underpins the importance of proper scores and yields further credence to the practice of measuring forecast quality by proper scores. Furthermore, the effect of averaging multiple probabilistic forecasts on the score is discussed. It is well known that the Brier score of a mixture of several forecasts is never worse that the average score of the individual forecasts. This property hinges on the convexity of the Brier score, a property not universal among proper scores. Arguably, this phenomenon portends epistemological questions which require clarification.}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/P9IB98P5/Brocker - Decompositions of Proper Scores.pdf} +} + +@misc{cramerCOVID19ForecastHub2020, + title = {{{COVID-19 Forecast Hub}}: 4 {{December}} 2020 Snapshot}, + shorttitle = {{{COVID-19 Forecast Hub}}}, + author = {Cramer, Estee and Nicholas G Reich and Serena Yijin Wang and Jarad Niemi and Abdul Hannan and Katie House and Youyang Gu and Shanghong Xie and Steve Horstman and {aniruddhadiga} and Robert Walraven and {starkari} and Michael Lingzhi Li and Graham Gibson and Lauren Castro and Dean Karlen and Nutcha Wattanachit and {jinghuichen} and {zyt9lsb} and {aagarwal1996} and Spencer Woody and Evan Ray and Frost Tianjian Xu and Hannah Biegel and GuidoEspana and Xinyue X and Johannes Bracher and Elizabeth Lee and {har96} and {leyouz}}, + year = {2020}, + month = dec, + publisher = {{Zenodo}}, + doi = {10.5281/zenodo.3963371}, + abstract = {This update to the COVID-19 Forecast Hub repository is a snapshot as of 4 December 2020 of the data hosted by and visualized at~https://covid19forecasthub.org/.}, + file = {/mnt/data/github-synced/zotero-nikos/storage/AVWA2UPE/4305938.html} +} + +@article{cramerEvaluationIndividualEnsemble2021, + title = {Evaluation of Individual and Ensemble Probabilistic Forecasts of {{COVID-19}} Mortality in the {{US}}}, + author = {Cramer, Estee and Ray, Evan L. and Lopez, Velma K. and Bracher, Johannes and Brennen, Andrea and Rivadeneira, Alvaro J. Castro and Gerding, Aaron and Gneiting, Tilmann and House, Katie H. and Huang, Yuxin and Jayawardena, Dasuni and Kanji, Abdul H. and Khandelwal, Ayush and Le, Khoa and M{\"u}hlemann, Anja and Niemi, Jarad and Shah, Apurv and Stark, Ariane and Wang, Yijin and Wattanachit, Nutcha and Zorn, Martha W. and Gu, Youyang and Jain, Sansiddh and Bannur, Nayana and Deva, Ayush and Kulkarni, Mihir and Merugu, Srujana and Raval, Alpan and Shingi, Siddhant and Tiwari, Avtansh and White, Jerome and Woody, Spencer and Dahan, Maytal and Fox, Spencer and Gaither, Kelly and Lachmann, Michael and Meyers, Lauren Ancel and Scott, James G. and Tec, Mauricio and Srivastava, Ajitesh and George, Glover E. and Cegan, Jeffrey C. and Dettwiller, Ian D. and England, William P. and Farthing, Matthew W. and Hunter, Robert H. and Lafferty, Brandon and Linkov, Igor and Mayo, Michael L. and Parno, Matthew D. and Rowland, Michael A. and Trump, Benjamin D. and Corsetti, Sabrina M. and Baer, Thomas M. and Eisenberg, Marisa C. and Falb, Karl and Huang, Yitao and Martin, Emily T. and McCauley, Ella and Myers, Robert L. and Schwarz, Tom and Sheldon, Daniel and Gibson, Graham Casey and Yu, Rose and Gao, Liyao and Ma, Yian and Wu, Dongxia and Yan, Xifeng and Jin, Xiaoyong and Wang, Yu-Xiang and Chen, YangQuan and Guo, Lihong and Zhao, Yanting and Gu, Quanquan and Chen, Jinghui and Wang, Lingxiao and Xu, Pan and Zhang, Weitong and Zou, Difan and Biegel, Hannah and Lega, Joceline and Snyder, Timothy L. and Wilson, Davison D. and McConnell, Steve and Walraven, Robert and Shi, Yunfeng and Ban, Xuegang and Hong, Qi-Jun and Kong, Stanley and Turtle, James A. and {Ben-Nun}, Michal and Riley, Pete and Riley, Steven and Koyluoglu, Ugur and DesRoches, David and Hamory, Bruce and Kyriakides, Christina and Leis, Helen and Milliken, John and Moloney, Michael and Morgan, James and Ozcan, Gokce and Schrader, Chris and Shakhnovich, Elizabeth and Siegel, Daniel and Spatz, Ryan and Stiefeling, Chris and Wilkinson, Barrie and Wong, Alexander and Gao, Zhifeng and Bian, Jiang and Cao, Wei and Ferres, Juan Lavista and Li, Chaozhuo and Liu, Tie-Yan and Xie, Xing and Zhang, Shun and Zheng, Shun and Vespignani, Alessandro and Chinazzi, Matteo and Davis, Jessica T. and Mu, Kunpeng and y Piontti, Ana Pastore and Xiong, Xinyue and Zheng, Andrew and Baek, Jackie and Farias, Vivek and Georgescu, Andreea and Levi, Retsef and Sinha, Deeksha and Wilde, Joshua and Penna, Nicolas D. and Celi, Leo A. and Sundar, Saketh and Cavany, Sean and Espa{\~n}a, Guido and Moore, Sean and Oidtman, Rachel and Perkins, Alex and Osthus, Dave and Castro, Lauren and Fairchild, Geoffrey and Michaud, Isaac and Karlen, Dean and Lee, Elizabeth C. and Dent, Juan and Grantz, Kyra H. and Kaminsky, Joshua and Kaminsky, Kathryn and Keegan, Lindsay T. and Lauer, Stephen A. and Lemaitre, Joseph C. and Lessler, Justin and Meredith, Hannah R. and {Perez-Saez}, Javier and Shah, Sam and Smith, Claire P. and Truelove, Shaun A. and Wills, Josh and Kinsey, Matt and Obrecht, R. F. and Tallaksen, Katharine and Burant, John C. and Wang, Lily and Gao, Lei and Gu, Zhiling and Kim, Myungjin and Li, Xinyi and Wang, Guannan and Wang, Yueying and Yu, Shan and Reiner, Robert C. and Barber, Ryan and Gaikedu, Emmanuela and Hay, Simon and Lim, Steve and Murray, Chris and Pigott, David and Prakash, B. Aditya and Adhikari, Bijaya and Cui, Jiaming and Rodr{\'i}guez, Alexander and Tabassum, Anika and Xie, Jiajia and Keskinocak, Pinar and Asplund, John and Baxter, Arden and Oruc, Buse Eylul and Serban, Nicoleta and Arik, Sercan O. and Dusenberry, Mike and Epshteyn, Arkady and Kanal, Elli and Le, Long T. and Li, Chun-Liang and Pfister, Tomas and Sava, Dario and Sinha, Rajarishi and Tsai, Thomas and Yoder, Nate and Yoon, Jinsung and Zhang, Leyou and Abbott, Sam and Bosse, Nikos I. and Funk, Sebastian and Hellewel, Joel and Meakin, Sophie R. and Munday, James D. and Sherratt, Katherine and Zhou, Mingyuan and Kalantari, Rahi and Yamana, Teresa K. and Pei, Sen and Shaman, Jeffrey and Ayer, Turgay and Adee, Madeline and Chhatwal, Jagpreet and Dalgic, Ozden O. and Ladd, Mary A. and Linas, Benjamin P. and Mueller, Peter and Xiao, Jade and Li, Michael L. and Bertsimas, Dimitris and Lami, Omar Skali and Soni, Saksham and Bouardi, Hamza Tazi and Wang, Yuanjia and Wang, Qinxia and Xie, Shanghong and Zeng, Donglin and Green, Alden and Bien, Jacob and Hu, Addison J. and Jahja, Maria and Narasimhan, Balasubramanian and Rajanala, Samyak and Rumack, Aaron and Simon, Noah and Tibshirani, Ryan and Tibshirani, Rob and Ventura, Valerie and Wasserman, Larry and O'Dea, Eamon B. and Drake, John M. and Pagano, Robert and Walker, Jo W. and Slayton, Rachel B. and Johansson, Michael and Biggerstaff, Matthew and Reich, Nicholas G.}, + year = {2021}, + month = feb, + journal = {medRxiv}, + pages = {2021.02.03.21250974}, + publisher = {{Cold Spring Harbor Laboratory Press}}, + doi = {10.1101/2021.02.03.21250974}, + abstract = {{$<$}h3{$>$}Abstract{$<$}/h3{$>$} {$<$}p{$>$}Short-term probabilistic forecasts of the trajectory of the COVID-19 pandemic in the United States have served as a visible and important communication channel between the scientific modeling community and both the general public and decision-makers. Forecasting models provide specific, quantitative, and evaluable predictions that inform short-term decisions such as healthcare staffing needs, school closures, and allocation of medical supplies. In 2020, the COVID-19 Forecast Hub (https://covid19forecasthub.org/) collected, disseminated, and synthesized hundreds of thousands of specific predictions from more than 50 different academic, industry, and independent research groups. This manuscript systematically evaluates 23 models that regularly submitted forecasts of reported weekly incident COVID-19 mortality counts in the US at the state and national level. One of these models was a multi-model ensemble that combined all available forecasts each week. The performance of individual models showed high variability across time, geospatial units, and forecast horizons. Half of the models evaluated showed better accuracy than a na\"ive baseline model. In combining the forecasts from all teams, the ensemble showed the best overall probabilistic accuracy of any model. Forecast accuracy degraded as models made predictions farther into the future, with probabilistic accuracy at a 20-week horizon more than 5 times worse than when predicting at a 1-week horizon. This project underscores the role that collaboration and active coordination between governmental public health agencies, academic modeling teams, and industry partners can play in developing modern modeling capabilities to support local, state, and federal response to outbreaks.{$<$}/p{$>$}}, + copyright = {\textcopyright{} 2021, Posted by Cold Spring Harbor Laboratory. This article is a US Government work. It is not subject to copyright under 17 USC 105 and is also made available for use under a CC0 license}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/W82X9ZN5/Cramer et al. - 2021 - Evaluation of individual and ensemble probabilisti.pdf;/mnt/data/github-synced/zotero-nikos/storage/7MC6LGTC/2021.02.03.21250974v1.html} +} + +@article{dawidCoherentDispersionCriteria1999, + title = {Coherent Dispersion Criteria for Optimal Experimental Design}, + author = {Dawid, A. Philip and Sebastiani, Paola}, + year = {1999}, + month = mar, + journal = {The Annals of Statistics}, + volume = {27}, + number = {1}, + pages = {65--81}, + publisher = {{Institute of Mathematical Statistics}}, + issn = {0090-5364, 2168-8966}, + doi = {10.1214/aos/1018031101}, + abstract = {We characterize those coherent design criteria which depend only on the dispersion matrix (assumed proper and nonsingular) of the ``state of nature,'' which may be a parameter-vector or a set of future observables, and describe the associated decision problems. Connections are established with the classical approach to optimal design theory for the normal linear model, based on concave functions of the information matrix. Implications of the theory for more general models are also considered.}, + keywords = {62C10,62K05,Bayesian decision theory,Coherence,concavity,dispersion standard,optimal design,optimality criterion,proper scoring rule,uncertainty function}, + file = {/mnt/data/github-synced/zotero-nikos/storage/BLEK95JQ/Dawid and Sebastiani - 1999 - Coherent dispersion criteria for optimal experimen.pdf;/mnt/data/github-synced/zotero-nikos/storage/KIXNL5J2/1018031101.html} +} + +@article{dawidPresentPositionPotential1984, + title = {Present {{Position}} and {{Potential Developments}}: {{Some Personal Views Statistical Theory}} the {{Prequential Approach}}}, + shorttitle = {Present {{Position}} and {{Potential Developments}}}, + author = {Dawid, A. P.}, + year = {1984}, + journal = {Journal of the Royal Statistical Society: Series A (General)}, + volume = {147}, + number = {2}, + pages = {278--290}, + issn = {2397-2327}, + doi = {10.2307/2981683}, + abstract = {The prequential approach is founded on the premiss that the purpose of statistical inference is to make sequential probability forecasts for future observations, rather than to express information about parameters. Many traditional parametric concepts, such as consistency and efficiency, prove to have natural counterparts in this formulation, which sheds new light on these and suggests fruitful extensions.}, + copyright = {\textcopyright{} 1984 Royal Statistical Society}, + langid = {english}, + keywords = {consistency,efficiency,likelihood,prequential principle,probability forecasting}, + annotation = {\_eprint: https://rss.onlinelibrary.wiley.com/doi/pdf/10.2307/2981683}, + file = {/mnt/data/github-synced/zotero-nikos/storage/PX9RNJBW/Dawid - 1984 - Present Position and Potential Developments Some .pdf;/mnt/data/github-synced/zotero-nikos/storage/UXRWFPAE/2981683.html} +} + +@article{dawidProperLocalScoring2012, + title = {Proper Local Scoring Rules on Discrete Sample Spaces}, + author = {Dawid, A. Philip and Lauritzen, Steffen and Parry, Matthew}, + year = {2012}, + month = feb, + journal = {The Annals of Statistics}, + volume = {40}, + number = {1}, + eprint = {1104.2224}, + eprinttype = {arxiv}, + issn = {0090-5364}, + doi = {10.1214/12-AOS972}, + abstract = {A scoring rule is a loss function measuring the quality of a quoted probability distribution \$Q\$ for a random variable \$X\$, in the light of the realized outcome \$x\$ of \$X\$; it is proper if the expected score, under any distribution \$P\$ for \$X\$, is minimized by quoting \$Q=P\$. Using the fact that any differentiable proper scoring rule on a finite sample space \$\{\textbackslash mathcal\{X\}\}\$ is the gradient of a concave homogeneous function, we consider when such a rule can be local in the sense of depending only on the probabilities quoted for points in a nominated neighborhood of \$x\$. Under mild conditions, we characterize such a proper local scoring rule in terms of a collection of homogeneous functions on the cliques of an undirected graph on the space \$\{\textbackslash mathcal\{X\}\}\$. A useful property of such rules is that the quoted distribution \$Q\$ need only be known up to a scale factor. Examples of the use of such scoring rules include Besag's pseudo-likelihood and Hyv\textbackslash "\{a\}rinen's method of ratio matching.}, + archiveprefix = {arXiv}, + langid = {english}, + keywords = {Mathematics - Statistics Theory}, + file = {/mnt/data/github-synced/zotero-nikos/storage/M7CJGNKN/Dawid et al. - 2012 - Proper local scoring rules on discrete sample spac.pdf} +} + +@article{elliottForecastingEconomicsFinance2016, + title = {Forecasting in {{Economics}} and {{Finance}}}, + author = {Elliott, Graham and Timmermann, Allan}, + year = {2016}, + journal = {Annual Review of Economics}, + volume = {8}, + number = {1}, + pages = {81--110}, + doi = {10.1146/annurev-economics-080315-015346}, + abstract = {Practices used to address economic forecasting problems have undergone substantial changes over recent years. We review how such changes have influenced the ways in which a range of forecasting questions are being addressed. We also discuss the promises and challenges arising from access to big data. Finally, we review empirical evidence and experience accumulated from the use of forecasting methods to a range of economic and financial variables.}, + keywords = {big data,forecast evaluation,forecast models,model instability,model misspecification,parameter estimation,risk}, + annotation = {\_eprint: https://doi.org/10.1146/annurev-economics-080315-015346}, + file = {/mnt/data/github-synced/zotero-nikos/storage/3AXCLA59/Elliott and Timmermann - 2016 - Forecasting in Economics and Finance.pdf} +} + +@article{epsteinScoringSystemProbability1969, + title = {A {{Scoring System}} for {{Probability Forecasts}} of {{Ranked Categories}}}, + author = {Epstein, Edward S.}, + year = {1969}, + month = dec, + journal = {Journal of Applied Meteorology}, + volume = {8}, + number = {6}, + pages = {985--987}, + publisher = {{American Meteorological Society}}, + issn = {0021-8952}, + doi = {10.1175/1520-0450(1969)008<0985:ASSFPF>2.0.CO;2}, + langid = {english}, + keywords = {ranked probability score,RPS}, + file = {/mnt/data/github-synced/zotero-nikos/storage/XAVX39GC/Epstein - 1969 - A Scoring System for Probability Forecasts of Rank.pdf;/mnt/data/github-synced/zotero-nikos/storage/CVK2YPKP/A-Scoring-System-for-Probability-Forecasts-of.html} +} + +@misc{europeancovid-19forecasthubEuropeanCovid19Forecast2021, + title = {European {{Covid-19 Forecast Hub}}}, + author = {{European Covid-19 Forecast Hub}}, + year = {2021}, + howpublished = {https://covid19forecasthub.eu/}, + file = {/mnt/data/github-synced/zotero-nikos/storage/JRFUHRDI/covid19forecasthub.eu.html} +} + +@article{funkAssessingPerformanceRealtime2019, + title = {Assessing the Performance of Real-Time Epidemic Forecasts: {{A}} Case Study of {{Ebola}} in the {{Western Area}} Region of {{Sierra Leone}}, 2014-15}, + shorttitle = {Assessing the Performance of Real-Time Epidemic Forecasts}, + author = {Funk, Sebastian and Camacho, Anton and Kucharski, Adam J. and Lowe, Rachel and Eggo, Rosalind M. and Edmunds, W. John}, + year = {2019}, + month = feb, + journal = {PLOS Computational Biology}, + volume = {15}, + number = {2}, + pages = {e1006785}, + issn = {1553-7358}, + doi = {10.1371/journal.pcbi.1006785}, + abstract = {Real-time forecasts based on mathematical models can inform critical decision-making during infectious disease outbreaks. Yet, epidemic forecasts are rarely evaluated during or after the event, and there is little guidance on the best metrics for assessment. Here, we propose an evaluation approach that disentangles different components of forecasting ability using metrics that separately assess the calibration, sharpness and bias of forecasts. This makes it possible to assess not just how close a forecast was to reality but also how well uncertainty has been quantified. We used this approach to analyse the performance of weekly forecasts we generated in real time for Western Area, Sierra Leone, during the 2013\textendash 16 Ebola epidemic in West Africa. We investigated a range of forecast model variants based on the model fits generated at the time with a semi-mechanistic model, and found that good probabilistic calibration was achievable at short time horizons of one or two weeks ahead but model predictions were increasingly unreliable at longer forecasting horizons. This suggests that forecasts may have been of good enough quality to inform decision making based on predictions a few weeks ahead of time but not longer, reflecting the high level of uncertainty in the processes driving the trajectory of the epidemic. Comparing forecasts based on the semi-mechanistic model to simpler null models showed that the best semi-mechanistic model variant performed better than the null models with respect to probabilistic calibration, and that this would have been identified from the earliest stages of the outbreak. As forecasts become a routine part of the toolkit in public health, standards for evaluation of performance will be important for assessing quality and improving credibility of mathematical models, and for elucidating difficulties and trade-offs when aiming to make the most useful and reliable forecasts.}, + langid = {english}, + keywords = {Epidemiology,Forecasting,Infectious disease epidemiology,Infectious diseases,Mathematical models,Probability distribution,Public and occupational health,Sierra Leone}, + file = {/mnt/data/github-synced/zotero-nikos/storage/X6Z9PIFT/Funk et al. - 2019 - Assessing the performance of real-time epidemic fo.pdf;/mnt/data/github-synced/zotero-nikos/storage/JN28VVKF/article.html} +} + +@article{funkShorttermForecastsInform2020, + title = {Short-Term Forecasts to Inform the Response to the {{Covid-19}} Epidemic in the {{UK}}}, + author = {Funk, Sebastian and Abbott, Sam and Atkins, B. D. and Baguelin, M. and Baillie, J. K. and Birrell, P. and Blake, J. and Bosse, Nikos I. and Burton, J. and Carruthers, J. and Davies, N. G. and Angelis, D. De and Dyson, L. and Edmunds, W. J. and Eggo, R. M. and Ferguson, N. M. and Gaythorpe, K. and Gorsich, E. and {Guyver-Fletcher}, G. and Hellewell, J. and Hill, E. M. and Holmes, A. and House, T. A. and Jewell, C. and Jit, M. and Jombart, T. and Joshi, I. and Keeling, M. J. and Kendall, E. and Knock, E. S. and Kucharski, A. J. and Lythgoe, K. A. and Meakin, S. R. and Munday, J. D. and Openshaw, P. J. M. and Overton, C. E. and Pagani, F. and Pearson, J. and {Perez-Guzman}, P. N. and Pellis, L. and Scarabel, F. and Semple, M. G. and Sherratt, K. and Tang, M. and Tildesley, M. J. and Leeuwen, E. Van and Whittles, L. K. and Group, CMMID COVID-19 Working and Team, Imperial College COVID-19 Response and Investigators, Isaric4c}, + year = {2020}, + month = nov, + journal = {medRxiv}, + pages = {2020.11.11.20220962}, + publisher = {{Cold Spring Harbor Laboratory Press}}, + doi = {10.1101/2020.11.11.20220962}, + abstract = {{$<$}p{$>$}Background: Short-term forecasts of infectious disease can create situational awareness and inform planning for outbreak response. Here, we report on multi-model forecasts of Covid-19 in the UK that were generated at regular intervals starting at the end of March 2020, in order to monitor expected healthcare utilisation and population impacts in real time. Methods: We evaluated the performance of individual model forecasts generated between 24 March and 14 July 2020, using a variety of metrics including the weighted interval score as well as metrics that assess the calibration, sharpness, bias and absolute error of forecasts separately. We further combined the predictions from individual models to ensemble forecasts using a simple mean as well as a quantile regression average that aimed to maximise performance. We further compared model performance to a null model of no change. Results: In most cases, individual models performed better than the null model, and ensembles models were well calibrated and performed comparatively to the best individual models. The quantile regression average did not noticeably outperform the mean ensemble. Conclusions: Ensembles of multi-model forecasts can inform the policy response to the Covid-19 pandemic by assessing future resource needs and expected population impact of morbidity and mortality.{$<$}/p{$>$}}, + copyright = {\textcopyright{} 2020, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution 4.0 International), CC BY 4.0, as described at http://creativecommons.org/licenses/by/4.0/}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/9RK57885/Funk et al. - 2020 - Short-term forecasts to inform the response to the.pdf;/mnt/data/github-synced/zotero-nikos/storage/AKDY6PAQ/2020.11.11.20220962v1.full.html} +} + +@article{gelmanUnderstandingPredictiveInformation2014, + title = {Understanding Predictive Information Criteria for {{Bayesian}} Models}, + author = {Gelman, Andrew and Hwang, Jessica and Vehtari, Aki}, + year = {2014}, + month = nov, + journal = {Statistics and Computing}, + volume = {24}, + number = {6}, + pages = {997--1016}, + issn = {1573-1375}, + doi = {10.1007/s11222-013-9416-2}, + abstract = {We review the Akaike, deviance, and Watanabe-Akaike information criteria from a Bayesian perspective, where the goal is to estimate expected out-of-sample-prediction error using a bias-corrected adjustment of within-sample error. We focus on the choices involved in setting up these measures, and we compare them in three simple examples, one theoretical and two applied. The contribution of this paper is to put all these information criteria into a Bayesian predictive context and to better understand, through small examples, how these methods can apply in practice.}, + langid = {english}, + keywords = {AIC,Bayes,Cross-validation,DIC,Prediction,WAIC}, + file = {/mnt/data/github-synced/zotero-nikos/storage/B2VVZDAP/Gelman et al. - 2014 - Understanding predictive information criteria for .pdf} +} + +@article{gneitingProbabilisticForecastsCalibration2007, + title = {Probabilistic Forecasts, Calibration and Sharpness}, + author = {Gneiting, Tilmann and Balabdaoui, Fadoua and Raftery, Adrian E.}, + year = {2007}, + journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, + volume = {69}, + number = {2}, + pages = {243--268}, + issn = {1467-9868}, + doi = {10.1111/j.1467-9868.2007.00587.x}, + abstract = {Summary. Probabilistic forecasts of continuous variables take the form of predictive densities or predictive cumulative distribution functions. We propose a diagnostic approach to the evaluation of predictive performance that is based on the paradigm of maximizing the sharpness of the predictive distributions subject to calibration. Calibration refers to the statistical consistency between the distributional forecasts and the observations and is a joint property of the predictions and the events that materialize. Sharpness refers to the concentration of the predictive distributions and is a property of the forecasts only. A simple theoretical framework allows us to distinguish between probabilistic calibration, exceedance calibration and marginal calibration. We propose and study tools for checking calibration and sharpness, among them the probability integral transform histogram, marginal calibration plots, the sharpness diagram and proper scoring rules. The diagnostic approach is illustrated by an assessment and ranking of probabilistic forecasts of wind speed at the Stateline wind energy centre in the US Pacific Northwest. In combination with cross-validation or in the time series context, our proposal provides very general, nonparametric alternatives to the use of information criteria for model diagnostics and model selection.}, + langid = {english}, + keywords = {Cross-validation,Density forecast,Ensemble prediction system,Ex post evaluation,Forecast verification,Model diagnostics,Posterior predictive assessment,Predictive distribution,Prequential principle,Probability integral transform,Proper scoring rule}, + file = {/mnt/data/github-synced/zotero-nikos/storage/BUWD6CGT/Gneiting et al. - 2007 - Probabilistic forecasts, calibration and sharpness.pdf;/mnt/data/github-synced/zotero-nikos/storage/EUCMSBKN/j.1467-9868.2007.00587.html} +} + +@article{gneitingStrictlyProperScoring2007, + title = {Strictly {{Proper Scoring Rules}}, {{Prediction}}, and {{Estimation}}}, + author = {Gneiting, Tilmann and Raftery, Adrian E}, + year = {2007}, + month = mar, + journal = {Journal of the American Statistical Association}, + volume = {102}, + number = {477}, + pages = {359--378}, + issn = {0162-1459, 1537-274X}, + doi = {10.1198/016214506000001437}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/P599P5ZY/Gneiting and Raftery - 2007 - Strictly Proper Scoring Rules, Prediction, and Est.pdf} +} + +@article{gneitingWeatherForecastingEnsemble2005, + title = {Weather {{Forecasting}} with {{Ensemble Methods}}}, + author = {Gneiting, Tilmann and Raftery, Adrian E.}, + year = {2005}, + month = oct, + journal = {Science}, + volume = {310}, + number = {5746}, + pages = {248--249}, + publisher = {{American Association for the Advancement of Science}}, + issn = {0036-8075, 1095-9203}, + doi = {10.1126/science.1115255}, + abstract = {{$<$}p{$>$} Traditional weather forecasting has been built on a foundation of deterministic modeling--start with initial conditions, put them into a supercomputer model, and end up with a prediction about future weather. But as Gneiting and Raftery discuss in their Perspective, a new approach--ensemble forecasting--was introduced in the early 1990s. In this method, up to 100 different computer runs, each with slightly different starting conditions or model assumptions, are combined into a weather forecast. In concert with statistical techniques, ensembles can provide accurate statements about the uncertainty in daily and seasonal forecasting. The challenge now is to improve the modeling, statistical analysis, and visualization technologies for disseminating the ensemble results. {$<$}/p{$>$}}, + chapter = {Perspective}, + copyright = {\textcopyright{} 2005 American Association for the Advancement of Science}, + langid = {english}, + pmid = {16224011}, + file = {/mnt/data/github-synced/zotero-nikos/storage/VRJMN77J/Gneiting and Raftery - 2005 - Weather Forecasting with Ensemble Methods.pdf;/mnt/data/github-synced/zotero-nikos/storage/8Q5UA2FU/248.html} +} + +@article{goodRationalDecisions1952, + title = {Rational {{Decisions}}}, + author = {Good, I. J.}, + year = {1952}, + journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, + volume = {14}, + number = {1}, + pages = {107--114}, + publisher = {{[Royal Statistical Society, Wiley]}}, + issn = {0035-9246}, + abstract = {This paper deals first with the relationship between the theory of probability and the theory of rational behaviour. A method is then suggested for encouraging people to make accurate probability estimates, a connection with the theory of information being mentioned. Finally Wald's theory of statistical decision functions is summarised and generalised and its relation to the theory of rational behaviour is discussed.}, + keywords = {Log Score,LogS}, + file = {/mnt/data/github-synced/zotero-nikos/storage/23458422/2020 - Rational Decisions.pdf} +} + +@article{hamillInterpretationRankHistograms2001a, + title = {Interpretation of {{Rank Histograms}} for {{Verifying Ensemble Forecasts}}}, + author = {Hamill, Thomas M.}, + year = {2001}, + month = mar, + journal = {Monthly Weather Review}, + volume = {129}, + number = {3}, + pages = {550--560}, + publisher = {{American Meteorological Society}}, + issn = {1520-0493, 0027-0644}, + doi = {10.1175/1520-0493(2001)129<0550:IORHFV>2.0.CO;2}, + abstract = {Abstract Rank histograms are a tool for evaluating ensemble forecasts. They are useful for determining the reliability of ensemble forecasts and for diagnosing errors in its mean and spread. Rank histograms are generated by repeatedly tallying the rank of the verification (usually an observation) relative to values from an ensemble sorted from lowest to highest. However, an uncritical use of the rank histogram can lead to misinterpretations of the qualities of that ensemble. For example, a flat rank histogram, usually taken as a sign of reliability, can still be generated from unreliable ensembles. Similarly, a U-shaped rank histogram, commonly understood as indicating a lack of variability in the ensemble, can also be a sign of conditional bias. It is also shown that flat rank histograms can be generated for some model variables if the variance of the ensemble is correctly specified, yet if covariances between model grid points are improperly specified, rank histograms for combinations of model variables may not be flat. Further, if imperfect observations are used for verification, the observational errors should be accounted for, otherwise the shape of the rank histogram may mislead the user about the characteristics of the ensemble. If a statistical hypothesis test is to be performed to determine whether the differences from uniformity of rank are statistically significant, then samples used to populate the rank histogram must be located far enough away from each other in time and space to be considered independent.}, + chapter = {Monthly Weather Review}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/FJYU9QZH/Hamill - 2001 - Interpretation of Rank Histograms for Verifying En.pdf;/mnt/data/github-synced/zotero-nikos/storage/SH65U38N/1520-0493_2001_129_0550_iorhfv_2.0.co_2.html} +} + +@article{hersbachDecompositionContinuousRanked2000a, + title = {Decomposition of the {{Continuous Ranked Probability Score}} for {{Ensemble Prediction Systems}}}, + author = {Hersbach, Hans}, + year = {2000}, + month = oct, + journal = {Weather and Forecasting}, + volume = {15}, + number = {5}, + pages = {559--570}, + publisher = {{American Meteorological Society}}, + issn = {1520-0434, 0882-8156}, + doi = {10.1175/1520-0434(2000)015<0559:DOTCRP>2.0.CO;2}, + abstract = {Abstract Some time ago, the continuous ranked probability score (CRPS) was proposed as a new verification tool for (probabilistic) forecast systems. Its focus is on the entire permissible range of a certain (weather) parameter. The CRPS can be seen as a ranked probability score with an infinite number of classes, each of zero width. Alternatively, it can be interpreted as the integral of the Brier score over all possible threshold values for the parameter under consideration. For a deterministic forecast system the CRPS reduces to the mean absolute error. In this paper it is shown that for an ensemble prediction system the CRPS can be decomposed into a reliability part and a resolution/uncertainty part, in a way that is similar to the decomposition of the Brier score. The reliability part of the CRPS is closely connected to the rank histogram of the ensemble, while the resolution/uncertainty part can be related to the average spread within the ensemble and the behavior of its outliers. The usefulness of such a decomposition is illustrated for the ensemble prediction system running at the European Centre for Medium-Range Weather Forecasts. The evaluation of the CRPS and its decomposition proposed in this paper can be extended to systems issuing continuous probability forecasts, by realizing that these can be interpreted as the limit of ensemble forecasts with an infinite number of members.}, + chapter = {Weather and Forecasting}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/8C93QRYF/Hersbach - 2000 - Decomposition of the Continuous Ranked Probability.pdf;/mnt/data/github-synced/zotero-nikos/storage/7PARNA8T/1520-0434_2000_015_0559_dotcrp_2_0_co_2.html} +} + +@article{jordanEvaluatingProbabilisticForecasts2019, + title = {Evaluating {{Probabilistic Forecasts}} with {{{\textbf{scoringRules}}}}}, + author = {Jordan, Alexander and Kr{\"u}ger, Fabian and Lerch, Sebastian}, + year = {2019}, + journal = {Journal of Statistical Software}, + volume = {90}, + number = {12}, + issn = {1548-7660}, + doi = {10.18637/jss.v090.i12}, + abstract = {Probabilistic forecasts in the form of probability distributions over future events have become popular in several fields including meteorology, hydrology, economics, and demography. In typical applications, many alternative statistical models and data sources can be used to produce probabilistic forecasts. Hence, evaluating and selecting among competing methods is an important task. The scoringRules package for R provides functionality for comparative evaluation of probabilistic models based on proper scoring rules, covering a wide range of situations in applied work. This paper discusses implementation and usage details, presents case studies from meteorology and economics, and points to the relevant background literature.}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/DSYW6QUF/Jordan et al. - 2019 - Evaluating Probabilistic Forecasts with bscoring.pdf} +} + +@article{joseCharacterizationSphericalScoring2009, + title = {A {{Characterization}} for the {{Spherical Scoring Rule}}}, + author = {Jose, Victor Richmond}, + year = {2009}, + month = mar, + journal = {Theory and Decision}, + volume = {66}, + number = {3}, + pages = {263--281}, + issn = {1573-7187}, + doi = {10.1007/s11238-007-9067-x}, + abstract = {Strictly proper scoring rules have been studied widely in statistical decision theory and recently in experimental economics because of their ability to encourage assessors to honestly provide their true subjective probabilities. In this article, we study the spherical scoring rule by analytically examining some of its properties and providing some new geometric interpretations for this rule. Moreover, we state a theorem which provides an axiomatic characterization for the spherical scoring rule. The objective of this analysis is to provide a better understanding of one of the most commonly available scoring rules, which could aid decision makers in the selection of an appropriate tool for evaluating and assessing probabilistic forecasts.}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/TSSMRAB9/Jose - 2009 - A Characterization for the Spherical Scoring Rule.pdf} +} + +@article{kukkonenReviewOperationalRegionalscale2012, + title = {A Review of Operational, Regional-Scale, Chemical Weather Forecasting Models in {{Europe}}}, + author = {Kukkonen, J. and Olsson, T. and Schultz, D. M. and Baklanov, A. and Klein, T. and Miranda, A. I. and Monteiro, A. and Hirtl, M. and Tarvainen, V. and Boy, M. and Peuch, V.-H. and Poupkou, A. and Kioutsioukis, I. and Finardi, S. and Sofiev, M. and Sokhi, R. and Lehtinen, K. E. J. and Karatzas, K. and San Jos{\'e}, R. and Astitha, M. and Kallos, G. and Schaap, M. and Reimer, E. and Jakobs, H. and Eben, K.}, + year = {2012}, + month = jan, + journal = {Atmospheric Chemistry and Physics}, + volume = {12}, + number = {1}, + pages = {1--87}, + publisher = {{Copernicus GmbH}}, + issn = {1680-7316}, + doi = {10.5194/acp-12-1-2012}, + abstract = {{$<$}p{$><$}strong class="journal-contentHeaderColor"{$>$}Abstract.{$<$}/strong{$>$} Numerical models that combine weather forecasting and atmospheric chemistry are here referred to as chemical weather forecasting models. Eighteen operational chemical weather forecasting models on regional and continental scales in Europe are described and compared in this article. Topics discussed in this article include how weather forecasting and atmospheric chemistry models are integrated into chemical weather forecasting systems, how physical processes are incorporated into the models through parameterization schemes, how the model architecture affects the predicted variables, and how air chemistry and aerosol processes are formulated. In addition, we discuss sensitivity analysis and evaluation of the models, user operational requirements, such as model availability and documentation, and output availability and dissemination. In this manner, this article allows for the evaluation of the relative strengths and weaknesses of the various modelling systems and modelling approaches. Finally, this article highlights the most prominent gaps of knowledge for chemical weather forecasting models and suggests potential priorities for future research directions, for the following selected focus areas: emission inventories, the integration of numerical weather prediction and atmospheric chemical transport models, boundary conditions and nesting of models, data assimilation of the various chemical species, improved understanding and parameterization of physical processes, better evaluation of models against data and the construction of model ensembles.{$<$}/p{$>$}}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/X3N7D4HE/Kukkonen et al. - 2012 - A review of operational, regional-scale, chemical .pdf;/mnt/data/github-synced/zotero-nikos/storage/XWR2S6F8/2012.html} +} + +@article{liboschikTscountPackageAnalysis2017, + title = {Tscount: {{An R Package}} for {{Analysis}} of {{Count Time Series Following Generalized Linear Models}}}, + shorttitle = {Tscount}, + author = {Liboschik, Tobias and Fokianos, Konstantinos and Fried, Roland}, + year = {2017}, + month = nov, + journal = {Journal of Statistical Software}, + volume = {82}, + pages = {1--51}, + issn = {1548-7660}, + doi = {10.18637/jss.v082.i05}, + abstract = {The R package tscount provides likelihood-based estimation methods for analysis and modeling of count time series following generalized linear models. This is a flexible class of models which can describe serial correlation in a parsimonious way. The conditional mean of the process is linked to its past values, to past observations and to potential covariate effects. The package allows for models with the identity and with the logarithmic link function. The conditional distribution can be Poisson or negative binomial. An important special case of this class is the so-called INGARCH model and its log-linear extension. The package includes methods for model fitting and assessment, prediction and intervention analysis. This paper summarizes the theoretical background of these methods. It gives details on the implementation of the package and provides simulation results for models which have not been studied theoretically before. The usage of the package is illustrated by two data examples. Additionally, we provide a review of R packages which can be used for count time series analysis. This includes a detailed comparison of tscount to those packages.}, + copyright = {Copyright (c) 2017 Tobias Liboschik, Konstantinos Fokianos, Roland Fried}, + langid = {english}, + keywords = {serial correlation}, + file = {/mnt/data/github-synced/zotero-nikos/storage/NAKUKRYZ/Liboschik et al. - 2017 - tscount An R Package for Analysis of Count Time S.pdf} +} + +@article{macheteContrastingProbabilisticScoring2012, + title = {Contrasting {{Probabilistic Scoring Rules}}}, + author = {Machete, Reason Lesego}, + year = {2012}, + month = jul, + journal = {arXiv:1112.4530 [math, stat]}, + eprint = {1112.4530}, + eprinttype = {arxiv}, + primaryclass = {math, stat}, + abstract = {There are several scoring rules that one can choose from in order to score probabilistic forecasting models or estimate model parameters. Whilst it is generally agreed that proper scoring rules are preferable, there is no clear criterion for preferring one proper scoring rule above another. This manuscript compares and contrasts some commonly used proper scoring rules and provides guidance on scoring rule selection. In particular, it is shown that the logarithmic scoring rule prefers erring with more uncertainty, the spherical scoring rule prefers erring with lower uncertainty, whereas the other scoring rules are indifferent to either option.}, + archiveprefix = {arXiv}, + langid = {english}, + keywords = {62B10; 62C05; 62G05; 62G07; 62F99; 62P05; 62P12; 62P20,Mathematics - Statistics Theory}, + file = {/mnt/data/github-synced/zotero-nikos/storage/8FYPC3Y4/Machete - 2012 - Contrasting Probabilistic Scoring Rules.pdf} +} + +@article{mannTestWhetherOne1947, + title = {On a {{Test}} of {{Whether}} One of {{Two Random Variables}} Is {{Stochastically Larger}} than the {{Other}}}, + author = {Mann, H. B. and Whitney, D. R.}, + year = {1947}, + month = mar, + journal = {The Annals of Mathematical Statistics}, + volume = {18}, + number = {1}, + pages = {50--60}, + publisher = {{Institute of Mathematical Statistics}}, + issn = {0003-4851, 2168-8990}, + doi = {10.1214/aoms/1177730491}, + abstract = {Let \$x\$ and \$y\$ be two random variables with continuous cumulative distribution functions \$f\$ and \$g\$. A statistic \$U\$ depending on the relative ranks of the \$x\$'s and \$y\$'s is proposed for testing the hypothesis \$f = g\$. Wilcoxon proposed an equivalent test in the Biometrics Bulletin, December, 1945, but gave only a few points of the distribution of his statistic. Under the hypothesis \$f = g\$ the probability of obtaining a given \$U\$ in a sample of \$n x's\$ and \$m y's\$ is the solution of a certain recurrence relation involving \$n\$ and \$m\$. Using this recurrence relation tables have been computed giving the probability of \$U\$ for samples up to \$n = m = 8\$. At this point the distribution is almost normal. From the recurrence relation explicit expressions for the mean, variance, and fourth moment are obtained. The 2rth moment is shown to have a certain form which enabled us to prove that the limit distribution is normal if \$m, n\$ go to infinity in any arbitrary manner. The test is shown to be consistent with respect to the class of alternatives \$f(x) {$>$} g(x)\$ for every \$x\$.}, + file = {/mnt/data/github-synced/zotero-nikos/storage/YTWX67GQ/Mann and Whitney - 1947 - On a Test of Whether one of Two Random Variables i.pdf;/mnt/data/github-synced/zotero-nikos/storage/8DA4G3H8/1177730491.html} +} + +@article{mathesonScoringRulesContinuous1976, + title = {Scoring {{Rules}} for {{Continuous Probability Distributions}}}, + author = {Matheson, James E. and Winkler, Robert L.}, + year = {1976}, + month = jun, + journal = {Management Science}, + volume = {22}, + number = {10}, + pages = {1087--1096}, + publisher = {{INFORMS}}, + issn = {0025-1909}, + doi = {10.1287/mnsc.22.10.1087}, + abstract = {Personal, or subjective, probabilities are used as inputs to many inferential and decision-making models, and various procedures have been developed for the elicitation of such probabilities. Included among these elicitation procedures are scoring rules, which involve the computation of a score based on the assessor's stated probabilities and on the event that actually occurs. The development of scoring rules has, in general, been restricted to the elicitation of discrete probability distributions. In this paper, families of scoring rules for the elicitation of continuous probability distributions are developed and discussed.}, + file = {/mnt/data/github-synced/zotero-nikos/storage/SVJ7YPP7/Matheson and Winkler - 1976 - Scoring Rules for Continuous Probability Distribut.pdf;/mnt/data/github-synced/zotero-nikos/storage/H5CNZS4U/mnsc.22.10.html} +} + +@article{murphyNoteRankedProbability1971a, + title = {A {{Note}} on the {{Ranked Probability Score}}}, + author = {Murphy, Allan H.}, + year = {1971}, + month = feb, + journal = {Journal of Applied Meteorology and Climatology}, + volume = {10}, + number = {1}, + pages = {155--156}, + publisher = {{American Meteorological Society}}, + issn = {1520-0450}, + doi = {10.1175/1520-0450(1971)010<0155:ANOTRP>2.0.CO;2}, + abstract = {Abstract}, + chapter = {Journal of Applied Meteorology and Climatology}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/G4ZUDF6B/Murphy - 1971 - A Note on the Ranked Probability Score.pdf;/mnt/data/github-synced/zotero-nikos/storage/DA47KK7K/1520-0450_1971_010_0155_anotrp_2_0_co_2.html} +} + +@article{reichCollaborativeMultiyearMultimodel2019, + title = {A Collaborative Multiyear, Multimodel Assessment of Seasonal Influenza Forecasting in the {{United States}}}, + author = {Reich, Nicholas G. and Brooks, Logan C. and Fox, Spencer J. and Kandula, Sasikiran and McGowan, Craig J. and Moore, Evan and Osthus, Dave and Ray, Evan L. and Tushar, Abhinav and Yamana, Teresa K. and Biggerstaff, Matthew and Johansson, Michael A. and Rosenfeld, Roni and Shaman, Jeffrey}, + year = {2019}, + month = feb, + journal = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {8}, + pages = {3146--3154}, + publisher = {{National Academy of Sciences}}, + issn = {0027-8424, 1091-6490}, + doi = {10.1073/pnas.1812594116}, + abstract = {Influenza infects an estimated 9\textendash 35 million individuals each year in the United States and is a contributing cause for between 12,000 and 56,000 deaths annually. Seasonal outbreaks of influenza are common in temperate regions of the world, with highest incidence typically occurring in colder and drier months of the year. Real-time forecasts of influenza transmission can inform public health response to outbreaks. We present the results of a multiinstitution collaborative effort to standardize the collection and evaluation of forecasting models for influenza in the United States for the 2010/2011 through 2016/2017 influenza seasons. For these seven seasons, we assembled weekly real-time forecasts of seven targets of public health interest from 22 different models. We compared forecast accuracy of each model relative to a historical baseline seasonal average. Across all regions of the United States, over half of the models showed consistently better performance than the historical baseline when forecasting incidence of influenza-like illness 1 wk, 2 wk, and 3 wk ahead of available data and when forecasting the timing and magnitude of the seasonal peak. In some regions, delays in data reporting were strongly and negatively associated with forecast accuracy. More timely reporting and an improved overall accessibility to novel and traditional data sources are needed to improve forecasting accuracy and its integration with real-time public health decision making.}, + chapter = {PNAS Plus}, + copyright = {Copyright \textcopyright{} 2019 the Author(s). Published by PNAS.. https://creativecommons.org/licenses/by-nc-nd/4.0/This open access article is distributed under Creative Commons Attribution-NonCommercial-NoDerivatives License 4.0 (CC BY-NC-ND).}, + langid = {english}, + pmid = {30647115}, + keywords = {forecasting,infectious disease,influenza,public health,statistics}, + file = {/mnt/data/github-synced/zotero-nikos/storage/XEKLR37W/Reich et al. - 2019 - A collaborative multiyear, multimodel assessment o.pdf;/mnt/data/github-synced/zotero-nikos/storage/QID3PLW4/3146.html} +} + +@article{thoreyOnlineLearningContinuous2017a, + title = {Online Learning with the {{Continuous Ranked Probability Score}} for Ensemble Forecasting}, + author = {Thorey, J. and Mallet, V. and Baudin, P.}, + year = {2017}, + month = jan, + journal = {Quarterly Journal of the Royal Meteorological Society}, + volume = {143}, + number = {702}, + pages = {521--529}, + issn = {0035-9009, 1477-870X}, + doi = {10.1002/qj.2940}, + abstract = {Ensemble forecasting resorts to multiple individual forecasts to produce a discrete probability distribution which accurately represents the uncertainties. Before every forecast, a weighted empirical distribution function is derived from the ensemble, so as to minimize the Continuous Ranked Probability Score (CRPS). We apply online learning techniques, which have previously been used for deterministic forecasting, and we adapt them for the minimization of the CRPS. The proposed method theoretically guarantees that the aggregated forecast competes, in terms of CRPS, against the best weighted empirical distribution function with weights constant in time. This is illustrated on synthetic data. Besides, our study improves the knowledge of the CRPS expectation for model mixtures. We generalize results on the bias of the CRPS computed with ensemble forecasts, and propose a new scheme to achieve fair CRPS minimization, without any assumption on the distributions.}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/R96EA9GW/Thorey et al. - 2017 - Online learning with the Continuous Ranked Probabi.pdf} +} + +@article{timmermannForecastingMethodsFinance2018, + title = {Forecasting {{Methods}} in {{Finance}}}, + author = {Timmermann, Allan}, + year = {2018}, + journal = {Annual Review of Financial Economics}, + volume = {10}, + number = {1}, + pages = {449--479}, + doi = {10.1146/annurev-financial-110217-022713}, + abstract = {Our review highlights some of the key challenges in financial forecasting problems and opportunities arising from the unique features of financial data. We analyze the difficulty of establishing predictability in an environment with a low signal-to-noise ratio, persistent predictors, and instability in predictive relations arising from competitive pressures and investors' learning. We discuss approaches for forecasting the mean, variance, and probability distribution of asset returns. Finally, we discuss how to evaluate financial forecasts while accounting for the possibility that numerous forecasting models may have been considered, leading to concerns of data mining.}, + annotation = {\_eprint: https://doi.org/10.1146/annurev-financial-110217-022713} +} + +@article{timmermannForecastingMethodsFinance2018a, + title = {Forecasting {{Methods}} in {{Finance}}}, + author = {Timmermann, Allan}, + year = {2018}, + month = nov, + journal = {Annual Review of Financial Economics}, + volume = {10}, + number = {1}, + pages = {449--479}, + issn = {1941-1367, 1941-1375}, + doi = {10.1146/annurev-financial-110217-022713}, + abstract = {Our review highlights some of the key challenges in financial forecasting problems and opportunities arising from the unique features of financial data. We analyze the difficulty of establishing predictability in an environment with a low signal-to-noise ratio, persistent predictors, and instability in predictive relations arising from competitive pressures and investors' learning. We discuss approaches for forecasting the mean, variance, and probability distribution of asset returns. Finally, we discuss how to evaluate financial forecasts while accounting for the possibility that numerous forecasting models may have been considered, leading to concerns of data mining.}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/9FHI5V4F/Timmermann - 2018 - Forecasting Methods in Finance.pdf} +} + +@article{winklerScoringRulesEvaluation1996, + title = {Scoring Rules and the Evaluation of Probabilities}, + author = {Winkler, R. L. and Mu{\~n}oz, Javier and Cervera, Jos{\'e} L. and Bernardo, Jos{\'e} M. and Blattenberger, Gail and Kadane, Joseph B. and Lindley, Dennis V. and Murphy, Allan H. and Oliver, Robert M. and {R{\'i}os-Insua}, David}, + year = {1996}, + month = jun, + journal = {Test}, + volume = {5}, + number = {1}, + pages = {1--60}, + issn = {1863-8260}, + doi = {10.1007/BF02562681}, + abstract = {In Bayesian inference and decision analysis, inferences and predictions are inherently probabilistic in nature. Scoring rules, which involve the computation of a score based on probability forecasts and what actually occurs, can be used to evaluate probabilities and to provide appropriate incentives for ``good'' probabilities. This paper review scoring rules and some related measures for evaluating probabilities, including decompositions of scoring rules and attributes of ``goodness'' of probabilites, comparability of scores, and the design of scoring rules for specific inferential and decision-making problems}, + langid = {english}, + file = {/mnt/data/github-synced/zotero-nikos/storage/VHTQR6BK/Winkler et al. - 1996 - Scoring rules and the evaluation of probabilities.pdf} +} + + diff --git a/inst/metrics-overview/forecast-types.Rda b/inst/metrics-overview/forecast-types.Rda new file mode 100644 index 000000000..3ff41d540 Binary files /dev/null and b/inst/metrics-overview/forecast-types.Rda differ diff --git a/inst/metrics-overview/metrics-detailed.Rda b/inst/metrics-overview/metrics-detailed.Rda new file mode 100644 index 000000000..c118804d6 Binary files /dev/null and b/inst/metrics-overview/metrics-detailed.Rda differ diff --git a/inst/metrics-overview/metrics-summary.Rda b/inst/metrics-overview/metrics-summary.Rda new file mode 100644 index 000000000..ef3da650f Binary files /dev/null and b/inst/metrics-overview/metrics-summary.Rda differ diff --git a/inst/scoringutils-todos.Rmd b/inst/scoringutils-todos.Rmd new file mode 100644 index 000000000..e8a243b3c --- /dev/null +++ b/inst/scoringutils-todos.Rmd @@ -0,0 +1,132 @@ +--- +title: "scoringutils - to dos and open questions" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Documentation for `score()` function + +- there is currently no documentation for the metrics returned (maybe a list of possible metrics could also be included in `check_forecasts()`). Generally, that information could sit in the documentation for the main function, the subfunctions, or in a data.frame that is provided as package data (maybe not ideal?). +- there is a documentation for the `score_binary()`, `score_quantile()` and `score_sample()`, but that inherits the data description for all which is maybe slightly confusing. + +--- + +# Unit tests needed + +- for example checking how unnecessary arguments are handled with regards to the `...` arguments +- handling of point forecasts in data.frames, i.e. forecasts where quantile is `NA` (also that option is poorly documented) +- input checks + +--- + +# Input checks + +check functions need to be applied consistently everywhere. Also maybe check functions could be improved + +--- + +# Argument names and order + +Check that all function arguments are named and ordered consistently + +--- + +# scoringutils.R help file + +I find the scoringutils.R file useful, so you can get some overview by calling `?scoringutils`. However, populating that file manually is quite annoying. Is there a way to auto-populate it? Or maybe just link to the website with the pkgdown yaml overview? + +--- + +# Check overall documentaion is ok + +--- + +# Names of scoringRules wrapper functions + +Currently functions are called `dss`, `crps`, `logs`. Maybe it would be more consistent to call them `dss_sample`. `crps_sample`, `logss_sample`, as all other lower-level scoring functions also have that naming convention to show where they are applicable. Also there is a `logs_binary` function. + +--- + +# Check function naming is consistent + +--- + +# Relative skill with and without baseline + +The relative skill is different when there is a baseline (not only the scaled relative skill), because the baseline is not taken into account when taking the geometric mean. Is this what we want? Maybe discuss again with Johannes? + +--- + +# Behaviour of `score()` and `summarise_scores()` + +Are we happy with what is sumamrised and what is not sumamrised? E.g. partial summarising over samples for sample-based, but nothing for quantiles? Are we also happy with the default `by = NULL` in `summarise_scores()`, where `NULL` means to summarise over quantiles for quantile-based predictions? + +Do we want to have an additional `aggregate_by` or `aggregate_over` argument in `summarise_scores()` to save typing? + +--- + +# Name of 'range' argument + +Some functions accept a 'range' argument. Should we rename "range" to "interval_range" to make it more clear? it is currently called `interval_range` in function `interval_score`, but 'range' somwhere else + +--- + +# Available metrics function and data set + +The function available_metrics() maybe could be a data set. In any case it is not yet done and also the data set would need some updating. + +--- + +# Support for logging forecasts and data + +The `score()` function could have an argument for whether everything shall be scored on a log scale. Would have to think about the exact transformation (e.g. log(y + 1)?) and also what to do with negative forecasts. And binary questions. Maybe complicated. + + +--- + +# Renaming columns and metrics + +Could rename 'true_value' to 'truth' in `score()`. Could also rename column 'interval_score' to 'wis'. Technically it may not always be weighted, as you can turn weighting off. But realistically nobody will ever do it. So could just call it 'wis'. + +Would be a good idea to go over all metric names and see whether we are happy with it. The metric names currently are in inst/tables-metric-overview.R (or in inst/metrics-overview/metrics-summary.Rda) + + +--- + +# Rework `plot_predictions` function + +Maybe any filtering should be removed from the function? + +--- + +# Make colour for score tables account for facets + +currently the function plot_score_table() and plot_heatmap() just compute the colour per data.frame. This makes it hard to use facetting. + +```{r} +score(example_continuous) %>% + summarise_scores(by = c("model", "location", "target_type")) %>% + plot_heatmap(x = "location", metric = "bias") + + facet_wrap(~ target_type) +``` + + +--- + +# Should the function bias_range() be exported? + +I'm using it within score_quantile() and it used to be the only bias function available. But now that there is a quantile version available we maybe shouldn't export it? + + +--- + +# Could add an argument `convert_to_quantiles` (or similar) to `score()` + +We have a function to calculate quantiles from samples. The reason we don't do that automatically is because it might be biased. Forcing users to convert manually to quantiles, e.g. if they want to compute coverage values, however, is clunky. If someone e.g. wants to have crps, but also coverage values, they would have to run `score()` twice with two different data sets and join the outputs. The reason we want manual action from the user is because estimating quantiles based on samples may be biased if the number of samples is small. But maybe having an extra argument suffices? we could also print out a message / warning when the user does that (or use the lifycycle package and print out a warning every 8 hours...) + +# Make plotting theme for plot functions consistent + +Do we want to have theme_minimal()? no theme? diff --git a/inst/tables-metric-overview.R b/inst/tables-metric-overview.R new file mode 100644 index 000000000..656150df3 --- /dev/null +++ b/inst/tables-metric-overview.R @@ -0,0 +1,432 @@ +#------------------------------------------------------------------------------# +#------------------ Overview of the different forecast types ------------------# +#------------------------------------------------------------------------------# +library(data.table) +point_forecast <- list( + `Forecast type` = c("Point forecast"), + `Target type` = c("continuous \n discrete \n binary"), + `Representation of the predictive distribution` = c("one single number for the predicted outcome") +) + + +probabilistic_forecast <- list( + `Forecast type` = c("Probabilistic forecast", "Probabilistic forecast"), + `Target type` = c("continuous \n discrete", + "binary"), + `Representation of the predictive distribution` = c( + "predictive samples \n quantiles \n closed analytical form", + "binary probabilities" + ) +) + +data <- rbind(as.data.table(point_forecast), + as.data.table(probabilistic_forecast)) + +saveRDS(data, "inst/metrics-overview/forecast-types.Rda") + +#------------------------------------------------------------------------------# +#----------------- Overview with applicability of the metrics -----------------# +#------------------------------------------------------------------------------# + +ae <- list( + `Metric` = "Absolute error", + `Name` = list("ae_point", "ae_median"), + `Functions` = r"(score(), ae_point()), ae_median_sample()", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Suitable for scoring the median of a predictive distribution", + `References` = "" +) + +se <- list( + `Metric` = "Squared error", + `Name` = list("se_point", "se_mean"), + `Functions` = r"(score(), se_point(), se_mean_sample())", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Suitable for scoring the mean of a predictive distribution.", + `References` = "" +) + + +crps <- list( + `Metric` = "(Continuous) ranked probability score (CRPS)", + `Name` = r"(crps)", + `Functions` = r"(score(), ae_point)", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($-$)", + `Properties` = "Proper scoring rule (smaller is better), takes entire predictive distribution into account (global), penalises over- and under-confidence similarly, stable handling of outliers", + `References` = "" +) + +log_score <- list( + `Metric` = "Log score", + `Name` = r"(log_score)", + `Functions` = r"(score(), logs_sample(), logs_binary())", + `D` = r"($-$)", + `C` = r"($\checkmark$)", + `B` = r"($\checkmark$)", + `Q` = r"($-$)", + `Properties` = "Proper scoring rule, smaller is better, only evaluates predictive density at observed value (local), penalises over-confidence severely, susceptible to outliers", + `References` = "" +) + +wis <- list( + Metric = "(Weighted) interval score (WIS)", + `Name` = r"(interval_score)", + `Functions` = r"(score(), interval_score())", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Proper scoring rule, smaller is better, similar properties to CRPS and converges to CRPS for an increasing number of equally spaced intervals", + `References` = "" +) + +dss <- list( + `Metric` = "Dawid-Sebastiani score (DSS)", + `Name` = r"(dss)", + `Functions` = r"(score(), dss_sample())", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($-$)", + `Properties` = "Proper scoring rule, smaller is better, evaluates forecast based on mean and sd of predictive distribution (global), susceptible to outliers, penalises over-confidence severely", + `References` = "" +) + +brier_score <- list( + `Metric` = "Brier score (BS)", + `Name` = r"(brier_score)", + `Functions` = r"(score(), brier_score())", + `D` = r"($-$)", + `C` = r"($-$)", + `B` = r"($\checkmark$)", + `Q` = r"($-$)", + `Properties` = "Proper scoring rule, smaller is better, equals CRPS for binary outcomes, penalises over- and under-confidence similarly", + `References` = "" +) + +interval_coverage <- list( + `Metric` = "Interval coverage", + `Name` = r"(coverage)", + `Functions` = r"(score())", + `D` = r"($-$)", + `C` = r"($-$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Proportion of observations falling inside a given central prediction interval (= 'empirical interval coverage'). Used to assess probabilistic calibration." , + `References` = "" +) + +coverage_deviation <- list( + `Metric` = "Coverage deviation", + `Name` = r"(coverage_deviation)", + `Functions` = r"(score())", + `D` = r"($-$)", + `C` = r"($-$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Average difference between empirical and nominal interval coverage (coverage that should have been realised)", + `References` = "" +) + +quantile_coverage <- list( + `Metric` = "Quantile coverage", + `Name` = r"(quantile_coverage)", + `Functions` = r"(score())", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($-$)", + `Properties` = "Proportion of observations below a given quantile of the predictive CDF. Used to assess probabilistic calibration.", + `References` = "" +) + + +dispersion <- list( + `Metric` = "Dispersion", + `Name` = r"(dispersion)", + `Functions` = r"(score(), interval_score())", + `D` = r"($-$)", + `C` = r"($-$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Dispersion component of WIS, measures width of predictive intervals.", + `References` = "" +) + +mad <- list( + `Metric` = "Median Absolute Deviation (Dispersion)", + `Name` = r"(mad)", + `Functions` = r"(score(), mad_sample())", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($-$)", + `Properties` = "Measure for dispersion of a forecast: median of the absolute deviations from the median", + `References` = "" +) + +bias <- list( + `Metric` = "Bias", + `Name` = r"(bias)", + `Functions` = r"(score(), bias_sample(), bias_quantile())", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Measure of relative tendency to over- or under-predict (aspect of calibration), bounded between -1 and 1 (ideally 0)", + `References` = "" +) + +under_overprediction <- list( + `Metric` = "Under-, Over-prediction", + `Name` = list("underprediction", "overprediction"), + `Functions` = r"(score(), interval_score())", + `D` = r"($-$)", + `C` = r"($-$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "Absolute amount of over-or under-prediction (components of WIS)", + `References` = "" +) + +pit <- list( + `Metric` = "Probability integral transform (PIT)", + `Name` = r"(crps)", + `Functions` = r"(score(), pit())", + `D` = r"($\checkmark$)", + `C` = r"($\checkmark$)", + `B` = r"($-$)", + `Q` = r"($\checkmark$)", + `Properties` = "PIT transform is the CDF of the predictive distribution evaluated at the observed values. PIT values should be uniform. ", + `References` = "" +) + +mean_score_ratio <- list( + `Metric` = "Mean score ratio", + `Name` = r"(mean_scores_ratio)", + `Functions` = r"(pairwise_comparison())", + `D` = r"($\sim$)", + `C` = r"($\sim$)", + `B` = r"($\sim$)", + `Q` = r"($\sim$)", + `Properties` = "Compares performance of two models. Properties depend on the metric chosen for the comparison.", + `References` = "" +) + +relative_skill <- list( + `Metric` = "(Scaled) Relative skill", + `Name` = list("relative_skill", "scaled_rel_skill"), + `Functions` = r"(score(), pairwise_comparison())", + `D` = r"($\sim$)", + `C` = r"($\sim$)", + `B` = r"($\sim$)", + `Q` = r"($\sim$)", + `Properties` = "Ranks models based on pairwise comparisons, useful in the context of missing forecasts. Properties depend on the metric chosen for the comparison.", + `References` = "" +) + +data <- rbind(as.data.table(ae), + as.data.table(se), + as.data.table(crps), + as.data.table(log_score), + as.data.table(wis), + as.data.table(dss), + as.data.table(brier_score), + as.data.table(interval_coverage), + as.data.table(coverage_deviation), + as.data.table(quantile_coverage), + as.data.table(dispersion), + as.data.table(mad), + as.data.table(under_overprediction), + as.data.table(pit), + as.data.table(dispersion), + as.data.table(bias), + as.data.table(mean_score_ratio), + as.data.table(relative_skill)) + +data[, References := NULL] +setnames(data, old = c("Properties"), + new = c("Info")) + +metrics_summary <- data[, lapply(.SD, FUN = function(x) { + x <- gsub("$\\checkmark$", '+', x, fixed = TRUE) + x <- gsub("$-$", '-', x, fixed = TRUE) + x <- gsub("$\\sim$", '~', x, fixed = TRUE) + return(x) +})] +setnames(metrics_summary, old = c("D", "C", "B", "Q"), + new = c("Discrete", "Continuous", "Binary", "Quantile")) + +usethis::use_data(metrics_summary, overwrite = TRUE) + + +# save for manuscript +data[, c("Name", "Functions") := NULL] +saveRDS(unique(data), file = "inst/metrics-overview/metrics-summary.Rda") + + +#------------------------------------------------------------------------------# +#------------------ Detailed explanation of all the metrics -------------------# +#------------------------------------------------------------------------------# + +crps <- list( + `Metric` = "CRPS (Continuous) ranked probability score", + `Explanation` = r"(The crps is a proper scoring rule that generalises the absolute error to probabilistic forecasts. It measures the 'distance' of the predictive distribution to the observed data-generating distribution. The CRPS is given as + $$\text{CRPS}(F, y) = \int_{-\infty}^\infty \left( F(x) - 1(x \geq y) \right)^2 dx,$$ + where y is the true observed value and F the CDF of predictive distribution. Often An alternative representation is used: + $$ \text{CRPS}(F, y) = \frac{1}{2} \mathbb{E}_{F} |X - X'| - \mathbb{E}_P |X - y|,$$ where $X$ and $X'$ are independent realisations from the predictive distributions $F$ with finite first moment and $y$ is the true value. In this represenation we can simply replace $X$ and $X'$ by samples sum over all possible combinations to obtain the CRPS. + For integer-valued forecasts, the RPS is given as + $$ \text{RPS}(F, y) = \sum_{x = 0}^\infty (F(x) - 1(x \geq y))^2. $$ + + **Usage and caveats** Smaller values are better. The crps is a good choice for most practical purposes that involve decision making, as it takes the entire predictive distribution into account. If two forecasters assign the same probability to the true event $y$, then the forecaster who assigned high probability to events far away from $y$ will still get a worse score. The crps (in contrast to the log score) can at times be quite lenient towards extreme mispredictions. Also, due to it's similarity to the absolute error, the level of scores depend a lot on the absolute value of what is predicted, which makes it hard to compare scores of forecasts for quantities that are orders of magnitude apart.)" +) + + +log_score <- list( + `Metric` = "Log score", + `Explanation` = r"(The Log score is a proper scoring rule that is simply compuated as the log of the predictive density evaluated at the true observed value. It is given as + $$ \text{log score} = \log f(y), $$ + where $f$ is the predictive density function and y is the true value. For integer-valued forecasts, the log score can be computed as + $$ \text{log score} = \log p_y, $$ + where $p_y$ is the probability assigned to outcome p by the forecast F. + + **Usage and caveats**: Larger values are better, but sometimes the sign is reversed. The log score is ensitive to outliers, as individual negative log score contributions quickly can become very large if the event falls in the tails of the predictive distribution, where $f(y)$ (or $p_y$) is close to zero. Whether or not that is desirable depends ont the application. In scoringutils, the log score cannot be used for integer-valued forecasts, as the implementation requires a predictive density. In contrast to the crps, the log score is a local scoring rule: it's value only depends only on the probability that was assigned to the actual outcome. This property may be desirable for inferential purposes, for example in a Bayesian context (Winkler et al., 1996). In settings where forecasts inform decision making, it may be more appropriate to score forecasts based on the entire predictive distribution.)" +) + +wis <- list( + Metric = "WIS (Weighted) interval score", + `Explanation` = r"(The (weighted) interval score is a proper scoring rule for quantile forecasts that converges to the crps for an increasing number of intervals. The score can be decomposed into a sharpness (uncertainty) component and penalties for over- and underprediction. For a single interval, the score is computed as + $$IS_\alpha(F,y) = (u-l) + \frac{2}{\alpha} \cdot (l-y) \cdot 1(y \leq l) + \frac{2}{\alpha} \cdot (y-u) \cdot 1(y \geq u), $$ + where $1()$ is the indicator function, $y$ is the true value, and $l$ and $u$ are the $\frac{\alpha}{2}$ and $1 - \frac{\alpha}{2}$ quantiles of the predictive distribution $F$, i.e. the lower and upper bound of a single prediction interval. For a set of $K$ prediction intervals and the median $m$, the score is computed as a weighted sum, + $$WIS = \frac{1}{K + 0.5} \cdot (w_0 \cdot |y - m| + \sum_{k = 1}^{K} w_k \cdot IS_{\alpha}(F, y)),$$ + where $w_k$ is a weight for every interval. Usually, $w_k = \frac{\alpha_k}{2}$ and $w_0 = 0.5$. + + **Usage and caveats**: + Smaller scores are better. Applicable to all quantile forecasts, takes the entire predictive distribution into account. Just as the crps, the wis is based on measures of absolute error. When averaging across multiple targets, it will therefore be dominated by targets with higher absolute values. The decomposition into sharpness, over- and underprediction make it easy to interpret scores and use them for model improvement. )" +) + +quantile_score <- "yet to come" + + +dss <- list( + `Metric` = "DSS Dawid-Sebastiani score", + `Explanation` = r"(The Dawid-Sebastiani-Score is a proper scoring rule proposed that only relies on the first moments of the predictive distribution and is therefore easy to compute. It is given as + + $$\text{dss}(F, y) = \left( \frac{y - \mu}{\sigma} \right)^2 + 2 \cdot \log \sigma,$$ + where $F$ is the predictive distribution with mean $\mu$ and standard deviation $\sigma$ and $y$ is the true observed value. + + **Usage and caveats** The dss is applicable to continuous and integer forecasts and easy to compute. Apart from the ease of computation we see little advantage in using it over other scores.)" +) + +brier_score <- list( + `Metric` = "Brier score", + `Explanation` = r"(Proper scoring rule for binary forecasts. The Brier score is computed as + $$\text{Brier Score} = \frac{1}{N} \sum_{n = 1}^{N} (f_n - y_n),$$ + where $f_n$, with $n = 1, \dots, N$ are the predicted probablities that the corresponding events, $y_n \in (0, 1)$ will be equal to one.) + + **Usage**: Applicable to all binary forecasts.)" +) + +interval_coverage <- list( + `Metric` = "Interval coverage", + `Explanation` = r"(Interval coverage measures the proportion of observed values that fall in a given prediction interval range. Interval coverage for a single prediction interval range can be calculated as $$IC_{\alpha} = \text{nominal coverage} - \text{empirical coverage},$$ + where nominal coverage is $1 - \alpha$ and empirical coverage is the proportion of true values actually covered by all $1 - \alpha$ prediction intervals. + + To summarise interval coverage over different over multiple interval ranges, we can compute coverage deviation defined as the mean interval coverage over all $K$ interval ranges $\alpha_k$ with $k = 1, \dots, K$: + $$\text{Coverage deviation} = \frac{1}{K} \sum_{k = 1}^{K} \text{IC}_{\alpha_k}$$ + + **Usage**: Interval coverage for a set of chosen intervals, (e.g. 50\% and 90\%) gives a good indication of marginal calibration and is easy to interpret. Reporting coverage deviation has the advantage of summarising calibration in a single number, but loses some of the nuance.)" +) + +quantile_coverage <- list( + `Metric` = "Quantile coverage", + `Explanation` = r"(Quantile coverage for a given quantile level is the proportion of true values smaller than the predictions corresponding to that quantile level. + + **Usage**: Quantile coverage is similar to interval coverage, but conveys more information. For example, it allows us to look at the 5\% and 95\% quantile separately, instead of jointly at the 90\% prediction interval). This helps to diagnose whether it is the upper or lower end of a prediction interval that is causing problems. Plots of quantile coverage are conceptually very similar to PIT histograms.)" +) + +sharpness <- list( + `Metric` = "Sharpness", + `Explanation` = r"(Sharpness is the ability to produce narrow forecasts and is a feature of the forecasts only and does not depend on the observations. Sharpness is therefore only of interest conditional on calibration: a very precise forecast is not useful if it is clearly wrong. + + As suggested by Funk et al. (2019), we measure sharpness for continuous and integer forecasts represented by predictive samples as the normalised median absolute deviation about the median (MADN) ), i.e. + $$ S(F) = \frac{1}{0.675} \cdot \text{median}(|x - \text{median(x)}|), $$ + where $x$ is the vector of all predictive samples and $\frac{1}{0.675}$ is a normalising constant. If the predictive distribution $F$ is the CDF of a normal distribution, then sharpness will equal the standard deviation of $F$. + + For quantile forecasts we can directly use the sharpness component of the weighted interval score. Sharpness is then simply the weighted mean of the widths of the central prediction intervals.)" +) + +bias <- list( + `Metric` = "Bias", + `Explanation` = r"(Bias is a measure of the tendency of a forecaster to over- or underpredict. For continuous forecasts, bias is given as + $$B(F, y) = 1 - 2 \cdot (F (y)), $$ + where $F$ is the CDF of the predictive distribution and $y$ is the observed value. + + For integer-valued forecasts, bias can be calculated as + $$B(P, y) = 1 - (P(y) + P(y + 1)), $$ + where $P(y)$ is the cumulative probability assigned to all outcomes smaller or equal to $y$. + + For quantile forecasts, Bias can be calculated as the maximum percentile rank for which the prediction is smaller than $y$, if the true value is smaller than the median of the predictive distribution. If the true value is above the median of the predictive distribution, then bias is the minimum percentile rank for which the corresponding quantile is still larger than the true value. If the true value is exactly the median, bias is zero. For a large enough number of quantiles, the percentile rank will equal the proportion of predictive samples below the observed true value, and this metric coincides with the one for continuous forecasts. + + **Usage**: + In contrast to the over- and underprediction penalties of the interval score it is bound between 0 and 1 and represents the tendency of forecasts to be biased rather than the absolute amount of over- and underprediction. It is therefore a more robust measurement, but harder to interpet. It largely depends on the application whether one is more interested in the tendency to be biased or in the absolute value of over- and underpredictions.)" +) + +pit <- list( + `Metric` = "Probability integral transform (PIT)", + `Explanation` = r"(The probability integral transform (PIT, Dawid 1984) represents a succinct way to visualise deviations between the predictive distribution $F$ and the true data-generating distribution $G$. The idea is to transform the observed values such that agreement between forecasts and data can then be examined by observing whether or not the transformed values follow a uniform distribution. The PIT is given by + $$u = F (y),$$ + where $u$ is the transformed variable and $F(y)$ is the predictive distribution $F$ evaluated at the true observed value $y$. If $F = G$, then $u$ follows a uniform distribution. + + For integer outcomes, the PIT is no longer uniform even when forecasts are ideal. Instead, a randomised PIT can be used: + $$u = P(y) + v \cdot (P(y) - P(y - 1) ),$$ + where $y$ is again the observed value $P()$ is the cumulative probability assigned to all values smaller or equal to $y$ (where $P(-1) = 0$ by definition, and $v$ is a standard uniform variable independent of $y$. If $P$ is equal to the true data-generating distribution function, then $u$ is standard uniform. also propose a non-randomised version of the PIT for count data that could be used alternatively. + + **Usage**: + One can plot a histogram of $u$ values to look for deviations from uniformity. U-shaped histograms often result from predictions that are too narrow, while hump-shaped histograms indicate that predictions may be too wide. Biased predictions will usually result in a triangle-shaped histogram. One can also test for deviations from normality, using for example an Anderson-Darling test. This, however, proves to be overly strict in practice and even slight deviations from perfect calibration are punished in a way that makes it very hard to compare models at all. In addition, errors from forecasts may be correlated (i.e. forecasts made on a given date), potentially violating the assumptions of the Anderson-Darling test. We therefore do not recommend it for most use cases.)" +) + +mean_score_ratio <- list( + `Metric` = "Mean score ratio", + `Explanation` = r"(The mean score ratio is used to compare two models on the overlapping set of forecast targets for which both models have made a prediction. The mean score ratio is calculated as the mean score achieved by the first model over the mean score achieved by the second model. More precisely, for two models $i, j$, we determine the set of overlapping forecasts, denoted by $\mathcal{A}_{ij}$ and compute the mean score ratio $\theta_{ij}$ as + $$\theta_{ij} =\frac{\text{mean score model } i \text{ on } \mathcal{A}_{ij}}{\text{mean score model } j \text{ on } \mathcal{A}_{ij}}.$$ + The mean score ratio can in principle be computed for any arbitrary score. + + **Usage**: + Mean scores ratios are usually calculated in the context of pairwise comparisons, where a set of models is compared by looking at mean score ratios of all possible parings. Whether smaller or larger values are better depends on the orientation of the original score used)" +) + +relative_skill <- list( + `Metric` = "Relative skill", + `Explanation` = r"(Relative skill scores can be used to obtain a ranking of models based on pairwise comparisons between all models. To compute the relative skill $\theta_i$ of model $i$, we take the geometric mean of all mean score ratios that involve model $i$, i.e. + $$ \theta_{i} = \left(\prod_{m = 1}^M \theta_{im}\right)^{1/M}, $$ + where M is the number of models. + + **Usage and caveats**: + Relative skill is a helpful way to obtain a model ranking. Whether smaller or larger values are better depends on the orientation of the original score used. + It is in principle relatively robust against biases that arise when models only forecast some of the available targets and is a reasonable way to handle missing forecasts. One possible precautionary measure to reduces issues with missing forecasts is to only compare models that have forecasted at least half of all possible targets (this ensures that there is always an overlap between models). If there is no overlap between models, the relative skill implicitly estimates how a model would have forecasted on those missing targets. )" +) + +data <- rbind(as.data.frame(crps), + as.data.frame(log_score), + as.data.frame(wis), + as.data.frame(dss), + as.data.frame(brier_score), + as.data.frame(interval_coverage), + as.data.frame(quantile_coverage), + as.data.frame(pit), + as.data.frame(sharpness), + as.data.frame(bias), + as.data.frame(mean_score_ratio), + as.data.frame(relative_skill)) + +saveRDS(data, "inst/metrics-overview/metrics-detailed.Rda") diff --git a/man/abs_error.Rd b/man/abs_error.Rd index 0d5beaae1..eac8d3579 100644 --- a/man/abs_error.Rd +++ b/man/abs_error.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/absolute_error.R +% Please edit documentation in R/metrics_point_forecasts.R \name{abs_error} \alias{abs_error} \title{Absolute Error} @@ -16,9 +16,11 @@ quantiles in a second vector, \code{quantiles}.} vector with the absolute error } \description{ -Caclulate absolute error as +Calculate absolute error as \deqn{ + \text{abs}(\text{true_value} - \text{median_prediction}) +}{ abs(true_value - prediction) } } @@ -27,3 +29,7 @@ true_values <- rnorm(30, mean = 1:30) predicted_values <- rnorm(30, mean = 1:30) abs_error(true_values, predicted_values) } +\seealso{ +\code{\link[=ae_median_sample]{ae_median_sample()}}, \code{\link[=ae_median_quantile]{ae_median_quantile()}} +} +\keyword{metric} diff --git a/man/add_coverage.Rd b/man/add_coverage.Rd new file mode 100644 index 000000000..ad658432e --- /dev/null +++ b/man/add_coverage.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarise_scores.R +\name{add_coverage} +\alias{add_coverage} +\title{Add coverage of central prediction intervals} +\usage{ +add_coverage(scores, by, ranges = c(50, 90)) +} +\arguments{ +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} + +\item{by}{character vector with column names to add the coverage for.} + +\item{ranges}{numeric vector of the ranges of the central prediction intervals +for which coverage values shall be added.} +} +\value{ +a data.table with unsummarised scores with columns added for the +coverage of the central prediction intervals. While the overall data.table +is still unsummarised, note that for the coverage columns some level of +summary is present according to the value specified in \code{by}. +} +\description{ +Adds a column with the coverage of central prediction intervals +to unsummarised scores as produced by \code{\link[=score]{score()}} +} +\details{ +The coverage values that are added are computed according to the values +specified in \code{by}. If, for example, \code{by = "model"}, then there will be one +coverage value for every model and \code{\link[=add_coverage]{add_coverage()}} will compute the coverage +for every model across the values present in all other columns which define +the unit of a single forecast. +} +\examples{ +library(magrittr) # pipe operator +score(example_quantile) \%>\% + add_coverage(by = c("model", "target_type")) \%>\% + summarise_scores(by = c("model", "target_type")) \%>\% + summarise_scores(fun = signif, digits = 2) +} +\keyword{scoring} diff --git a/man/add_quantiles.Rd b/man/add_quantiles.Rd deleted file mode 100644 index 5fe384de3..000000000 --- a/man/add_quantiles.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval_forecasts_helper.R -\name{add_quantiles} -\alias{add_quantiles} -\title{Add Quantiles to Predictions When Summarising} -\usage{ -add_quantiles(dt, varnames, quantiles, by) -} -\arguments{ -\item{dt}{the data.table operated on} - -\item{varnames}{names of the variables for which to calculate quantiles} - -\item{quantiles}{the desired quantiles} - -\item{by}{grouping variable in \code{\link[=eval_forecasts]{eval_forecasts()}}} -} -\value{ -\code{data.table} with quantiles added -} -\description{ -Helper function used within eval_forecasts -} -\keyword{internal} diff --git a/man/add_rel_skill_to_eval_forecasts.Rd b/man/add_rel_skill_to_eval_forecasts.Rd deleted file mode 100644 index 713d38ac8..000000000 --- a/man/add_rel_skill_to_eval_forecasts.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pairwise-comparisons.R -\name{add_rel_skill_to_eval_forecasts} -\alias{add_rel_skill_to_eval_forecasts} -\title{Add relative skill to eval_forecasts()} -\usage{ -add_rel_skill_to_eval_forecasts( - unsummarised_scores, - rel_skill_metric, - baseline, - by, - summarise_by, - verbose -) -} -\arguments{ -\item{unsummarised_scores}{unsummarised scores to be passed from -\code{\link[=eval_forecasts]{eval_forecasts()}}} - -\item{rel_skill_metric}{character string with the name of the metric for which -a relative skill shall be computed. If equal to 'auto' (the default), then -one of interval score, crps or brier score will be used where appropriate} - -\item{baseline}{character string with the name of a model. If a baseline is -given, then a scaled relative skill with respect to the baseline will be -returned. By default (\code{NULL}), relative skill will not be scaled with -respect to a baseline model.} - -\item{by}{character vector of columns to group scoring by. This should be the -lowest level of grouping possible, i.e. the unit of the individual -observation. This is important as many functions work on individual -observations. If you want a different level of aggregation, you should use -\code{summarise_by} to aggregate the individual scores. -Also not that the pit will be computed using \code{summarise_by} -instead of \code{by}} - -\item{summarise_by}{character vector of columns to group the summary by. By -default, this is equal to \code{by} and no summary takes place. -But sometimes you may want to to summarise -over categories different from the scoring. -\code{summarise_by} is also the grouping level used to compute -(and possibly plot) the probability integral transform(pit).} - -\item{verbose}{print out additional helpful messages (default is \code{TRUE})} -} -\description{ -This function will only be called within \code{\link[=eval_forecasts]{eval_forecasts()}} and serves to -make pairwise comparisons from within that function. It uses the -\code{summarise_by} argument as well as the data from \code{\link[=eval_forecasts]{eval_forecasts()}}. -Essentially, it wraps \code{\link[=pairwise_comparison]{pairwise_comparison()}} and deals with the specifics -necessary to work with \code{\link[=eval_forecasts]{eval_forecasts()}}. -} -\keyword{internal} diff --git a/man/add_sd.Rd b/man/add_sd.Rd deleted file mode 100644 index 192ff83bc..000000000 --- a/man/add_sd.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval_forecasts_helper.R -\name{add_sd} -\alias{add_sd} -\title{Add Standard Deviation to Predictions When Summarising} -\usage{ -add_sd(dt, varnames, by) -} -\arguments{ -\item{dt}{the data.table operated on} - -\item{varnames}{names of the variables for which to calculate the sd} - -\item{by}{grouping variable in \code{\link[=eval_forecasts]{eval_forecasts()}}} -} -\value{ -\code{data.table} with sd added -} -\description{ -Helper function used within eval_forecasts -} -\keyword{internal} diff --git a/man/ae_median_quantile.Rd b/man/ae_median_quantile.Rd index ed2b60fc0..64ace88e0 100644 --- a/man/ae_median_quantile.Rd +++ b/man/ae_median_quantile.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/absolute_error.R +% Please edit documentation in R/metrics_point_forecasts.R \name{ae_median_quantile} \alias{ae_median_quantile} \title{Absolute Error of the Median (Quantile-based Version)} \usage{ -ae_median_quantile(true_values, predictions, quantiles = NULL, verbose = TRUE) +ae_median_quantile(true_values, predictions, quantiles = NULL) } \arguments{ \item{true_values}{A vector with the true observed values of size n} @@ -16,8 +16,6 @@ quantiles in a second vector, \code{quantiles}.} in \code{predictions}. Only those predictions where \code{quantiles == 0.5} will be kept. If \code{quantiles} is \code{NULL}, then all \code{predictions} and \code{true_values} will be used (this is then the same as \code{\link[=abs_error]{abs_error()}})} - -\item{verbose}{logical, return a warning is something unexpected happens} } \value{ vector with the scoring values @@ -26,11 +24,20 @@ vector with the scoring values Absolute error of the median calculated as \deqn{ + \text{abs}(\text{true_value} - \text{prediction}) +}{ abs(true_value - median_prediction) } + +The function was created for internal use within \code{\link[=score]{score()}}, but can also +used as a standalone function. } \examples{ true_values <- rnorm(30, mean = 1:30) predicted_values <- rnorm(30, mean = 1:30) ae_median_quantile(true_values, predicted_values, quantiles = 0.5) } +\seealso{ +\code{\link[=ae_median_sample]{ae_median_sample()}}, \code{\link[=abs_error]{abs_error()}} +} +\keyword{metric} diff --git a/man/ae_median_sample.Rd b/man/ae_median_sample.Rd index 4e16a8f3f..41bb1e34d 100644 --- a/man/ae_median_sample.Rd +++ b/man/ae_median_sample.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/absolute_error.R +% Please edit documentation in R/metrics_point_forecasts.R \name{ae_median_sample} \alias{ae_median_sample} \title{Absolute Error of the Median (Sample-based Version)} @@ -10,9 +10,8 @@ ae_median_sample(true_values, predictions) \item{true_values}{A vector with the true observed values of size n} \item{predictions}{nxN matrix of predictive samples, n (number of rows) being -the number of data points and N (number of columns) the -number of Monte Carlo samples. Alternatively, predictions can just be a vector -of size n} +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} } \value{ vector with the scoring values @@ -21,6 +20,8 @@ vector with the scoring values Absolute error of the median calculated as \deqn{ + \text{abs}(\text{true_value} - \text{median_prediction}) +}{ abs(true_value - median_prediction) } } @@ -29,3 +30,7 @@ true_values <- rnorm(30, mean = 1:30) predicted_values <- rnorm(30, mean = 1:30) ae_median_sample(true_values, predicted_values) } +\seealso{ +\code{\link[=ae_median_quantile]{ae_median_quantile()}}, \code{\link[=abs_error]{abs_error()}} +} +\keyword{metric} diff --git a/man/avail_forecasts.Rd b/man/avail_forecasts.Rd new file mode 100644 index 000000000..541e588f9 --- /dev/null +++ b/man/avail_forecasts.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/avail_forecasts.R +\name{avail_forecasts} +\alias{avail_forecasts} +\title{Display Number of Forecasts Available} +\usage{ +avail_forecasts(data, by = NULL, collapse = c("quantile", "sample")) +} +\arguments{ +\item{data}{data.frame with predictions in the same format required for +\code{\link[=score]{score()}}.} + +\item{by}{character vector or \code{NULL} (the default) that denotes the +categories over which the number of forecasts should be counted. +By default (\code{by = NULL}) this will be the unit of a single forecast (i.e. +all available columns (apart from a few "protected" columns such as +'prediction' and 'true value') plus "quantile" or "sample" where present).} + +\item{collapse}{character vector (default is \verb{c("quantile", "sample"}) with +names of categories for which the number of rows should be collapsed to one +when counting. For example, a single forecast is usually represented by a +set of several quantiles or samples and collapsing these to one makes sure +that a single forecast only gets counted once.} +} +\value{ +A data.table with columns as specified in \code{by} and an additional +column with the number of forecasts. +} +\description{ +Given a data set with forecasts, count the number of available forecasts +for arbitrary grouping (e.g. the number of forecasts per model, or the +number of forecasts per model and location). +This is useful to determine whether there are any missing forecasts. +} +\examples{ +avail_forecasts(example_quantile, + collapse = c("quantile"), + by = c("model", "target_type") +) +} +\keyword{check-forecasts} diff --git a/man/available_metrics.Rd b/man/available_metrics.Rd index c52e02958..9dd3d737d 100644 --- a/man/available_metrics.Rd +++ b/man/available_metrics.Rd @@ -12,3 +12,4 @@ A vector with the name of all available metrics \description{ Available metrics in scoringutils } +\keyword{info} diff --git a/man/bias_quantile.Rd b/man/bias_quantile.Rd new file mode 100644 index 000000000..426535234 --- /dev/null +++ b/man/bias_quantile.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bias.R +\name{bias_quantile} +\alias{bias_quantile} +\title{Determines Bias of Quantile Forecasts} +\usage{ +bias_quantile(predictions, quantiles, true_value) +} +\arguments{ +\item{predictions}{vector of length corresponding to the number of quantiles +that holds predictions} + +\item{quantiles}{vector of corresponding size with the quantiles for which +predictions were made} + +\item{true_value}{a single true value} +} +\value{ +scalar with the quantile bias for a single quantile prediction +} +\description{ +Determines bias from quantile forecasts. For an increasing number of +quantiles this measure converges against the sample based bias version +for integer and continuous forecasts. +} +\details{ +For quantile forecasts, bias is measured as + +\deqn{ +B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) + 1( x_t \leq q_{t, 0.5}) \\ ++ (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) + 1( x_t \geq q_{t, 0.5}),} + +where \eqn{Q_t} is the set of quantiles that form the predictive +distribution at time \eqn{t}. They represent our +belief about what the true value $x_t$ will be. For consistency, we define +\eqn{Q_t} such that it always includes the element +\eqn{q_{t, 0} = - \infty$ and $q_{t,1} = \infty}. +\eqn{1()} is the indicator function that is \eqn{1} if the +condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is +defined as the maximum percentile rank for which the corresponding quantile +is still below the true value, if the true value is smaller than the +median of the predictive distribution. If the true value is above the +median of the predictive distribution, then $B_t$ is the minimum percentile +rank for which the corresponding quantile is still larger than the true +value. If the true value is exactly the median, both terms cancel out and +\eqn{B_t} is zero. For a large enough number of quantiles, the +percentile rank will equal the proportion of predictive samples below the +observed true value, and this metric coincides with the one for +continuous forecasts. + +Bias can assume values between +-1 and 1 and is 0 ideally. +} +\examples{ + +predictions <- c( + 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, + 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, + 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, + 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 +) + +quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + +true_value <- 8062 + +bias_quantile(predictions, quantiles, true_value = true_value) +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} +\keyword{metric} diff --git a/man/quantile_bias.Rd b/man/bias_range.Rd similarity index 70% rename from man/quantile_bias.Rd rename to man/bias_range.Rd index 2c224db31..5f29f0f4d 100644 --- a/man/quantile_bias.Rd +++ b/man/bias_range.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/bias.R -\name{quantile_bias} -\alias{quantile_bias} +\name{bias_range} +\alias{bias_range} \title{Determines Bias of Quantile Forecasts} \usage{ -quantile_bias(range, lower, upper, true_value) +bias_range(range, lower, upper, true_value) } \arguments{ \item{range}{vector of corresponding size with information about the width @@ -32,15 +32,23 @@ for integer and continuous forecasts. For quantile forecasts, bias is measured as \deqn{ -B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) 1( x_t \leq q_{t, 0.5}) \\ -+ (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) 1( x_t \geq q_{t, 0.5}),} +B_t = (1 - 2 \cdot \max \{i | q_{t,i} \in Q_t \land q_{t,i} \leq x_t\}) +\mathbf{1}( x_t \leq q_{t, 0.5}) \\ ++ (1 - 2 \cdot \min \{i | q_{t,i} \in Q_t \land q_{t,i} \geq x_t\}) + \mathbf{1}( x_t \geq q_{t, 0.5}), +}{ +B_t = (1 - 2 * max(i | q_{t,i} in Q_t and q_{t,i} <= x_t\)) +1( x_t <= q_{t, 0.5}) + (1 - 2 * min(i | q_{t,i} in Q_t and q_{t,i} >= x_t)) + 1( x_t >= q_{t, 0.5}), +} where \eqn{Q_t} is the set of quantiles that form the predictive distribution at time \eqn{t}. They represent our -belief about what the true value $x_t$ will be. For consistency, we define +belief about what the true value \eqn{x_t} will be. For consistency, we +define \eqn{Q_t} such that it always includes the element -\eqn{q_{t, 0} = - \infty$ and $q_{t,1} = \infty}. -\eqn{1()} is the indicator function that is \eqn{1} if the +\eqn{q_{t, 0} = - \infty} and \eqn{q_{t,1} = \infty}. +\eqn{\mathbf{1}()}{1()} is the indicator function that is \eqn{1} if the condition is satisfied and $0$ otherwise. In clearer terms, \eqn{B_t} is defined as the maximum percentile rank for which the corresponding quantile is still below the true value, if the true value is smaller than the @@ -58,22 +66,28 @@ Bias can assume values between } \examples{ -lower <- c(6341.000, 6329.500, 6087.014, 5703.500, - 5451.000, 5340.500, 4821.996, 4709.000, - 4341.500, 4006.250, 1127.000, 705.500) +lower <- c( + 6341.000, 6329.500, 6087.014, 5703.500, + 5451.000, 5340.500, 4821.996, 4709.000, + 4341.500, 4006.250, 1127.000, 705.500 +) -upper <- c(6341.000, 6352.500, 6594.986, 6978.500, - 7231.000, 7341.500, 7860.004, 7973.000, - 8340.500, 8675.750, 11555.000, 11976.500) +upper <- c( + 6341.000, 6352.500, 6594.986, 6978.500, + 7231.000, 7341.500, 7860.004, 7973.000, + 8340.500, 8675.750, 11555.000, 11976.500 +) range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) true_value <- 8062 -quantile_bias(lower = lower, upper = upper, - range = range, true_value = true_value) - +bias_range( + lower = lower, upper = upper, + range = range, true_value = true_value +) } \author{ Nikos Bosse \email{nikosbosse@gmail.com} } +\keyword{metric} diff --git a/man/bias.Rd b/man/bias_sample.Rd similarity index 86% rename from man/bias.Rd rename to man/bias_sample.Rd index 7a22eb5b6..953fbf00a 100644 --- a/man/bias.Rd +++ b/man/bias_sample.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/bias.R -\name{bias} -\alias{bias} +\name{bias_sample} +\alias{bias_sample} \title{Determines bias of forecasts} \usage{ -bias(true_values, predictions) +bias_sample(true_values, predictions) } \arguments{ \item{true_values}{A vector with the true observed values of size n} \item{predictions}{nxN matrix of predictive samples, n (number of rows) being -the number of data points and N (number of columns) the -number of Monte Carlo samples} +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} } \value{ vector of length n with the biases of the predictive samples with @@ -50,14 +50,12 @@ In both cases, Bias can assume values between ## integer valued forecasts true_values <- rpois(30, lambda = 1:30) predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -bias(true_values, predictions) +bias_sample(true_values, predictions) ## continuous forecasts true_values <- rnorm(30, mean = 1:30) predictions <- replicate(200, rnorm(30, mean = 1:30)) -bias(true_values, predictions) - - +bias_sample(true_values, predictions) } \references{ The integer valued Bias function is discussed in @@ -71,3 +69,4 @@ region of Sierra Leone, 2014-15. PLOS Computational Biology 15(2): e1006785. \author{ Nikos Bosse \email{nikosbosse@gmail.com} } +\keyword{metric} diff --git a/man/brier_score.Rd b/man/brier_score.Rd index c676492d0..add2c1c11 100644 --- a/man/brier_score.Rd +++ b/man/brier_score.Rd @@ -7,7 +7,8 @@ brier_score(true_values, predictions) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{true_values}{A vector with the true observed values of size n with +all values equal to either 0 or 1} \item{predictions}{A vector with a predicted probability that true_value = 1.} @@ -28,13 +29,16 @@ The Brier Score is then computed as the mean squared error between the probabilistic prediction and the true outcome. \deqn{ -Brier_Score = \frac{1}{N} \sum_{t = 1}^{n} (prediction_t - outcome_t)^2 + \text{Brier_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\text{prediction_t} - + \text{outcome_t})^2 +}{ + Brier_Score = 1/N \sum_{t = 1}^{n} (prediction_t - outcome_t)² } } \examples{ -true_values <- sample(c(0,1), size = 30, replace = TRUE) +true_values <- sample(c(0, 1), size = 30, replace = TRUE) predictions <- runif(n = 30, min = 0, max = 1) brier_score(true_values, predictions) - } +\keyword{metric} diff --git a/man/check_clean_data.Rd b/man/check_clean_data.Rd deleted file mode 100644 index 565592c12..000000000 --- a/man/check_clean_data.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_forecasts.R -\name{check_clean_data} -\alias{check_clean_data} -\title{Clean forecast data} -\usage{ -check_clean_data(data) -} -\arguments{ -\item{data}{A data.frame or similar as it gets passed to \code{\link[=eval_forecasts]{eval_forecasts()}}.} -} -\value{ -A data.table with NA values in \code{true_value} or \code{prediction} removed. -} -\description{ -Helper function to check that the input is in fact a data.frame -or similar and remove rows with no value for \code{prediction} or \code{true_value} -} -\keyword{internal} diff --git a/man/check_equal_length.Rd b/man/check_equal_length.Rd index ab2357899..38edbcd9c 100644 --- a/man/check_equal_length.Rd +++ b/man/check_equal_length.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/input-check-helpers.R \name{check_equal_length} \alias{check_equal_length} \title{Check Length} @@ -9,7 +9,8 @@ check_equal_length(..., one_allowed = TRUE) \arguments{ \item{...}{The variables to check} -\item{one_allowed}{logical, allow arguments of length one that can be recycled} +\item{one_allowed}{logical, allow arguments of length one that can be +recycled} } \value{ The function returns \code{NULL}, but throws an error if variable lengths diff --git a/man/check_forecasts.Rd b/man/check_forecasts.Rd index 34edae005..9d1f19791 100644 --- a/man/check_forecasts.Rd +++ b/man/check_forecasts.Rd @@ -7,15 +7,17 @@ check_forecasts(data) } \arguments{ -\item{data}{A data.frame or similar as would be used for \code{\link[=eval_forecasts]{eval_forecasts()}}} +\item{data}{data.frame with predictions in the same format required for +\code{\link[=score]{score()}}.} } \value{ A list with elements that give information about what \code{scoringutils} thinks you are trying to do and potential issues. \itemize{ \item \code{target_type} the type of the prediction target as inferred from the -input: 'binary', if all values in \code{true_value} are either 0 or 1 and values in -\code{prediction} are between 0 and 1, 'discrete' if all true values are integers +input: 'binary', if all values in \code{true_value} are either 0 or 1 and values +in \code{prediction} are between 0 and 1, 'discrete' if all true values are +integers. and 'continuous' if not. \item \code{prediction_type} inferred type of the prediction. 'quantile', if there is a column called 'quantile', else 'discrete' if all values in \code{prediction} @@ -35,32 +37,32 @@ values, but rather the maximum number of unique values across the whole data. \item \code{warnings} A vector with warnings. These can be ignored if you know what you are doing. \item \code{errors} A vector with issues that will cause an error when running -\code{\link[=eval_forecasts]{eval_forecasts()}}. +\code{\link[=score]{score()}}. \item \code{messages} A verbal explanation of the information provided above. } } \description{ Function to check the input data before running -\code{\link[=eval_forecasts]{eval_forecasts()}}. +\code{\link[=score]{score()}}. The data should come in one of three different formats: \itemize{ -\item A format for binary predictions (see \link{binary_example_data}) +\item A format for binary predictions (see \link{example_binary}) \item A sample-based format for discrete or continuous predictions -(see \link{continuous_example_data} and \link{integer_example_data}) -\item A quantile-based format (see \link{quantile_example_data}) +(see \link{example_continuous} and \link{example_integer}) +\item A quantile-based format (see \link{example_quantile}) } } \examples{ -library(scoringutils) -check <- check_forecasts(quantile_example_data) +check <- check_forecasts(example_quantile) print(check) -check_forecasts(binary_example_data) +check_forecasts(example_binary) } \seealso{ -Functions to move between different formats: -\code{\link[=range_long_to_quantile]{range_long_to_quantile()}}, \code{\link[=range_wide_to_long]{range_wide_to_long()}} +Function to move from sample-based to quantile format: +\code{\link[=sample_to_quantile]{sample_to_quantile()}} } \author{ Nikos Bosse \email{nikosbosse@gmail.com} } +\keyword{check-forecasts} diff --git a/man/check_metrics.Rd b/man/check_metrics.Rd new file mode 100644 index 000000000..11fb19035 --- /dev/null +++ b/man/check_metrics.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/input-check-helpers.R +\name{check_metrics} +\alias{check_metrics} +\title{Check whether the desired metrics are available in scoringutils} +\usage{ +check_metrics(metrics) +} +\arguments{ +\item{metrics}{character vector with desired metrics} +} +\value{ +A character vector with metrics that can be used for downstream +computation +} +\description{ +Helper function to check whether desired metrics are +available. If the input is \code{NULL}, all metrics will be returned. +} +\keyword{internal} diff --git a/man/check_not_null.Rd b/man/check_not_null.Rd index 242e43284..7e468e46e 100644 --- a/man/check_not_null.Rd +++ b/man/check_not_null.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/input-check-helpers.R \name{check_not_null} \alias{check_not_null} \title{Check Variable is not NULL} diff --git a/man/check_predictions.Rd b/man/check_predictions.Rd new file mode 100644 index 000000000..682ea3a95 --- /dev/null +++ b/man/check_predictions.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/input-check-helpers.R +\name{check_predictions} +\alias{check_predictions} +\title{Check Prediction Input For Lower-level Scoring Functions} +\usage{ +check_predictions( + predictions, + true_values = NULL, + type = c("continuous", "integer", "binary"), + class = c("vector", "matrix") +) +} +\arguments{ +\item{predictions}{an object with predictions. Depending on whether +\code{class = vector} or \code{class = "matrix"} this can be either a vector of length +n (corresponding to the length of the true_values) or a nxN matrix of +predictive samples, n (number of rows) being the number of data points and +N (number of columns) the number of Monte Carlo samples} + +\item{true_values}{A vector with the true observed values of size n} + +\item{type}{character, one of "continuous" (default), "integer" or "binary" that +defines the type of the forecast} + +\item{class}{character, either "vector" (default) or "matrix" that determines the +class the input has to correspond to} +} +\description{ +Helper function to check inputs for lower-level score functions. +} +\keyword{internal} diff --git a/man/check_summary_params.Rd b/man/check_summary_params.Rd new file mode 100644 index 000000000..1da886bf6 --- /dev/null +++ b/man/check_summary_params.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarise_scores.R +\name{check_summary_params} +\alias{check_summary_params} +\title{Check input parameters for \code{\link[=summarise_scores]{summarise_scores()}}} +\usage{ +check_summary_params(scores, by, relative_skill, baseline, metric) +} +\arguments{ +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} + +\item{by}{character vector with column names to summarise scores by. Default +is \code{NULL}, meaning that the only summary that takes is place is summarising +over quantiles (in case of quantile-based forecasts), such that there is one +score per forecast as defined by the unit of a single forecast (rather than +one score for every quantile).} + +\item{relative_skill}{logical, whether or not to compute relative +performance between models based on pairwise comparisons. +If \code{TRUE} (default is \code{FALSE}), then a column called +'model' must be present in the input data. For more information on +the computation of relative skill, see \code{\link[=pairwise_comparison]{pairwise_comparison()}}. +Relative skill will be calculated for the aggregation level specified in +\code{by}.} + +\item{baseline}{character string with the name of a model. If a baseline is +given, then a scaled relative skill with respect to the baseline will be +returned. By default (\code{NULL}), relative skill will not be scaled with +respect to a baseline model.} + +\item{metric}{character with the name of the metric for which +a relative skill shall be computed. If equal to 'auto' (the default), then +this will be either interval score, crps or brier score (depending on which +of these is available in the input data)} +} +\description{ +A helper function to check the input parameters for +\code{\link[=score]{score()}}. +} +\keyword{internal} diff --git a/man/check_true_values.Rd b/man/check_true_values.Rd new file mode 100644 index 000000000..da5489dcc --- /dev/null +++ b/man/check_true_values.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/input-check-helpers.R +\name{check_true_values} +\alias{check_true_values} +\title{Check Observed Value Input For Lower-level Scoring Functions} +\usage{ +check_true_values(true_values, type = c("continuous", "integer", "binary")) +} +\arguments{ +\item{true_values}{A vector with the true observed values of size n} + +\item{type}{character, one of "continuous" (default), "integer" or "binary" that +defines the type of the forecast} +} +\description{ +Helper function to check inputs for lower-level score functions. +} +\keyword{internal} diff --git a/man/collapse_messages.Rd b/man/collapse_messages.Rd new file mode 100644 index 000000000..47f3abaed --- /dev/null +++ b/man/collapse_messages.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_forecasts.R +\name{collapse_messages} +\alias{collapse_messages} +\title{Collapse several messages to one} +\usage{ +collapse_messages(type = "messages", messages) +} +\arguments{ +\item{type}{character, should be either "messages", "warnings" or "errors"} + +\item{messages}{the messages or warnings to collapse} +} +\value{ +string with the message or warning +} +\description{ +Internal helper function to facilitate generating messages +and warnings in \code{\link[=check_forecasts]{check_forecasts()}} +} +\keyword{internal} diff --git a/man/compare_two_models.Rd b/man/compare_two_models.Rd index f6c5d382c..840658c23 100644 --- a/man/compare_two_models.Rd +++ b/man/compare_two_models.Rd @@ -4,31 +4,38 @@ \alias{compare_two_models} \title{Compare Two Models Based on Subset of Common Forecasts} \usage{ -compare_two_models(scores, name_model1, name_model2, metric, test_options, by) +compare_two_models( + scores, + name_model1, + name_model2, + metric, + one_sided = FALSE, + test_type = c("non_parametric", "permutation"), + n_permutations = 999 +) } \arguments{ -\item{scores}{A data.frame of unsummarised scores as produced by -\code{\link[=eval_forecasts]{eval_forecasts()}}} +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} \item{name_model1}{character, name of the first model} \item{name_model2}{character, name of the model to compare against} -\item{metric}{A character vector of length one with the metric to do -the comparison on.} +\item{metric}{A character vector of length one with the metric to do the +comparison on. The default is "auto", meaning that either "interval_score", +"crps", or "brier_score" will be selected where available. +See \code{\link[=available_metrics]{available_metrics()}} for available metrics.} -\item{test_options}{list with options to pass down to \code{\link[=compare_two_models]{compare_two_models()}}. -To change only one of the default options, just pass a list as input with -the name of the argument you want to change. All elements not included in the -list will be set to the default (so passing an empty list would result in the -default options).} +\item{one_sided}{Boolean, default is \code{FALSE}, whether two conduct a one-sided +instead of a two-sided test to determine significance in a pairwise +comparison.} -\item{by}{character vector of columns to group scoring by. This should be the -lowest level of grouping possible, i.e. the unit of the individual -observation. This is important as many functions work on individual -observations. If you want a different level of aggregation, you should use -\code{summarise_by} to aggregate the individual scores. -Also not that the pit will be computed using \code{summarise_by} instead of \code{by}} +\item{test_type}{character, either "non_parametric" (the default) or +"permutation". This determines which kind of test shall be conducted to +determine p-values.} + +\item{n_permutations}{numeric, the number of permutations for a +permutation test. Default is 999.} } \description{ This function compares two models based on the subset of forecasts for which @@ -46,3 +53,4 @@ Johannes Bracher, \email{johannes.bracher@kit.edu} Nikos Bosse \email{nikosbosse@gmail.com} } +\keyword{internal} diff --git a/man/continuous_example_data.Rd b/man/continuous_example_data.Rd deleted file mode 100644 index 94282bf3a..000000000 --- a/man/continuous_example_data.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{continuous_example_data} -\alias{continuous_example_data} -\title{Continuous Forecast Example Data} -\format{ -A data frame with 13,429 rows and 10 columns: -\describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} -\item{horizon}{forecast horizon in days} -\item{prediction}{prediction value for the corresponding sample} -\item{sample}{id for the corresponding sample} -\item{true_value}{true observed values} -} -} -\usage{ -continuous_example_data -} -\description{ -A data set with continuous predictions in a sample-based format relevant in the -2020 UK Covid-19 epidemic. -} -\keyword{datasets} diff --git a/man/correlation.Rd b/man/correlation.Rd new file mode 100644 index 000000000..77de3a469 --- /dev/null +++ b/man/correlation.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlations.R +\name{correlation} +\alias{correlation} +\title{Correlation Between Metrics} +\usage{ +correlation(scores, metrics = NULL) +} +\arguments{ +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} + +\item{metrics}{A character vector with the metrics to show. If set to +\code{NULL} (default), all metrics present in \code{scores} will +be shown} +} +\value{ +A data.table with correlations for the different metrics +} +\description{ +Calculate the correlation between different metrics for a data.frame of +scores as produced by \code{\link[=score]{score()}}. +} +\examples{ +scores <- score(example_quantile) +correlation(scores) +} +\keyword{scoring} diff --git a/man/correlation_plot.Rd b/man/correlation_plot.Rd deleted file mode 100644 index d3c3f829c..000000000 --- a/man/correlation_plot.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{correlation_plot} -\alias{correlation_plot} -\title{Plot Correlation Between Metrics} -\usage{ -correlation_plot(scores, select_metrics = NULL) -} -\arguments{ -\item{scores}{A data.frame of scores as produced by -\code{\link[=eval_forecasts]{eval_forecasts()}}} - -\item{select_metrics}{A character vector with the metrics to show. If set to -\code{NULL} (default), all metrics present in \code{summarised_scores} will -be shown} -} -\value{ -A ggplot2 object showing a coloured matrix of correlations -between metrics -} -\description{ -Plots a coloured table of scores obtained using -\code{\link[=eval_forecasts]{eval_forecasts()}} -} -\examples{ -scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data) -scoringutils::correlation_plot(scores) -} diff --git a/man/crps.Rd b/man/crps_sample.Rd similarity index 74% rename from man/crps.Rd rename to man/crps_sample.Rd index b2432bf50..0e8f48e4b 100644 --- a/man/crps.Rd +++ b/man/crps_sample.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/scoringRules_wrappers.R -\name{crps} -\alias{crps} +\name{crps_sample} +\alias{crps_sample} \title{Ranked Probability Score} \usage{ -crps(true_values, predictions) +crps_sample(true_values, predictions) } \arguments{ \item{true_values}{A vector with the true observed values of size n} \item{predictions}{nxN matrix of predictive samples, n (number of rows) being -the number of data points and N (number of columns) the -number of Monte Carlo samples} +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} } \value{ vector with the scoring values @@ -25,9 +25,10 @@ valued forecasts \examples{ true_values <- rpois(30, lambda = 1:30) predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -crps(true_values, predictions) +crps_sample(true_values, predictions) } \references{ Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic Forecasts with scoringRules, \url{https://arxiv.org/pdf/1709.04743.pdf} } +\keyword{metric} diff --git a/man/delete_columns.Rd b/man/delete_columns.Rd index 035a5d8bb..124b86625 100644 --- a/man/delete_columns.Rd +++ b/man/delete_columns.Rd @@ -4,12 +4,14 @@ \alias{delete_columns} \title{Delete Columns From a Data.table} \usage{ -delete_columns(df, cols_to_delete) +delete_columns(df, cols_to_delete, make_unique = FALSE) } \arguments{ \item{df}{A data.table or data.frame from which columns shall be deleted} \item{cols_to_delete}{character vector with names of columns to be deleted} + +\item{make_unique}{whether to make the data set unique after removing columns} } \value{ A data.table diff --git a/man/dss.Rd b/man/dss_sample.Rd similarity index 73% rename from man/dss.Rd rename to man/dss_sample.Rd index c7dd3abe1..b65a04282 100644 --- a/man/dss.Rd +++ b/man/dss_sample.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/scoringRules_wrappers.R -\name{dss} -\alias{dss} +\name{dss_sample} +\alias{dss_sample} \title{Dawid-Sebastiani Score} \usage{ -dss(true_values, predictions) +dss_sample(true_values, predictions) } \arguments{ \item{true_values}{A vector with the true observed values of size n} \item{predictions}{nxN matrix of predictive samples, n (number of rows) being -the number of data points and N (number of columns) the -number of Monte Carlo samples} +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} } \value{ vector with scoring values @@ -24,9 +24,10 @@ function from the \examples{ true_values <- rpois(30, lambda = 1:30) predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -dss(true_values, predictions) +dss_sample(true_values, predictions) } \references{ Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic Forecasts with scoringRules, \url{https://arxiv.org/pdf/1709.04743.pdf} } +\keyword{metric} diff --git a/man/eval_forecasts.Rd b/man/eval_forecasts.Rd deleted file mode 100644 index 30e2180af..000000000 --- a/man/eval_forecasts.Rd +++ /dev/null @@ -1,238 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval_forecasts.R -\name{eval_forecasts} -\alias{eval_forecasts} -\title{Evaluate forecasts} -\usage{ -eval_forecasts( - data = NULL, - by = NULL, - summarise_by = by, - metrics = NULL, - quantiles = c(), - sd = FALSE, - interval_score_arguments = list(weigh = TRUE, count_median_twice = FALSE, - separate_results = TRUE), - pit_plots = FALSE, - summarised = TRUE, - verbose = TRUE, - forecasts = NULL, - truth_data = NULL, - merge_by = NULL, - compute_relative_skill = FALSE, - rel_skill_metric = "auto", - baseline = NULL -) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -Note: it is easiest to have a look at the example files provided in the -package and in the examples below. -The following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For quantile forecasts the data can be provided in variety of formats. You -can either use a range-based format or a quantile-based format. (You can -convert between formats using \code{\link[=quantile_to_range_long]{quantile_to_range_long()}}, -\code{\link[=range_long_to_quantile]{range_long_to_quantile()}}, -\code{\link[=sample_to_range_long]{sample_to_range_long()}}, -\code{\link[=sample_to_quantile]{sample_to_quantile()}}) -For a quantile-format forecast you should provide: -\itemize{ -\item \code{prediction}: prediction to the corresponding quantile -\item \code{quantile}: quantile to which the prediction corresponds -For a range format (long) forecast you need -\item \code{prediction}: the quantile forecasts -\item \code{boundary}: values should be either "lower" or "upper", depending -on whether the prediction is for the lower or upper bound of a given range -\item \code{range} the range for which a forecast was made. For a 50\% interval -the range should be 50. The forecast for the 25\% quantile should have -the value in the \code{prediction} column, the value of \code{range} -should be 50 and the value of \code{boundary} should be "lower". -If you want to score the median (i.e. \code{range = 0}), you still -need to include a lower and an upper estimate, so the median has to -appear twice. -Alternatively you can also provide the format in a wide range format. -This format needs: -\item pairs of columns called something like 'upper_90' and 'lower_90', -or 'upper_50' and 'lower_50', where the number denotes the interval range. -For the median, you need to provide columns called 'upper_0' and 'lower_0' -}} - -\item{by}{character vector of columns to group scoring by. This should be the -lowest level of grouping possible, i.e. the unit of the individual -observation. This is important as many functions work on individual -observations. If you want a different level of aggregation, you should use -\code{summarise_by} to aggregate the individual scores. -Also not that the pit will be computed using \code{summarise_by} -instead of \code{by}} - -\item{summarise_by}{character vector of columns to group the summary by. By -default, this is equal to \code{by} and no summary takes place. -But sometimes you may want to to summarise -over categories different from the scoring. -\code{summarise_by} is also the grouping level used to compute -(and possibly plot) the probability integral transform(pit).} - -\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the -default), all available metrics will be computed. For a list of available -metrics see \code{\link[=available_metrics]{available_metrics()}}} - -\item{quantiles}{numeric vector of quantiles to be returned when summarising. -Instead of just returning a mean, quantiles will be returned for the -groups specified through \code{summarise_by}. By default, no quantiles are -returned.} - -\item{sd}{if \code{TRUE} (the default is \code{FALSE}) the standard deviation of all -metrics will be returned when summarising.} - -\item{interval_score_arguments}{list with arguments for the calculation of -the interval score. These arguments get passed down to -\code{interval_score}, except for the argument \code{count_median_twice} that -controls how the interval scores for different intervals are summed up. This -should be a logical (default is \code{FALSE}) that indicates whether or not -to count the median twice when summarising. This would conceptually treat the -median as a 0\% prediction interval, where the median is the lower as well as -the upper bound. The alternative is to treat the median as a single quantile -forecast instead of an interval. The interval score would then -be better understood as an average of quantile scores.)} - -\item{pit_plots}{if \code{TRUE} (not the default), pit plots will be returned. For -details see \code{\link[=pit]{pit()}}.} - -\item{summarised}{Summarise arguments (i.e. take the mean per group -specified in group_by. Default is \code{TRUE.}} - -\item{verbose}{print out additional helpful messages (default is \code{TRUE})} - -\item{forecasts}{data.frame with forecasts, that should follow the same -general guidelines as the \code{data} input. Argument can be used to supply -forecasts and truth data independently. Default is \code{NULL}.} - -\item{truth_data}{data.frame with a column called \code{true_value} to be merged -with \code{forecasts}} - -\item{merge_by}{character vector with column names that \code{forecasts} and -\code{truth_data} should be merged on. Default is \code{NULL} and merge will be -attempted automatically.} - -\item{compute_relative_skill}{logical, whether or not to compute relative -performance between models. If \code{TRUE} (default is \code{FALSE}), then a column called -'model' must be present in the input data. For more information on -the computation of relative skill, see \code{\link[=pairwise_comparison]{pairwise_comparison()}}. -Relative skill will be calculated for the aggregation level specified in -\code{summarise_by}.} - -\item{rel_skill_metric}{character string with the name of the metric for which -a relative skill shall be computed. If equal to 'auto' (the default), then -one of interval score, crps or brier score will be used where appropriate} - -\item{baseline}{character string with the name of a model. If a baseline is -given, then a scaled relative skill with respect to the baseline will be -returned. By default (\code{NULL}), relative skill will not be scaled with -respect to a baseline model.} -} -\value{ -A data.table with appropriate scores. For binary predictions, -the Brier Score will be returned, for quantile predictions the interval -score, as well as adapted metrics for calibration, sharpness and bias. -For integer forecasts, Sharpness, Bias, DSS, CRPS, LogS, and -pit_p_val (as an indicator of calibration) are returned. For integer -forecasts, pit_sd is returned (to account for the randomised PIT), -but no Log Score is returned (the internal estimation relies on a -kernel density estimate which is difficult for integer-valued forecasts). -If \code{summarise_by} is specified differently from \code{by}, -the average score per summary unit is returned. -If specified, quantiles and standard deviation of scores can also be returned -when summarising. -} -\description{ -The function \code{eval_forecasts} is an easy to use wrapper -of the lower level functions in the \pkg{scoringutils} package. -It can be used to score probabilistic or quantile forecasts of -continuous, integer-valued or binary variables. -} -\details{ -the following metrics are used where appropriate: -\itemize{ -\item {Interval Score} for quantile forecasts. Smaller is better. See -\code{\link[=interval_score]{interval_score()}} for more information. By default, the -weighted interval score is used. -\item {Brier Score} for a probability forecast of a binary outcome. -Smaller is better. See \code{\link[=brier_score]{brier_score()}} for more information. -\item {aem} Absolute error of the median prediction -\item {Bias} 0 is good, 1 and -1 are bad. -See \code{\link[=bias]{bias()}} for more information. -\item {Sharpness} Smaller is better. See \code{\link[=sharpness]{sharpness()}} for more -information. -\item {Calibration} represented through the p-value of the -Anderson-Darling test for the uniformity of the Probability Integral -Transformation (PIT). For integer valued forecasts, this p-value also -has a standard deviation. Larger is better. -See \code{\link[=pit]{pit()}} for more information. -\item {DSS} Dawid-Sebastiani-Score. Smaller is better. -See \code{\link[=dss]{dss()}} for more information. -\item {CRPS} Continuous Ranked Probability Score. Smaller is better. -See \code{\link[=crps]{crps()}} for more information. -\item {Log Score} Smaller is better. Only for continuous forecasts. -See \code{\link[=logs]{logs()}} for more information. -} -} -\examples{ -## Probability Forecast for Binary Target -binary_example <- data.table::setDT(scoringutils::binary_example_data) -eval <- scoringutils::eval_forecasts(binary_example, - summarise_by = c("model"), - quantiles = c(0.5), sd = TRUE, - verbose = FALSE) - -## Quantile Forecasts -# wide format example (this examples shows usage of both wide formats) -range_example_wide <- data.table::setDT(scoringutils::range_example_data_wide) -range_example <- scoringutils::range_wide_to_long(range_example_wide) -wide2 <- data.table::setDT(scoringutils::range_example_data_semi_wide) -range_example <- scoringutils::range_wide_to_long(wide2) -example <- scoringutils::range_long_to_quantile(range_example) -eval <- scoringutils::eval_forecasts(example, - summarise_by = "model", - quantiles = c(0.05, 0.95), - sd = TRUE) -eval <- scoringutils::eval_forecasts(example) - - -## Integer Forecasts -integer_example <- data.table::setDT(scoringutils::integer_example_data) -eval <- scoringutils::eval_forecasts(integer_example, - summarise_by = c("model"), - quantiles = c(0.1, 0.9), - sd = TRUE, - pit_plots = TRUE) -eval <- scoringutils::eval_forecasts(integer_example) - -## Continuous Forecasts -continuous_example <- data.table::setDT(scoringutils::continuous_example_data) -eval <- scoringutils::eval_forecasts(continuous_example) -eval <- scoringutils::eval_forecasts(continuous_example, - quantiles = c(0.5, 0.9), - sd = TRUE, - summarise_by = c("model")) - -} -\references{ -Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ -(2019) Assessing the performance of real-time epidemic forecasts: A -case study of Ebola in the Western Area region of Sierra Leone, 2014-15. -PLoS Comput Biol 15(2): e1006785. \url{doi:10.1371/journal.pcbi.1006785} -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} diff --git a/man/eval_forecasts_binary.Rd b/man/eval_forecasts_binary.Rd deleted file mode 100644 index 4b65e1c25..000000000 --- a/man/eval_forecasts_binary.Rd +++ /dev/null @@ -1,112 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval_forecasts_binary.R -\name{eval_forecasts_binary} -\alias{eval_forecasts_binary} -\title{Evaluate forecasts in a Binary Format} -\usage{ -eval_forecasts_binary( - data, - by, - summarise_by, - metrics, - quantiles, - sd, - summarised, - verbose -) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -Note: it is easiest to have a look at the example files provided in the -package and in the examples below. -The following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For quantile forecasts the data can be provided in variety of formats. You -can either use a range-based format or a quantile-based format. (You can -convert between formats using \code{\link[=quantile_to_range_long]{quantile_to_range_long()}}, -\code{\link[=range_long_to_quantile]{range_long_to_quantile()}}, -\code{\link[=sample_to_range_long]{sample_to_range_long()}}, -\code{\link[=sample_to_quantile]{sample_to_quantile()}}) -For a quantile-format forecast you should provide: -\itemize{ -\item \code{prediction}: prediction to the corresponding quantile -\item \code{quantile}: quantile to which the prediction corresponds -For a range format (long) forecast you need -\item \code{prediction}: the quantile forecasts -\item \code{boundary}: values should be either "lower" or "upper", depending -on whether the prediction is for the lower or upper bound of a given range -\item \code{range} the range for which a forecast was made. For a 50\% interval -the range should be 50. The forecast for the 25\% quantile should have -the value in the \code{prediction} column, the value of \code{range} -should be 50 and the value of \code{boundary} should be "lower". -If you want to score the median (i.e. \code{range = 0}), you still -need to include a lower and an upper estimate, so the median has to -appear twice. -Alternatively you can also provide the format in a wide range format. -This format needs: -\item pairs of columns called something like 'upper_90' and 'lower_90', -or 'upper_50' and 'lower_50', where the number denotes the interval range. -For the median, you need to provide columns called 'upper_0' and 'lower_0' -}} - -\item{by}{character vector of columns to group scoring by. This should be the -lowest level of grouping possible, i.e. the unit of the individual -observation. This is important as many functions work on individual -observations. If you want a different level of aggregation, you should use -\code{summarise_by} to aggregate the individual scores. -Also not that the pit will be computed using \code{summarise_by} -instead of \code{by}} - -\item{summarise_by}{character vector of columns to group the summary by. By -default, this is equal to \code{by} and no summary takes place. -But sometimes you may want to to summarise -over categories different from the scoring. -\code{summarise_by} is also the grouping level used to compute -(and possibly plot) the probability integral transform(pit).} - -\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the -default), all available metrics will be computed. For a list of available -metrics see \code{\link[=available_metrics]{available_metrics()}}} - -\item{quantiles}{numeric vector of quantiles to be returned when summarising. -Instead of just returning a mean, quantiles will be returned for the -groups specified through \code{summarise_by}. By default, no quantiles are -returned.} - -\item{sd}{if \code{TRUE} (the default is \code{FALSE}) the standard deviation of all -metrics will be returned when summarising.} - -\item{summarised}{Summarise arguments (i.e. take the mean per group -specified in group_by. Default is \code{TRUE.}} - -\item{verbose}{print out additional helpful messages (default is \code{TRUE})} -} -\value{ -A data.table with appropriate scores. For more information see -\code{\link[=eval_forecasts]{eval_forecasts()}} -} -\description{ -Evaluate forecasts in a Binary Format -} -\examples{ -# Probability Forecast for Binary Target -binary_example <- data.table::setDT(scoringutils::binary_example_data) -eval <- scoringutils::eval_forecasts(data = binary_example, - summarise_by = c("model"), - quantiles = c(0.5), sd = TRUE, - verbose = FALSE) - -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} diff --git a/man/eval_forecasts_sample.Rd b/man/eval_forecasts_sample.Rd deleted file mode 100644 index b186889d6..000000000 --- a/man/eval_forecasts_sample.Rd +++ /dev/null @@ -1,137 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eval_forecasts_continuous_integer.R -\name{eval_forecasts_sample} -\alias{eval_forecasts_sample} -\title{Evaluate forecasts in a Sample-Based Format (Integer or Continuous)} -\usage{ -eval_forecasts_sample( - data, - by, - summarise_by, - metrics, - prediction_type, - quantiles, - sd, - pit_plots, - summarised, - verbose -) -} -\arguments{ -\item{data}{A data.frame or data.table with the predictions and observations. -Note: it is easiest to have a look at the example files provided in the -package and in the examples below. -The following columns need to be present: -\itemize{ -\item \code{true_value} - the true observed values -\item \code{prediction} - predictions or predictive samples for one -true value. (You only don't need to provide a prediction column if -you want to score quantile forecasts in a wide range format.)} -For integer and continuous forecasts a \code{sample} column is needed: -\itemize{ -\item \code{sample} - an index to identify the predictive samples in the -prediction column generated by one model for one true value. Only -necessary for continuous and integer forecasts, not for -binary predictions.} -For quantile forecasts the data can be provided in variety of formats. You -can either use a range-based format or a quantile-based format. (You can -convert between formats using \code{\link[=quantile_to_range_long]{quantile_to_range_long()}}, -\code{\link[=range_long_to_quantile]{range_long_to_quantile()}}, -\code{\link[=sample_to_range_long]{sample_to_range_long()}}, -\code{\link[=sample_to_quantile]{sample_to_quantile()}}) -For a quantile-format forecast you should provide: -\itemize{ -\item \code{prediction}: prediction to the corresponding quantile -\item \code{quantile}: quantile to which the prediction corresponds -For a range format (long) forecast you need -\item \code{prediction}: the quantile forecasts -\item \code{boundary}: values should be either "lower" or "upper", depending -on whether the prediction is for the lower or upper bound of a given range -\item \code{range} the range for which a forecast was made. For a 50\% interval -the range should be 50. The forecast for the 25\% quantile should have -the value in the \code{prediction} column, the value of \code{range} -should be 50 and the value of \code{boundary} should be "lower". -If you want to score the median (i.e. \code{range = 0}), you still -need to include a lower and an upper estimate, so the median has to -appear twice. -Alternatively you can also provide the format in a wide range format. -This format needs: -\item pairs of columns called something like 'upper_90' and 'lower_90', -or 'upper_50' and 'lower_50', where the number denotes the interval range. -For the median, you need to provide columns called 'upper_0' and 'lower_0' -}} - -\item{by}{character vector of columns to group scoring by. This should be the -lowest level of grouping possible, i.e. the unit of the individual -observation. This is important as many functions work on individual -observations. If you want a different level of aggregation, you should use -\code{summarise_by} to aggregate the individual scores. -Also not that the pit will be computed using \code{summarise_by} -instead of \code{by}} - -\item{summarise_by}{character vector of columns to group the summary by. By -default, this is equal to \code{by} and no summary takes place. -But sometimes you may want to to summarise -over categories different from the scoring. -\code{summarise_by} is also the grouping level used to compute -(and possibly plot) the probability integral transform(pit).} - -\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the -default), all available metrics will be computed. For a list of available -metrics see \code{\link[=available_metrics]{available_metrics()}}} - -\item{prediction_type}{character, should be either "continuous" or "integer"} - -\item{quantiles}{numeric vector of quantiles to be returned when summarising. -Instead of just returning a mean, quantiles will be returned for the -groups specified through \code{summarise_by}. By default, no quantiles are -returned.} - -\item{sd}{if \code{TRUE} (the default is \code{FALSE}) the standard deviation of all -metrics will be returned when summarising.} - -\item{pit_plots}{if \code{TRUE} (not the default), pit plots will be returned. For -details see \code{\link[=pit]{pit()}}.} - -\item{summarised}{Summarise arguments (i.e. take the mean per group -specified in group_by. Default is \code{TRUE.}} - -\item{verbose}{print out additional helpful messages (default is \code{TRUE})} -} -\value{ -A data.table with appropriate scores. For more information see -\code{\link[=eval_forecasts]{eval_forecasts()}} -} -\description{ -Evaluate forecasts in a Sample-Based Format (Integer or Continuous) -} -\examples{ - -## Integer Forecasts -integer_example <- data.table::setDT(scoringutils::integer_example_data) -eval <- scoringutils::eval_forecasts(integer_example, - summarise_by = c("model"), - quantiles = c(0.1, 0.9), - sd = TRUE, - pit_plots = TRUE) -eval <- scoringutils::eval_forecasts(integer_example) - -## Continuous Forecasts -continuous_example <- data.table::setDT(scoringutils::continuous_example_data) -eval <- scoringutils::eval_forecasts(continuous_example)#' - -eval <- scoringutils::eval_forecasts(continuous_example, - quantiles = c(0.5, 0.9), - sd = TRUE, - summarise_by = c("model")) - -} -\references{ -Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ -(2019) Assessing the performance of real-time epidemic forecasts: A -case study of Ebola in the Western Area region of Sierra Leone, 2014-15. -PLoS Comput Biol 15(2): e1006785. \url{doi:10.1371/journal.pcbi.1006785} -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} diff --git a/man/binary_example_data.Rd b/man/example_binary.Rd similarity index 50% rename from man/binary_example_data.Rd rename to man/example_binary.Rd index c0da1cc25..b76c3fa97 100644 --- a/man/binary_example_data.Rd +++ b/man/example_binary.Rd @@ -1,29 +1,32 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} -\name{binary_example_data} -\alias{binary_example_data} +\name{example_binary} +\alias{example_binary} \title{Binary Forecast Example Data} \format{ A data frame with 346 rows and 10 columns: \describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} -\item{horizon}{forecast horizon in days} -\item{prediction}{probability prediction that true value would be 1} +\item{location}{the country for which a prediction was made} +\item{location_name}{name of the country for which a prediction was made} +\item{target_end_date}{the date for which a prediction was made} +\item{target_type}{the target to be predicted (cases or deaths)} \item{true_value}{true observed values} +\item{forecast_date}{the date on which a prediction was made} +\item{model}{name of the model that generated the forecasts} +\item{horizon}{forecast horizon in weeks} +\item{prediction}{predicted value} +} } +\source{ +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ -binary_example_data +example_binary } \description{ -A data set with (constructed) binary predictions relevant in the -2020 UK Covid-19 epidemic. +A data set with binary predictions for COVID-19 cases and deaths constructed +from data submitted to the European Forecast Hub. } \details{ Predictions in the data set were constructed based on the continuous example diff --git a/man/example_continuous.Rd b/man/example_continuous.Rd new file mode 100644 index 000000000..3f071a724 --- /dev/null +++ b/man/example_continuous.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{example_continuous} +\alias{example_continuous} +\title{Continuous Forecast Example Data} +\format{ +A data frame with 13,429 rows and 10 columns: +\describe{ +\item{location}{the country for which a prediction was made} +\item{target_end_date}{the date for which a prediction was made} +\item{target_type}{the target to be predicted (cases or deaths)} +\item{true_value}{true observed values} +\item{location_name}{name of the country for which a prediction was made} +\item{forecast_date}{the date on which a prediction was made} +\item{model}{name of the model that generated the forecasts} +\item{horizon}{forecast horizon in weeks} +\item{prediction}{predicted value} +\item{sample}{id for the corresponding sample} +} +} +\source{ +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +} +\usage{ +example_continuous +} +\description{ +A data set with continuous predictions for COVID-19 cases and deaths +constructed from data submitted to the European Forecast Hub. +} +\keyword{datasets} diff --git a/man/example_integer.Rd b/man/example_integer.Rd new file mode 100644 index 000000000..98dceaf65 --- /dev/null +++ b/man/example_integer.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{example_integer} +\alias{example_integer} +\title{Integer Forecast Example Data} +\format{ +A data frame with 13,429 rows and 10 columns: +\describe{ +\item{location}{the country for which a prediction was made} +\item{target_end_date}{the date for which a prediction was made} +\item{target_type}{the target to be predicted (cases or deaths)} +\item{true_value}{true observed values} +\item{location_name}{name of the country for which a prediction was made} +\item{forecast_date}{the date on which a prediction was made} +\item{model}{name of the model that generated the forecasts} +\item{horizon}{forecast horizon in weeks} +\item{prediction}{predicted value} +\item{sample}{id for the corresponding sample} +} +} +\usage{ +example_integer +} +\description{ +A data set with integer predictions for COVID-19 cases and deaths +constructed from data submitted to the European Forecast Hub. +} +\keyword{datasets} diff --git a/man/example_quantile.Rd b/man/example_quantile.Rd new file mode 100644 index 000000000..f6b1e1927 --- /dev/null +++ b/man/example_quantile.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{example_quantile} +\alias{example_quantile} +\title{Quantile Example Data} +\format{ +A data frame with +\describe{ +\item{location}{the country for which a prediction was made} +\item{target_end_date}{the date for which a prediction was made} +\item{target_type}{the target to be predicted (cases or deaths)} +\item{true_value}{true observed values} +\item{location_name}{name of the country for which a prediction was made} +\item{forecast_date}{the date on which a prediction was made} +\item{quantile}{quantile of the corresponding prediction} +\item{prediction}{predicted value} +\item{model}{name of the model that generated the forecasts} +\item{horizon}{forecast horizon in weeks} +} +} +\source{ +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +} +\usage{ +example_quantile +} +\description{ +A data set with predictions for COVID-19 cases and deaths submitted to the +European Forecast Hub. +} +\keyword{datasets} diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd index 68c4c93b1..fa6aac56a 100644 --- a/man/example_quantile_forecasts_only.Rd +++ b/man/example_quantile_forecasts_only.Rd @@ -7,23 +7,24 @@ \format{ A data frame with 7,581 rows and 9 columns: \describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} +\item{location}{the country for which a prediction was made} +\item{target_end_date}{the date for which a prediction was made} +\item{target_type}{the target to be predicted (cases or deaths)} +\item{forecast_date}{the date on which a prediction was made} \item{quantile}{quantile of the corresponding prediction} -\item{prediction}{quantile predictions} -\item{value_desc}{long form description of the prediction target} -\item{horizon}{forecast horizon in days} - +\item{prediction}{predicted value} +\item{model}{name of the model that generated the forecasts} +\item{horizon}{forecast horizon in weeks} +} } +\source{ +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile_forecasts_only } \description{ -A data set with predictions for different quantities relevant in the -2020 UK Covid-19 epidemic, but no true_values +A data set with quantile predictions for COVID-19 cases and deaths +submitted to the European Forecast Hub. } \keyword{datasets} diff --git a/man/example_truth_data_only.Rd b/man/example_truth_data_only.Rd deleted file mode 100644 index 2add5b491..000000000 --- a/man/example_truth_data_only.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{example_truth_data_only} -\alias{example_truth_data_only} -\title{Truth data only} -\format{ -A data frame with 140 rows and 5 columns: -\describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{true_value}{true observed values} - -} -} -\usage{ -example_truth_data_only -} -\description{ -A data set with truth data for different quantities relevant in the -2020 UK Covid-19 epidemic, but no predictions -} -\keyword{datasets} diff --git a/man/example_truth_only.Rd b/man/example_truth_only.Rd new file mode 100644 index 000000000..3a7bc3f9c --- /dev/null +++ b/man/example_truth_only.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{example_truth_only} +\alias{example_truth_only} +\title{Truth data only} +\format{ +A data frame with 140 rows and 5 columns: +\describe{ +\item{location}{the country for which a prediction was made} +\item{target_end_date}{the date for which a prediction was made} +\item{target_type}{the target to be predicted (cases or deaths)} +\item{true_value}{true observed values} +\item{location_name}{name of the country for which a prediction was made} +} +} +\source{ +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} +} +\usage{ +example_truth_only +} +\description{ +A data set with truth values for COVID-19 cases and deaths +submitted to the European Forecast Hub. +} +\keyword{datasets} diff --git a/man/extract_from_list.Rd b/man/extract_from_list.Rd deleted file mode 100644 index d8e37b128..000000000 --- a/man/extract_from_list.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{extract_from_list} -\alias{extract_from_list} -\title{Extract Elements From a List of Lists} -\usage{ -extract_from_list(list, what) -} -\arguments{ -\item{list}{the list of lists} - -\item{what}{character with the name of the element to extract from every -individual list element of \code{list}} -} -\value{ -A list with the extracted element from every sublist -missing. -} -\description{ -Extract corresponding elements from a list of lists. -} -\keyword{internal} diff --git a/man/figures/unnamed-chunk-12-1.png b/man/figures/unnamed-chunk-12-1.png new file mode 100644 index 000000000..eda9e5192 Binary files /dev/null and b/man/figures/unnamed-chunk-12-1.png differ diff --git a/man/figures/unnamed-chunk-13-1.png b/man/figures/unnamed-chunk-13-1.png new file mode 100644 index 000000000..b6c7d8be2 Binary files /dev/null and b/man/figures/unnamed-chunk-13-1.png differ diff --git a/man/figures/unnamed-chunk-14-1.png b/man/figures/unnamed-chunk-14-1.png new file mode 100644 index 000000000..38a35f05b Binary files /dev/null and b/man/figures/unnamed-chunk-14-1.png differ diff --git a/man/figures/unnamed-chunk-16-1.png b/man/figures/unnamed-chunk-16-1.png new file mode 100644 index 000000000..8e946c59c Binary files /dev/null and b/man/figures/unnamed-chunk-16-1.png differ diff --git a/man/figures/unnamed-chunk-17-1.png b/man/figures/unnamed-chunk-17-1.png new file mode 100644 index 000000000..16448e7e7 Binary files /dev/null and b/man/figures/unnamed-chunk-17-1.png differ diff --git a/man/figures/unnamed-chunk-18-1.png b/man/figures/unnamed-chunk-18-1.png new file mode 100644 index 000000000..1e3f8f587 Binary files /dev/null and b/man/figures/unnamed-chunk-18-1.png differ diff --git a/man/figures/unnamed-chunk-19-1.png b/man/figures/unnamed-chunk-19-1.png new file mode 100644 index 000000000..e5c5eb7ed Binary files /dev/null and b/man/figures/unnamed-chunk-19-1.png differ diff --git a/man/figures/unnamed-chunk-22-1.png b/man/figures/unnamed-chunk-22-1.png new file mode 100644 index 000000000..b1146cbc4 Binary files /dev/null and b/man/figures/unnamed-chunk-22-1.png differ diff --git a/man/figures/unnamed-chunk-24-1.png b/man/figures/unnamed-chunk-24-1.png new file mode 100644 index 000000000..c62bc1862 Binary files /dev/null and b/man/figures/unnamed-chunk-24-1.png differ diff --git a/man/figures/unnamed-chunk-25-1.png b/man/figures/unnamed-chunk-25-1.png new file mode 100644 index 000000000..ac12daf6d Binary files /dev/null and b/man/figures/unnamed-chunk-25-1.png differ diff --git a/man/figures/unnamed-chunk-4-1.png b/man/figures/unnamed-chunk-4-1.png index 18d5f02c3..0edbabd20 100644 Binary files a/man/figures/unnamed-chunk-4-1.png and b/man/figures/unnamed-chunk-4-1.png differ diff --git a/man/figures/unnamed-chunk-5-1.png b/man/figures/unnamed-chunk-5-1.png index cecf942f5..29e28d225 100644 Binary files a/man/figures/unnamed-chunk-5-1.png and b/man/figures/unnamed-chunk-5-1.png differ diff --git a/man/figures/unnamed-chunk-6-1.png b/man/figures/unnamed-chunk-6-1.png index 73a0b58f7..f7c62f1b7 100644 Binary files a/man/figures/unnamed-chunk-6-1.png and b/man/figures/unnamed-chunk-6-1.png differ diff --git a/man/find_duplicates.Rd b/man/find_duplicates.Rd new file mode 100644 index 000000000..f0bf08341 --- /dev/null +++ b/man/find_duplicates.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_forecasts.R +\name{find_duplicates} +\alias{find_duplicates} +\title{Find duplicate forecasts} +\usage{ +find_duplicates(data) +} +\arguments{ +\item{data}{A data.frame as used for \code{\link[=score]{score()}}} +} +\value{ +A data.frame with all rows for which a duplicate forecast was found +} +\description{ +Helper function to identify duplicate forecasts, i.e. +instances where there is more than one forecast for the same prediction +target. +} +\examples{ +example <- rbind(example_quantile, example_quantile[1000:1010]) +find_duplicates(example) +} +\keyword{check-forecasts} diff --git a/man/get_forecast_unit.Rd b/man/get_forecast_unit.Rd new file mode 100644 index 000000000..ae808ccf0 --- /dev/null +++ b/man/get_forecast_unit.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{get_forecast_unit} +\alias{get_forecast_unit} +\title{Get unit of a single forecast} +\usage{ +get_forecast_unit(data) +} +\arguments{ +\item{data}{data.frame with predictions in the same format required for +\code{\link[=score]{score()}}.} +} +\value{ +A character vector with the column names that define the unit of +a single forecast +} +\description{ +Helper function to get the unit of a single forecast, i.e. +the column names that define where a single forecast was made for +} +\keyword{internal} diff --git a/man/get_prediction_type.Rd b/man/get_prediction_type.Rd index 8b861830a..951db2f18 100644 --- a/man/get_prediction_type.Rd +++ b/man/get_prediction_type.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_forecasts.R +% Please edit documentation in R/utils.R \name{get_prediction_type} \alias{get_prediction_type} \title{Get prediction type of a forecast} @@ -7,7 +7,8 @@ get_prediction_type(data) } \arguments{ -\item{data}{A data.frame or similar as would be used for \code{\link[=eval_forecasts]{eval_forecasts()}}} +\item{data}{data.frame with predictions in the same format required for +\code{\link[=score]{score()}}.} } \value{ Character vector of length one with either "quantile", "integer", or diff --git a/man/get_target_type.Rd b/man/get_target_type.Rd index ebd2f7452..cdf5b8b24 100644 --- a/man/get_target_type.Rd +++ b/man/get_target_type.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_forecasts.R +% Please edit documentation in R/utils.R \name{get_target_type} \alias{get_target_type} \title{Get type of the target true values of a forecast} @@ -7,11 +7,12 @@ get_target_type(data) } \arguments{ -\item{data}{A data.frame or similar as would be used for \code{\link[=eval_forecasts]{eval_forecasts()}}} +\item{data}{data.frame with predictions in the same format required for +\code{\link[=score]{score()}}.} } \value{ Character vector of length one with either "binary", "integer", or -"continous" +"continuous" } \description{ Internal helper function to get the type of the target diff --git a/man/hist_PIT.Rd b/man/hist_PIT.Rd deleted file mode 100644 index d1ffaf938..000000000 --- a/man/hist_PIT.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pit.R -\name{hist_PIT} -\alias{hist_PIT} -\title{PIT Histogram} -\usage{ -hist_PIT(PIT_samples, num_bins = NULL, caption = NULL) -} -\arguments{ -\item{PIT_samples}{A vector with the PIT values of size n} - -\item{num_bins}{the number of bins in the PIT histogram.} - -\item{caption}{provide a caption that gets passed to the plot -If not given, the square root of n will be used} -} -\value{ -vector with the scoring values -} -\description{ -Make a simple histogram of the probability integral transformed values to -visually check whether a uniform distribution seems likely. -} diff --git a/man/hist_PIT_quantile.Rd b/man/hist_PIT_quantile.Rd deleted file mode 100644 index ccb5ec2d0..000000000 --- a/man/hist_PIT_quantile.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pit.R -\name{hist_PIT_quantile} -\alias{hist_PIT_quantile} -\title{PIT Histogram Quantile} -\usage{ -hist_PIT_quantile(PIT_samples, num_bins = NULL, caption = NULL) -} -\arguments{ -\item{PIT_samples}{A vector with the PIT values of size n} - -\item{num_bins}{the number of bins in the PIT histogram.} - -\item{caption}{provide a caption that gets passed to the plot -If not given, the square root of n will be used} -} -\value{ -vector with the scoring values -} -\description{ -Make a simple histogram of the probability integral transformed values to -visually check whether a uniform distribution seems likely. -} diff --git a/man/infer_rel_skill_metric.Rd b/man/infer_rel_skill_metric.Rd new file mode 100644 index 000000000..82413818b --- /dev/null +++ b/man/infer_rel_skill_metric.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pairwise-comparisons.R +\name{infer_rel_skill_metric} +\alias{infer_rel_skill_metric} +\title{Infer metric for pairwise comparisons} +\usage{ +infer_rel_skill_metric(scores) +} +\arguments{ +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} +} +\description{ +Helper function to infer the metric for which pairwise comparisons shall +be made. The function simply checks the names of the available columns and +chooses the most widely used metric. +} +\keyword{internal} diff --git a/man/integer_example_data.Rd b/man/integer_example_data.Rd deleted file mode 100644 index 19e7fe9f3..000000000 --- a/man/integer_example_data.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{integer_example_data} -\alias{integer_example_data} -\title{Integer Forecast Example Data} -\format{ -A data frame with 13,429 rows and 10 columns: -\describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} -\item{horizon}{forecast horizon in days} -\item{prediction}{prediction value for the corresponding sample} -\item{sample}{id for the corresponding sample} -\item{true_value}{true observed values} -} -} -\usage{ -integer_example_data -} -\description{ -A data set with integer predictions in a sample-based format relevant in the -2020 UK Covid-19 epidemic. -} -\keyword{datasets} diff --git a/man/interval_coverage.Rd b/man/interval_coverage.Rd deleted file mode 100644 index 694fa9582..000000000 --- a/man/interval_coverage.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{interval_coverage} -\alias{interval_coverage} -\title{Plot Interval Coverage} -\usage{ -interval_coverage( - summarised_scores, - colour = "model", - facet_formula = NULL, - facet_wrap_or_grid = "facet_wrap", - scales = "free_y" -) -} -\arguments{ -\item{summarised_scores}{Summarised scores as produced by -\code{\link[=eval_forecasts]{eval_forecasts()}}. Make sure that "range" is included in -\code{summarise_by} when producing the summarised scores} - -\item{colour}{According to which variable shall the graphs be coloured? -Default is "model".} - -\item{facet_formula}{formula for facetting in ggplot. If this is \code{NULL} -(the default), no facetting will take place} - -\item{facet_wrap_or_grid}{Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} - -\item{scales}{scales argument that gets passed down to ggplot. Only necessary -if you make use of facetting. Default is "free_y"} -} -\value{ -ggplot object with a plot of interval coverage -} -\description{ -Plot interval coverage -} -\examples{ -example1 <- scoringutils::range_example_data_long -example1 <- scoringutils::range_long_to_quantile(example1) -scores <- scoringutils::eval_forecasts(example1, - summarise_by = c("model", "range")) -interval_coverage(scores) -} diff --git a/man/interval_score.Rd b/man/interval_score.Rd index d37f79ad3..325760fc1 100644 --- a/man/interval_score.Rd +++ b/man/interval_score.Rd @@ -16,9 +16,11 @@ interval_score( \arguments{ \item{true_values}{A vector with the true observed values of size n} -\item{lower}{vector of size n with the lower quantile of the given range} +\item{lower}{vector of size n with the prediction for the lower quantile +of the given range} -\item{upper}{vector of size n with the upper quantile of the given range} +\item{upper}{vector of size n with the prediction for the upper quantile +of the given range} \item{interval_range}{the range of the prediction intervals. i.e. if you're forecasting the 0.05 and 0.95 quantile, the interval_range would be 90. @@ -27,12 +29,14 @@ for different forecasts to be scored. This corresponds to (100-alpha)/100 in Gneiting and Raftery (2007). Internally, the range will be transformed to alpha.} -\item{weigh}{if TRUE, weigh the score by alpha / 4, so it can be averaged -into an interval score that, in the limit, corresponds to CRPS. Default: -\code{FALSE}.} +\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged +into an interval score that, in the limit, corresponds to CRPS. Alpha is the +decimal value that represents how much is outside a central prediction +interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) +Default: \code{TRUE}.} \item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate -parts of the interval score (sharpness, penalties for over- and +parts of the interval score (dispersion penalty, penalties for over- and under-prediction get returned as separate elements of a list). If you want a \code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.} } @@ -47,12 +51,21 @@ and Raftery (2007). Smaller values are better. The score is computed as \deqn{ +\text{score} = (\text{upper} - \text{lower}) + \frac{2}{\alpha}(\text{lower} + - \text{true_value}) * +\mathbf{1}(\text{true_value} < \text{lower}) + +\frac{2}{\alpha}(\text{true_value} - \text{upper}) * +\mathbf{1}(\text{true_value} > \text{upper}) +}{ score = (upper - lower) + 2/alpha * (lower - true_value) * 1(true_values < lower) + 2/alpha * (true_value - upper) * 1(true_value > upper) } -where $1()$ is the indicator function and alpha is the decimal value that +where \eqn{\mathbf{1}()}{1()} is the indicator function and indicates how much is outside the prediction interval. +\eqn{\alpha}{alpha} is the decimal value that indicates how much is outside +the prediction interval. + To improve usability, the user is asked to provide an interval range in percentage terms, i.e. interval_range = 90 (percent) for a 90 percent prediction interval. Correspondingly, the user would have to provide the @@ -60,27 +73,30 @@ prediction interval. Correspondingly, the user would have to provide the No specific distribution is assumed, but the range has to be symmetric (i.e you can't use the 0.1 quantile as the lower bound and the 0.7 quantile as the upper). - -The interval score is a proper scoring rule that scores a quantile forecast +Non-symmetric quantiles can be scored using the function \code{\link[=quantile_score]{quantile_score()}}. } \examples{ true_values <- rnorm(30, mean = 1:30) -interval_range = rep(90, 30) -alpha = (100 - interval_range) / 100 -lower = qnorm(alpha/2, rnorm(30, mean = 1:30)) -upper = qnorm((1- alpha/2), rnorm(30, mean = 1:30)) +interval_range <- rep(90, 30) +alpha <- (100 - interval_range) / 100 +lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) +upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) -interval_score(true_values = true_values, - lower = lower, - upper = upper, - interval_range = interval_range) +interval_score( + true_values = true_values, + lower = lower, + upper = upper, + interval_range = interval_range +) # example with missing values and separate results -interval_score(true_values = c(true_values, NA), - lower = c(lower, NA), - upper = c(NA, upper), - separate_results = TRUE, - interval_range = 90) +interval_score( + true_values = c(true_values, NA), + lower = c(lower, NA), + upper = c(NA, upper), + separate_results = TRUE, + interval_range = 90 +) } \references{ Strictly Proper Scoring Rules, Prediction,and Estimation, @@ -89,5 +105,6 @@ Statistical Association, Volume 102, 2007 - Issue 477 Evaluating epidemic forecasts in an interval format, Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, -\url{https://arxiv.org/abs/2005.12881} +\url{https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008618} # nolint } +\keyword{metric} diff --git a/man/logs_binary.Rd b/man/logs_binary.Rd new file mode 100644 index 000000000..0cc1a03ff --- /dev/null +++ b/man/logs_binary.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/log_score.R +\name{logs_binary} +\alias{logs_binary} +\title{Log Score for Binary outcomes} +\usage{ +logs_binary(true_values, predictions) +} +\arguments{ +\item{true_values}{A vector with the true observed values of size n with +all values equal to either 0 or 1} + +\item{predictions}{A vector with a predicted probability +that true_value = 1.} +} +\value{ +A numeric value with the Log Score, i.e. the mean squared +error of the given probability forecasts +} +\description{ +Computes the Log Score for probabilistic forecasts of binary outcomes. +} +\details{ +The Log Score is a proper score rule suited to assessing the accuracy of +probabilistic binary predictions. The outcomes can be either 0 or 1, +the predictions must be a probability that the true outcome will be 1. + +The Log Score is then computed as the negative logarithm of the probability +assigned to the true outcome. Reporting the negative logarithm means that +smaller values are better. +} +\examples{ +true_values <- sample(c(0, 1), size = 30, replace = TRUE) +predictions <- runif(n = 30, min = 0, max = 1) +logs_binary(true_values, predictions) +} +\keyword{metric} diff --git a/man/logs.Rd b/man/logs_sample.Rd similarity index 78% rename from man/logs.Rd rename to man/logs_sample.Rd index 1877d4ac5..c1a73c69d 100644 --- a/man/logs.Rd +++ b/man/logs_sample.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/scoringRules_wrappers.R -\name{logs} -\alias{logs} -\title{LogS} +\name{logs_sample} +\alias{logs_sample} +\title{Logarithmic score} \usage{ -logs(true_values, predictions) +logs_sample(true_values, predictions) } \arguments{ \item{true_values}{A vector with the true observed values of size n} \item{predictions}{nxN matrix of predictive samples, n (number of rows) being -the number of data points and N (number of columns) the -number of Monte Carlo samples} +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} } \value{ vector with the scoring values @@ -30,9 +30,10 @@ more details. \examples{ true_values <- rpois(30, lambda = 1:30) predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -logs(true_values, predictions) +logs_sample(true_values, predictions) } \references{ Alexander Jordan, Fabian Krüger, Sebastian Lerch, Evaluating Probabilistic Forecasts with scoringRules, \url{https://arxiv.org/pdf/1709.04743.pdf} } +\keyword{metric} diff --git a/man/sharpness.Rd b/man/mad_sample.Rd similarity index 57% rename from man/sharpness.Rd rename to man/mad_sample.Rd index 5f831a4ce..dc1239f14 100644 --- a/man/sharpness.Rd +++ b/man/mad_sample.Rd @@ -1,34 +1,36 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/sharpness.R -\name{sharpness} -\alias{sharpness} -\title{Determines sharpness of a probabilistic forecast} +\name{mad_sample} +\alias{mad_sample} +\title{Determine dispersion of a probabilistic forecast} \usage{ -sharpness(predictions) +mad_sample(predictions) } \arguments{ \item{predictions}{nxN matrix of predictive samples, n (number of rows) being -the number of data points and N (number of columns) the -number of Monte Carlo samples} +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} } \value{ -vector with sharpness values +vector with dispersion values } \description{ -Determines sharpness of a probabilistic forecast +Determine dispersion of a probabilistic forecast } \details{ Sharpness is the ability of the model to generate predictions within a -narrow range. It is a data-independent measure, and is purely a feature +narrow range and dispersion is the lack thereof. +It is a data-independent measure, and is purely a feature of the forecasts themselves. -Sharpness of predictive samples corresponding to one single true value is +Dispersion of predictive samples corresponding to one single true value is measured as the normalised median of the absolute deviation from the median of the predictive samples. For details, see \link[stats:mad]{mad()} +and the explanations given in Funk et al. (2019) } \examples{ predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -sharpness(predictions) +mad_sample(predictions) } \references{ Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ (2019) @@ -36,3 +38,4 @@ Assessing the performance of real-time epidemic forecasts: A case study of Ebola in the Western Area region of Sierra Leone, 2014-15. PLoS Comput Biol 15(2): e1006785. \url{doi:10.1371/journal.pcbi.1006785} } +\keyword{metric} diff --git a/man/merge_pred_and_obs.Rd b/man/merge_pred_and_obs.Rd index eaa5c41b1..601ff69a9 100644 --- a/man/merge_pred_and_obs.Rd +++ b/man/merge_pred_and_obs.Rd @@ -13,7 +13,7 @@ merge_pred_and_obs( } \arguments{ \item{forecasts}{data.frame with the forecast data (as can be passed to -\code{\link[=eval_forecasts]{eval_forecasts()}}).} +\code{\link[=score]{score()}}).} \item{observations}{data.frame with the observations} @@ -34,3 +34,9 @@ aims to handle the merging well if additional columns are present in one or both data sets. If in doubt, you should probably merge the data sets manually. } +\examples{ +forecasts <- example_quantile_forecasts_only +observations <- example_truth_only +merge_pred_and_obs(forecasts, observations) +} +\keyword{data-handling} diff --git a/man/metrics_summary.Rd b/man/metrics_summary.Rd new file mode 100644 index 000000000..ce71a7d33 --- /dev/null +++ b/man/metrics_summary.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{metrics_summary} +\alias{metrics_summary} +\title{Summary information for selected metrics} +\format{ +An object of class \code{data.table} (inherits from \code{data.frame}) with 22 rows and 8 columns. +} +\usage{ +metrics_summary +} +\description{ +A data set with summary information on selected metrics implemented in +\pkg{scoringutils} +} +\keyword{info} diff --git a/man/pairwise_comparison.Rd b/man/pairwise_comparison.Rd index 9beab3189..7de48b502 100644 --- a/man/pairwise_comparison.Rd +++ b/man/pairwise_comparison.Rd @@ -6,42 +6,40 @@ \usage{ pairwise_comparison( scores, - metric = "interval_score", - test_options = list(oneSided = FALSE, test_type = c("non_parametric", "permuation"), - n_permutations = 999), + by = c("model"), + metric = "auto", baseline = NULL, - by = NULL, - summarise_by = c("model") + ... ) } \arguments{ -\item{scores}{A data.frame of unsummarised scores as produced by -\code{\link[=eval_forecasts]{eval_forecasts()}}} +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} -\item{metric}{A character vector of length one with the metric to do -the comparison on.} +\item{by}{character vector with names of columns present in the input +data.frame. \code{by} determines how pairwise comparisons will be computed. +You will get a relative skill score for every grouping level determined in +\code{by}. If, for example, \code{by = c("model", "location")}. Then you will get a +separate relative skill score for every model in every location. Internally, +the data.frame will be split according \code{by} (but removing "model" before +splitting) and the pairwise comparisons will be computed separately for the +split data.frames.} -\item{test_options}{list with options to pass down to \code{\link[=compare_two_models]{compare_two_models()}}. -To change only one of the default options, just pass a list as input with -the name of the argument you want to change. All elements not included in the -list will be set to the default (so passing an empty list would result in the -default options).} +\item{metric}{A character vector of length one with the metric to do the +comparison on. The default is "auto", meaning that either "interval_score", +"crps", or "brier_score" will be selected where available. +See \code{\link[=available_metrics]{available_metrics()}} for available metrics.} \item{baseline}{character vector of length one that denotes the baseline model against which to compare other models.} -\item{by}{character vector of columns to group scoring by. This should be the -lowest level of grouping possible, i.e. the unit of the individual -observation. This is important as many functions work on individual -observations. If you want a different level of aggregation, you should use -\code{summarise_by} to aggregate the individual scores. -Also not that the pit will be computed using \code{summarise_by} instead of \code{by}} - -\item{summarise_by}{character vector of columns to group the summary by. By -default, this is equal to \code{by} and no summary takes place. But sometimes you -may want to to summarise over categories different from the scoring. -\code{summarise_by} is also the grouping level used to compute (and possibly plot) -the probability integral transform(pit).} +\item{...}{additional arguments, such as test options that can get passed +down to lower level functions. The following options are available: +\code{one_sided} (Boolean, default is \code{FALSE}, whether two conduct a one-sided +instead of a two-sided test), \code{test_type} (character, either "non_parametric" +or "permutation" determining which kind of test shall be conducted to +determine p-values. Default is "non-parametric), \code{n_permutations} (number of +permutations for a permutation test. Default is 999). See +\code{\link[=compare_two_models]{compare_two_models()}} for more information.} } \value{ A ggplot2 object with a coloured table of summarised scores @@ -55,25 +53,25 @@ The implementation of the permutation test follows the function Andrea Riebler and Michaela Paul. } \examples{ -df <- data.frame(model = rep(c("model1", "model2", "model3"), each = 10), - date = as.Date("2020-01-01") + rep(1:5, each = 2), - location = c(1, 2), - interval_score = (abs(rnorm(30))), - aem = (abs(rnorm(30)))) +df <- data.frame( + model = rep(c("model1", "model2", "model3"), each = 10), + date = as.Date("2020-01-01") + rep(1:5, each = 2), + location = c(1, 2), + interval_score = (abs(rnorm(30))), + ae_median = (abs(rnorm(30))) +) -res <- scoringutils::pairwise_comparison(df, - baseline = "model1") -scoringutils::plot_pairwise_comparison(res) +res <- pairwise_comparison(df, + baseline = "model1" +) +plot_pairwise_comparison(res) -eval <- scoringutils::eval_forecasts(scoringutils::quantile_example_data) -pairwise <- pairwise_comparison(eval, summarise_by = c("model")) +eval <- score(example_quantile) +pairwise_comparison(eval, by = c("model")) } \author{ -Johannes Bracher, https://jbracher.github.io/ - -Nikos Bosse - Nikos Bosse \email{nikosbosse@gmail.com} Johannes Bracher, \email{johannes.bracher@kit.edu} } +\keyword{scoring} diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd index 54f7b0bf3..df25b04be 100644 --- a/man/pairwise_comparison_one_group.Rd +++ b/man/pairwise_comparison_one_group.Rd @@ -4,43 +4,36 @@ \alias{pairwise_comparison_one_group} \title{Do Pairwise Comparison for one Set of Forecasts} \usage{ -pairwise_comparison_one_group( - scores, - metric, - test_options, - baseline, - by, - summarise_by -) +pairwise_comparison_one_group(scores, metric, baseline, by, ...) } \arguments{ -\item{scores}{A data.frame of unsummarised scores as produced by -\code{\link[=eval_forecasts]{eval_forecasts()}}} +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} -\item{metric}{A character vector of length one with the metric to do -the comparison on.} - -\item{test_options}{list with options to pass down to \code{\link[=compare_two_models]{compare_two_models()}}. -To change only one of the default options, just pass a list as input with -the name of the argument you want to change. All elements not included in the -list will be set to the default (so passing an empty list would result in the -default options).} +\item{metric}{A character vector of length one with the metric to do the +comparison on. The default is "auto", meaning that either "interval_score", +"crps", or "brier_score" will be selected where available. +See \code{\link[=available_metrics]{available_metrics()}} for available metrics.} \item{baseline}{character vector of length one that denotes the baseline model against which to compare other models.} -\item{by}{character vector of columns to group scoring by. This should be the -lowest level of grouping possible, i.e. the unit of the individual -observation. This is important as many functions work on individual -observations. If you want a different level of aggregation, you should use -\code{summarise_by} to aggregate the individual scores. -Also not that the pit will be computed using \code{summarise_by} instead of \code{by}} +\item{by}{character vector with names of columns present in the input +data.frame. \code{by} determines how pairwise comparisons will be computed. +You will get a relative skill score for every grouping level determined in +\code{by}. If, for example, \code{by = c("model", "location")}. Then you will get a +separate relative skill score for every model in every location. Internally, +the data.frame will be split according \code{by} (but removing "model" before +splitting) and the pairwise comparisons will be computed separately for the +split data.frames.} -\item{summarise_by}{character vector of columns to group the summary by. By -default, this is equal to \code{by} and no summary takes place. But sometimes you -may want to to summarise over categories different from the scoring. -\code{summarise_by} is also the grouping level used to compute (and possibly plot) -the probability integral transform(pit).} +\item{...}{additional arguments, such as test options that can get passed +down to lower level functions. The following options are available: +\code{one_sided} (Boolean, default is \code{FALSE}, whether two conduct a one-sided +instead of a two-sided test), \code{test_type} (character, either "non_parametric" +or "permutation" determining which kind of test shall be conducted to +determine p-values. Default is "non-parametric), \code{n_permutations} (number of +permutations for a permutation test. Default is 999). See +\code{\link[=compare_two_models]{compare_two_models()}} for more information.} } \description{ This function does the pairwise comparison for one set of forecasts, but @@ -52,3 +45,4 @@ subgroup is managed from \code{\link[=pairwise_comparison_one_group]{pairwise_co actually do the comparison between two models over a subset of common forecasts it calls \code{\link[=compare_two_models]{compare_two_models()}}. } +\keyword{internal} diff --git a/man/permutation_test.Rd b/man/permutation_test.Rd new file mode 100644 index 000000000..e889ee17e --- /dev/null +++ b/man/permutation_test.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{permutation_test} +\alias{permutation_test} +\title{Simple permutation test} +\usage{ +permutation_test( + scores1, + scores2, + n_permutation = 999, + one_sided = FALSE, + comparison_mode = c("difference", "ratio") +) +} +\value{ +p-value of the permutation test +} +\description{ +#' The implementation of the permutation test follows the +function +\code{permutationTest} from the \code{surveillance} package by Michael Höhle, +Andrea Riebler and Michaela Paul. +} +\keyword{internal} diff --git a/man/pit.Rd b/man/pit.Rd index 6c26bce42..1ba491cf1 100644 --- a/man/pit.Rd +++ b/man/pit.Rd @@ -2,121 +2,39 @@ % Please edit documentation in R/pit.R \name{pit} \alias{pit} -\title{Probability Integral Transformation} +\title{Probability Integral Transformation (data.frame Format)} \usage{ -pit( - true_values, - predictions, - plot = TRUE, - full_output = FALSE, - n_replicates = 50, - num_bins = NULL, - verbose = FALSE -) +pit(data, by, n_replicates = 100) } \arguments{ -\item{true_values}{A vector with the true observed values of size n} +\item{data}{a data.frame with the following columns: \code{true_value}, +\code{prediction}, \code{sample}.} -\item{predictions}{nxN matrix of predictive samples, n (number of rows) being -the number of data points and N (number of columns) the -number of Monte Carlo samples} +\item{by}{Character vector with the columns according to which the +PIT values shall be grouped. If you e.g. have the columns 'model' and +'location' in the data and want to have a PIT histogram for +every model and location, specify \code{by = c("model", "location")}.} -\item{plot}{logical. If \code{TRUE}, a histogram of the PIT values will be -returned as well} - -\item{full_output}{return all individual p_values and computed u_t values -for the randomised PIT. Usually not needed.} - -\item{n_replicates}{the number of tests to perform, -each time re-randomising the PIT} - -\item{num_bins}{the number of bins in the PIT histogram (if \code{plot = TRUE}) -If not given, the square root of n will be used} - -\item{verbose}{if \code{TRUE} (default is \code{FALSE}) more error messages are printed. -Usually, this should not be needed, but may help with debugging.} +\item{n_replicates}{the number of draws for the randomised PIT for +integer predictions.} } \value{ -a list with the following components: -\itemize{ -\item \code{p_value}: p-value of the Anderson-Darling test on the -PIT values. In case of integer forecasts, this will be the mean p_value -from the \code{n_replicates} replicates -\item \code{sd}: standard deviation of the p_value returned. In case of -continuous forecasts, this will be NA as there is only one p_value returned. -\item \code{hist_PIT} a plot object with the PIT histogram. Only returned -if \code{plot = TRUE}. Call \code{plot(PIT(...)$hist_PIT)} to display the histogram. -\item \code{p_values}: all p_values generated from the Anderson-Darling tests -on the (randomised) PIT. Only returned if \code{full_output = TRUE} -\item \code{u}: the u_t values internally computed. Only returned if -\code{full_output = TRUE} -} +a data.table with PIT values according to the grouping specified in +\code{by} } \description{ -Uses a Probability Integral Transformation (PIT) (or a -randomised PIT for integer forecasts) to -assess the calibration of predictive Monte Carlo samples. Returns a -p-values resulting from an Anderson-Darling test for uniformity -of the (randomised) PIT as well as a PIT histogram if specified. +Wrapper around \code{pit()} for use in data.frames } \details{ -Calibration or reliability of forecasts is the ability of a model to -correctly identify its own uncertainty in making predictions. In a model -with perfect calibration, the observed data at each time point look as if -they came from the predictive probability distribution at that time. - -Equivalently, one can inspect the probability integral transform of the -predictive distribution at time t, - -\deqn{ -u_t = F_t (x_t) -} - -where \eqn{x_t} is the observed data point at time \eqn{t in t_1, …, t_n}, -n being the number of forecasts, and $F_t$ is the (continuous) predictive -cumulative probability distribution at time t. If the true probability -distribution of outcomes at time t is \eqn{G_t} then the forecasts eqn{F_t} are -said to be ideal if eqn{F_t = G_t} at all times t. In that case, the -probabilities ut are distributed uniformly. - -In the case of discrete outcomes such as incidence counts, -the PIT is no longer uniform even when forecasts are ideal. -In that case a randomised PIT can be used instead: -\deqn{ -u_t = P_t(k_t) + v * (P_t(k_t) - P_t(k_t - 1) ) -} - -where \eqn{k_t} is the observed count, \eqn{P_t(x)} is the predictive -cumulative probability of observing incidence k at time t, -eqn{P_t (-1) = 0} by definition and v is standard uniform and independent -of k. If \eqn{P_t} is the true cumulative -probability distribution, then \eqn{u_t} is standard uniform. - -The function checks whether integer or continuous forecasts were provided. -It then applies the (randomised) probability integral and tests -the values \eqn{u_t} for uniformity using the -Anderson-Darling test. - -As a rule of thumb, there is no evidence to suggest a forecasting model is -miscalibrated if the p-value found was greater than a threshold of p >= 0.1, -some evidence that it was miscalibrated if 0.01 < p < 0.1, and good -evidence that it was miscalibrated if p <= 0.01. However, the AD-p-values -may be overly strict and there actual usefulness may be questionable. -In this context it should be noted, though, that uniformity of the -PIT is a necessary but not sufficient condition of calibration. +see \code{\link[=pit]{pit()}} } \examples{ +result <- pit(example_continuous, by = "model") +plot_pit(result) -## continuous predictions -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -pit(true_values, predictions) - -## integer predictions -true_values <- rpois(100, lambda = 1:100) -predictions <- replicate(5000, rpois(n = 100, lambda = 1:100)) -pit(true_values, predictions, n_replicates = 5) - +# example with quantile data +result <- pit(example_quantile, by = "model") +plot_pit(result) } \references{ Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, @@ -124,3 +42,4 @@ Rosalind M. Eggo, W. John Edmunds (2019) Assessing the performance of real-time epidemic forecasts: A case study of Ebola in the Western Area region of Sierra Leone, 2014-15, \url{doi:10.1371/journal.pcbi.1006785} } +\keyword{scoring} diff --git a/man/pit_df.Rd b/man/pit_df.Rd deleted file mode 100644 index a6ec8df82..000000000 --- a/man/pit_df.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pit.R -\name{pit_df} -\alias{pit_df} -\title{Probability Integral Transformation (data.frame Format)} -\usage{ -pit_df( - data, - plot = TRUE, - full_output = FALSE, - n_replicates = 100, - num_bins = NULL, - verbose = FALSE -) -} -\arguments{ -\item{data}{a data.frame with the following columns: \code{true_value}, -\code{prediction}, \code{sample}} - -\item{plot}{logical. If \code{TRUE}, a histogram of the PIT values will be -returned as well} - -\item{full_output}{return all individual p_values and computed u_t values -for the randomised PIT. Usually not needed.} - -\item{n_replicates}{the number of tests to perform, -each time re-randomising the PIT} - -\item{num_bins}{the number of bins in the PIT histogram (if \code{plot = TRUE}) -If not given, the square root of n will be used} - -\item{verbose}{if \code{TRUE} (default is \code{FALSE}) more error messages are printed. -Usually, this should not be needed, but may help with debugging.} -} -\value{ -a list with the following components: -\itemize{ -\item \code{data}: the input data.frame (not including rows where prediction is \code{NA}), -with added columns \code{pit_p_val} and \code{pit_sd} -\item \code{hist_PIT} a plot object with the PIT histogram. Only returned -if \code{plot = TRUE}. Call -\code{plot(PIT(...)$hist_PIT)} to display the histogram. -\item \code{p_values}: all p_values generated from the Anderson-Darling tests on the -(randomised) PIT. Only returned if \code{full_output = TRUE} -\item \code{u}: the u_t values internally computed. Only returned if -\code{full_output = TRUE} -} -} -\description{ -Wrapper around \code{pit()} for use in data.frames -} -\details{ -see \code{\link[=pit]{pit()}} -} -\examples{ -example <- scoringutils::continuous_example_data -result <- pit_df(example, full_output = TRUE) - -} -\references{ -Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, -Rosalind M. Eggo, W. John Edmunds (2019) Assessing the performance of -real-time epidemic forecasts: A case study of Ebola in the Western Area -region of Sierra Leone, 2014-15, \url{doi:10.1371/journal.pcbi.1006785} -} diff --git a/man/pit_df_fast.Rd b/man/pit_df_fast.Rd deleted file mode 100644 index 9a167c4ac..000000000 --- a/man/pit_df_fast.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pit.R -\name{pit_df_fast} -\alias{pit_df_fast} -\title{Probability Integral Transformation (data.frame Format, fast version)} -\usage{ -pit_df_fast(data, n_replicates = 100, by = by) -} -\arguments{ -\item{data}{a data.frame with the following columns: \code{true_value}, -\code{prediction}, \code{sample}} - -\item{n_replicates}{the number of tests to perform, -each time re-randomising the PIT} - -\item{by}{character vector with categories to iterate over} -} -\value{ -the input data.frame (not including rows where prediction is \code{NA}), -with added columns \code{pit_p_val} and \code{pit_sd} -} -\description{ -Wrapper around \code{pit()} for fast use in data.frames. This version -of the pit does not do allow any plotting, but can iterate over categories -in a data.frame as specified in the \code{by} argument. -} -\details{ -see \code{\link[=pit]{pit()}} -} -\examples{ -example <- scoringutils::continuous_example_data -result <- pit_df(example, full_output = TRUE) - -} -\references{ -Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, -Rosalind M. Eggo, W. John Edmunds (2019) Assessing the performance of -real-time epidemic forecasts: A case study of Ebola in the Western Area -region of Sierra Leone, 2014-15, \url{doi:10.1371/journal.pcbi.1006785} -} diff --git a/man/pit_sample.Rd b/man/pit_sample.Rd new file mode 100644 index 000000000..dfee95e1c --- /dev/null +++ b/man/pit_sample.Rd @@ -0,0 +1,100 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pit.R +\name{pit_sample} +\alias{pit_sample} +\title{Probability Integral Transformation (sample-based version)} +\usage{ +pit_sample(true_values, predictions, n_replicates = 100) +} +\arguments{ +\item{true_values}{A vector with the true observed values of size n} + +\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} + +\item{n_replicates}{the number of draws for the randomised PIT for +integer predictions.} +} +\value{ +A vector with PIT-values. For continuous forecasts, the vector will +correspond to the length of \code{true_values}. For integer forecasts, a +randomised PIT will be returned of length +\code{length(true_values) * n_replicates} +} +\description{ +Uses a Probability Integral Transformation (PIT) (or a +randomised PIT for integer forecasts) to +assess the calibration of predictive Monte Carlo samples. Returns a +p-values resulting from an Anderson-Darling test for uniformity +of the (randomised) PIT as well as a PIT histogram if specified. +} +\details{ +Calibration or reliability of forecasts is the ability of a model to +correctly identify its own uncertainty in making predictions. In a model +with perfect calibration, the observed data at each time point look as if +they came from the predictive probability distribution at that time. + +Equivalently, one can inspect the probability integral transform of the +predictive distribution at time t, + +\deqn{ +u_t = F_t (x_t) +} + +where \eqn{x_t} is the observed data point at time \eqn{t in t_1, …, t_n}{t +\text{ in } t_1, …, t_n}, n being the number of forecasts, and \eqn{F_t} is +the (continuous) predictive cumulative probability distribution at time t. If +the true probability distribution of outcomes at time t is \eqn{G_t} then the +forecasts \eqn{F_t} are said to be ideal if \eqn{F_t = G_t} at all times t. +In that case, the probabilities \eqn{u_t} are distributed uniformly. + +In the case of discrete outcomes such as incidence counts, +the PIT is no longer uniform even when forecasts are ideal. +In that case a randomised PIT can be used instead: +\deqn{ +u_t = P_t(k_t) + v * (P_t(k_t) - P_t(k_t - 1) ) +} + +where \eqn{k_t} is the observed count, \eqn{P_t(x)} is the predictive +cumulative probability of observing incidence k at time t, +\eqn{P_t (-1) = 0} by definition and v is standard uniform and independent +of k. If \eqn{P_t} is the true cumulative +probability distribution, then \eqn{u_t} is standard uniform. + +The function checks whether integer or continuous forecasts were provided. +It then applies the (randomised) probability integral and tests +the values \eqn{u_t} for uniformity using the +Anderson-Darling test. + +As a rule of thumb, there is no evidence to suggest a forecasting model is +miscalibrated if the p-value found was greater than a threshold of p >= 0.1, +some evidence that it was miscalibrated if 0.01 < p < 0.1, and good +evidence that it was miscalibrated if p <= 0.01. However, the AD-p-values +may be overly strict and there actual usefulness may be questionable. +In this context it should be noted, though, that uniformity of the +PIT is a necessary but not sufficient condition of calibration. +} +\examples{ +## continuous predictions +true_values <- rnorm(30, mean = 1:30) +predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) +pit <- pit_sample(true_values, predictions) +plot_pit(pit) + +## integer predictions +true_values <- rpois(100, lambda = 1:100) +predictions <- replicate(5000, rpois(n = 100, lambda = 1:100)) +pit <- pit_sample(true_values, predictions, n_replicates = 50) +plot_pit(pit) +} +\references{ +Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, +Rosalind M. Eggo, W. John Edmunds (2019) Assessing the performance of +real-time epidemic forecasts: A case study of Ebola in the Western Area +region of Sierra Leone, 2014-15, \url{doi:10.1371/journal.pcbi.1006785} +} +\seealso{ +\code{\link[=pit]{pit()}} +} +\keyword{metric} diff --git a/man/plot_avail_forecasts.Rd b/man/plot_avail_forecasts.Rd new file mode 100644 index 000000000..4e86bd160 --- /dev/null +++ b/man/plot_avail_forecasts.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_avail_forecasts} +\alias{plot_avail_forecasts} +\title{Visualise Where Forecasts Are Available} +\usage{ +plot_avail_forecasts( + avail_forecasts, + y = "model", + x = "forecast_date", + make_x_factor = TRUE, + show_numbers = TRUE +) +} +\arguments{ +\item{avail_forecasts}{data.frame with a column called \verb{Number forecasts} as +produced by \code{\link[=avail_forecasts]{avail_forecasts()}}} + +\item{y}{character vector of length one that denotes the name of the column +to appear on the y-axis of the plot. Default is "model".} + +\item{x}{character vector of length one that denotes the name of the column +to appear on the x-axis of the plot. Default is "forecast_date".} + +\item{make_x_factor}{logical (default is TRUE). Whether or not to convert +the variable on the x-axis to a factor. This has an effect e.g. if dates +are shown on the x-axis.} + +\item{show_numbers}{logical (default is \code{TRUE}) that indicates whether +or not to show the actual count numbers on the plot} +} +\value{ +ggplot object with a plot of interval coverage +} +\description{ +Visualise Where Forecasts Are Available +} +\examples{ +library(scoringutils) +library(ggplot2) +avail_forecasts <- avail_forecasts(example_quantile, + by = c( + "model", "target_type", + "target_end_date" + ) +) +plot_avail_forecasts(avail_forecasts, + x = "target_end_date", + show_numbers = FALSE +) + + facet_wrap("target_type") +} diff --git a/man/plot_correlation.Rd b/man/plot_correlation.Rd new file mode 100644 index 000000000..d71679345 --- /dev/null +++ b/man/plot_correlation.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correlations.R +\name{plot_correlation} +\alias{plot_correlation} +\title{Plot Correlation Between Metrics} +\usage{ +plot_correlation(correlations) +} +\arguments{ +\item{correlations}{A data.table of correlations between scores as produced +by \code{\link[=correlation]{correlation()}}.} +} +\value{ +A ggplot2 object showing a coloured matrix of correlations +between metrics +} +\description{ +Plots a heatmap of correlations between different metrics +} +\examples{ +scores <- score(example_quantile) +correlations <- correlation(scores) +plot_correlation(correlations) +} diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd new file mode 100644 index 000000000..f57377807 --- /dev/null +++ b/man/plot_heatmap.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_heatmap} +\alias{plot_heatmap} +\title{Create a Heatmap of a Scoring Metric} +\usage{ +plot_heatmap(scores, y = "model", x, metric) +} +\arguments{ +\item{scores}{A data.frame of scores based on quantile forecasts as +produced by \code{\link[=score]{score()}}.} + +\item{y}{The variable from the scores you want to show on the y-Axis. The +default for this is "model"} + +\item{x}{The variable from the scores you want to show on the x-Axis. This +could be something like "horizon", or "location"} + +\item{metric}{the metric that determines the value and colour shown in the +tiles of the heatmap} +} +\value{ +A ggplot2 object showing a heatmap of the desired metric +} +\description{ +This function can be used to create a heatmap of one metric across different +groups, e.g. the interval score obtained by several forecasting models in +different locations. +} +\examples{ +library("scoringutils") +scores <- score(example_quantile) +scores <- summarise_scores(scores, by = c("model", "target_type", "range")) + +plot_heatmap(scores, x = "target_type", metric = "bias") +} diff --git a/man/plot_interval_coverage.Rd b/man/plot_interval_coverage.Rd new file mode 100644 index 000000000..8b64ff15e --- /dev/null +++ b/man/plot_interval_coverage.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_interval_coverage} +\alias{plot_interval_coverage} +\title{Plot Interval Coverage} +\usage{ +plot_interval_coverage(scores, colour = "model") +} +\arguments{ +\item{scores}{A data.frame of scores based on quantile forecasts as +produced by \code{\link[=score]{score()}} or \code{\link[=summarise_scores]{summarise_scores()}}. Note that "range" must be included +in the \code{by} argument when running \code{\link[=summarise_scores]{summarise_scores()}}} + +\item{colour}{According to which variable shall the graphs be coloured? +Default is "model".} +} +\value{ +ggplot object with a plot of interval coverage +} +\description{ +Plot interval coverage +} +\examples{ +library("scoringutils") +scores <- score(example_quantile) +scores <- summarise_scores(scores, by = c("model", "range")) +plot_interval_coverage(scores) +} diff --git a/man/plot_pairwise_comparison.Rd b/man/plot_pairwise_comparison.Rd index 09da847d4..d1de65054 100644 --- a/man/plot_pairwise_comparison.Rd +++ b/man/plot_pairwise_comparison.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pairwise-comparisons.R +% Please edit documentation in R/plot.R \name{plot_pairwise_comparison} \alias{plot_pairwise_comparison} \title{Plot Heatmap of Pairwise Comparisons} @@ -7,11 +7,7 @@ plot_pairwise_comparison( comparison_result, type = c("mean_scores_ratio", "pval", "together"), - smaller_is_good = TRUE, - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap" + smaller_is_good = TRUE ) } \arguments{ @@ -25,36 +21,23 @@ pairwise comparison. Default is "mean_scores_ratio"} \item{smaller_is_good}{logical (default is \code{TRUE}) that indicates whether smaller or larger values are to be interpreted as 'good' (as you could just invert the mean scores ratio)} - -\item{facet_formula}{facetting formula passed down to ggplot. Default is -\code{NULL}} - -\item{scales}{scales argument that gets passed down to ggplot. Only necessary -if you make use of facetting. Default is "free_y"} - -\item{ncol}{Number of columns for facet wrap. Only relevant if -\code{facet_formula} is given and \code{facet_wrap_or_grid == "facet_wrap"}} - -\item{facet_wrap_or_grid}{Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} } \description{ Creates a heatmap of the ratios or pvalues from a pairwise comparison between models } \examples{ -df <- data.frame(model = rep(c("model1", "model2", "model3"), each = 10), - id = rep(1:10), - interval_score = abs(rnorm(30, mean = rep(c(1, 1.3, 2), each = 10))), - aem = (abs(rnorm(30)))) +library(ggplot2) +library(scoringutils) +df <- data.frame( + model = rep(c("model1", "model2", "model3"), each = 10), + id = rep(1:10), + interval_score = abs(rnorm(30, mean = rep(c(1, 1.3, 2), each = 10))), + ae_median = (abs(rnorm(30))) +) -data <- scoringutils::quantile_example_data -scores <- scoringutils::eval_forecasts(data) -pairwise <- pairwise_comparison(scores, - summarise_by = "value_desc") -scoringutils::plot_pairwise_comparison(pairwise, - facet_formula = ~ value_desc, - scales = "fixed") +scores <- score(example_quantile) +pairwise <- pairwise_comparison(scores, by = "target_type") +plot_pairwise_comparison(pairwise) + + facet_wrap(~target_type) } diff --git a/man/plot_pit.Rd b/man/plot_pit.Rd new file mode 100644 index 000000000..2d8e073c0 --- /dev/null +++ b/man/plot_pit.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_pit} +\alias{plot_pit} +\title{PIT Histogram} +\usage{ +plot_pit(pit, num_bins = "auto", breaks = NULL) +} +\arguments{ +\item{pit}{either a vector with the PIT values of size n, or a data.frame as +produced by \code{\link[=pit]{pit()}}} + +\item{num_bins}{the number of bins in the PIT histogram, default is "auto". +When \code{num_bins == "auto"}, \code{\link[=plot_pit]{plot_pit()}} will either display 10 bins, or it +will display a bin for each available quantile in case you passed in data in +a quantile-based format. +You can control the number of bins by supplying a number. This is fine for +sample-based pit histograms, but may fail for quantile-based formats. In this +case it is preferred to supply explicit breaks points using the \code{breaks} +argument.} + +\item{breaks}{numeric vector with the break points for the bins in the +PIT histogram. This is preferred when creating a PIT histogram based on +quantile-based data. Default is \code{NULL} and breaks will be determined by +\code{num_bins}.} +} +\value{ +vector with the scoring values +} +\description{ +Make a simple histogram of the probability integral transformed values to +visually check whether a uniform distribution seems likely. +} +\examples{ +library(scoringutils) + +# PIT histogram in vector based format +true_values <- rnorm(30, mean = 1:30) +predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) +pit <- pit_sample(true_values, predictions) +plot_pit(pit) + +# quantile-based pit +pit <- pit(example_quantile, by = c("model")) +plot_pit(pit, breaks = seq(0.1, 1, 0.1)) + +# sample-based pit +pit <- pit(example_integer, by = c("model")) +plot_pit(pit) +} diff --git a/man/plot_predictions.Rd b/man/plot_predictions.Rd index e12231140..505489faa 100644 --- a/man/plot_predictions.Rd +++ b/man/plot_predictions.Rd @@ -20,15 +20,12 @@ plot_predictions( scales = "free_y", allow_truth_without_pred = FALSE, remove_from_truth = c("model", "forecaster", "quantile", "prediction", "sample", - "interval"), - xlab = x, - ylab = "True and predicted values", - verbose = TRUE + "interval") ) } \arguments{ \item{data}{a data.frame that follows the same specifications outlined in -\code{\link[=eval_forecasts]{eval_forecasts()}}. The data.frame needs to have columns called +\code{\link[=score]{score()}}. The data.frame needs to have columns called "true_value", "prediction" and then either a column called sample, or one called "quantile" or two columns called "range" and "boundary". Internally, these will be separated into a truth and forecast data set in order to be @@ -88,12 +85,6 @@ models or forecasters don't cover the same periods. Removing these columns from the truth data makes sure that nevertheless all available truth data is plotted (instead of having different true values depending on the period covered by a certain model).} - -\item{xlab}{Label for the x-axis. Default is the variable name on the x-axis} - -\item{ylab}{Label for the y-axis. Default is "True and predicted values"} - -\item{verbose}{print out additional helpful messages (default is TRUE)} } \value{ ggplot object with a plot of true vs predicted values @@ -102,21 +93,20 @@ ggplot object with a plot of true vs predicted values Make a plot of observed and predicted values } \examples{ -example1 <- scoringutils::continuous_example_data -example2 <- scoringutils::range_example_data_long +example1 <- scoringutils::example_continuous -scoringutils::plot_predictions(example1, x = "value_date", - filter_truth = list('value_date <= "2020-06-22"', - 'value_date > "2020-05-01"'), - filter_forecasts = list("model == 'SIRCOVID'", - 'creation_date == "2020-06-22"'), - facet_formula = geography ~ value_desc) - -scoringutils::plot_predictions(example2, x = "value_date", - filter_truth = list('value_date <= "2020-06-22"', - 'value_date > "2020-05-01"'), - filter_forecasts = list("model == 'SIRCOVID'", - 'creation_date == "2020-06-22"'), - allow_truth_without_pred = TRUE, - facet_formula = geography ~ value_desc) +plot_predictions( + example1, + x = "target_end_date", + filter_truth = list( + 'target_end_date <= "2021-07-22"', + 'target_end_date > "2021-05-01"' + ), + filter_forecasts = list( + "model == 'EuroCOVIDhub-ensemble'", + 'forecast_date == "2021-06-07"' + ), + facet_formula = location ~ target_type, + range = c(0, 50, 90, 95) +) } diff --git a/man/plot_quantile_coverage.Rd b/man/plot_quantile_coverage.Rd new file mode 100644 index 000000000..234d80d0f --- /dev/null +++ b/man/plot_quantile_coverage.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_quantile_coverage} +\alias{plot_quantile_coverage} +\title{Plot Quantile Coverage} +\usage{ +plot_quantile_coverage(scores, colour = "model") +} +\arguments{ +\item{scores}{A data.frame of scores based on quantile forecasts as +produced by \code{\link[=score]{score()}} or \code{\link[=summarise_scores]{summarise_scores()}}. Note that "range" must be included +in the \code{by} argument when running \code{\link[=summarise_scores]{summarise_scores()}}} + +\item{colour}{According to which variable shall the graphs be coloured? +Default is "model".} +} +\value{ +ggplot object with a plot of interval coverage +} +\description{ +Plot quantile coverage +} +\examples{ +library("scoringutils") +scores <- score(example_quantile) +scores <- summarise_scores(scores, by = c("model", "quantile")) +plot_quantile_coverage(scores) +} diff --git a/man/plot_ranges.Rd b/man/plot_ranges.Rd new file mode 100644 index 000000000..9cad8854c --- /dev/null +++ b/man/plot_ranges.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_ranges} +\alias{plot_ranges} +\title{Plot Metrics by Range of the Prediction Interval} +\usage{ +plot_ranges(scores, y = "interval_score", x = "model", colour = "range") +} +\arguments{ +\item{scores}{A data.frame of scores based on quantile forecasts as +produced by \code{\link[=score]{score()}} or \code{\link[=summarise_scores]{summarise_scores()}}. Note that "range" must be included +in the \code{by} argument when running \code{\link[=summarise_scores]{summarise_scores()}}} + +\item{y}{The variable from the scores you want to show on the y-Axis. +This could be something like "interval_score" (the default) or "dispersion"} + +\item{x}{The variable from the scores you want to show on the x-Axis. +Usually this will be "model"} + +\item{colour}{Character vector of length one used to determine a variable +for colouring dots. The Default is "range".} +} +\value{ +A ggplot2 object showing a contributions from the three components of +the weighted interval score +} +\description{ +Visualise the metrics by range, e.g. if you are interested how different +interval ranges contribute to the overall interval score, or how +sharpness / dispersion changes by range. +} +\examples{ +library("scoringutils") +library(ggplot2) +scores <- score(example_quantile) +scores <- summarise_scores(scores, by = c("model", "target_type", "range")) + +plot_ranges(scores, x = "model") + + facet_wrap(~target_type, scales = "free") + +# visualise dispersion instead of interval score +plot_ranges(scores, y = "dispersion", x = "model") + + facet_wrap(~target_type) +} diff --git a/man/plot_score_table.Rd b/man/plot_score_table.Rd new file mode 100644 index 000000000..d508c59fc --- /dev/null +++ b/man/plot_score_table.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_score_table} +\alias{plot_score_table} +\title{Plot Coloured Score Table} +\usage{ +plot_score_table(scores, y = "model", by = NULL, metrics = NULL) +} +\arguments{ +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} + +\item{y}{the variable to be shown on the y-axis. Instead of a single character string, +you can also specify a vector with column names, e.g. +\code{y = c("model", "location")}. These column names will be concatenated +to create a unique row identifier (e.g. "model1_location1").} + +\item{by}{A character vector that determines how the colour shading for the +plot gets computed. By default (\code{NULL}), shading will be determined per +metric, but you can provide additional column names (see examples).} + +\item{metrics}{A character vector with the metrics to show. If set to +\code{NULL} (default), all metrics present in \code{scores} will be shown.} +} +\value{ +A ggplot2 object with a coloured table of summarised scores +} +\description{ +Plots a coloured table of summarised scores obtained using +\code{\link[=score]{score()}}. +} +\examples{ +library(ggplot2) +library(magrittr) # pipe operator + +scores <- score(example_quantile) \%>\% + summarise_scores(by = c("model", "target_type")) \%>\% + summarise_scores(fun = signif, digits = 2) + +plot_score_table(scores, y = "model", by = "target_type") + + facet_wrap(~target_type, ncol = 1) + +# can also put target description on the y-axis +plot_score_table(scores, + y = c("model", "target_type"), + by = "target_type") +} diff --git a/man/plot_wis.Rd b/man/plot_wis.Rd new file mode 100644 index 000000000..931ffc474 --- /dev/null +++ b/man/plot_wis.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{plot_wis} +\alias{plot_wis} +\title{Plot Contributions to the Weighted Interval Score} +\usage{ +plot_wis(scores, x = "model", relative_contributions = FALSE, flip = FALSE) +} +\arguments{ +\item{scores}{A data.frame of scores based on quantile forecasts as +produced by \code{\link[=score]{score()}} and summarised using \code{\link[=summarise_scores]{summarise_scores()}}} + +\item{x}{The variable from the scores you want to show on the x-Axis. +Usually this will be "model".} + +\item{relative_contributions}{show relative contributions instead of absolute +contributions. Default is FALSE and this functionality is not available yet.} + +\item{flip}{boolean (default is \code{FALSE}), whether or not to flip the axes.} +} +\value{ +A ggplot2 object showing a contributions from the three components of +the weighted interval score +} +\description{ +Visualise the components of the weighted interval score: penalties for +over-prediction, under-prediction and for high dispersion (lack of sharpness) +} +\examples{ +library("scoringutils") +library(ggplot2) +scores <- score(example_quantile) +scores <- summarise_scores(scores, by = c("model", "target_type")) + +plot_wis(scores, + x = "model", + relative_contributions = TRUE +) + + facet_wrap(~target_type) +plot_wis(scores, + x = "model", + relative_contributions = FALSE +) + + facet_wrap(~target_type, scales = "free_x") +} +\references{ +Bracher J, Ray E, Gneiting T, Reich, N (2020) Evaluating epidemic forecasts +in an interval format. \url{https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008618} +} diff --git a/man/print.scoringutils_check.Rd b/man/print.scoringutils_check.Rd index 6bd694eec..19a0e03b3 100644 --- a/man/print.scoringutils_check.Rd +++ b/man/print.scoringutils_check.Rd @@ -16,3 +16,8 @@ Helper function that prints the output generated by \code{\link[=check_forecasts]{check_forecasts()}} } +\examples{ +check <- check_forecasts(example_quantile) +print(check) +} +\keyword{check-forecasts} diff --git a/man/quantile_coverage.Rd b/man/quantile_coverage.Rd deleted file mode 100644 index 5589709cb..000000000 --- a/man/quantile_coverage.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{quantile_coverage} -\alias{quantile_coverage} -\title{Plot Quantile Coverage} -\usage{ -quantile_coverage( - summarised_scores, - colour = "model", - facet_formula = NULL, - facet_wrap_or_grid = "facet_wrap", - scales = "free_y" -) -} -\arguments{ -\item{summarised_scores}{Summarised scores as produced by -\code{\link[=eval_forecasts]{eval_forecasts()}}. Make sure that "quantile" is included in -\code{summarise_by} when producing the summarised scores} - -\item{colour}{According to which variable shall the graphs be coloured? -Default is "model".} - -\item{facet_formula}{formula for facetting in ggplot. If this is \code{NULL} -(the default), no facetting will take place} - -\item{facet_wrap_or_grid}{Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} - -\item{scales}{scales argument that gets passed down to ggplot. Only necessary -if you make use of facetting. Default is "free_y"} -} -\value{ -ggplot object with a plot of interval coverage -} -\description{ -Plot quantile coverage -} -\examples{ -example1 <- scoringutils::quantile_example_data -scores <- scoringutils::eval_forecasts(example1, - summarise_by = c("model", "quantile")) -quantile_coverage(scores) -} diff --git a/man/quantile_example_data.Rd b/man/quantile_example_data.Rd deleted file mode 100644 index 98f5b5bea..000000000 --- a/man/quantile_example_data.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{quantile_example_data} -\alias{quantile_example_data} -\title{Quantile Example Data} -\format{ -A data frame with -\describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{true_value}{true observed values} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} -\item{quantile}{quantile of the corresponding prediction} -\item{prediction}{quantile predictions} -\item{horizon}{forecast horizon in days} - -} -} -\usage{ -quantile_example_data -} -\description{ -A data set with predictions for different quantities relevant in the -2020 UK Covid-19 epidemic. -} -\keyword{datasets} diff --git a/man/quantile_score.Rd b/man/quantile_score.Rd new file mode 100644 index 000000000..0fbc09244 --- /dev/null +++ b/man/quantile_score.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interval_score.R +\name{quantile_score} +\alias{quantile_score} +\title{Quantile Score} +\usage{ +quantile_score(true_values, predictions, quantiles, weigh = TRUE) +} +\arguments{ +\item{true_values}{A vector with the true observed values of size n} + +\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} + +\item{quantiles}{vector of size n with the quantile values of the +corresponding predictions.} + +\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged +into an interval score that, in the limit, corresponds to CRPS. Alpha is the +value that corresponds to the (alpha/2) or (1 - alpha/2) quantiles provided +and will be computed from the quantile. Alpha is the decimal value that +represents how much is outside a central prediction interval (E.g. for a +90 percent central prediction interval, alpha is 0.1). Default: \code{TRUE}.} +} +\value{ +vector with the scoring values +} +\description{ +Proper Scoring Rule to score quantile predictions. Smaller values are better. +The quantile score is +closely related to the Interval score (see \code{\link[=interval_score]{interval_score()}}) and is +the quantile equivalent that works with single quantiles instead of +central prediction intervals. +} +\examples{ +true_values <- rnorm(10, mean = 1:10) +alpha <- 0.5 + +lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) +upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) + +qs_lower <- quantile_score(true_values, + predictions = lower, + quantiles = alpha / 2 +) +qs_upper <- quantile_score(true_values, + predictions = upper, + quantiles = 1 - alpha / 2 +) +interval_score <- (qs_lower + qs_upper) / 2 +} +\references{ +Strictly Proper Scoring Rules, Prediction,and Estimation, +Tilmann Gneiting and Adrian E. Raftery, 2007, Journal of the American +Statistical Association, Volume 102, 2007 - Issue 477 + +Evaluating epidemic forecasts in an interval format, +Johannes Bracher, Evan L. Ray, Tilmann Gneiting and Nicholas G. Reich, +\url{https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008618} +} +\keyword{metric} diff --git a/man/quantile_to_long.Rd b/man/quantile_to_long.Rd deleted file mode 100644 index 33fcd4905..000000000 --- a/man/quantile_to_long.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{quantile_to_long} -\alias{quantile_to_long} -\title{Pivot Range Format Forecasts From Wide to Long Format} -\usage{ -quantile_to_long(data) -} -\arguments{ -\item{data}{a data.frame following the specifications from -\code{\link[=eval_forecasts]{eval_forecasts()}}) for quantile forecasts. For an example, see -\code{\link[=range_example_data_long]{range_example_data_long()}})} -} -\value{ -a data.frame in long format -} -\description{ -Legacy function that will not be supported in future updates. -} diff --git a/man/quantile_to_range.Rd b/man/quantile_to_range.Rd deleted file mode 100644 index a63f5e828..000000000 --- a/man/quantile_to_range.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{quantile_to_range} -\alias{quantile_to_range} -\title{Change Data from a Plain Quantile Format to a Long Range Format} -\usage{ -quantile_to_range(data, keep_quantile_col = FALSE) -} -\arguments{ -\item{data}{a data.frame following the specifications shown in the example -\code{\link[=range_example_data_long]{range_example_data_long()}})} - -\item{keep_quantile_col}{keep the quantile column in the final -output after transformation (default is FALSE)} -} -\value{ -a data.frame in long format -} -\description{ -Legacy function that will not be supported in future updates. -} diff --git a/man/quantile_to_range_long.Rd b/man/quantile_to_range_long.Rd index 1e91d6803..ad214f3e5 100644 --- a/man/quantile_to_range_long.Rd +++ b/man/quantile_to_range_long.Rd @@ -7,8 +7,7 @@ quantile_to_range_long(data, keep_quantile_col = TRUE) } \arguments{ -\item{data}{a data.frame following the specifications shown in the example -\code{\link[=range_example_data_long]{range_example_data_long()}})} +\item{data}{a data.frame in quantile format} \item{keep_quantile_col}{keep the quantile column in the final output after transformation (default is FALSE)} @@ -19,15 +18,5 @@ a data.frame in a long interval range format \description{ Transform data from a format that uses quantiles only to one that uses interval ranges to denote quantiles. - -Given a data.frame that follows the structure shown in -\code{\link[=quantile_example_data]{quantile_example_data()}}, the function outputs the same -data in a long format as (as shown in -\code{\link[=range_example_data_long]{range_example_data_long()}}). -} -\examples{ -quantile <- scoringutils::quantile_example_data - -long <- scoringutils::quantile_to_range_long(quantile) - } +\keyword{internal} diff --git a/man/quantile_to_wide.Rd b/man/quantile_to_wide.Rd deleted file mode 100644 index 13bf435c2..000000000 --- a/man/quantile_to_wide.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{quantile_to_wide} -\alias{quantile_to_wide} -\title{Pivot Range Format Forecasts From Long to Wide Format} -\usage{ -quantile_to_wide(data) -} -\arguments{ -\item{data}{a data.frame following the specifications from -\code{\link[=eval_forecasts]{eval_forecasts()}}) for quantile forecasts. For an example, see -\code{\link[=range_example_data_long]{range_example_data_long()}})} -} -\value{ -a data.frame in wide format -} -\description{ -Legacy function that will not be supported in future updates. -} diff --git a/man/range_example_data_long.Rd b/man/range_example_data_long.Rd deleted file mode 100644 index bd92db768..000000000 --- a/man/range_example_data_long.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{range_example_data_long} -\alias{range_example_data_long} -\title{Range Forecast Example Data (Long Format)} -\format{ -A data frame with: -\describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{true_value}{true observed values} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} -\item{prediction}{value for the lower or upper bound of the given prediction interval} -\item{horizon}{forecast horizon in days} -\item{boundary}{indicate lower or upper bound of prediction interval} -\item{range}{range of the corresponding prediction interval} -} -} -\usage{ -range_example_data_long -} -\description{ -A data set with predictions with different interval ranges relevant in the -2020 UK Covid-19 epidemic. -} -\keyword{datasets} diff --git a/man/range_example_data_semi_wide.Rd b/man/range_example_data_semi_wide.Rd deleted file mode 100644 index dea418c8b..000000000 --- a/man/range_example_data_semi_wide.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{range_example_data_semi_wide} -\alias{range_example_data_semi_wide} -\title{Range Forecast Example Data (Semi-Wide Format)} -\format{ -A data frame with 5,419 rows and 12 columns: -\describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{true_value}{true observed values} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} -\item{horizon}{forecast horizon in days} -\item{range}{range of the corresponding prediction interval} -\item{lower}{prediction for the lower bound of the corresponding interval} -\item{upper}{prediction for the upper bound of the corresponding interval} -} -} -\usage{ -range_example_data_semi_wide -} -\description{ -A data set with predictions with different interval ranges relevant in the -2020 UK Covid-19 epidemic. -} -\keyword{datasets} diff --git a/man/range_example_data_wide.Rd b/man/range_example_data_wide.Rd deleted file mode 100644 index fce617c92..000000000 --- a/man/range_example_data_wide.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{range_example_data_wide} -\alias{range_example_data_wide} -\title{Range Forecast Example Data (Wide Format)} -\format{ -A data frame with: -\describe{ -\item{value_date}{the date for which a prediction was made} -\item{value_type}{the target to be predicted (short form)} -\item{geography}{the region for which a prediction was made} -\item{value_desc}{long form description of the prediction target} -\item{true_value}{true observed values} -\item{model}{name of the model that generated the forecasts} -\item{creation_date}{date on which the forecast was made} -\item{horizon}{forecast horizon in days} -\item{lower_0}{prediction for the lower bound of the 0\% interval range (median)} -\item{lower_10}{prediction for the lower bound of the 10\% interval range} -\item{lower_20}{prediction for the lower bound of the 20\% interval range} -\item{lower_30}{prediction for the lower bound of the 30\% interval range} -\item{lower_40}{prediction for the lower bound of the 40\% interval range} -\item{lower_50}{prediction for the lower bound of the 50\% interval range} -\item{lower_60}{prediction for the lower bound of the 60\% interval range} -\item{lower_70}{prediction for the lower bound of the 70\% interval range} -\item{lower_80}{prediction for the lower bound of the 80\% interval range} -\item{lower_90}{prediction for the lower bound of the 90\% interval range} -\item{upper_0}{prediction for the upper bound of the 0\% interval range} -\item{upper_10}{prediction for the upper bound of the 1\% interval range} -\item{upper_20}{prediction for the upper bound of the 20\% interval range} -\item{upper_30}{prediction for the upper bound of the 30\% interval range} -\item{upper_40}{prediction for the upper bound of the 40\% interval range} -\item{upper_50}{prediction for the upper bound of the 50\% interval range} -\item{upper_60}{prediction for the upper bound of the 60\% interval range} -\item{upper_70}{prediction for the upper bound of the 70\% interval range} -\item{upper_80}{prediction for the upper bound of the 80\% interval range} -\item{upper_90}{prediction for the upper bound of the 90\% interval range} -} -} -\usage{ -range_example_data_wide -} -\description{ -A data set with predictions with different interval ranges relevant in the -2020 UK Covid-19 epidemic. -} -\keyword{datasets} diff --git a/man/range_long_to_quantile.Rd b/man/range_long_to_quantile.Rd index 5342d40f8..52c31d237 100644 --- a/man/range_long_to_quantile.Rd +++ b/man/range_long_to_quantile.Rd @@ -8,8 +8,7 @@ range_long_to_quantile(data, keep_range_col = FALSE) } \arguments{ \item{data}{a data.frame following the specifications from -\code{\link[=eval_forecasts]{eval_forecasts()}}) for quantile forecasts. For an example, see -\code{\link[=range_example_data_long]{range_example_data_long()}})} +\code{\link[=score]{score()}}) for quantile forecasts.} \item{keep_range_col}{keep the range and boundary columns after transformation (default is FALSE)} @@ -20,21 +19,5 @@ a data.frame in a plain quantile format \description{ Transform data from a format that uses interval ranges to denote quantiles to a format that uses quantiles only. - -Given a data.frame that follows the structure shown in -\code{\link[=range_example_data_long]{range_example_data_long()}}, the function outputs the same -data in a long format as (as shown in -\code{\link[=range_example_data_long]{range_example_data_long()}}). This can be useful e.g. for -plotting. If you're data.frame is in a different format, consider running -\code{\link[=range_long_to_wide]{range_long_to_wide()}} first. -} -\examples{ -wide <- range_example_data_wide -semiwide <- range_example_data_semi_wide - -long <- range_wide_to_long(wide) -long2 <- range_wide_to_long(semiwide) - -plain_quantile <- range_long_to_quantile(long2) - } +\keyword{internal} diff --git a/man/range_long_to_wide.Rd b/man/range_long_to_wide.Rd deleted file mode 100644 index 0107af8f4..000000000 --- a/man/range_long_to_wide.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{range_long_to_wide} -\alias{range_long_to_wide} -\title{Pivot Range Format Forecasts From Long to Wide Format} -\usage{ -range_long_to_wide(data) -} -\arguments{ -\item{data}{a data.frame following the specifications from -\code{\link[=eval_forecasts]{eval_forecasts()}}) for quantile forecasts. For an example, see -\code{\link[=range_example_data_long]{range_example_data_long()}})} -} -\value{ -a data.frame in wide format -} -\description{ -Given a data.frame that follows the structure shown in -\code{\link[=range_example_data_long]{range_example_data_long()}}, the function outputs the same -data in a long format as (as shown in -\code{\link[=range_example_data_wide]{range_example_data_wide()}}). This can be useful e.g. for -plotting. -} -\examples{ -long <- scoringutils::range_example_data_long -wide <- scoringutils::range_long_to_wide(long) - -} diff --git a/man/range_plot.Rd b/man/range_plot.Rd deleted file mode 100644 index ccd4e1b81..000000000 --- a/man/range_plot.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{range_plot} -\alias{range_plot} -\title{Plot Metrics by Range of the Prediction Interval} -\usage{ -range_plot( - scores, - y = "interval_score", - x = "model", - colour = "range", - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap", - xlab = x, - ylab = y -) -} -\arguments{ -\item{scores}{A data.frame of scores based on quantile forecasts as -produced by \code{\link[=eval_forecasts]{eval_forecasts()}}. Note that "range" must be included -in the \code{summarise_by} argument when running \code{eval_forecasts}} - -\item{y}{The variable from the scores you want to show on the y-Axis. -This could be something like "interval_score" (the default) or "sharpness"} - -\item{x}{The variable from the scores you want to show on the x-Axis. -Usually this will be "model"} - -\item{colour}{Character vector of length one used to determine a variable -for colouring dots. The Default is "range".} - -\item{facet_formula}{facetting formula passed down to ggplot. Default is -\code{NULL}} - -\item{scales}{scales argument that gets passed down to ggplot. Only necessary -if you make use of facetting. Default is "free_y"} - -\item{ncol}{Number of columns for facet wrap. Only relevant if -\code{facet_formula} is given and \code{facet_wrap_or_grid == "facet_wrap"}} - -\item{facet_wrap_or_grid}{Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} - -\item{xlab}{Label for the x-axis. Default is the variable name on the x-axis} - -\item{ylab}{Label for the y-axis. Default is "WIS contributions"} -} -\value{ -A ggplot2 object showing a contributions from the three components of -the weighted interval score -} -\description{ -Visualise the metrics by range, e.g. if you are interested how different -interval ranges contribute to the overall interval score, or how sharpness -changes by range. -} -\examples{ -scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, - summarise_by = c("model", "value_desc", "range")) - -scoringutils::range_plot(scores, x = "model", facet_formula = ~ value_desc) - -# visualise sharpness instead of interval score -scoringutils::range_plot(scores, y = "sharpness", x = "model", - facet_formula = ~value_desc) - -# we saw above that sharpness values crossed. Let's look at the unweighted WIS -scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, - interval_score_arguments = list(weigh = FALSE), - summarise_by = c("model", "value_desc", "range")) -scoringutils::range_plot(scores, y = "sharpness", x = "model", - facet_formula = ~value_desc) -} diff --git a/man/range_to_quantile.Rd b/man/range_to_quantile.Rd deleted file mode 100644 index d605c94b4..000000000 --- a/man/range_to_quantile.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{range_to_quantile} -\alias{range_to_quantile} -\title{Pivot Change Data from a Range Format to a Quantile Format} -\usage{ -range_to_quantile(data, keep_range_col = FALSE) -} -\arguments{ -\item{data}{a data.frame following the specifications from -\code{\link[=eval_forecasts]{eval_forecasts()}}) for quantile forecasts. For an example, see -\code{\link[=range_example_data_long]{range_example_data_long()}})} - -\item{keep_range_col}{keep the range and boundary columns after -transformation (default is FALSE)} -} -\value{ -a data.frame in long format -} -\description{ -Legacy function that will not be supported in future updates. -} diff --git a/man/range_wide_to_long.Rd b/man/range_wide_to_long.Rd deleted file mode 100644 index 4165c560e..000000000 --- a/man/range_wide_to_long.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{range_wide_to_long} -\alias{range_wide_to_long} -\title{Pivot Range Format Forecasts From Wide to Long Format} -\usage{ -range_wide_to_long(data) -} -\arguments{ -\item{data}{a data.frame following the specifications from -\code{\link[=eval_forecasts]{eval_forecasts()}}) for quantile forecasts. For an example, see -\code{\link[=range_example_data_wide]{range_example_data_wide()}})} -} -\value{ -a data.frame in long format -} -\description{ -Given a data.frame that follows the structure shown in -\code{\link[=range_example_data_wide]{range_example_data_wide()}}, the function outputs the same -data in a long format as (as shown in -\code{\link[=range_example_data_long]{range_example_data_long()}}). This can be useful e.g. for -plotting. -} -\examples{ -wide <- scoringutils::range_example_data_wide -long <- scoringutils::range_wide_to_long(wide) - -} diff --git a/man/sample_to_quantile.Rd b/man/sample_to_quantile.Rd index e4caedb95..34fd4a5e0 100644 --- a/man/sample_to_quantile.Rd +++ b/man/sample_to_quantile.Rd @@ -22,8 +22,6 @@ Transform data from a format that is based on predictive samples to a format based on plain quantiles. } \examples{ -example_data <- scoringutils::integer_example_data - -quantile_data <- scoringutils::sample_to_quantile(example_data) - +sample_to_quantile(example_integer) } +\keyword{data-handling} diff --git a/man/sample_to_range.Rd b/man/sample_to_range.Rd deleted file mode 100644 index ff39b29bc..000000000 --- a/man/sample_to_range.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_data_handling.R -\name{sample_to_range} -\alias{sample_to_range} -\title{Change Data from a Sample Based Format to a Long Interval Range Format} -\usage{ -sample_to_range(data, range = c(0, 50, 90), type = 7, keep_quantile_col = TRUE) -} -\arguments{ -\item{data}{a data.frame with samples} - -\item{range}{a numeric vector of interval ranges to extract -(e.g. \code{c(0, 50, 90)})} - -\item{type}{type argument passed down to the quantile function. For more -information, see \code{\link[=quantile]{quantile()}}} - -\item{keep_quantile_col}{keep quantile column, default is TRUE} -} -\value{ -a data.frame in long format -} -\description{ -Legacy function that will not be supported in future updates. -} diff --git a/man/sample_to_range_long.Rd b/man/sample_to_range_long.Rd index 19524a66d..531afedc7 100644 --- a/man/sample_to_range_long.Rd +++ b/man/sample_to_range_long.Rd @@ -29,9 +29,4 @@ a data.frame in a long interval range format Transform data from a format that is based on predictive samples to a format based on interval ranges } -\examples{ -example_data <- scoringutils::integer_example_data - -quantile_data <- scoringutils::sample_to_range_long(example_data) - -} +\keyword{internal} diff --git a/man/score.Rd b/man/score.Rd new file mode 100644 index 000000000..a183e03c0 --- /dev/null +++ b/man/score.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/score.R +\name{score} +\alias{score} +\title{Evaluate forecasts} +\usage{ +score(data, metrics = NULL, ...) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +The following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For a quantile-format forecast you should provide a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +}} + +\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the +default), all available metrics will be computed. For a list of available +metrics see \code{\link[=available_metrics]{available_metrics()}}} + +\item{...}{additional parameters passed down to lower-level functions. +For example, the following arguments can change how weighted interval +scores are computed: +\itemize{ +\item \code{count_median_twice} that controls how the interval scores for different +intervals are summed up. This should be a logical (default is \code{FALSE}) that +indicates whether or not to count the median twice when summarising. +This would conceptually treat the +median as a 0\% prediction interval, where the median is the lower as well as +the upper bound. The alternative is to treat the median as a single quantile +forecast instead of an interval. The interval score would then +be better understood as an average of quantile scores.) +}} +} +\value{ +A data.table with unsummarised scores. There will be one score per +quantile or sample, which is usually not desired, so you should always run +\code{\link[=summarise_scores]{summarise_scores()}} on the unsummarised scores. +} +\description{ +The function \code{score} allows automatic scoring of forecasts and +wraps the lower level functions in the \pkg{scoringutils} package. + +It can be used to score forecasts in a quantile-based, sample-based, or +binary format. To obtain an overview of what input is expected, have a look +at the \link{example_quantile}, \link{example_continuous}, \link{example_integer}, and +\link{example_binary} data sets. + +You can (and should) check your input using the function \code{\link[=check_forecasts]{check_forecasts()}} +before scoring. + +To obtain a quick overview of the evaluation metrics used, have a look at the +\link{metrics_summary} data included in the package. +} +\examples{ +library(magrittr) # pipe operator + +check_forecasts(example_quantile) +score(example_quantile) \%>\% + add_coverage(by = c("model", "target_type")) \%>\% + summarise_scores(by = c("model", "target_type")) + +# forecast formats with different metrics +score(example_binary) +score(example_quantile) +score(example_integer) +score(example_continuous) +} +\references{ +Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ +(2019) Assessing the performance of real-time epidemic forecasts: A +case study of Ebola in the Western Area region of Sierra Leone, 2014-15. +PLoS Comput Biol 15(2): e1006785. \url{doi:10.1371/journal.pcbi.1006785} +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} diff --git a/man/score_binary.Rd b/man/score_binary.Rd new file mode 100644 index 000000000..6237468e7 --- /dev/null +++ b/man/score_binary.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/score_binary.R +\name{score_binary} +\alias{score_binary} +\title{Evaluate forecasts in a Binary Format} +\usage{ +score_binary(data, forecast_unit, metrics) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +The following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For a quantile-format forecast you should provide a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +}} + +\item{forecast_unit}{A character vector with the column names that define +the unit of a single forecast, i.e. a forecast was made for a combination +of the values in \code{forecast_unit}.} + +\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the +default), all available metrics will be computed. For a list of available +metrics see \code{\link[=available_metrics]{available_metrics()}}} +} +\value{ +A data.table with appropriate scores. For more information see +\code{\link[=score]{score()}}. +} +\description{ +Evaluate forecasts in a Binary Format +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} +\keyword{internal} diff --git a/man/score_heatmap.Rd b/man/score_heatmap.Rd deleted file mode 100644 index b1bf9af1a..000000000 --- a/man/score_heatmap.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{score_heatmap} -\alias{score_heatmap} -\title{Create a Heatmap of a Scoring Metric} -\usage{ -score_heatmap( - scores, - y = "model", - x, - metric, - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap", - ylab = y, - xlab = x -) -} -\arguments{ -\item{scores}{A data.frame of scores based on quantile forecasts as -produced by \code{\link[=eval_forecasts]{eval_forecasts()}}.} - -\item{y}{The variable from the scores you want to show on the y-Axis. The -default for this is "model"} - -\item{x}{The variable from the scores you want to show on the x-Axis. This -could be something like "horizon", or "location"} - -\item{metric}{the metric that determines the value and colour shown in the -tiles of the heatmap} - -\item{facet_formula}{facetting formula passed down to ggplot. Default is -\code{NULL}} - -\item{scales}{scales argument that gets passed down to ggplot. Only necessary -if you make use of facetting. Default is "free_y"} - -\item{ncol}{Number of columns for facet wrap. Only relevant if -\code{facet_formula} is given and \code{facet_wrap_or_grid == "facet_wrap"}} - -\item{facet_wrap_or_grid}{Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} - -\item{ylab}{Label for the y-axis. Default is the variable name on the y-axis} - -\item{xlab}{Label for the x-axis. Default is the variable name on the x-axis} -} -\value{ -A ggplot2 object showing a heatmap of the desired metric -} -\description{ -This function can be used to create a heatmap of one metric across different -groups, e.g. the interval score obtained by several forecasting models in -different locations. -} -\examples{ -scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, - summarise_by = c("model", "value_desc", "range")) - -scoringutils::score_heatmap(scores, x = "value_desc", metric = "bias") - -} diff --git a/man/score_quantile.Rd b/man/score_quantile.Rd new file mode 100644 index 000000000..c8a402476 --- /dev/null +++ b/man/score_quantile.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/score_quantile.R +\name{score_quantile} +\alias{score_quantile} +\title{Evaluate forecasts in a Quantile-Based Format} +\usage{ +score_quantile( + data, + forecast_unit, + metrics, + weigh = TRUE, + count_median_twice = FALSE, + separate_results = TRUE +) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +The following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For a quantile-format forecast you should provide a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +}} + +\item{forecast_unit}{A character vector with the column names that define +the unit of a single forecast, i.e. a forecast was made for a combination +of the values in \code{forecast_unit}} + +\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the +default), all available metrics will be computed. For a list of available +metrics see \code{\link[=available_metrics]{available_metrics()}}} + +\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged +into an interval score that, in the limit, corresponds to CRPS. Alpha is the +decimal value that represents how much is outside a central prediction +interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) +Default: \code{TRUE}.} + +\item{count_median_twice}{logical that controls whether or not to count the +median twice when summarising (default is \code{FALSE}). Counting the +median twice would conceptually treat it as a 0\\% prediction interval, where +the median is the lower as well as the upper bound. The alternative is to +treat the median as a single quantile forecast instead of an interval. The +interval score would then be better understood as an average of quantile +scores.} + +\item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate +parts of the interval score (dispersion penalty, penalties for over- and +under-prediction get returned as separate elements of a list). If you want a +\code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.} +} +\value{ +A data.table with appropriate scores. For more information see +\code{\link[=score]{score()}} +} +\description{ +Evaluate forecasts in a Quantile-Based Format +} +\references{ +Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ +(2019) Assessing the performance of real-time epidemic forecasts: A +case study of Ebola in the Western Area region of Sierra Leone, 2014-15. +PLoS Comput Biol 15(2): e1006785. \url{doi:10.1371/journal.pcbi.1006785} +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} +\keyword{internal} diff --git a/man/score_sample.Rd b/man/score_sample.Rd new file mode 100644 index 000000000..5672f8d93 --- /dev/null +++ b/man/score_sample.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/score_continuous_integer.R +\name{score_sample} +\alias{score_sample} +\title{Evaluate forecasts in a Sample-Based Format (Integer or Continuous)} +\usage{ +score_sample(data, forecast_unit, metrics, prediction_type) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +The following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For a quantile-format forecast you should provide a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +}} + +\item{forecast_unit}{A character vector with the column names that define +the unit of a single forecast, i.e. a forecast was made for a combination +of the values in \code{forecast_unit}} + +\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the +default), all available metrics will be computed. For a list of available +metrics see \code{\link[=available_metrics]{available_metrics()}}} + +\item{prediction_type}{character, should be either "continuous" or "integer"} +} +\value{ +A data.table with appropriate scores. For more information see +\code{\link[=score]{score()}} +} +\description{ +Evaluate forecasts in a Sample-Based Format (Integer or Continuous) +} +\references{ +Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ +(2019) Assessing the performance of real-time epidemic forecasts: A +case study of Ebola in the Western Area region of Sierra Leone, 2014-15. +PLoS Comput Biol 15(2): e1006785. \url{doi:10.1371/journal.pcbi.1006785} +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} +\keyword{internal} diff --git a/man/score_table.Rd b/man/score_table.Rd deleted file mode 100644 index 8f8ab2c80..000000000 --- a/man/score_table.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{score_table} -\alias{score_table} -\title{Plot Coloured Score Table} -\usage{ -score_table( - summarised_scores, - y = NULL, - select_metrics = NULL, - facet_formula = NULL, - ncol = NULL, - facet_wrap_or_grid = "facet_wrap" -) -} -\arguments{ -\item{summarised_scores}{A data.frame of summarised scores as produced by -\code{\link[=eval_forecasts]{eval_forecasts()}}} - -\item{y}{the variable to be shown on the y-axis. If \code{NULL} (default), -all columns that are not scoring metrics will be used. Alternatively, -you can specify a vector with column names, e.g. -\code{y = c("model", "location")}. These column names will be concatenated -to create a unique row identifier (e.g. "model1_location1")} - -\item{select_metrics}{A character vector with the metrics to show. If set to -\code{NULL} (default), all metrics present in \code{summarised_scores} will -be shown} - -\item{facet_formula}{formula for facetting in ggplot. If this is \code{NULL} -(the default), no facetting will take place} - -\item{ncol}{Number of columns for facet wrap. Only relevant if -\code{facet_formula} is given and \code{facet_wrap_or_grid == "facet_wrap"}} - -\item{facet_wrap_or_grid}{Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} -} -\value{ -A ggplot2 object with a coloured table of summarised scores -} -\description{ -Plots a coloured table of summarised scores obtained using -\code{\link[=eval_forecasts]{eval_forecasts()}} -} -\examples{ -scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, - summarise_by = c("model", "value_desc")) -scoringutils::score_table(scores, y = "model", facet_formula = ~ value_desc, - ncol = 1) - -# can also put target description on the y-axis -scoringutils::score_table(scores, y = c("model", "value_desc")) - -# yields the same result in this case -scoringutils::score_table(scores) - - -scores <- scoringutils::eval_forecasts(scoringutils::integer_example_data, - summarise_by = c("model", "value_desc")) -scoringutils::score_table(scores, y = "model", facet_formula = ~ value_desc, - ncol = 1) - -# only show selected metrics -scoringutils::score_table(scores, y = "model", facet_formula = ~ value_desc, - ncol = 1, select_metrics = c("crps", "bias")) -} diff --git a/man/scoringutils.Rd b/man/scoringutils.Rd index 50be93b38..b2a950592 100644 --- a/man/scoringutils.Rd +++ b/man/scoringutils.Rd @@ -16,33 +16,34 @@ forecasts or point forecasts. The true values can be either continuous, integer, or binary. A collection of different metrics and scoring rules can be accessed through -the function \code{\link[=eval_forecasts]{eval_forecasts()}}. Given a data.frame of the +the function \code{\link[=score]{score()}}. Given a data.frame of the correct form the function will automatically figure out the type of prediction and true values and return appropriate scoring metrics. The package also has a lot of default visualisation based on the output -created by \code{\link[=eval_forecasts]{eval_forecasts()}}. +created by \code{\link[=score]{score()}}. \itemize{ -\item \code{\link[=score_table]{score_table()}} -\item \code{\link[=correlation_plot]{correlation_plot()}} -\item \code{\link[=wis_components]{wis_components()}} -\item \code{\link[=range_plot]{range_plot()}} -\item \code{\link[=score_heatmap]{score_heatmap()}} +\item \code{\link[=plot_score_table]{plot_score_table()}} +\item \code{\link[=plot_correlation]{plot_correlation()}} +\item \code{\link[=plot_wis]{plot_wis()}} +\item \code{\link[=plot_ranges]{plot_ranges()}} +\item \code{\link[=plot_heatmap]{plot_heatmap()}} \item \code{\link[=plot_predictions]{plot_predictions()}} -\item \code{\link[=interval_coverage]{interval_coverage()}} -\item \code{\link[=quantile_coverage]{quantile_coverage()}} +\item \code{\link[=plot_interval_coverage]{plot_interval_coverage()}} +\item \code{\link[=plot_quantile_coverage]{plot_quantile_coverage()}} } Alternatively, the following functions can be accessed directly: \itemize{ \item \code{\link[=brier_score]{brier_score()}} \item \code{\link[=pit]{pit()}} -\item \code{\link[=bias]{bias()}} -\item \code{\link[=quantile_bias]{quantile_bias()}} -\item \code{\link[=sharpness]{sharpness()}} -\item \code{\link[=crps]{crps()}} -\item \code{\link[=logs]{logs()}} -\item \code{\link[=dss]{dss()}} +\item \code{\link[=bias_sample]{bias_sample()}} +\item \code{\link[=bias_quantile]{bias_quantile()}} +\item \code{\link[=bias_range]{bias_range()}} +\item \code{\link[=mad_sample]{mad_sample()}} +\item \code{\link[=crps_sample]{crps_sample()}} +\item \code{\link[=logs_sample]{logs_sample()}} +\item \code{\link[=dss_sample]{dss_sample()}} \item \code{\link[=ae_median_sample]{ae_median_sample()}} } @@ -50,9 +51,6 @@ Predictions can be evaluated in a lot of different formats. If you want to convert from one format to the other, the following helper functions can do that for you: \itemize{ -\item \code{\link[=sample_to_range_long]{sample_to_range_long()}} \item \code{\link[=sample_to_quantile]{sample_to_quantile()}} -\item \code{\link[=quantile_to_range_long]{quantile_to_range_long()}} -\item \code{\link[=range_long_to_quantile]{range_long_to_quantile()}} } } diff --git a/man/se_mean_sample.Rd b/man/se_mean_sample.Rd new file mode 100644 index 000000000..6b17a27c2 --- /dev/null +++ b/man/se_mean_sample.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics_point_forecasts.R +\name{se_mean_sample} +\alias{se_mean_sample} +\title{Squared Error of the Mean (Sample-based Version)} +\usage{ +se_mean_sample(true_values, predictions) +} +\arguments{ +\item{true_values}{A vector with the true observed values of size n} + +\item{predictions}{nxN matrix of predictive samples, n (number of rows) being +the number of data points and N (number of columns) the number of Monte +Carlo samples. Alternatively, predictions can just be a vector of size n.} +} +\value{ +vector with the scoring values +} +\description{ +Squared error of the mean calculated as + +\deqn{ + \text{mean}(\text{true_value} - \text{prediction})^2 +}{ + mean(true_value - mean_prediction)^2 +} +} +\examples{ +true_values <- rnorm(30, mean = 1:30) +predicted_values <- rnorm(30, mean = 1:30) +se_mean_sample(true_values, predicted_values) +} +\seealso{ +\code{\link[=squared_error]{squared_error()}} +} +\keyword{metric} diff --git a/man/show_avail_forecasts.Rd b/man/show_avail_forecasts.Rd deleted file mode 100644 index bdcd76546..000000000 --- a/man/show_avail_forecasts.Rd +++ /dev/null @@ -1,80 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{show_avail_forecasts} -\alias{show_avail_forecasts} -\title{Visualise Where Forecasts Are Available} -\usage{ -show_avail_forecasts( - data, - y = "model", - x = "forecast_date", - make_x_factor = TRUE, - summarise_by = NULL, - collapse_to_one = TRUE, - by = NULL, - show_numbers = TRUE, - facet_formula = NULL, - facet_wrap_or_grid = "facet_wrap", - scales = "fixed", - legend_position = "none" -) -} -\arguments{ -\item{data}{data.frame with predictions in the same format required for -\code{\link[=eval_forecasts]{eval_forecasts()}}} - -\item{y}{character vector of length one that denotes the name of the column -to appear on the y-axis of the plot} - -\item{x}{character vector of length one that denotes the name of the column -to appear on the x-axis of the plot} - -\item{make_x_factor}{logical (default is TRUE). Whether or not to convert -the variable on the x-axis to a factor. This has an effect e.g. if dates -are shown on the x-axis.} - -\item{summarise_by}{character vector or \code{NULL} (the default) that -denotes the categories over which the number of forecasts should be summed -up. By default (i.e. \code{summarise_by = NULL}) this will be all the -columns that appear in either x, y, or the facetting formula.} - -\item{collapse_to_one}{logical. If \code{TRUE}) (the default), everything -not included in \code{by} will be counted only once. This is useful, for -example, if you don't want to count every single sample or quantile, but -instead treat one set of samples or quantiles as one forecast.} - -\item{by}{character vector or \code{NULL} (the default) that denotes the -unit of an individual forecast. This argument behaves similarly to the -\code{by} argument in \code{link{eval_forecasts}}. By default, all columns -are used that are not part of any internally protected columns like "sample" -or "prediction" or similar. The \code{by} argument is only necessary if -\code{collapse_to_one = TRUE} to indicate which rows not to collapse to one.} - -\item{show_numbers}{logical (default is \code{TRUE}) that indicates whether -or not to show the actual count numbers on the plot} - -\item{facet_formula}{formula for facetting in ggplot. If this is \code{NULL} -(the default), no facetting will take place} - -\item{facet_wrap_or_grid}{character. Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} - -\item{scales}{character. The scales argument gets passed down to ggplot. -Only necessary -if you make use of facetting. Default is "fixed"} - -\item{legend_position}{character that indicates where to put the legend. -The argument gets passed to ggplot2. By default ("none"), no legend is shown.} -} -\value{ -ggplot object with a plot of interval coverage -} -\description{ -Visualise Where Forecasts Are Available -} -\examples{ -example1 <- scoringutils::range_example_data_long -show_avail_forecasts(example1, x = "value_date", facet_formula = ~ value_desc) -} diff --git a/man/mse.Rd b/man/squared_error.Rd similarity index 64% rename from man/mse.Rd rename to man/squared_error.Rd index 1013815e7..0bc1f03b9 100644 --- a/man/mse.Rd +++ b/man/squared_error.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/metrics_point_forecasts.R -\name{mse} -\alias{mse} -\title{Mean Squared Error} +\name{squared_error} +\alias{squared_error} +\title{Squared Error} \usage{ -mse(true_values, predictions) +squared_error(true_values, predictions) } \arguments{ \item{true_values}{A vector with the true observed values of size n} @@ -15,15 +15,15 @@ mse(true_values, predictions) vector with the scoring values } \description{ -Mean Squared Error MSE of point forecasts. -Calculated as +Squared Error SE calculated as \deqn{ - mean((true_values - predicted_values)^2) + (true_values - predicted_values)^2 } } \examples{ true_values <- rnorm(30, mean = 1:30) predicted_values <- rnorm(30, mean = 1:30) -mse(true_values, predicted_values) +squared_error(true_values, predicted_values) } +\keyword{metric} diff --git a/man/summarise_scores.Rd b/man/summarise_scores.Rd new file mode 100644 index 000000000..2fb6c40e7 --- /dev/null +++ b/man/summarise_scores.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarise_scores.R +\name{summarise_scores} +\alias{summarise_scores} +\title{Summarise scores as produced by \code{\link[=score]{score()}}} +\usage{ +summarise_scores( + scores, + by = NULL, + fun = mean, + relative_skill = FALSE, + metric = "auto", + baseline = NULL, + ... +) +} +\arguments{ +\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} + +\item{by}{character vector with column names to summarise scores by. Default +is \code{NULL}, meaning that the only summary that takes is place is summarising +over quantiles (in case of quantile-based forecasts), such that there is one +score per forecast as defined by the unit of a single forecast (rather than +one score for every quantile).} + +\item{fun}{a function used for summarising scores. Default is \code{mean}.} + +\item{relative_skill}{logical, whether or not to compute relative +performance between models based on pairwise comparisons. +If \code{TRUE} (default is \code{FALSE}), then a column called +'model' must be present in the input data. For more information on +the computation of relative skill, see \code{\link[=pairwise_comparison]{pairwise_comparison()}}. +Relative skill will be calculated for the aggregation level specified in +\code{by}.} + +\item{metric}{character with the name of the metric for which +a relative skill shall be computed. If equal to 'auto' (the default), then +this will be either interval score, crps or brier score (depending on which +of these is available in the input data)} + +\item{baseline}{character string with the name of a model. If a baseline is +given, then a scaled relative skill with respect to the baseline will be +returned. By default (\code{NULL}), relative skill will not be scaled with +respect to a baseline model.} + +\item{...}{additional arguments, such as test options that can get passed +down to lower level functions. The following options are available: +\code{one_sided} (Boolean, default is \code{FALSE}, whether two conduct a one-sided +instead of a two-sided test), \code{test_type} (character, either "non_parametric" +or "permutation" determining which kind of test shall be conducted to +determine p-values. Default is "non-parametric), \code{n_permutations} (number of +permutations for a permutation test. Default is 999). See +\code{\link[=compare_two_models]{compare_two_models()}} for more information.} +} +\description{ +Summarise scores as produced by \code{\link[=score]{score()}}- +} +\examples{ +library(magrittr) # pipe operator + +# summarise over samples or quantiles to get one score per forecast +scores <- score(example_quantile) +summarise_scores(scores) + +# get scores by model +summarise_scores(scores, by = c("model")) + +# get scores by model and target type +summarise_scores(scores, by = c("model", "target_type")) + +# get standard deviation +summarise_scores(scores, by = "model", fun = sd) + +# round digits +summarise_scores(scores, by = c("model")) \%>\% + summarise_scores(fun = signif, digits = 2) + +# get quantiles of scores +# make sure to aggregate over ranges first +summarise_scores(scores, + by = "model", fun = quantile, + probs = c(0.25, 0.5, 0.75) +) + +# get ranges +# summarise_scores(scores, by = "range") +} +\keyword{scoring} diff --git a/man/theme_scoringutils.Rd b/man/theme_scoringutils.Rd new file mode 100644 index 000000000..fab65c0e3 --- /dev/null +++ b/man/theme_scoringutils.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.R +\name{theme_scoringutils} +\alias{theme_scoringutils} +\title{Scoringutils ggplot2 theme} +\usage{ +theme_scoringutils() +} +\value{ +A ggplot2 theme +} +\description{ +A theme for ggplot2 plots used in scoringutils +} diff --git a/man/update_list.Rd b/man/update_list.Rd deleted file mode 100644 index 07a7b6c5e..000000000 --- a/man/update_list.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{update_list} -\alias{update_list} -\title{Update a List} -\usage{ -update_list(defaults = list(), optional = list()) -} -\arguments{ -\item{defaults}{A list of default settings} - -\item{optional}{A list of optional settings to override defaults} -} -\value{ -A list -} -\description{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#stable}{\figure{lifecycle-stable.svg}{options: alt='[Stable]'}}}{\strong{[Stable]}} -Used to handle updating settings in a list. For example when making -changes to \code{interval_score_arguments} in \code{eval_forecasts()} -} -\keyword{internal} diff --git a/man/wis_components.Rd b/man/wis_components.Rd deleted file mode 100644 index 0ed11f7c6..000000000 --- a/man/wis_components.Rd +++ /dev/null @@ -1,73 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{wis_components} -\alias{wis_components} -\title{Plot Contributions to the Weighted Interval Score} -\usage{ -wis_components( - scores, - x = "model", - group = NULL, - relative_contributions = FALSE, - facet_formula = NULL, - scales = "free_y", - ncol = NULL, - facet_wrap_or_grid = "facet_wrap", - x_text_angle = 90, - xlab = x, - ylab = "WIS contributions" -) -} -\arguments{ -\item{scores}{A data.frame of scores based on quantile forecasts as -produced by \code{\link[=eval_forecasts]{eval_forecasts()}}} - -\item{x}{The variable from the scores you want to show on the x-Axis. -Usually this will be "model"} - -\item{group}{Choose a grouping variable for the plot that gets directly -passed down to ggplot. Default is \code{NULL}} - -\item{relative_contributions}{show relative contributions instead of absolute -contributions. Default is FALSE and this functionality is not available yet.} - -\item{facet_formula}{facetting formula passed down to ggplot. Default is -\code{NULL}} - -\item{scales}{scales argument that gets passed down to ggplot. Only necessary -if you make use of facetting. Default is "free_y"} - -\item{ncol}{Number of columns for facet wrap. Only relevant if -\code{facet_formula} is given and \code{facet_wrap_or_grid == "facet_wrap"}} - -\item{facet_wrap_or_grid}{Use ggplot2's \code{facet_wrap} or -\code{facet_grid}? Anything other than "facet_wrap" will be interpreted as -\code{facet_grid}. This only takes effect if \code{facet_formula} is not -\code{NULL}} - -\item{x_text_angle}{Angle for the text on the x-axis. Default is 90} - -\item{xlab}{Label for the x-axis. Default is the variable name on the x-axis} - -\item{ylab}{Label for the y-axis. Default is "WIS contributions"} -} -\value{ -A ggplot2 object showing a contributions from the three components of -the weighted interval score -} -\description{ -Visualise the components of the weighted interval score: penalties for -over-prediction, under-prediction and for a lack of sharpness -} -\examples{ -scores <- scoringutils::eval_forecasts(scoringutils::quantile_example_data, - summarise_by = c("model", "value_desc")) -scoringutils::wis_components(scores, x = "model", facet_formula = ~ value_desc, - relative_contributions = TRUE) -scoringutils::wis_components(scores, x = "model", facet_formula = ~ value_desc, - relative_contributions = FALSE) -} -\references{ -Bracher J, Ray E, Gneiting T, Reich, N (2020) Evaluating epidemic forecasts -in an interval format. \url{https://arxiv.org/abs/2005.12881} -} diff --git a/scoringutils.Rproj b/scoringutils.Rproj index 0ee12cdee..052e507fb 100644 --- a/scoringutils.Rproj +++ b/scoringutils.Rproj @@ -19,4 +19,4 @@ BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageCheckArgs: --as-cran -PackageRoxygenize: rd,collate,namespace +PackageRoxygenize: rd,collate,namespace,vignette diff --git a/tests/testthat/_snaps/plot_predictions/many-quantiles-from-sample.svg b/tests/testthat/_snaps/plot_predictions/many-quantiles-from-sample.svg index 22fd773ff..bd0ac722e 100644 --- a/tests/testthat/_snaps/plot_predictions/many-quantiles-from-sample.svg +++ b/tests/testthat/_snaps/plot_predictions/many-quantiles-from-sample.svg @@ -18,945 +18,732 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Scotland - - - - - - + + - - -Deaths - - + - - + + - + - - + + - - -Wales + +IT - + - - + + - - -Deaths + +Cases - + - - + + - - - + - - + + - + +IT - + - - + + - + +Deaths - - - - - - - -Northern Ireland - - - - - - - - - - -Deaths - - + - - - - - - - - - + + - - -Scotland - - - - - - - - - - -Total beds occupied - - + - - + + - + - - + + - - -Scotland + +FR - + - - + + - - -ICU beds occupied + +Deaths - + - - + + - + - - + + - - -Scotland + +GB - + - - + + - - -Hospital admissions + +Cases - + - - + + - + - - + + - - -England + +GB - + - - + + - - -Total beds occupied + +Deaths - + - - + + - + - - + + - - -England + +DE - + - - + + - - -Hospital admissions + +Cases - + - - + + - + - - + + - - -England + +DE - + - - + + - - -Deaths + +Deaths - + - - + + - + - - + + - - -Northern Ireland + +FR - + - - + + - - -Hospital admissions + +Cases - + - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 -5 -10 -15 -20 - - - - -10 -20 -30 -40 -50 - - - - - -100 -200 -300 -400 - - - - -0 -25 -50 -75 - - - - -300 -500 -700 -900 - - - - -800 -1000 -1200 - - - -0 -5 -10 -15 -20 - - - - - -2500 -5000 -7500 -10000 - - - - -0.0 -2.5 -5.0 -7.5 - - - - -0 -10 -20 -30 -40 - - - - - -value_date -True and predicted values - - - - - - - - - -actual -median - -range - - - - - - -95 -90 -50 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + +-2e+05 +-1e+05 +0e+00 +1e+05 + + + + + +0 +100 +200 +300 +400 + + + + + + +0 +500 +1000 +1500 + + + + + +0e+00 +1e+05 +2e+05 +3e+05 + + + + + +0 +500 +1000 +1500 + + + + + +0 +30000 +60000 +90000 + + + + + +0 +500 +1000 +1500 + + + + + +0 +20000 +40000 +60000 + + + + +target_end_date +True and predicted values + + + + + + +actual +median +range + + + +95 +90 +50 many_quantiles_from_sample diff --git a/tests/testthat/_snaps/plot_predictions/many-quantiles.svg b/tests/testthat/_snaps/plot_predictions/many-quantiles.svg index c26d3c4d1..74b951322 100644 --- a/tests/testthat/_snaps/plot_predictions/many-quantiles.svg +++ b/tests/testthat/_snaps/plot_predictions/many-quantiles.svg @@ -18,1540 +18,750 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - -Scotland - - - - - - + + - - -Deaths - - + - - - - - - - - - + + - - -Wales - - + - - + + - - -Deaths - - + +IT - + - - + + - + +Cases - - - - - - - - + - - + + - + - - + + - - -Northern Ireland + +IT - + - - + + - - -Deaths + +Deaths - + - - - - - - - - - + + - - -Scotland - - - - - - - - - - -Total beds occupied - - + - - + + - + - - + + - - -Scotland + +FR - + - - + + - - -ICU beds occupied + +Deaths - + - - + + - + - - + + - - -Scotland + +GB - + - - + + - - -Hospital admissions + +Cases - + - - + + - + - - + + - - -England + +GB - + - - + + - - -Total beds occupied + +Deaths - + - - + + - + - - + + - - -England + +DE - + - - + + - - -Hospital admissions + +Cases - + - - + + - + - - + + - - -England + +DE - + - - + + - - -Deaths + +Deaths - + - - + + - + - - + + - - -Northern Ireland + +FR - + - - + + - - -Hospital admissions + +Cases - + - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 -0 -5 -10 -15 -20 - - - - - -10 -20 -30 -40 -50 - - - - - -100 -200 -300 -400 - - - - -0 -25 -50 -75 - - - - -400 -600 -800 -1000 - - - - -500 -700 -900 -1100 -1300 - - - - - -5 -10 -15 -20 - - - - -2000 -4000 -6000 -8000 -10000 - - - - - -0.0 -2.5 -5.0 -7.5 - - - - -0 -10 -20 -30 -40 - - - - - -value_date -True and predicted values - - - - - - - - - -actual -median - -range - - - - - - - - - - - - -60 -50 -40 -30 -20 -10 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + +-2e+05 +-1e+05 +0e+00 +1e+05 + + + + + +100 +200 + + + +400 +800 +1200 +1600 + + + + + +0e+00 +1e+05 +2e+05 +3e+05 + + + + + +500 +1000 +1500 + + + + +0 +30000 +60000 +90000 + + + + + +400 +800 +1200 + + + + +20000 +40000 +60000 + + + +target_end_date +True and predicted values + + + + + + +actual +median +range + + + + + + +60 +50 +40 +30 +20 +10 many_quantiles diff --git a/tests/testthat/_snaps/plot_predictions/no-median.svg b/tests/testthat/_snaps/plot_predictions/no-median.svg new file mode 100644 index 000000000..aacf96617 --- /dev/null +++ b/tests/testthat/_snaps/plot_predictions/no-median.svg @@ -0,0 +1,540 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +United Kingdom + + + + + + + + + +Cases + + + + + + + + + + + + + + + + + + +United Kingdom + + + + + + + + + +Deaths + + + + + + + + + + + + + + + + + + + + + + + + + + + +Germany + + + + + + + + + +Deaths + + + + + + + + + + + + + + + + + + +Italy + + + + + + + + + +Cases + + + + + + + + + + + + + + + + + + +Italy + + + + + + + + + +Deaths + + + + + + + + + + + + + + + + + + +France + + + + + + + + + +Cases + + + + + + + + + + + + + + + + + + +France + + + + + + + + + +Deaths + + + + + + + + + + + + + + + + + + +Germany + + + + + + + + + +Cases + + + + + + + + + +Jun 28 +Jul 05 +Jul 12 +Jul 19 + + + + + +Jun 28 +Jul 05 +Jul 12 +Jul 19 + + + + + +Jun 28 +Jul 05 +Jul 12 +Jul 19 + +5000 +10000 + + + +50 +100 +150 +200 +250 + + + + + + +100 +200 +300 +400 +500 + + + + + + +10000 +20000 +30000 + + + + +200 +400 +600 + + + + +5e+04 +1e+05 + + + +100 +200 +300 + + + + +1e+05 +2e+05 +3e+05 +4e+05 + + + + +target_end_date +True and predicted values +range + + +90 +50 + + +actual +no_median + + diff --git a/tests/testthat/_snaps/plot_predictions/point-forecasts.svg b/tests/testthat/_snaps/plot_predictions/point-forecasts.svg index 3ad96c646..f396f950d 100644 --- a/tests/testthat/_snaps/plot_predictions/point-forecasts.svg +++ b/tests/testthat/_snaps/plot_predictions/point-forecasts.svg @@ -18,923 +18,689 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Scotland - - - - - - + + - - -Deaths - - + - - + + - + - - + + - - -Wales + +IT - + - - + + - - -Deaths + +Cases - + - - + + - - - + - - + + - + +IT - + - - + + - + +Deaths - - - - - - - -Northern Ireland - - - - - - - - - - -Deaths - - + - - - - - - - - - + + - - -Scotland - - - - - - - - - - -Total beds occupied - - + - - + + - + - - + + - - -Scotland + +FR - + - - + + - - -ICU beds occupied + +Deaths - + - - + + - + - - + + - - -Scotland + +GB - + - - + + - - -Hospital admissions + +Cases - + - - + + - + - - + + - - -England + +GB - + - - + + - - -Total beds occupied + +Deaths - + - - + + - + - - + + - - -England + +DE - + - - + + - - -Hospital admissions + +Cases - + - - + + - + - - + + - - -England + +DE - + - - + + - - -Deaths + +Deaths - + - - + + - + - - + + - - -Northern Ireland + +FR - + - - + + - - -Hospital admissions + +Cases - + - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 - - - - - -May 01 -May 15 -Jun 01 -Jun 15 -Jul 01 -0 -5 -10 -15 -20 - - - - - -10 -20 -30 -40 -50 - - - - - -100 -200 -300 -400 - - - - -0 -25 -50 -75 - - - - -400 -600 -800 -1000 - - - - -500 -700 -900 -1100 -1300 - - - - - -5 -10 -15 -20 - - - - -4000 -6000 -8000 -10000 - - - - -0.0 -2.5 -5.0 -7.5 - - - - -0 -10 -20 -30 -40 - - - - - -value_date -True and predicted values - - - - - - - - - -actual -median + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + + + + + + +May 15 +Jun 01 +Jun 15 +Jul 01 +Jul 15 + +-2e+05 +-1e+05 +0e+00 +1e+05 + + + + + +100 +200 + + + +400 +800 +1200 +1600 + + + + + +0e+00 +1e+05 +2e+05 +3e+05 + + + + + +500 +1000 +1500 + + + + +0 +30000 +60000 +90000 + + + + + +400 +800 +1200 + + + + +20000 +40000 +60000 + + + +target_end_date +True and predicted values + + + + + + +actual +median point_forecasts diff --git a/tests/testthat/test-absolute_error.R b/tests/testthat/test-absolute_error.R index 646392494..c9c76dfec 100644 --- a/tests/testthat/test-absolute_error.R +++ b/tests/testthat/test-absolute_error.R @@ -11,7 +11,7 @@ test_that("absolute error (sample based) works", { # covidHubUtils-tests -test_that("abs error is correct within eval_forecasts, point forecast only", { +test_that("abs error is correct within score, point forecast only", { # test is adapted from the package covidHubUtils, https://github.com/reichlab/covidHubUtils/ y <- c(1, -15, 22) @@ -27,7 +27,7 @@ test_that("abs error is correct within eval_forecasts, point forecast only", { forecast_target_variables <- rep(target_variables, times = 1) - point_forecast <- c(5,6,7) + point_forecast <- c(5, 6, 7) test_truth <- data.frame( model = rep("truth_source", length(y)), @@ -61,8 +61,12 @@ test_that("abs error is correct within eval_forecasts, point forecast only", { data.table::setnames(fc_scoringutils, old = "value", new = "prediction") truth_scoringutils[, model := NULL] - eval <- scoringutils::eval_forecasts(forecasts = fc_scoringutils, - truth_data = truth_scoringutils) + data_scoringutils <- merge_pred_and_obs( + forecasts = fc_scoringutils, + observations = truth_scoringutils + ) + + eval <- scoringutils::score(data_scoringutils) # actual <- score_forecasts(forecasts = test_forecasts, truth = test_truth) @@ -70,7 +74,6 @@ test_that("abs error is correct within eval_forecasts, point forecast only", { # expect_equal(actual$abs_error, expected) expect_equal(eval$ae_point, expected) - }) test_that("abs error is correct, point and median forecasts different", { @@ -78,7 +81,8 @@ test_that("abs error is correct, point and median forecasts different", { forecast_quantiles_matrix <- rbind( c(-1, 0, 1, 2, 3), c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4)) + c(-2, 0, 3, 3, 4) + ) forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] @@ -98,7 +102,7 @@ test_that("abs error is correct, point and median forecasts different", { forecast_quantiles <- forecast_quantiles_matrix dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - point_forecast <- c(5,6,7) + point_forecast <- c(5, 6, 7) test_truth <- data.frame( model = rep("truth_source", length(y)), @@ -118,9 +122,9 @@ test_that("abs error is correct, point and median forecasts different", { temporal_resolution = rep("wk", n_forecasts), target_variable = forecast_target_variables, target_end_date = forecast_target_end_dates, - type = c(rep("point",length(point_forecast)),rep("quantile", length(forecast_quantiles))), - quantile = c(rep(NA,length(point_forecast)),forecast_quantile_probs), - value = c(point_forecast,forecast_quantiles), + type = c(rep("point", length(point_forecast)), rep("quantile", length(forecast_quantiles))), + quantile = c(rep(NA, length(point_forecast)), forecast_quantile_probs), + value = c(point_forecast, forecast_quantiles), stringsAsFactors = FALSE ) @@ -131,10 +135,12 @@ test_that("abs error is correct, point and median forecasts different", { data.table::setnames(fc_scoringutils, old = "value", new = "prediction") truth_scoringutils[, model := NULL] - eval <- scoringutils::eval_forecasts(forecasts = fc_scoringutils, - truth_data = truth_scoringutils) + data_scoringutils <- merge_pred_and_obs( + forecasts = fc_scoringutils, + observations = truth_scoringutils + ) - # actual <- score_forecasts(forecasts = test_forecasts, truth = test_truth) + eval <- scoringutils::score(data_scoringutils) expected <- abs(y - point_forecast) # expect_equal(actual$abs_error, expected) @@ -146,7 +152,8 @@ test_that("abs error is correct, point and median forecasts same", { forecast_quantiles_matrix <- rbind( c(-1, 0, 1, 2, 3), c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4)) + c(-2, 0, 3, 3, 4) + ) forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] @@ -166,7 +173,7 @@ test_that("abs error is correct, point and median forecasts same", { forecast_quantiles <- forecast_quantiles_matrix dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) - point_forecast <- c(1,2,3) + point_forecast <- c(1, 2, 3) test_truth <- data.frame( model = rep("truth_source", length(y)), @@ -186,9 +193,9 @@ test_that("abs error is correct, point and median forecasts same", { temporal_resolution = rep("wk", n_forecasts), target_variable = forecast_target_variables, target_end_date = forecast_target_end_dates, - type = c(rep("point",length(point_forecast)),rep("quantile", length(forecast_quantiles))), - quantile = c(rep(NA,length(point_forecast)),forecast_quantile_probs), - value = c(point_forecast,forecast_quantiles), + type = c(rep("point", length(point_forecast)), rep("quantile", length(forecast_quantiles))), + quantile = c(rep(NA, length(point_forecast)), forecast_quantile_probs), + value = c(point_forecast, forecast_quantiles), stringsAsFactors = FALSE ) @@ -200,9 +207,20 @@ test_that("abs error is correct, point and median forecasts same", { data.table::setnames(fc_scoringutils, old = "value", new = "prediction") truth_scoringutils[, model := NULL] - eval <- scoringutils::eval_forecasts(forecasts = fc_scoringutils, - truth_data = truth_scoringutils, - summarise_by = c("location", "target_end_date", "target_variable", "horizon")) + data_scoringutils <- merge_pred_and_obs( + forecasts = fc_scoringutils, + observations = truth_scoringutils + ) + + eval <- score(data = data_scoringutils) + eval <- summarise_scores(eval, + by = c( + "location", "target_end_date", + "target_variable", "horizon" + ), + na.rm = TRUE + ) + # actual <- score_forecasts(forecasts = test_forecasts, truth = test_truth) diff --git a/tests/testthat/test-bias.R b/tests/testthat/test-bias.R index 44d76f622..ab058b7f6 100644 --- a/tests/testthat/test-bias.R +++ b/tests/testthat/test-bias.R @@ -1,82 +1,121 @@ -test_that("function throws an error when missing true_values", - { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error(bias(predictions = predictions), - "true_values or predictions argument missing") - }) - -test_that("function throws an error when missing 'predictions'", - { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error(bias(true_values = true_values), - "true_values or predictions argument missing") - }) - -test_that("function works for integer true_values and predictions", - { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(10, rpois(10, lambda = 1:10)) - output <- bias(true_values = true_values, - predictions = predictions) - expect_equal(length(output), - length(true_values)) - expect_equal(class(output), - "numeric") - }) - -test_that("function works for continuous true_values and predictions", - { - true_values <- rnorm(10) - predictions <- replicate(10, rnorm(10)) - output <- bias(true_values = true_values, - predictions = predictions) - expect_equal(length(output), - length(true_values)) - expect_equal(class(output), - "numeric") - }) +test_that("function throws an error when missing true_values", { + true_values <- rpois(10, lambda = 1:10) + predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(predictions = predictions), + "true_values argument is missing" + ) +}) + +test_that("function throws an error when missing 'predictions'", { + true_values <- rpois(10, lambda = 1:10) + predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(true_values = true_values), + "argument 'predictions' missing" + ) +}) + +test_that("function works for integer true_values and predictions", { + true_values <- rpois(10, lambda = 1:10) + predictions <- replicate(10, rpois(10, lambda = 1:10)) + output <- bias_sample( + true_values = true_values, + predictions = predictions + ) + expect_equal( + length(output), + length(true_values) + ) + expect_equal( + class(output), + "numeric" + ) +}) + +test_that("function works for continuous true_values and predictions", { + true_values <- rnorm(10) + predictions <- replicate(10, rnorm(10)) + output <- bias_sample( + true_values = true_values, + predictions = predictions + ) + expect_equal( + length(output), + length(true_values) + ) + expect_equal( + class(output), + "numeric" + ) +}) test_that("bias works", { true_values <- rpois(30, lambda = 1:30) predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) - all(scoringutils::bias(true_values, predictions) == scoringutils::bias(true_values, predictions)) + expect_true(all(bias_sample(true_values, predictions) == bias_sample(true_values, predictions))) ## continuous forecasts true_values <- rnorm(30, mean = 1:30) predictions <- replicate(200, rnorm(30, mean = 1:30)) - scoringutils2 <- scoringutils::bias(true_values, predictions) - scoringutils <- scoringutils::bias(true_values, predictions) + scoringutils2 <- bias_sample(true_values, predictions) + scoringutils <- bias_sample(true_values, predictions) expect_equal(scoringutils, scoringutils2) }) -test_that("quantile bias works", { - lower <- c(6341.000, 6329.500, 6087.014, 5703.500, - 5451.000, 5340.500, 4821.996, 4709.000, - 4341.500, 4006.250, 1127.000, 705.500) +test_that("range bias works", { + lower <- c( + 6341.000, 6329.500, 6087.014, 5703.500, + 5451.000, 5340.500, 4821.996, 4709.000, + 4341.500, 4006.250, 1127.000, 705.500 + ) - upper <- c(6341.000, 6352.500, 6594.986, 6978.500, - 7231.000, 7341.500, 7860.004, 7973.000, - 8340.500, 8675.750, 11555.000, 11976.500) + upper <- c( + 6341.000, 6352.500, 6594.986, 6978.500, + 7231.000, 7341.500, 7860.004, 7973.000, + 8340.500, 8675.750, 11555.000, 11976.500 + ) range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) true_value <- 8062 - scoringutils2 <- scoringutils::quantile_bias(lower = lower, upper = upper, - range = range, true_value = true_value) - scoringutils <- scoringutils:: quantile_bias(lower = lower, upper = upper, - range = range, true_value = true_value) + scoringutils2 <- bias_range( + lower = lower, upper = upper, + range = range, true_value = true_value + ) + scoringutils <- bias_range( + lower = lower, upper = upper, + range = range, true_value = true_value + ) expect_equal(scoringutils, scoringutils2) }) +test_that("quantile bias and range bias have the same result", { + predictions <- order(rnorm(23)) + lower <- rev(predictions[1:12]) + upper <- predictions[12:23] + + range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) + quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + true_value <- 8062 + scoringutils2 <- bias_range( + lower = lower, upper = upper, + range = range, true_value = true_value + ) + scoringutils <- bias_quantile( + predictions = predictions, quantiles = quantiles, + true_value = true_value + ) + + expect_equal(scoringutils, scoringutils2) +}) diff --git a/tests/testthat/test-brier_score.R b/tests/testthat/test-brier_score.R deleted file mode 100644 index ea4325ff2..000000000 --- a/tests/testthat/test-brier_score.R +++ /dev/null @@ -1,58 +0,0 @@ -test_that("function throws an error when missing true_values or predictions", - { - true_values <- sample(c(0,1), size = 10, replace = TRUE) - predictions <- replicate(20, - sample(c(0,1), size = 10, replace = TRUE)) - - expect_error(brier_score(predictions = predictions), - "true_values or predictions argument missing") - - expect_error(brier_score(true_values = true_values), - "true_values or predictions argument missing") - }) - - - -test_that("function throws an error for wrong format of true_value", - { - true_values <- rpois(10, lambda = 1:10) - predictions <- runif(10, min = 0, max = 1) - - expect_error(brier_score(true_values = true_values, - predictions = predictions), - "elements of true_values should be either zero or one") - - true_values <- rnorm(10) - expect_error(brier_score(true_values = true_values, - predictions = predictions), - "elements of true_values should be either zero or one") - }) - -test_that("function throws an error for wrong format of predictions", - { - true_values <- sample(c(0,1), size = 10, replace = TRUE) - predictions <- runif(10, min = 0, max = 3) - expect_error(brier_score(true_values = true_values, - predictions = predictions), - "elements of 'predictions' should be probabilites between zero and one") - - predictions <- runif(10, min = 0, max = 1) - expect_error(brier_score(true_values = true_values, - predictions = list(predictions)), - "Mismatch: 'true_values' has length `10`, but 'predictions' has length `1`") - - predictions <- runif(15, min = 0, max = 1) - expect_error(brier_score(true_values = true_values, - predictions = predictions), - "Mismatch: 'true_values' has length `10`, but 'predictions' has length `15`") - }) - - -test_that("brier_score works", { - true_values <- sample(c(0,1), size = 30, replace = TRUE) - predictions <- runif(n = 30, min = 0, max = 1) - - scoringutils2 <- scoringutils::brier_score(true_values, predictions) - scoringutils <- scoringutils::brier_score(true_values, predictions) - expect_equal(scoringutils2, scoringutils) -}) diff --git a/tests/testthat/test-check_forecasts.R b/tests/testthat/test-check_forecasts.R index 77c1ddb87..a6aac1ed2 100644 --- a/tests/testthat/test-check_forecasts.R +++ b/tests/testthat/test-check_forecasts.R @@ -1,24 +1,31 @@ test_that("check_forecasts() function works", { - check <- check_forecasts(quantile_example_data) + check <- check_forecasts(example_quantile) expect_s3_class(check, "scoringutils_check") }) - test_that("check_forecasts() function has an error for empty data.frame", { expect_error(check_forecasts(data.frame())) }) -test_that("check_forecasts() function returns a warning with NA in the data", { - check <- check_forecasts(quantile_example_data) - expect_equal(unlist(check$warnings), - "Some values for `prediction` are NA in the data provided") +test_that("check_forecasts() function returns a message with NA in the data", { + check <- check_forecasts(example_quantile) + expect_equal( + unlist(check$messages), + "Some values for `prediction` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected." + ) }) -test_that("check_forecasts() function returns warnings with NA in the data", { - example <- data.table::copy(quantile_example_data) - example[horizon == 7, true_value := NA] +test_that("check_forecasts() function returns messages with NA in the data", { + example <- data.table::copy(example_quantile) + example[horizon == 2, true_value := NA] check <- check_forecasts(example) - expect_equal(length(check$warnings), 2) + expect_equal(length(check$messages), 2) }) +test_that("check_forecasts() function throws an error with duplicate forecasts", { + example <- rbind(example_quantile, + example_quantile[1000:1010]) + + expect_error(suppressWarnings(check_forecasts(example))) +}) diff --git a/tests/testthat/test-eval_forecasts.R b/tests/testthat/test-eval_forecasts.R deleted file mode 100644 index cbca78028..000000000 --- a/tests/testthat/test-eval_forecasts.R +++ /dev/null @@ -1,256 +0,0 @@ -# common error handling -------------------------------------------------------- -test_that("function throws an error if data is missing", { - expect_error(eval_forecasts(data = NULL)) -}) - - - -# test binary case ------------------------------------------------------------- -test_that("function produces output for a binary case", { - binary_example <- data.table::setDT(scoringutils::binary_example_data) - eval <- eval_forecasts(binary_example[!is.na(prediction)], - summarise_by = c("model", "value_desc"), - quantiles = c(0.5), sd = TRUE, - verbose = FALSE) - expect_equal(nrow(eval) > 1, - TRUE) -}) - - -# test quantile case ----------------------------------------------------------- -test_that("function produces output for a quantile format case", { - quantile_example <- data.table::setDT(scoringutils::quantile_example_data) - eval <- eval_forecasts(quantile_example[!is.na(prediction)], - summarise_by = c("model"), - quantiles = c(0.5), sd = TRUE) - - expect_equal(nrow(eval) > 1, - TRUE) -}) - -test_that("calculation of aem is correct for a quantile format case", { - quantile_example <- data.table::setDT(scoringutils::quantile_example_data) - eval <- eval_forecasts(quantile_example[!is.na(prediction)], - summarise_by = c("model"), - quantiles = c(0.5), sd = TRUE) - - ae <- quantile_example[quantile == 0.5, ae := abs(true_value - prediction) - ][!is.na(model), .(mean = mean(ae, na.rm = TRUE)), - by = "model" - ]$mean - - expect_equal(sort(eval$aem), sort(ae)) -}) - - -test_that("all quantile and range formats yield the same result", { - quantile_example1 <- data.table::setDT(scoringutils::quantile_example_data) - - quantile_example2 <- data.table::setDT(scoringutils::range_example_data_long) - quantile_example2 <- range_long_to_quantile(quantile_example2) - - quantile_example3 <- data.table::setDT(scoringutils::range_example_data_semi_wide) - quantile_example3 <- range_wide_to_long(quantile_example3) - quantile_example3 <- range_long_to_quantile(quantile_example3) - - wide <- data.table::setDT(scoringutils::range_example_data_wide) - quantile_example4 <- scoringutils::range_wide_to_long(wide) - - - eval1 <- eval_forecasts(quantile_example1[!is.na(prediction)], - summarise_by = c("model"), - quantiles = c(0.5), sd = TRUE) - - eval2 <- eval_forecasts(quantile_example2[!is.na(prediction)], - summarise_by = c("model"), - quantiles = c(0.5), sd = TRUE) - - ae <- quantile_example1[quantile == 0.5, ae := abs(true_value - prediction) - ][!is.na(model), .(mean = mean(ae, na.rm = TRUE)), - by = "model" - ]$mean - - expect_equal(sort(eval1$aem), sort(ae)) -}) - -test_that("function produces output even if only some metrics are chosen", { - range_example_wide <- data.table::setDT(scoringutils::range_example_data_wide) - range_example <- scoringutils::range_wide_to_long(range_example_wide) - example <- range_long_to_quantile(range_example) - - eval <- scoringutils::eval_forecasts(example, - summarise_by = c("model", "range"), - metrics = "coverage", - sd = TRUE) - - expect_equal(nrow(eval) > 1, - TRUE) -}) - -test_that("WIS is the same with other metrics omitted or included", { - range_example_wide <- data.table::setDT(scoringutils::range_example_data_wide) - range_example <- scoringutils::range_wide_to_long(range_example_wide) - example <- scoringutils::range_long_to_quantile(range_example) - - eval <- scoringutils::eval_forecasts(example, - summarise_by = c("model", "range"), - metrics = "interval_score") - - eval2 <- scoringutils::eval_forecasts(example, - summarise_by = c("model", "range")) - - expect_equal(sum(eval$interval_score), - sum(eval2$interval_score)) -}) - - - - - -# test integer and continuous case --------------------------------------------- -test_that("function produces output for a continuous format case", { - example <- data.table::setDT(scoringutils::continuous_example_data) - eval <- eval_forecasts(example[!is.na(prediction)], - summarised = TRUE, - summarise_by = c("model"), - quantiles = c(0.5), sd = TRUE) - - # eval2 <- scoringutils::eval_forecasts(example, - # summarised = TRUE, - # summarise_by = c("model"), - # quantiles = c(0.5), sd = TRUE) - # - # setcolorder(eval2, colnames(eval)) - # eval <- eval[order(model)] - # eval2 <- eval2[order(model)] - # all(eval == eval2, na.rm = TRUE) - - expect_equal(nrow(eval) > 1, - TRUE) -}) - - - - - - - - - - - - -# -# # tests that function returns the same results for scoringutils2 and scoringutils1 -# test_that("scoringutils and scoringutils2 are the same for a binary case", { -# binary_example <- data.table::setDT(scoringutils::binary_example_data) -# eval2 <- scoringutils::eval_forecasts(binary_example, -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval <- scoringutils::eval_forecasts(binary_example[!is.na(prediction)], -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# all(eval == eval2) -# -# expect_equal(eval, eval2) -# }) -# -# -# test_that("scoringutils and scoringutils2 are the same for a continuous case", { -# example <- data.table::setDT(scoringutils::continuous_example_data) -# eval2 <- scoringutils::eval_forecasts(example, -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval <- scoringutils::eval_forecasts(example[!is.na(prediction)], -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval2 <- eval2[, .SD, .SDcols = names(eval2)[names(eval2) %in% names(eval)]] -# data.table::setcolorder(eval2, names(eval)) -# -# expect_equal(eval, eval2) -# }) -# -# -# test_that("scoringutils and scoringutils2 are the same for an integer case", { -# set.seed(1) -# example <- data.table::setDT(scoringutils::integer_example_data) -# eval2 <- scoringutils::eval_forecasts(example, -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# set.seed(1) -# eval <- scoringutils::eval_forecasts(example[!is.na(prediction)], -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval <- eval[order(model)] -# eval[, c("pit_p_val", "pit_sd", "pit_p_val_0.5") := NULL] -# eval2 <- eval2[order(model)] -# eval2 <- eval2[, .SD, .SDcols = names(eval2)[names(eval2) %in% names(eval)]] -# data.table::setcolorder(eval2, names(eval)) -# -# expect_equal(eval, eval2) -# }) -# -# -# -# -# test_that("scoringutils and scoringutils2 are the same for a quantile case", { -# example <- data.table::setDT(scoringutils::quantile_example_data) -# eval2 <- scoringutils::eval_forecasts(example, -# summarise_by = c("model", "value_desc"), -# interval_score_arguments = list(count_median_twice = FALSE), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval <- scoringutils::eval_forecasts(example[!is.na(prediction)], -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval <- eval[order(model)] -# eval2 <- eval2[order(model)] -# eval2 <- eval2[, .SD, .SDcols = names(eval2)[names(eval2) %in% names(eval)]] -# eval <- eval[, .SD, .SDcols = names(eval)[names(eval) %in% names(eval2)]] -# data.table::setcolorder(eval2, names(eval)) -# -# expect_equal(eval, eval2) -# }) -# -# -# test_that("scoringutils and scoringutils2 are the same for a range format case", { -# example <- data.table::setDT(scoringutils::range_example_data_long) -# eval2 <- scoringutils::eval_forecasts(example, -# summarise_by = c("model", "value_desc"), -# interval_score_arguments = list(count_median_twice = FALSE), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval <- scoringutils::eval_forecasts(example[!is.na(prediction)], -# summarise_by = c("model", "value_desc"), -# quantiles = c(0.5), sd = TRUE, -# verbose = FALSE) -# -# eval <- eval[order(model)] -# eval2 <- eval2[order(model)] -# eval2 <- eval2[, .SD, .SDcols = names(eval2)[names(eval2) %in% names(eval)]] -# eval <- eval[, .SD, .SDcols = names(eval)[names(eval) %in% names(eval2)]] -# data.table::setcolorder(eval2, names(eval)) -# -# expect_equal(eval, eval2) -# }) - - - -## test for separate truth and forecast data - - diff --git a/tests/testthat/test-interval_score.R b/tests/testthat/test-interval_score.R index 53f989750..2df2be365 100644 --- a/tests/testthat/test-interval_score.R +++ b/tests/testthat/test-interval_score.R @@ -3,57 +3,69 @@ test_that("wis works, median only", { lower <- upper <- c(1, 2, 3) quantile_probs <- 0.5 - actual <- scoringutils::interval_score(y, lower = lower, upper = upper, - weigh = TRUE, - interval_range = 0) + actual <- interval_score(y, + lower = lower, upper = upper, + weigh = TRUE, + interval_range = 0 + ) expected <- abs(y - lower) expect_identical(actual, expected) }) -test_that("WIS works within eval_forecasts for median forecast", { - test_data <- data.frame(true_value = c(1, -15, 22), - prediction = 1:3, - quantile = rep(c(0.5), each = 3), - model = "model1", - date = 1:3) - eval <- scoringutils::eval_forecasts(test_data, - interval_score_arguments = list(count_median_twice = TRUE)) - expect_equal(eval$aem, eval$interval_score) +test_that("WIS works within score for median forecast", { + test_data <- data.frame( + true_value = c(1, -15, 22), + prediction = 1:3, + quantile = rep(c(0.5), each = 3), + model = "model1", + date = 1:3 + ) + eval <- scoringutils::score(test_data, + count_median_twice = TRUE + ) + expect_equal(eval$ae_median, eval$interval_score) }) test_that("wis works, 1 interval only", { y <- c(1, -15, 22) - lower = c(0, 1, 0) - upper = c(2, 2, 3) + lower <- c(0, 1, 0) + upper <- c(2, 2, 3) quantile_probs <- c(0.25, 0.75) alpha <- 0.5 - actual <- scoringutils::interval_score(y, lower = lower, upper = upper, - weigh = TRUE, - interval_range = 50) - expected <- (upper - lower)*(alpha/2) + c(0, 1-(-15), 22-3) + actual <- scoringutils::interval_score(y, + lower = lower, upper = upper, + weigh = TRUE, + interval_range = 50 + ) + expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) expect_identical(actual, expected) }) -test_that("WIS works within eval_forecasts for one interval", { - test_data <- data.frame(true_value = rep(c(1, -15, 22), times = 2), - quantile = rep(c(0.25, 0.75), each = 3), - prediction = c(c(0, 1, 0), c(2, 2, 3)), - model = c("model1"), - date = rep(1:3, times = 2)) +test_that("WIS works within score for one interval", { + test_data <- data.frame( + true_value = rep(c(1, -15, 22), times = 2), + quantile = rep(c(0.25, 0.75), each = 3), + prediction = c(c(0, 1, 0), c(2, 2, 3)), + model = c("model1"), + date = rep(1:3, times = 2) + ) + + eval <- scoringutils::score(test_data, + count_median_twice = TRUE + ) - eval <- scoringutils::eval_forecasts(test_data, - interval_score_arguments = list(count_median_twice = TRUE)) + eval <- summarise_scores(eval, by = c("model", "date")) - lower = c(0, 1, 0) - upper = c(2, 2, 3) + lower <- c(0, 1, 0) + upper <- c(2, 2, 3) alpha <- 0.5 - expected <- (upper - lower)*(alpha/2) + c(0, 1-(-15), 22-3) + expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) expect_equal(expected, eval$interval_score) }) @@ -64,15 +76,19 @@ test_that("WIS works within eval_forecasts for one interval", { test_that("wis works, 1 interval and median", { - test_data <- data.frame(true_value = rep(c(1, -15, 22), times = 3), - quantile = rep(c(0.25, 0.5, 0.75), each = 3), - prediction = c(c(0, 1, 0), c(1, 2, 3), c(2, 2, 3)), - model = c("model1"), - date = rep(1:3, times = 3)) + test_data <- data.frame( + true_value = rep(c(1, -15, 22), times = 3), + quantile = rep(c(0.25, 0.5, 0.75), each = 3), + prediction = c(c(0, 1, 0), c(1, 2, 3), c(2, 2, 3)), + model = c("model1"), + date = rep(1:3, times = 3) + ) - eval <- scoringutils::eval_forecasts(test_data, - interval_score_arguments = list(count_median_twice = TRUE)) + eval <- scoringutils::score(test_data, + count_median_twice = TRUE + ) + eval <- summarise_scores(eval, by = c("model", "date")) y <- c(1, -15, 22) quantiles <- rbind(c(0, 1, 2), c(1, 2, 2), c(0, 3, 3)) @@ -82,7 +98,7 @@ test_that("wis works, 1 interval and median", { expected <- 0.5 * ( abs(y - quantiles[, 2]) + - (quantiles[, 3] - quantiles[, 1])*(alpha/2) + c(0, 1-(-15), 22-3) + (quantiles[, 3] - quantiles[, 1]) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) ) expect_identical(eval$interval_score, expected) @@ -90,16 +106,22 @@ test_that("wis works, 1 interval and median", { test_that("wis works, 2 intervals and median", { + test_data <- data.frame( + true_value = rep(c(1, -15, 22), times = 5), + quantile = rep(c(0.1, 0.25, 0.5, 0.75, 0.9), each = 3), + prediction = c( + c(-1, -2, -2), c(0, 1, 0), c(1, 2, 3), + c(2, 2, 3), c(3, 4, 4) + ), + model = c("model1"), + date = rep(1:3, times = 5) + ) - test_data <- data.frame(true_value = rep(c(1, -15, 22), times = 5), - quantile = rep(c(0.1, 0.25, 0.5, 0.75, 0.9), each = 3), - prediction = c(c(-1, -2, -2), c(0, 1, 0), c(1, 2, 3), - c(2, 2, 3), c(3, 4, 4)), - model = c("model1"), - date = rep(1:3, times = 5)) + eval <- scoringutils::score(test_data, + count_median_twice = TRUE + ) - eval <- scoringutils::eval_forecasts(test_data, - interval_score_arguments = list(count_median_twice = TRUE)) + eval <- summarise_scores(eval, by = c("model", "date")) y <- c(1, -15, 22) quantiles <- rbind(c(-1, 0, 1, 2, 3), c(-2, 1, 2, 2, 4), c(-2, 0, 3, 3, 4)) @@ -108,14 +130,16 @@ test_that("wis works, 2 intervals and median", { alpha1 <- 0.2 alpha2 <- 0.5 - expected <- (1/3) * ( + expected <- (1 / 3) * ( abs(y - quantiles[, 3]) + - (quantiles[, 5] - quantiles[, 1])*(alpha1/2) + c(0, (-2)-(-15), 22-4) + - (quantiles[, 4] - quantiles[, 2])*(alpha2/2) + c(0, 1-(-15), 22-3) + (quantiles[, 5] - quantiles[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + + (quantiles[, 4] - quantiles[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) ) - expect_equal(as.numeric(eval$interval_score), - as.numeric(expected)) + expect_equal( + as.numeric(eval$interval_score), + as.numeric(expected) + ) }) @@ -125,240 +149,302 @@ test_that("wis works, 2 intervals and median", { -# # additional tests from the covidhubutils repo -# -# test_that("wis is correct, median only - covidHubUtils check", { -# library(covidHubUtils) -# y <- c(1, -15, 22) -# forecast_quantiles_matrix <- rbind( -# c(-1, 0, 1, 2, 3), -# c(-2, 1, 2, 2, 4), -# c(-2, 0, 3, 3, 4)) -# forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) -# forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] -# forecast_quantile_probs <- forecast_quantile_probs[3] -# -# target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) -# horizons <- c("1", "2", "1") -# locations <- c("01", "01", "02") -# target_variables <- rep("inc death", length(y)) -# -# forecast_target_end_dates <- -# rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) -# forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) -# forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) -# forecast_target_variables <- -# rep(target_variables, times = ncol(forecast_quantiles_matrix)) -# forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) -# forecast_quantiles <- forecast_quantiles_matrix -# dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) -# -# test_truth <- data.frame( -# model = rep("truth_source", length(y)), -# target_variable = target_variables, -# target_end_date = target_end_dates, -# location = locations, -# value = y, -# stringsAsFactors = FALSE -# ) -# -# n_forecasts <- length(forecast_quantiles) -# test_forecasts <- data.frame( -# model = rep("m1", n_forecasts), -# forecast_date = rep(as.Date("2020-01-01"), n_forecasts), -# location = forecast_locations, -# horizon = forecast_horizons, -# temporal_resolution = rep("wk", n_forecasts), -# target_variable = forecast_target_variables, -# target_end_date = forecast_target_end_dates, -# type = rep("quantile", n_forecasts), -# quantile = forecast_quantile_probs, -# value = forecast_quantiles, -# stringsAsFactors = FALSE -# ) -# -# # make a version that conforms to scoringutils format -# truth_formatted <- data.table::as.data.table(test_truth) -# truth_formatted[, `:=`(model = NULL)] -# data.table::setnames(truth_formatted, old = "value", new = "true_value") -# -# forecasts_formated <- data.table::as.data.table(test_forecasts) -# data.table::setnames(forecasts_formated, old = "value", new = "prediction") -# -# data_formatted <- merge(forecasts_formated, truth_formatted) -# -# eval <- scoringutils::eval_forecasts(data_formatted, -# interval_score_arguments = list(count_median_twice = FALSE)) -# -# actual <- covidHubUtils::score_forecasts(forecasts = test_forecasts, truth = test_truth, -# use_median_as_point = TRUE) -# -# expected <- abs(y - forecast_quantiles_matrix[, 1]) -# -# if(!all(eval$interval_score == actual$wis)) { -# warning("eval_forecasts() and covidHubUtils don't match") -# } -# -# expect_equal(eval$interval_score, expected) -# }) -# -# -# -# -# test_that("wis is correct, 1 interval only - covidHubUtils check", { -# library(covidHubUtils) -# y <- c(1, -15, 22) -# forecast_quantiles_matrix <- rbind( -# c(-1, 0, 1, 2, 3), -# c(-2, 1, 2, 2, 4), -# c(-2, 0, 3, 3, 4)) -# forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) -# forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE] -# forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)] -# -# target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) -# horizons <- c("1", "2", "1") -# locations <- c("01", "01", "02") -# target_variables <- rep("inc death", length(y)) -# -# forecast_target_end_dates <- -# rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) -# forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) -# forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) -# forecast_target_variables <- -# rep(target_variables, times = ncol(forecast_quantiles_matrix)) -# forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) -# forecast_quantiles <- forecast_quantiles_matrix -# dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) -# -# test_truth <- data.frame( -# model = rep("truth_source", length(y)), -# target_variable = target_variables, -# target_end_date = target_end_dates, -# location = locations, -# value = y, -# stringsAsFactors = FALSE -# ) -# -# n_forecasts <- length(forecast_quantiles) -# test_forecasts <- data.frame( -# model = rep("m1", n_forecasts), -# forecast_date = rep(as.Date("2020-01-01"), n_forecasts), -# location = forecast_locations, -# horizon = forecast_horizons, -# temporal_resolution = rep("wk", n_forecasts), -# target_variable = forecast_target_variables, -# target_end_date = forecast_target_end_dates, -# type = rep("quantile", n_forecasts), -# quantile = forecast_quantile_probs, -# value = forecast_quantiles, -# stringsAsFactors = FALSE -# ) -# -# # make a version that conforms to scoringutils format -# truth_formatted <- data.table::as.data.table(test_truth) -# truth_formatted[, `:=`(model = NULL)] -# data.table::setnames(truth_formatted, old = "value", new = "true_value") -# -# forecasts_formated <- data.table::as.data.table(test_forecasts) -# data.table::setnames(forecasts_formated, old = "value", new = "prediction") -# -# data_formatted <- merge(forecasts_formated, truth_formatted) -# -# eval <- scoringutils::eval_forecasts(data_formatted, -# interval_score_arguments = list(count_median_twice = FALSE)) -# -# actual <- score_forecasts(forecasts = test_forecasts, truth = test_truth, -# use_median_as_point = TRUE) -# -# alpha1 <- 0.2 -# expected <- (forecast_quantiles_matrix[, 2] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + -# c(0, (-2) - (-15), 22 - 4) -# -# if(!all(eval$interval_score == actual$wis)) { -# warning("eval_forecasts() and covidHubUtils don't match") -# } -# -# expect_equal(eval$interval_score, expected) -# }) -# -# -# test_that("wis is correct, 2 intervals and median - covidHubUtils check", { -# library(covidHubUtils) -# y <- c(1, -15, 22) -# forecast_quantiles_matrix <- rbind( -# c(-1, 0, 1, 2, 3), -# c(-2, 1, 2, 2, 4), -# c(-2, 0, 3, 3, 4)) -# forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) -# -# target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) -# horizons <- c("1", "2", "1") -# locations <- c("01", "01", "02") -# target_variables <- rep("inc death", length(y)) -# -# forecast_target_end_dates <- -# rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) -# forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) -# forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) -# forecast_target_variables <- -# rep(target_variables, times = ncol(forecast_quantiles_matrix)) -# forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) -# forecast_quantiles <- forecast_quantiles_matrix -# dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) -# -# test_truth <- data.frame( -# model = rep("truth_source", length(y)), -# target_variable = target_variables, -# target_end_date = target_end_dates, -# location = locations, -# value = y, -# stringsAsFactors = FALSE -# ) -# -# n_forecasts <- length(forecast_quantiles) -# test_forecasts <- data.frame( -# model = rep("m1", n_forecasts), -# forecast_date = rep(as.Date("2020-01-01"), n_forecasts), -# location = forecast_locations, -# horizon = forecast_horizons, -# temporal_resolution = rep("wk", n_forecasts), -# target_variable = forecast_target_variables, -# target_end_date = forecast_target_end_dates, -# type = rep("quantile", n_forecasts), -# quantile = forecast_quantile_probs, -# value = forecast_quantiles, -# stringsAsFactors = FALSE -# ) -# -# # make a version that conforms to scoringutils format -# truth_formatted <- data.table::as.data.table(test_truth) -# truth_formatted[, `:=`(model = NULL)] -# data.table::setnames(truth_formatted, old = "value", new = "true_value") -# -# forecasts_formated <- data.table::as.data.table(test_forecasts) -# data.table::setnames(forecasts_formated, old = "value", new = "prediction") -# -# data_formatted <- merge(forecasts_formated, truth_formatted) -# -# eval <- scoringutils::eval_forecasts(data_formatted, -# interval_score_arguments = list(count_median_twice = FALSE)) -# -# actual <- score_forecasts(forecasts = test_forecasts, truth = test_truth, -# use_median_as_point = TRUE) -# -# alpha1 <- 0.2 -# alpha2 <- 0.5 -# expected <- (1 / 2.5) * ( -# 0.5 * abs(y - forecast_quantiles_matrix[, 3]) + -# (forecast_quantiles_matrix[, 5] - forecast_quantiles_matrix[, 1])*(alpha1/2) + c(0, (-2)-(-15), 22-4) + -# (forecast_quantiles_matrix[, 4] - forecast_quantiles_matrix[, 2])*(alpha2/2) + c(0, 1-(-15), 22-3) -# ) -# -# if(!all(eval$interval_score == actual$wis)) { -# warning("eval_forecasts() and covidHubUtils don't match") -# } -# -# expect_equal(eval$interval_score, expected) -# }) +# additional tests from the covidhubutils repo + +test_that("wis is correct, median only - test corresponds to covidHubUtils", { + y <- c(1, -15, 22) + forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) + ) + forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] + forecast_quantile_probs <- forecast_quantile_probs[3] + + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "true_value") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "prediction") + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- scoringutils::score(data_formatted, + count_median_twice = FALSE + ) + + expected <- abs(y - forecast_quantiles_matrix[, 1]) + + expect_equal(eval$interval_score, expected) +}) + + + + +test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", { + y <- c(1, -15, 22) + forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) + ) + forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE] + forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)] + + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "true_value") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "prediction") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- scoringutils::score(data_formatted, + count_median_twice = FALSE + ) + + eval <- summarise_scores(eval, + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) + ) + + alpha1 <- 0.2 + expected <- (forecast_quantiles_matrix[, 2] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + + c(0, (-2) - (-15), 22 - 4) + + expect_equal(eval$interval_score, expected) +}) + + +test_that("wis is correct, 2 intervals and median - test corresponds to covidHubUtils", { + y <- c(1, -15, 22) + forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) + ) + forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + + target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) + horizons <- c("1", "2", "1") + locations <- c("01", "01", "02") + target_variables <- rep("inc death", length(y)) + + forecast_target_end_dates <- + rep(target_end_dates, times = ncol(forecast_quantiles_matrix)) + forecast_horizons <- rep(horizons, times = ncol(forecast_quantiles_matrix)) + forecast_locations <- rep(locations, times = ncol(forecast_quantiles_matrix)) + forecast_target_variables <- + rep(target_variables, times = ncol(forecast_quantiles_matrix)) + forecast_quantile_probs <- rep(forecast_quantile_probs, each = length(y)) + forecast_quantiles <- forecast_quantiles_matrix + dim(forecast_quantiles) <- prod(dim(forecast_quantiles)) + + test_truth <- data.frame( + model = rep("truth_source", length(y)), + target_variable = target_variables, + target_end_date = target_end_dates, + location = locations, + value = y, + stringsAsFactors = FALSE + ) + + n_forecasts <- length(forecast_quantiles) + test_forecasts <- data.frame( + model = rep("m1", n_forecasts), + forecast_date = rep(as.Date("2020-01-01"), n_forecasts), + location = forecast_locations, + horizon = forecast_horizons, + temporal_resolution = rep("wk", n_forecasts), + target_variable = forecast_target_variables, + target_end_date = forecast_target_end_dates, + type = rep("quantile", n_forecasts), + quantile = forecast_quantile_probs, + value = forecast_quantiles, + stringsAsFactors = FALSE + ) + + # make a version that conforms to scoringutils format + truth_formatted <- data.table::as.data.table(test_truth) + truth_formatted[, `:=`(model = NULL)] + data.table::setnames(truth_formatted, old = "value", new = "true_value") + + forecasts_formated <- data.table::as.data.table(test_forecasts) + data.table::setnames(forecasts_formated, old = "value", new = "prediction") + + data_formatted <- merge(forecasts_formated, truth_formatted) + + eval <- scoringutils::score(data_formatted, + count_median_twice = FALSE + ) + + eval <- summarise_scores(eval, + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) + ) + + alpha1 <- 0.2 + alpha2 <- 0.5 + expected <- (1 / 2.5) * ( + 0.5 * abs(y - forecast_quantiles_matrix[, 3]) + + (forecast_quantiles_matrix[, 5] - forecast_quantiles_matrix[, 1]) * (alpha1 / 2) + c(0, (-2) - (-15), 22 - 4) + + (forecast_quantiles_matrix[, 4] - forecast_quantiles_matrix[, 2]) * (alpha2 / 2) + c(0, 1 - (-15), 22 - 3) + ) + + expect_equal(eval$interval_score, expected) +}) + + + + + +test_that("Quantlie score and interval score yield the same result, weigh = FALSE", { + true_values <- rnorm(10, mean = 1:10) + alphas <- c(0.1, 0.5, 0.9) + + for (alpha in alphas) { + lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) + upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) + + w <- FALSE + is <- interval_score( + true_values = true_values, + lower = lower, + upper = upper, + interval_range = (1 - alpha) * 100, + weigh = w + ) + + qs_lower <- quantile_score(true_values, + predictions = lower, + quantiles = alpha / 2, + weigh = w + ) + qs_upper <- quantile_score(true_values, + predictions = upper, + quantiles = 1 - alpha / 2, + weigh = w + ) + expect_equal((qs_lower + qs_upper) / 2, is) + } +}) + + +test_that("Quantlie score and interval score yield the same result, weigh = TRUE", { + true_values <- rnorm(10, mean = 1:10) + alphas <- c(0.1, 0.5, 0.9) + + for (alpha in alphas) { + lower <- qnorm(alpha / 2, rnorm(10, mean = 1:10)) + upper <- qnorm((1 - alpha / 2), rnorm(10, mean = 1:10)) + + w <- TRUE + is <- interval_score( + true_values = true_values, + lower = lower, + upper = upper, + interval_range = (1 - alpha) * 100, + weigh = w + ) + + qs_lower <- quantile_score(true_values, + predictions = lower, + quantiles = alpha / 2, + weigh = w + ) + qs_upper <- quantile_score(true_values, + predictions = upper, + quantiles = 1 - alpha / 2, + weigh = w + ) + expect_equal((qs_lower + qs_upper) / 2, is) + } +}) diff --git a/tests/testthat/test-lower-level-check-functions.R b/tests/testthat/test-lower-level-check-functions.R new file mode 100644 index 000000000..0f2966cf7 --- /dev/null +++ b/tests/testthat/test-lower-level-check-functions.R @@ -0,0 +1,117 @@ +test_that("Lower-level input check functions work", { + true_values <- rpois(30, lambda = 1:30) + predictions <- replicate(20, rpois(n = 30, lambda = 1:30)) + expect_equal(length(crps_sample(true_values, predictions)), 30) + + # should error when wrong prediction type is given + predictions2 <- rpois(30, lambda = 1) + expect_error(crps_sample(true_values, predictions2), + "'predictions' should be a matrix. Instead `integer` was found", + fixed = TRUE + ) + + # predictions have wrong number of rows + predictions3 <- replicate(20, rpois(n = 31, lambda = 1)) + expect_error(crps_sample(true_values, predictions3), + "Mismatch: 'true_values' has length `30`, but 'predictions' has `31` rows.", + fixed = TRUE + ) + + # error with missing argument + expect_error(crps_sample(predictions = predictions), + "true_values argument is missing", + fixed = TRUE + ) + + # checks work for binary forecasts + true_values <- sample(c(0, 1), size = 10, replace = TRUE) + predictions <- runif(n = 10) + expect_equal(length(brier_score(true_values, predictions)), 10) + + # true values are not either 0 or 1 + true_values2 <- true_values + 2 + expect_error(brier_score(true_values2, predictions), + "For a binary forecast, all true_values should be either 0 or 1.", + fixed = TRUE + ) + + # predictions are not between 0 and 1 + predictions2 <- predictions + 2 + expect_error(brier_score(true_values, predictions2), + "For a binary forecast, all predictions should be probabilities between 0 or 1.", + fixed = TRUE + ) +}) + + +test_that("function throws an error when missing true_values or predictions", { + true_values <- sample(c(0, 1), size = 10, replace = TRUE) + predictions <- replicate( + 20, + sample(c(0, 1), size = 10, replace = TRUE) + ) + + expect_error( + brier_score(predictions = predictions), + "true_values argument is missing" + ) + + expect_error( + brier_score(true_values = true_values), + "argument 'predictions' missing" + ) +}) + + + +test_that("function throws an error for wrong format of true_value", { + true_values <- rpois(10, lambda = 1:10) + predictions <- runif(10, min = 0, max = 1) + + expect_error( + brier_score( + true_values = true_values, + predictions = predictions + ), + "For a binary forecast, all true_values should be either 0 or 1." + ) + + true_values <- rnorm(10) + expect_error( + brier_score( + true_values = true_values, + predictions = predictions + ), + "For a binary forecast, all true_values should be either 0 or 1." + ) +}) + +test_that("function throws an error for wrong format of predictions", { + true_values <- sample(c(0, 1), size = 10, replace = TRUE) + predictions <- runif(10, min = 0, max = 3) + expect_error( + brier_score( + true_values = true_values, + predictions = predictions + ), + "For a binary forecast, all predictions should be probabilities between 0 or 1." + ) + + predictions <- runif(10, min = 0, max = 1) + expect_error( + brier_score( + true_values = true_values, + predictions = list(predictions) + ), + "Mismatch: 'true_values' has length `10`, but 'predictions' has length `1`" + ) + + predictions <- runif(15, min = 0, max = 1) + expect_error( + brier_score( + true_values = true_values, + predictions = predictions + ), + "Mismatch: 'true_values' has length `10`, but 'predictions' has length `15`" + ) +}) diff --git a/tests/testthat/test-merge_pred_and_obs.R b/tests/testthat/test-merge_pred_and_obs.R index 1be322de2..0d2df2bdf 100644 --- a/tests/testthat/test-merge_pred_and_obs.R +++ b/tests/testthat/test-merge_pred_and_obs.R @@ -1,26 +1,26 @@ -test_that("merge pred and obs works within eval_forecasts", { +test_that("merge pred and obs works", { + data <- example_quantile + forecasts <- example_quantile_forecasts_only + truth_data <- example_truth_only - data <- scoringutils::quantile_example_data - forecasts <- scoringutils::example_quantile_forecasts_only - truth_data <- scoringutils::example_truth_data_only + eval1 <- score(data = data) - eval1 <- scoringutils::eval_forecasts(data = data) + data2 <- merge_pred_and_obs( + forecasts = forecasts, + observations = truth_data + ) - eval2 <- scoringutils::eval_forecasts(forecasts = forecasts, - truth_data = truth_data) + eval2 <- score(data = data2) data.table::setcolorder(eval1, colnames(eval2)) - eval1 <- eval1[order(geography, value_type, model, value_date)] + eval1 <- eval1[order(location, target_type, model, forecast_date, horizon)] # for some reason merge sometimes turns characters into factors. # Reverse this here. # not sure this needs a general solution eval2[, model := as.character(model)] - eval2 <- eval2[order(geography, value_type, model, value_date)] + eval2 <- eval2[order(location, target_type, model, forecast_date, horizon)] expect_equal(as.data.frame(eval1), as.data.frame(eval2), ignore_attr = TRUE) }) - - - diff --git a/tests/testthat/test-pairwise_comparison.R b/tests/testthat/test-pairwise_comparison.R index c560ebd7e..fe9e7bea6 100644 --- a/tests/testthat/test-pairwise_comparison.R +++ b/tests/testthat/test-pairwise_comparison.R @@ -1,15 +1,21 @@ test_that("pairwise comparisons works", { # define some toy data using a format suitable for github.com/reichlab/covidHubUtils - test_truth <- data.frame(model = rep("truth_source", 30), - location = paste0("location_", rep(1:3, each = 10)), - target_end_date = as.Date("2020-01-01") + rep(1:10, times = 3), - target_variable = "inc death", - value = c(4, 1, 5, 5, 6, 12, 4, 5, 12, 53, 8, 6, 3, 1, 46, 6, 3, 5, - 8, 5, 3, 1, 5, 7, 7, 3, 2, 6, 8, 5)) - test_forecasts <- expand.grid(model = c("m1", "m2", "m3", "m4"), - location = c("location_1", "location_2", "location_3"), - target_end_date = as.Date("2020-01-01") + 1:10, - quantile = c(0.05, 0.25, 0.5, 0.75, 0.95)) + test_truth <- data.frame( + model = rep("truth_source", 30), + location = paste0("location_", rep(1:3, each = 10)), + target_end_date = as.Date("2020-01-01") + rep(1:10, times = 3), + target_variable = "inc death", + value = c( + 4, 1, 5, 5, 6, 12, 4, 5, 12, 53, 8, 6, 3, 1, 46, 6, 3, 5, + 8, 5, 3, 1, 5, 7, 7, 3, 2, 6, 8, 5 + ) + ) + test_forecasts <- expand.grid( + model = c("m1", "m2", "m3", "m4"), + location = c("location_1", "location_2", "location_3"), + target_end_date = as.Date("2020-01-01") + 1:10, + quantile = c(0.05, 0.25, 0.5, 0.75, 0.95) + ) set.seed(123) test_forecasts$value <- rnorm(n = nrow(test_forecasts)) @@ -20,37 +26,56 @@ test_that("pairwise comparisons works", { forecasts_formatted <- data.table::as.data.table(test_forecasts) data.table::setnames(forecasts_formatted, old = "value", new = "prediction") - data_formatted <- scoringutils::merge_pred_and_obs(forecasts_formatted, - truth_formatted) + data_formatted <- scoringutils::merge_pred_and_obs( + forecasts_formatted, + truth_formatted + ) # evaluate the toy forecasts, once with and once without a baseline model specified - eval_without_baseline <- scoringutils::eval_forecasts(data_formatted, - by = c("location", "target_end_date", "model"), - compute_relative_skill = TRUE, - interval_score_arguments = list(count_median_twice = FALSE)) - eval_with_baseline <- scoringutils::eval_forecasts(data_formatted, - by = c("location", "target_end_date", "model"), - baseline = "m1", - compute_relative_skill = TRUE, - interval_score_arguments = list(count_median_twice = FALSE)) + eval_without_baseline <- score(data_formatted) + + eval_without_baseline <- summarise_scores(eval_without_baseline, + relative_skill = TRUE, + by = c( + "model", "location", "target_end_date", + "target_variable" + ) + ) + eval_with_baseline <- scoringutils::score(data_formatted, + count_median_twice = FALSE + ) + eval_with_baseline <- summarise_scores(eval_with_baseline, + baseline = "m1", + relative_skill = TRUE, + by = c( + "model", "location", "target_end_date", + "target_variable" + ) + ) + # extract the relative_skill values - relative_skills_without <- eval_without_baseline[, .(model = unique(model), - relative_skill = unique(relative_skill))] - relative_skills_with <- eval_with_baseline[, .(model = unique(model), - relative_skill = unique(scaled_rel_skill))] + relative_skills_without <- eval_without_baseline[, .( + model = unique(model), + relative_skill = unique(relative_skill) + )] + relative_skills_with <- eval_with_baseline[, .( + model = unique(model), + relative_skill = unique(scaled_rel_skill) + )] # prepare scores for the code Johannes Bracher wrote scores_johannes <- data.table::copy(eval_without_baseline) # doesn't matter which one data.table::setnames(scores_johannes, - old = c("location", "target_end_date", "interval_score"), - new = c("unit", "timezero", "wis")) + old = c("location", "target_end_date", "interval_score"), + new = c("unit", "timezero", "wis") + ) # -----------------------------------------------------------------------------# ## rerun code from Johannes Bracher to see whether results agree - pairwise_comparison <- function(scores, mx, my, subset = rep(TRUE, nrow(scores)), - permutation_test = FALSE){ + pairwise_comparison_jb <- function(scores, mx, my, subset = rep(TRUE, nrow(scores)), + permutation_test = FALSE) { # apply subset: scores <- scores[subset, ] @@ -59,23 +84,27 @@ test_that("pairwise comparisons works", { suby <- subset(scores, model == my) # merge together and restrict to overlap: - sub <- merge(subx, suby, by = c("timezero", "unit"), - all.x = FALSE, all.y = FALSE) + sub <- merge(subx, suby, + by = c("timezero", "unit"), + all.x = FALSE, all.y = FALSE + ) # compute ratio: ratio <- sum(sub$wis.x) / sum(sub$wis.y) # perform permutation tests: - if(permutation_test){ + if (permutation_test) { pval <- scoringutils:::permutation_test(sub$wis.x, sub$wis.y, - nPermutation = 999, - comparison_mode = "difference") + n_permutation = 999, + comparison_mode = "difference" + ) # aggregate by forecast date: sub_fcd <- aggregate(cbind(wis.x, wis.y) ~ timezero, data = sub, FUN = mean) pval_fcd <- scoringutils:::permutation_test(sub_fcd$wis.x, sub_fcd$wis.y, - nPermutation = 999) - } else{ + n_permutation = 999 + ) + } else { pval <- NULL pval_fcd <- NULL } @@ -85,17 +114,21 @@ test_that("pairwise comparisons works", { models <- paste0("m", 1:4) # matrices to store: - results_ratio <- results_pval <- results_pval_fcd <- matrix(ncol = length(models), - nrow = length(models), - dimnames = list(models, models)) + results_ratio <- results_pval <- results_pval_fcd <- matrix( + ncol = length(models), + nrow = length(models), + dimnames = list(models, models) + ) set.seed(123) # set seed for permutation tests - for(mx in seq_along(models)){ - for(my in 1:mx){ - pwc <- pairwise_comparison(scores = scores_johannes, mx = models[mx], my = models[my], - permutation_test = TRUE) + for (mx in seq_along(models)) { + for (my in 1:mx) { + pwc <- pairwise_comparison_jb( + scores = scores_johannes, mx = models[mx], my = models[my], + permutation_test = TRUE + ) results_ratio[mx, my] <- pwc$ratio - results_ratio[my, mx] <- 1/pwc$ratio + results_ratio[my, mx] <- 1 / pwc$ratio results_pval[mx, my] <- results_pval[my, mx] <- pwc$pval results_pval_fcd[mx, my] <- @@ -113,7 +146,7 @@ test_that("pairwise comparisons works", { ind_baseline <- which(rownames(results_ratio) == "m1") geom_mean_ratios <- exp(rowMeans(log(results_ratio[, -ind_baseline]), na.rm = TRUE)) ratios_baseline <- results_ratio[, "m1"] - ratios_scaled <- geom_mean_ratios/geom_mean_ratios["m1"] + ratios_scaled <- geom_mean_ratios / geom_mean_ratios["m1"] names(ratios_scaled) <- NULL expect_equal(relative_skills_with$relative_skill, ratios_scaled) @@ -122,17 +155,21 @@ test_that("pairwise comparisons works", { # scoringutils can also do pairwise comparisons for different subcategories # comparison for a subset of the data vs. spliting by category within scoringutils scores_johannes_subset <- scores_johannes[unit == "location_3"] - results_ratio <- results_pval <- results_pval_fcd <- matrix(ncol = length(models), - nrow = length(models), - dimnames = list(models, models)) + results_ratio <- results_pval <- results_pval_fcd <- matrix( + ncol = length(models), + nrow = length(models), + dimnames = list(models, models) + ) set.seed(123) # set seed for permutation tests - for(mx in seq_along(models)){ - for(my in 1:mx){ - pwc <- pairwise_comparison(scores = scores_johannes_subset, mx = models[mx], my = models[my], - permutation_test = TRUE) + for (mx in seq_along(models)) { + for (my in 1:mx) { + pwc <- pairwise_comparison_jb( + scores = scores_johannes_subset, mx = models[mx], my = models[my], + permutation_test = TRUE + ) results_ratio[mx, my] <- pwc$ratio - results_ratio[my, mx] <- 1/pwc$ratio + results_ratio[my, mx] <- 1 / pwc$ratio results_pval[mx, my] <- results_pval[my, mx] <- pwc$pval results_pval_fcd[mx, my] <- @@ -142,17 +179,82 @@ test_that("pairwise comparisons works", { ind_baseline <- which(rownames(results_ratio) == "m1") geom_mean_ratios <- exp(rowMeans(log(results_ratio[, -ind_baseline]), na.rm = TRUE)) ratios_baseline <- results_ratio[, "m1"] - ratios_scaled <- geom_mean_ratios/geom_mean_ratios["m1"] + ratios_scaled <- geom_mean_ratios / geom_mean_ratios["m1"] names(ratios_scaled) <- NULL - eval_with_baseline <- scoringutils::eval_forecasts(data_formatted, - summarise_by = c("model", "location"), - baseline = "m1", - compute_relative_skill = TRUE, - interval_score_arguments = list(count_median_twice = FALSE)) - relative_skills_with <- eval_with_baseline[location == "location_3", - .(model = unique(model), - relative_skill = unique(scaled_rel_skill))] + eval_with_baseline <- scoringutils::score(data_formatted, + count_median_twice = FALSE + ) + eval_with_baseline <- summarise_scores(eval_with_baseline, + baseline = "m1", + relative_skill = TRUE, + by = c("model", "location") + ) + + relative_skills_with <- eval_with_baseline[ + location == "location_3", + .( + model = unique(model), + relative_skill = unique(scaled_rel_skill) + ) + ] expect_equal(relative_skills_with$relative_skill, ratios_scaled) }) + +test_that("Pairwise comparisons work in score() with integer data", { + eval <- score(data = example_integer) + eval <- summarise_scores(eval, by = "model", relative_skill = TRUE) + + expect_true("relative_skill" %in% colnames(eval)) +}) + + +test_that("Pairwise comparisons work in score() with binary data", { + eval <- score(data = example_binary) + eval <- summarise_scores(eval, by = "model", relative_skill = TRUE) + + expect_true("relative_skill" %in% colnames(eval)) +}) + + +# tests for pairwise comparison function --------------------------------------- + +test_that("pairwise_comparison() works", { + df <- data.frame( + model = rep(c("model1", "model2", "model3"), each = 10), + date = as.Date("2020-01-01") + rep(1:5, each = 2), + location = c(1, 2), + interval_score = (abs(rnorm(30))), + ae_median = (abs(rnorm(30))) + ) + + res <- pairwise_comparison(df, + baseline = "model1" + ) + + colnames <- c( + "model", "compare_against", "mean_scores_ratio", + "pval", "adj_pval", "relative_skill", "scaled_rel_skill" + ) + + expect_true(all(colnames %in% colnames(res))) +}) + + +test_that("pairwise_comparison() works inside and outside of score()", { + eval <- score(data = example_continuous) + + pairwise <- pairwise_comparison(eval, + by = "model", + metric = "crps" + ) + + eval2 <- score(data = example_continuous) + eval2 <- summarise_scores(eval2, by = "model", relative_skill = TRUE) + + expect_equal( + sort(unique(pairwise$relative_skill)), + sort(eval2$relative_skill) + ) +}) diff --git a/tests/testthat/test-pit.R b/tests/testthat/test-pit.R index f79d9b902..9d4564446 100644 --- a/tests/testthat/test-pit.R +++ b/tests/testthat/test-pit.R @@ -1,69 +1,62 @@ -test_that("function throws an error when missing true_values", - { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error(pit(predictions = predictions), - "true_values` or `predictions` missing in function 'pit()") - }) - -test_that("function throws an error when missing 'predictions'", - { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error(pit(predictions = predictions), - "true_values` or `predictions` missing in function 'pit()") - }) - - -test_that("function works for integer true_values and predictions", - { - true_values <- rpois(10, lambda = 1:10) - predictions <- replicate(10, rpois(10, lambda = 1:10)) - output <- pit(true_values = true_values, - predictions = predictions) - expect_equal(length(output), - 3) - expect_equal(class(output), - "list") - expect_equal(class(output[[1]]), - "numeric") - }) - -test_that("function works for continuous true_values and predictions", - { - true_values <- rnorm(10) - predictions <- replicate(10, rnorm(10)) - output <- pit(true_values = true_values, - predictions = predictions) - expect_equal(length(output), - 3) - expect_equal(class(output), - "list") - expect_equal(class(output[[1]]), - "numeric") - }) - - - - -# compare results scoringutils2 with scoringutils - -test_that("function works for continuous true_values and predictions", - { - ## continuous predictions - true_values <- rnorm(30, mean = 1:30) - predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) - scoringutils2 <- scoringutils::pit(true_values, predictions) - scoringutils <- scoringutils::pit(true_values, predictions) - - expect_equal(scoringutils2$p_value, scoringutils$p_value) - }) - - - - - - - +test_that("pit_sample() function throws an error when missing true_values", { + true_values <- rpois(10, lambda = 1:10) + predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + pit_sample(predictions = predictions), + "true_values` or `predictions` missing in function 'pit_sample()" + ) +}) + +test_that("pit_sample() function throws an error when missing 'predictions'", { + true_values <- rpois(10, lambda = 1:10) + predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + pit_sample(predictions = predictions), + "true_values` or `predictions` missing in function 'pit_sample()" + ) +}) + + +test_that("pit_sample() function works for integer true_values and predictions", { + true_values <- rpois(10, lambda = 1:10) + predictions <- replicate(10, rpois(10, lambda = 1:10)) + output <- pit_sample( + true_values = true_values, + predictions = predictions, + n_replicates = 56 + ) + expect_equal( + length(output), + 560 + ) +}) + +test_that("pit_sample() function works for continuous true_values and predictions", { + true_values <- rnorm(10) + predictions <- replicate(10, rnorm(10)) + output <- pit_sample( + true_values = true_values, + predictions = predictions, + n_replicates = 56 + ) + expect_equal( + length(output), + 10 + ) +}) + +test_that("pit function works for continuous integer and quantile data", { + pit1 <- pit(example_quantile, by = "model") + pit2 <- pit(example_continuous, + by = c("model", "target_type") + ) + pit3 <- pit(example_integer, + by = c("model", "location") + ) + + expect_equal(names(pit1), c("model", "quantile", "pit_value")) + expect_equal(names(pit2), c("model", "target_type", "pit_value")) + expect_equal(names(pit3), c("model", "location", "pit_value")) +}) diff --git a/tests/testthat/test-plot_predictions.R b/tests/testthat/test-plot_predictions.R index 2f1cdacaf..97c9bd9fa 100644 --- a/tests/testthat/test-plot_predictions.R +++ b/tests/testthat/test-plot_predictions.R @@ -1,59 +1,90 @@ test_that("plot_predictions() works with point forecasts", { - - d <- range_example_data_long - d <- d[d$range == 0 | is.na(d$range), ] + d <- scoringutils::example_quantile + d <- d[d$quantile == 0.5 | is.na(d$quantile), ] p <- scoringutils::plot_predictions( d, - x = "value_date", - filter_truth = list('value_date <= "2020-06-22"', - 'value_date > "2020-05-01"'), - filter_forecasts = list("model == 'SIRCOVID'", - 'creation_date == "2020-06-22"'), + x = "target_end_date", + filter_truth = list( + 'target_end_date <= "2021-07-22"', + 'target_end_date > "2021-05-01"' + ), + filter_forecasts = list( + "model == 'EuroCOVIDhub-ensemble'", + 'forecast_date == "2021-06-07"' + ), allow_truth_without_pred = TRUE, - facet_formula = geography ~ value_desc + facet_formula = location ~ target_type ) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger('point_forecasts', p) - + vdiffr::expect_doppelganger("point_forecasts", p) }) test_that("plot_predictions() can handle an arbitrary number of quantiles", { - - example2 <- scoringutils::range_example_data_long + example2 <- scoringutils::example_quantile p <- scoringutils::plot_predictions( - example2, x = "value_date", - filter_truth = list('value_date <= "2020-06-22"', - 'value_date > "2020-05-01"'), - filter_forecasts = list("model == 'SIRCOVID'", - 'creation_date == "2020-06-22"'), + example2, + x = "target_end_date", + filter_truth = list( + 'target_end_date <= "2021-07-22"', + 'target_end_date > "2021-05-01"' + ), + filter_forecasts = list( + "model == 'EuroCOVIDhub-ensemble'", + 'forecast_date == "2021-06-07"' + ), allow_truth_without_pred = TRUE, - facet_formula = geography ~ value_desc, + facet_formula = location ~ target_type, range = c(0, 10, 20, 30, 40, 50, 60) ) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger('many_quantiles', p) + vdiffr::expect_doppelganger("many_quantiles", p) - example1 <- scoringutils::continuous_example_data + example1 <- scoringutils::example_continuous p2 <- scoringutils::plot_predictions( - example1, x = "value_date", - filter_truth = list('value_date <= "2020-06-22"', - 'value_date > "2020-05-01"'), - filter_forecasts = list("model == 'SIRCOVID'", - 'creation_date == "2020-06-22"'), - facet_formula = geography ~ value_desc, + example1, + x = "target_end_date", + filter_truth = list( + 'target_end_date <= "2021-07-22"', + 'target_end_date > "2021-05-01"' + ), + filter_forecasts = list( + "model == 'EuroCOVIDhub-ensemble'", + 'forecast_date == "2021-06-07"' + ), + facet_formula = location ~ target_type, range = c(0, 50, 90, 95) ) expect_s3_class(p2, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger('many_quantiles_from_sample', p2) + vdiffr::expect_doppelganger("many_quantiles_from_sample", p2) }) +test_that("plot_predictions() works without median", { + + example3 <- subset( + scoringutils::example_quantile, + is.na(quantile) | quantile != 0.5 + ) + + p <- scoringutils::plot_predictions( + example3, x = "target_end_date", + filter_truth = list("target_end_date > '2021-06-25'", + "target_end_date <= '2021-07-12'"), + filter_forecasts = list("model == 'EuroCOVIDhub-ensemble'", + "forecast_date == '2021-07-12'"), + facet_formula = location_name ~ target_type + ) + expect_s3_class(p, "ggplot") + skip_on_cran() + vdiffr::expect_doppelganger('no_median', p) + +}) diff --git a/tests/testthat/test-score.R b/tests/testthat/test-score.R new file mode 100644 index 000000000..452bb7d02 --- /dev/null +++ b/tests/testthat/test-score.R @@ -0,0 +1,153 @@ +# common error handling -------------------------------------------------------- +test_that("function throws an error if data is missing", { + expect_error(score(data = NULL)) +}) + +test_that("score() warns if column name equals a metric name", { + data <- data.frame( + true_value = rep(1:10, each = 2), + prediction = rep(c(-0.3, 0.3), 10) + rep(1:10, each = 2), + model = "Model 1", + date = as.Date("2020-01-01") + rep(1:10, each = 2), + quantile = rep(c(0.1, 0.9), times = 10), + bias = 3 + ) + + expect_warning(score(data = data)) +}) + + + +# test binary case ------------------------------------------------------------- +test_that("function produces output for a binary case", { + binary_example <- data.table::setDT(scoringutils::example_binary) + eval <- score(binary_example[!is.na(prediction)]) + eval <- summarise_scores(eval, by = c("model", "target_type")) + + expect_equal( + nrow(eval) > 1, + TRUE + ) + expect_equal( + colnames(eval), + c( + "model", "target_type", + "brier_score", + "log_score" + ) + ) +}) + + +test_that("function produces score for a binary case", { + binary_example <- data.table::setDT(scoringutils::example_binary) + eval <- score(binary_example[!is.na(prediction)]) + eval <- summarise_scores(eval, by = c("model", "target_type")) + expect_true("brier_score" %in% names(eval)) +}) + + + + +# test quantile case ----------------------------------------------------------- +test_that("function produces output for a quantile format case", { + quantile_example <- data.table::setDT(scoringutils::example_quantile) + eval <- score(quantile_example[!is.na(prediction)]) + + expect_equal( + nrow(eval) > 1, + TRUE + ) +}) + +test_that("score() quantile produces desired metrics", { + data <- data.frame( + true_value = rep(1:10, each = 2), + prediction = rep(c(-0.3, 0.3), 10) + rep(1:10, each = 2), + model = "Model 1", + date = as.Date("2020-01-01") + rep(1:10, each = 2), + quantile = rep(c(0.1, 0.9), times = 10) + ) + + out <- score(data = data) + metric_names <- c( + "dispersion", "underprediction", "overprediction", + "bias", "ae_median", "coverage_deviation" + ) + + expect_true(all(metric_names %in% colnames(out))) +}) + + +test_that("calculation of ae_median is correct for a quantile format case", { + eval <- score(scoringutils::example_quantile[!is.na(prediction)]) + + eval <- summarise_scores(eval, by = c("model")) + + example <- scoringutils::example_quantile + ae <- example[quantile == 0.5, ae := abs(true_value - prediction)][!is.na(model), .(mean = mean(ae, na.rm = TRUE)), + by = "model" + ]$mean + + expect_equal(sort(eval$ae_median), sort(ae)) +}) + + +test_that("all quantile and range formats yield the same result", { + quantile_example1 <- data.table::setDT(scoringutils::example_quantile) + + eval1 <- score(quantile_example1[!is.na(prediction)]) + eval1 <- summarise_scores(eval1, by = "model") + + ae <- quantile_example1[quantile == 0.5, ae := abs(true_value - prediction)][!is.na(model), .(mean = mean(ae, na.rm = TRUE)), + by = "model" + ]$mean + + expect_equal(sort(eval1$ae_median), sort(ae)) +}) + +test_that("function produces output even if only some metrics are chosen", { + example <- scoringutils::example_quantile + + eval <- scoringutils::score(example, metrics = "coverage") + + expect_equal( + nrow(eval) > 1, + TRUE + ) +}) + +test_that("WIS is the same with other metrics omitted or included", { + eval <- scoringutils::score(example_quantile, + metrics = "interval_score" + ) + + eval2 <- scoringutils::score(example_quantile) + + expect_equal( + sum(eval$interval_score), + sum(eval2$interval_score) + ) +}) + + + + + +# test integer and continuous case --------------------------------------------- +test_that("function produces output for a continuous format case", { + example <- data.table::setDT(scoringutils::example_continuous) + eval <- score(example[!is.na(prediction)]) + + eval2 <- scoringutils::score(example) + + data.table::setcolorder(eval2, colnames(eval)) + eval <- eval[order(model)] + eval2 <- eval2[order(model)] + all(eval == eval2, na.rm = TRUE) + + expect_equal( + nrow(eval) > 1, + TRUE + ) +}) diff --git a/tests/testthat/test-sharpness.R b/tests/testthat/test-sharpness.R index 6987eda1e..4d50190e5 100644 --- a/tests/testthat/test-sharpness.R +++ b/tests/testthat/test-sharpness.R @@ -1,8 +1,8 @@ -test_that("function throws an error when missing 'predictions'", - { - predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error(sharpness(), - "predictions argument missing") - }) +test_that("function throws an error when missing 'predictions'", { + predictions <- replicate(50, rpois(n = 10, lambda = 1:10)) + expect_error( + mad_sample(), + "argument 'predictions' missing" + ) +}) diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R new file mode 100644 index 000000000..34a7ad33b --- /dev/null +++ b/tests/testthat/test-summarise_scores.R @@ -0,0 +1,31 @@ +test_that("summarise_scores() works without any arguments", { + scores <- score(example_quantile) + expect_true("quantile" %in% names(scores)) + + scores <- summarise_scores(scores) + expect_false("quantile" %in% names(scores)) + + s2 <- summarise_scores(scores, + by = c( + "location", "target_end_date", "target_type", + "location_name", "forecast_date", "model", + "horizon" + ) + ) + + expect_equal(nrow(scores), nrow(s2)) +}) + +test_that("summarise_scores() handles wrong by argument well", { + scores <- score(example_quantile) + + expect_error(summarise_scores(scores, by = "not_present"), + "The following items in `by` are notvalid column names of the data: 'not_present'. Check and run `summarise_scores()` again", + fixed = TRUE + ) + + expect_error(summarise_scores(scores, by = "sample"), + "The following items in `by` are notvalid column names of the data: 'sample'. Check and run `summarise_scores()` again", + fixed = TRUE + ) +}) diff --git a/tests/testthat/test-utils_data_handling.R b/tests/testthat/test-utils_data_handling.R index ce2417eec..e911c0feb 100644 --- a/tests/testthat/test-utils_data_handling.R +++ b/tests/testthat/test-utils_data_handling.R @@ -1,84 +1,48 @@ -test_that("range_long_to_wide works", { - long <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - range = 50, - boundary = rep(c("lower", "upper"), each = 10)) - - wide <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - lower_50 = c(2:11), - upper_50 = 4:13) - wide2 <- as.data.frame(scoringutils::range_long_to_wide(long)) - expect_equal(wide, wide2) -}) - - - -test_that("range_wide_to_long works", { - wide <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - lower_50 = c(2:11), - upper_50 = 4:13) - - long <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - range = 50, - boundary = rep(c("lower", "upper"), each = 10)) - - long2 <- as.data.frame(scoringutils::range_wide_to_long(wide)) - - # for some reason this is needed to pass the unit tests on gh actions - long2$boundary <- as.character(long2$boundary) - long$boundary <- as.character(long$boundary) - - data.table::setcolorder(long2, names(long)) - - expect_equal(long, as.data.frame(long2), ignnore_attr = TRUE) -}) - - test_that("range_long_to_quantile works", { - long <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - range = 50, - boundary = rep(c("lower", "upper"), each = 10)) - - quantile <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - quantile = rep(c(0.25, 0.75), each = 10)) - - quantile2 <- as.data.frame(scoringutils::range_long_to_quantile(long)) + long <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(2:11, 4:13), + range = 50, + boundary = rep(c("lower", "upper"), each = 10) + ) + + quantile <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(2:11, 4:13), + quantile = rep(c(0.25, 0.75), each = 10) + ) + + quantile2 <- as.data.frame(scoringutils:::range_long_to_quantile(long)) expect_equal(quantile, quantile2) }) test_that("quantile_to_range_long works", { - quantile <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - quantile = rep(c(0.25, 0.75), each = 10)) - - long <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - range = 50, - boundary = rep(c("lower", "upper"), each = 10)) - - long2 <- as.data.frame(scoringutils::quantile_to_range_long(quantile, - keep_quantile_col = FALSE)) + quantile <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(2:11, 4:13), + quantile = rep(c(0.25, 0.75), each = 10) + ) + + long <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(2:11, 4:13), + range = 50, + boundary = rep(c("lower", "upper"), each = 10) + ) + + long2 <- as.data.frame(scoringutils:::quantile_to_range_long(quantile, + keep_quantile_col = FALSE + )) data.table::setcolorder(long2, names(long)) @@ -86,23 +50,26 @@ test_that("quantile_to_range_long works", { long2$boundary <- as.character(long2$boundary) long$boundary <- as.character(long$boundary) - expect_equal(long, as.data.frame(long2), ignnore_attr = TRUE) + expect_equal(long, as.data.frame(long2)) }) test_that("sample_to_quantiles works", { - - samples <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), - sample = rep(1:5, each = 10)) - - quantile <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - quantile = rep(c(0.25, 0.75), each = 10)) + samples <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), + sample = rep(1:5, each = 10) + ) + + quantile <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(2:11, 4:13), + quantile = rep(c(0.25, 0.75), each = 10) + ) quantile2 <- scoringutils::sample_to_quantile(samples, quantiles = c(0.25, 0.75)) @@ -115,22 +82,27 @@ test_that("sample_to_quantiles works", { test_that("sample_to_range_long works", { - samples <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), - sample = rep(1:5, each = 10)) - - long <- data.frame(date = as.Date("2020-01-01") + 1:10, - model = "model1", - true_value = 1:10, - prediction = c(2:11, 4:13), - range = 50, - boundary = rep(c("lower", "upper"), each = 10)) - - long2 <- scoringutils::sample_to_range_long(samples, - range = 50, - keep_quantile_col = FALSE) + samples <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(rep(0, 10), 2:11, 3:12, 4:13, rep(100, 10)), + sample = rep(1:5, each = 10) + ) + + long <- data.frame( + date = as.Date("2020-01-01") + 1:10, + model = "model1", + true_value = 1:10, + prediction = c(2:11, 4:13), + range = 50, + boundary = rep(c("lower", "upper"), each = 10) + ) + + long2 <- scoringutils:::sample_to_range_long(samples, + range = 50, + keep_quantile_col = FALSE + ) long2 <- long2[order(model, boundary, date)] data.table::setcolorder(long2, names(long)) @@ -138,7 +110,5 @@ test_that("sample_to_range_long works", { long2$boundary <- as.character(long2$boundary) long$boundary <- as.character(long$boundary) - expect_equal(long, as.data.frame(long2), ignnore_attr = TRUE) + expect_equal(long, as.data.frame(long2)) }) - - diff --git a/vignettes/getting-started.Rmd b/vignettes/getting-started.Rmd new file mode 100644 index 000000000..06912a9bb --- /dev/null +++ b/vignettes/getting-started.Rmd @@ -0,0 +1,322 @@ +--- +title: "Getting started" +author: "Nikos Bosse" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Getting started} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, + fig.width = 7, + collapse = TRUE, + comment = "#>") +library(scoringutils) +library(magrittr) +library(data.table) +library(ggplot2) +library(knitr) +``` + +The `scoringutils` package provides a collection of metrics and proper scoring rules that make it simple to score probabilistic forecasts against the true observed values. The `scoringutils` package offers convenient automated forecast evaluation in a `data.table` format (using the function `score()`), but also provides experienced users with a set of reliable lower-level scoring metrics operating on vectors/matriced they can build upon in other applications. In addition it implements a wide range of flexible plots that are able to cover many use cases. + +The goal of this package is to provide a tested and reliable collection of metrics that can be used for scoring probabilistic forecasts (forecasts with a full predictive distribution, rather than point forecasts). It has a much stronger focus on convenience than e.g. the `scoringRules` package, which provides a comprehensive collection of proper scoring rules (also used in `scoringutils`). In contrast to other packages, `scoringutils` offers functionality to automatically evaluate forecasts, to visualise scores and to obtain relative scores between models. + +Predictions can be handled in various formats: `scoringutils` can handle probabilistic forecasts in either a sample based or a quantile based format. For more detail on the expected input formats please see below. True values can be integer, continuous or binary. + +## Input formats + +Most of the time, the `score()` function will be able to do the entire evaluation for you. All you need to do is to pass in a `data.frame` with the appropriate columns. Which columns are required depends on the format the forecasts come in. The forecast format can either be based on quantiles (see `example_quantile` for the expected format), based on predictive samples (see `example_continuous` and `example_integer` for the expected format in each case) or in a binary format. The following table gives an overview (pairwise comparisons will be explained in more detail below): + +```{r, echo=FALSE} +requirements <- data.table( + "Format" = c( + "quantile-based", "sample-based", "binary", "pairwise-comparisons" + ), + `Required columns` = c( + "'true_value', 'prediction', 'quantile'", "'true_value', 'prediction', + 'sample'", "'true_value', 'prediction'", "additionally a column 'model'" + ) +) +kable(requirements) +``` + +Additional columns may be present to indicate a grouping of forecasts. For example, we could have forecasts made by different models in various locations at different time points, each for several weeks into the future. It is important, that there are only columns present which are relevant in order to group forecasts. A combination of different columns should uniquely define the *unit of a single forecast*, meaning that a single forecast is defined by the values in the other columns. + + + +## Checking the input data + +The function `check_forecasts()` can be used to check the input data. It gives a summary of what `scoringutils` thinks you are trying to achieve. It infers the type of the prediction target, the prediction format, and the unit of a single forecasts, gives an overview of the number of unique values per column (helpful for spotting missing data) and returns warnings or errors. + +```{r} +head(example_quantile) +``` + +```{r} +check_forecasts(example_quantile) +``` + +If you are unsure what your input data should look like, have a look at the `example_quantile`, `example_integer`, `example_continuous` and `example_binary` data sets provided in the package. + +## Showing available forecasts + +The function `avail_forecasts()` may also be helpful to determine where forecasts are available. Using the `by` argument you can specify the level of summary. For example, to see how many forecasts there are per model and target_type, we can run + +```{r} +avail_forecasts(example_quantile, by = c("model", "target_type")) +``` + +We see that 'epiforecasts-EpiNow2' has some missing forecasts for the deaths forecast target and that UMass-MechBayes has no case forecasts. + +This information can also be visualised using the `plot_avail_forecasts()` function: + +```{r, fig.width=11, fig.height=6} +example_quantile %>% + avail_forecasts(by = c("model", "forecast_date", "target_type")) %>% + plot_avail_forecasts() + + facet_wrap(~ target_type) +``` + +You can also visualise forecasts directly using the `plot_predictions()` function: + +```{r, fig.width = 9, fig.height = 6} +example_quantile %>% + plot_predictions( + x = "target_end_date", + filter_truth = list( + 'target_end_date <= "2021-07-15"', 'target_end_date > "2021-05-22"' + ), + filter_forecasts = list( + "model == 'EuroCOVIDhub-ensemble'", 'forecast_date == "2021-06-28"' + ) + ) + + facet_wrap(target_type ~ location, ncol = 4, scales = "free") + + theme(legend.position = "bottom") +``` + +## Scoring and summarising forecasts + +Forecasts can easily be scored using the `score()` function. This function returns unsumarised scores, which in most cases is not what the user wants. A second function, `summarise_scores()` takes care of summarising these scores to the level specified by the user. If you like, you can also use `sumarise_scores()` to round your outputs by specifying e.g. `signif()` as a summary function. + +```{r} +score(example_quantile) %>% + head() +``` + +```{r} +example_quantile %>% + score() %>% + summarise_scores(by = c("model", "target_type")) %>% + summarise_scores(fun = signif, digits = 2) %>% + kable() +``` + +The `by` argument can be used to define the level of summary. By default, `by = NULL` is set to the unit of a single forecast. For quantile-based forecasts, unsummarised scores are returned for every quantile individually. It can therefore make sense to run `summarise_scores`even without any arguments provided. + +```{r} +score(example_quantile) %>% + summarise_scores() +``` + +### Adding empirical coverage + +For quantile-based forecasts we are often interested in specific coverage-levels, for example, what percentage of true values fell between all 50% or the 90% prediction intervals. We can add this information using the function `add_coverage()`. This function also requires a `by` argument which defines the level of grouping for which the percentage of true values covered by certain prediction intervals is computed. + +```{r} +score(example_quantile) %>% + add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% + summarise_scores(by = c("model", "target_type")) %>% + summarise_scores(fun = signif, digits = 2) +``` + +### Adding relative scores + +In order to better compare models against each other we can use relative scores which are computed based on pairwise comparisons (see details below). Relative scores can be added to the evaluation using the function `summarise_scores()`. This requires a column called 'model' to be present. Pairwise comparisons are computed according to the grouping specified in `by`: essentially, the data.frame with all scores gets split into different data.frames according to the values specified in `by` and relative scores are computed for every individual group separately. The `baseline` argumen allows us to specify a baseline that can be used to scale relative scores (all scores are divided by the baseline relative score). For example, to obtain relative scores separately for different forecast targets, we can run + +```{r} +score(example_quantile) %>% + summarise_scores(by = c("model", "target_type"), + relative_skill = TRUE, + baseline = "EuroCOVIDhub-ensemble") +``` + + +## Visualising scores + +### Coloured table + +A simple coloured table can be produced based on the scores: + +```{r} +score(example_integer) %>% + summarise_scores(by = c("model")) %>% + plot_score_table() +``` + +### Score heatmap + +We can also summarise one particular metric across different categories using a simple heatmap: + +```{r, fig.width=11, fig.height=6} +score(example_continuous) %>% + summarise_scores(by = c("model", "location", "target_type")) %>% + plot_heatmap(x = "location", metric = "bias") + + facet_wrap(~ target_type) +``` + +### Weighted interval score components + +The weighted interval score can be split up into three components: Over-prediction, under-prediction and dispersion. These can be visualised separately in the following way: + +```{r} +score(example_quantile) %>% + summarise_scores(by = c("target_type", "model")) %>% + plot_wis() + + facet_wrap(~ target_type, scales = "free") +``` + +## Calibration + +Calibration is a measure statistical consistency between the forecasts and the observed values. The most common way of assessing calibration (more precisely: probabilistic calibration) are PIT histograms. The probability integral transform (PIT) is equal to the cumulative distribution function of a forecast evaluated at the true observed value. Ideally, pit values should be uniformly distributed after the transformation. + +We can compute pit values as such: + +```{r} +example_continuous %>% + pit(by = "model") +``` + +And visualise the results as such: + +```{r} +example_continuous %>% + pit(by = c("model", "target_type")) %>% + plot_pit() + + facet_grid(model ~ target_type) +``` + +Similarly for quantile-based forecasts: + +```{r} +example_quantile[quantile %in% seq(0.1, 0.9, 0.1), ] %>% + pit(by = c("model", "target_type")) %>% + plot_pit() + + facet_grid(model ~ target_type) +``` + +Another way to look at calibration are interval coverage and quantile coverage. Interval coverage is the percentage of true values that fall inside a given central prediction interval. Quantile coverage is the percentage of observed values that fall below a given quantile level. + +In order to plot interval coverage, you need to include "range" in the `by` argument to `summarise_scores()`. The green area on the plot marks conservative behaviour, i.e. your empirical coverage is greater than it nominally need be (e.g. 55% of true values covered by all 50% central prediction intervals.) + +```{r} +example_quantile %>% + score() %>% + summarise_scores(by = c("model", "range")) %>% + plot_interval_coverage() +``` + +To visualise quantile coverage, you need to include "quantile" in `by`. Again, the green area corresponds to conservative forecasts, where central prediction intervals would cover more than needed. + +```{r} +example_quantile %>% + score() %>% + summarise_scores(by = c("model", "quantile")) %>% + plot_quantile_coverage() +``` + +## Pairwise comparisons + +Relative scores for different models can be computed using pairwise comparisons, a sort of pairwise tournament where all cominations of two models are compared against each other based on the overlapping set of available forecasts common to both models. Internally, a ratio of the mean scores of both models is computed. The relative score of a model is then the geometric mean of all mean score ratios which involve that model. When a baseline is provided, then that baseline is excluded from the relative scores for individual models (which therefore differ slightly from relative scores without a baseline) and all relative scores are scaled by (i.e. divided by) the relative score of the baseline model. + +In `scoringutils`, pairwise comparisons can be made in two ways: Through the standalone function `pairwise_comparison()` or from within `summarise_scores()` which simply adds relative scores to an existing set of scores. + +```{r} +example_quantile %>% + score() %>% + pairwise_comparison(by = "model", baseline = "EuroCOVIDhub-baseline") +``` + +```{r} +example_quantile %>% + score() %>% + summarise_scores( + by = "model", relative_skill = TRUE, baseline = "EuroCOVIDhub-baseline" + ) +``` + +If using the `pairwise_comparison()` function, we can also visualise pairwise comparisons by showing the mean score ratios between models. By default, smaller values are better and the model we care about is showing on the y axis on the left, while the model against it is compared is shown on the x-axis on the bottom. In the example above, the EuroCOVIDhub-ensemble performs best (it only has values smaller 1), while the EuroCOVIDhub-baseline performs worst (and only has values larger than 1). For cases, the UMass-MechBayes model is of course excluded as there are no case forecasts available and therefore the set of overlapping forecasts is empty. + +```{r, fig.width=9, fig.height=7} +example_quantile %>% + score() %>% + pairwise_comparison(by = c("model", "target_type")) %>% + plot_pairwise_comparison() + + facet_wrap(~ target_type) +``` + + +## Additional analyses and visualisations + +### Correlation between scores + +It may sometimes be interesting to see how different scores correlate with each other. We can examine this using the function `correlation()`. When dealing with quantile-based forecasts, it is important to call `summarise_scorees()` before `correlation()` to summarise over quantiles before computing correlations. + +```{r} +example_quantile %>% + score() %>% + summarise_scores() %>% + correlation() +``` + +Visualising correlations: + +```{r} +example_quantile %>% + score() %>% + summarise_scores() %>% + correlation() %>% + plot_correlation() +``` + +### Scores by interval ranges + +If you would like to see how different forecast interval ranges contribute to average scores, you can viusalise scores by interval range: + +```{r} +example_quantile %>% + score() %>% + summarise_scores(by = c("model", "range", "target_type")) %>% + plot_ranges() + + facet_wrap(~ target_type, scales = "free") +``` + +## Tips and tricks - converting to sample-based forecasts + +Different metrics are available for different forecasting formats. In some cases, you may for example have forecasts in a sample-based format, but wish to make use of some of the functionality only available to quantile-based forecasts. For example, you may want to use the decomposition of the weighted interval score, or may like to compute interval coverage values. + +You can convert your forecasts into a sample-based format using the function `sample_to_quantile()`. There is, however, one caveat: Quantiles will be calculated based on the predictive samples, which may introduce a bias if the number of available samples is small. + +```{r} +example_integer %>% + sample_to_quantile( + quantiles = c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + ) %>% + score() %>% + add_coverage(by = c("model", "target_type")) +``` + +## Available metrics + +An overview of available metrics can be found in the `metrics_summary` data set that is included in the package. + +```{r} +metrics_summary +``` + +## Lower level functions + +Most of these metrics are available as lower level functions and extensively documented. Have a look at the help files to understand these better. diff --git a/vignettes/metric-details.Rmd b/vignettes/metric-details.Rmd new file mode 100644 index 000000000..789299d89 --- /dev/null +++ b/vignettes/metric-details.Rmd @@ -0,0 +1,66 @@ +--- +title: "Details on the metrics implemented in `scoringutils`" +author: "Nikos Bosse" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Details on the metrics implemented in `scoringutils`} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +library(scoringutils) +library(kableExtra) +library(magrittr) +library(knitr) +library(data.table) +``` + +## Applicability of different metrics for different types of forecasts and formats + +This table gives an overview for when which metric can be applied and gives a very brief description. Note that this table on shows the metrics as implemented in `scoringutils`. For example, only scoring of sample-based discrete and continuous distributions is implemented in `scoringutils`, but closed-form solutions often exist (e.g. in the `scoringRules` package). + +```{r, echo = FALSE, results = "asis"} + +data <- readRDS( + system.file("metrics-overview/metrics-summary.Rda", package = "scoringutils") +) + +data[, 1:6] %>% + kbl(format = "html", + escape = FALSE, + align = c("lccccl"), + linesep = c('\\addlinespace')) %>% + column_spec(1, width = "3.2cm") %>% + column_spec(2, width = "1.5cm") %>% + column_spec(3, width = "1.5cm") %>% + column_spec(4, width = "1.3cm") %>% + column_spec(5, width = "1.5cm") %>% + column_spec(6, width = "6.0cm") %>% + add_header_above(c(" " = 1, "Sample-based" = 2, " " = 3)) %>% + row_spec(seq(1, nrow(data), 2), background = "Gainsboro") %>% + kable_styling() +``` + + +## Detailed explanation of the metrics implemented in `scoringutils` + +```{r, echo = FALSE, results = "asis"} + +data <- readRDS( + system.file("metrics-overview/metrics-detailed.Rda", package = "scoringutils") +) + +data[, 1:2] %>% + kbl(format = "html", + escape = TRUE) %>% + column_spec(1, width = "3.5cm") %>% + row_spec(seq(1, nrow(data), 2), background = "Gainsboro") %>% + column_spec(2, width = "15.5cm") %>% + kable_styling() +``` diff --git a/vignettes/scoring-forecasts-directly.Rmd b/vignettes/scoring-forecasts-directly.Rmd new file mode 100644 index 000000000..45ed9ce61 --- /dev/null +++ b/vignettes/scoring-forecasts-directly.Rmd @@ -0,0 +1,220 @@ +--- +title: "Scoring forecasts directly" +author: "Nikos Bosse" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{scoringutils} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + fig.width = 7, + collapse = TRUE, + comment = "#>" +) +library(scoringutils) +library(magrittr) +library(data.table) +library(ggplot2) +``` + +A variety of metrics and scoring rules can also be accessed directly through +the `scoringutils` package. + +The following gives an overview of (most of) the implemented metrics. + +# Bias + +The function `bias` determines bias from predictive Monte-Carlo samples, +automatically recognising whether forecasts are continuous or +integer valued. + +For continuous forecasts, Bias is measured as +$$B_t (P_t, x_t) = 1 - 2 \cdot (P_t (x_t))$$ + +where $P_t$ is the empirical cumulative distribution function of the +prediction for the true value $x_t$. Computationally, $P_t (x_t)$ is +just calculated as the fraction of predictive samples for $x_t$ +that are smaller than $x_t$. + +For integer valued forecasts, Bias is measured as + +$$B_t (P_t, x_t) = 1 - (P_t (x_t) + P_t (x_t + 1))$$ + +to adjust for the integer nature of the forecasts. In both cases, Bias can +assume values between -1 and 1 and is 0 ideally. + +```{r} +## integer valued forecasts +true_values <- rpois(30, lambda = 1:30) +predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) +bias_sample(true_values, predictions) + +## continuous forecasts +true_values <- rnorm(30, mean = 1:30) +predictions <- replicate(200, rnorm(30, mean = 1:30)) +bias_sample(true_values, predictions) +``` + + +# Sharpness +Sharpness is the ability of the model to generate predictions within a +narrow range. It is a data-independent measure, and is purely a feature +of the forecasts themselves. + +Sharpness / dispersion of predictive samples corresponding to one single true value is +measured as the normalised median of the absolute deviation from +the median of the predictive samples. For details, see `?stats::mad` + +```{r} +predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) +mad_sample(predictions) +``` + +# Calibration + +Calibration or reliability of forecasts is the ability of a model to +correctly identify its own uncertainty in making predictions. In a model +with perfect calibration, the observed data at each time point look as if +they came from the predictive probability distribution at that time. + +Equivalently, one can inspect the probability integral transform of the +predictive distribution at time t, + +$$u_t = F_t (x_t)$$ + +where $x_t$ is the observed data point at time $t \text{ in } t_1, …, t_n$, +n being the number of forecasts, and $F_t$ is the (continuous) predictive +cumulative probability distribution at time t. If the true probability +distribution of outcomes at time t is $G_t$ then the forecasts $F_t$ are +said to be ideal if $F_t = G_t$ at all times $t$. In that case, the +probabilities ut are distributed uniformly. + +In the case of discrete outcomes such as incidence counts, +the PIT is no longer uniform even when forecasts are ideal. +In that case a randomised PIT can be used instead: + +$$u_t = P_t(k_t) + v \cdot (P_t(k_t) - P_t(k_t - 1) )$$ + +where $k_t$ is the observed count, $P_t(x)$ is the predictive +cumulative probability of observing incidence $k$ at time $t$, +$P_t (-1) = 0$ by definition and $v$ is standard uniform and independent +of $k$. If $P_t$ is the true cumulative +probability distribution, then $u_t$ is standard uniform. + +The function checks whether integer or continuous forecasts were provided. +It then applies the (randomised) probability integral and tests +the values $u_t$ for uniformity using the +Anderson-Darling test. + +As a rule of thumb, there is no evidence to suggest a forecasting model is +miscalibrated if the p-value found was greater than a threshold of $p >= 0.1$, +some evidence that it was miscalibrated if $0.01 < p < 0.1$, and good +evidence that it was miscalibrated if $p <= 0.01$. +In this context it should be noted, though, that uniformity of the +PIT is a necessary but not sufficient condition of calibration. It should +als be noted that the test only works given sufficient samples, otherwise the +Null hypothesis will often be rejected outright. + + +# Continuous Ranked Probability Score (CRPS) +Wrapper around the `crps_sample()` function from the +`scoringRules` package. For more information look at the manuals from the +`scoringRules` package. The function can be used for continuous as well as +integer valued forecasts. Smaller values are better. + +```{r} +true_values <- rpois(30, lambda = 1:30) +predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) +crps_sample(true_values, predictions) +``` + + + +# Dawid-Sebastiani Score (DSS) +Wrapper around the `dss_sample()` function from the +`scoringRules` package. For more information look at the manuals from the +`scoringRules` package. The function can be used for continuous as well as +integer valued forecasts. Smaller values are better. + +```{r} +true_values <- rpois(30, lambda = 1:30) +predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) +dss_sample(true_values, predictions) +``` + +# Log Score +Wrapper around the `logs_sample()` function from the +`scoringRules` package. For more information look at the manuals from the +`scoringRules` package. The function should not be used for integer valued +forecasts. While Log Scores are in principle possible for integer valued +forecasts they require a kernel density estimate which is not well defined +for discrete values. Smaller values are better. + +```{r} +true_values <- rnorm(30, mean = 1:30) +predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) +logs_sample(true_values, predictions) +``` + +# Brier Score +The Brier score is a proper score rule that assesses the accuracy of +probabilistic binary predictions. The outcomes can be either 0 or 1, +the predictions must be a probability that the true outcome will be 1. + +The Brier Score is then computed as the mean squared error between the +probabilistic prediction and the true outcome. + +$$\text{Brier_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\text{prediction}_t - \text{outcome}_t)^2$$ + + +```{r} +true_values <- sample(c(0, 1), size = 30, replace = TRUE) +predictions <- runif(n = 30, min = 0, max = 1) + +brier_score(true_values, predictions) +``` + +## Interval Score +The Interval Score is a Proper Scoring Rule to score quantile predictions, +following Gneiting and Raftery (2007). Smaller values are better. + +The score is computed as + +$$ \text{score} = (\text{upper} - \text{lower}) + \\ +\frac{2}{\alpha} \cdot (\text{lower} - \text{true_value}) \cdot 1(\text{true_values} < \text{lower}) + \\ +\frac{2}{\alpha} \cdot (\text{true_value} - \text{upper}) \cdot +1(\text{true_value} > \text{upper})$$ + + +where $1()$ is the indicator function and $\alpha$ is the decimal value that +indicates how much is outside the prediction interval. +To improve usability, the user is asked to provide an interval range in +percentage terms, i.e. interval_range = 90 (percent) for a 90 percent +prediction interval. Correspondingly, the user would have to provide the +5\% and 95\% quantiles (the corresponding alpha would then be 0.1). +No specific distribution is assumed, +but the range has to be symmetric (i.e you can't use the 0.1 quantile +as the lower bound and the 0.7 quantile as the upper). +Setting `weigh = TRUE` will weigh the score by $\frac{\alpha}{2}$ such that +the Interval Score converges to the CRPS for increasing number of quantiles. + + +```{r} +true_values <- rnorm(30, mean = 1:30) +interval_range <- 90 +alpha <- (100 - interval_range) / 100 +lower <- qnorm(alpha / 2, rnorm(30, mean = 1:30)) +upper <- qnorm((1 - alpha / 2), rnorm(30, mean = 1:30)) + +interval_score( + true_values = true_values, + lower = lower, + upper = upper, + interval_range = interval_range +) +``` + diff --git a/vignettes/scoringutils.Rmd b/vignettes/scoringutils.Rmd deleted file mode 100644 index 80cafec5b..000000000 --- a/vignettes/scoringutils.Rmd +++ /dev/null @@ -1,356 +0,0 @@ ---- -title: "scoringutils" -author: "Nikos Bosse" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{scoringutils} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - fig.width = 7, - collapse = TRUE, - comment = "#>" -) -``` - -# Introduction and Overview of Functionality - -The `scoringutils` package provides a collection of metrics and proper scoring rules -that make it simple to score forecasts against the true observed values. -Predictions can either be automatically scored from a `data.frame` using the function `eval_forecasts()`. Alternatively, evaluation metrics can be accessed directly using lower level functions within a vector/matrix framework. - -Predictions can be handled in various formats: `scoringutils` can handle probabilistic forecasts in either a sample based or a quantile based format. For more detail on the expected input formats please see below. True values can be integer, continuous or binary. - -In addition to automatic scoring, `scoringutils` offers a variety of plots and visualisations. - -# Scoring Forecasts Automatically - -Most of the time, the `eval_forecasts()` function will be able to do the entire evaluation for you. The idea is simple, yet flexible. - -All you need to do is to pass in a `data.frame` that has a column called `prediction` and one called `true_value`. Depending on the exact input format, additional columns like `sample`, `quantile` or `range` and `boundary` are needed. Additional columns may be present to indicate a grouping of forecasts. For example, we could have forecasts made by different models in various locations at different time points, each for several weeks into the future. In this case, we would have additional columns called for example `model`, `date`, `forecast_date`, `forecast_horizon` and `location`. - -Using the `by` argument you need to specify the *unit of a single forecast*. In this example here we would set `by = c("model", "date", "forecast_date", "forecast_horizon", "location")` (note: if we want to be pedantic, there is a small duplication as the information of "date" is already included in the combination of "forecast_date" and "forecast_horizon". But as long as there isn't some weird shift, this doesn't matter for the purpose of grouping our observations). If you don't specify `by` (i.e. `by = NULL`), `scoringutils` will automatically use all appropriate present columns. Note that you don't need to include columns such as `quantile` or `sample` in the `by` argument, as several quantiles / samples make up one forecast. - -Using the `summarise_by` argument you can now choose categories to aggregate over. If you were only interested in scores for the different models, you would specify `summarise_by = c("model")`. If you wanted to have scores for every model in every location, you would need to specify `summarise_by = c("model", "location")`. If you wanted to have one score per quantile or one per prediction interval range, you could specify something like `summarise_by = c("model", "quantile")` or `summarise_by = c("model", "quantile", "range")` (note again that some information is duplicated in quantile and range, but this doesn't really matter for grouping purposes). When aggregating, `eval_forecasts` takes the mean according to the group defined in `summarise_by` (i.e. in this example, if `summarise_by = c("model", "location")`, scores will be averaged over all forecast dates, forecast horizons and quantiles to yield one score per model and location). In addition to the mean, you can also obtain the standard deviation of the scores over which you average or any desired quantile (e.g. the median in addition to the mean) by specifying `sd = TRUE` and `quantiles = c(0.5)`. - - -## Example Evaluation - -Here is an example of an evaluation using the example data included in the package. The data comes from a set of [Covid-19 short-term forecasts in the UK](https://github.com/epiforecasts/covid19.forecasts.uk). - -```{r} -library(scoringutils) -``` - -```{r} -data <- scoringutils::quantile_example_data -print(data, 3, 3) - -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "quantile", "range")) -print(scores, 3, 3) -``` - -Using an appropriate level of summary, we can easily use the output for visualisation. The `scoringutils` package offers some built-in functions to help get a sense of the data - -```{r} -# -# filtered_data <- data[geography == "England" & -# creation_date <= "2020-06-29" & -# value_desc == "Deaths"] - -scoringutils::plot_predictions(data = data, - filter_both = list("geography == 'England'"), - filter_forecasts = list("creation_date == '2020-07-06'"), - filter_truth = list("as.Date(value_date) <= '2020-07-06'"), - x = "value_date", - range = c(0, 50, 90), - scale = "free", - facet_formula = value_desc ~ model) -``` - - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model")) -scoringutils::score_table(scores) -``` - -Given this level of aggregation, not all metrics may make sense. In this case, for example, averaging over different quantiles to compute quantile coverage does not make much sense. If you like, you can select specific metrics for the visualisation. - -Let us look at calibration: - -```{r out.width="50%", fig.show="hold"} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "range", "quantile")) -scoringutils::interval_coverage(scores) + - ggplot2::ggtitle("Interval Coverage") - -scoringutils::quantile_coverage(scores) + - ggplot2::ggtitle("Quantile Coverage") -``` - -Let us look at the individual components of the weighted interval score: - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "value_desc")) -scoringutils::wis_components(scores, facet_formula = ~ value_desc) -``` - -We can also look at contributions to different metrics by range: - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "range", "value_desc")) -scoringutils::range_plot(scores, y = "interval_score", - facet_formula = ~ value_desc) -``` - -We can also visualise metrics using a heatmap: - -```{r} -scores <- scoringutils::eval_forecasts(data, - summarise_by = c("model", "horizon")) -scores$horizon <- as.factor(scores$horizon) -scoringutils::score_heatmap(scores, - x = "horizon", metric = "bias") -``` - - -### Expected Input Formats - -The `eval_forecasts()` function is designed to work with various different input formats. The following formats are currently supported: - -quantile forecasts in either a plain quantile format or in a format that specifies interval ranges and the boundary of a given interval range. - -``` {r} -print(scoringutils::quantile_example_data, 3, 3) -print(scoringutils::range_example_data_long, 3, 3) -print(scoringutils::range_example_data_wide, 3, 3) -``` - -sample based format with either continuous or integer values - -``` {r} -print(scoringutils::integer_example_data, 3, 3) -print(scoringutils::continuous_example_data, 3, 3) -``` - -forecasts in a binary format: - -``` {r} -print(scoringutils::binary_example_data, 3, 3) -``` - -It also offers functionality to convert between these formats. For more information have a look at the documentation of the following functions: -``` {r eval=FALSE} -scoringutils::sample_to_quantile() # convert from sample based to quantile format -scoringutils::range_long_to_quantile() # convert from range format to plain quantile -scoringutils::quantile_to_range_long() # convert the other way round -scoringutils::range_wide_to_long() # convert range based format from wide to long -scoringutils::range_long_to_wide() # convert the other way round -``` - - -# Scoring Forecasts Directly - -A variety of metrics and scoring rules can also be accessed directly through -the `scoringutils` package. - -The following gives an overview of (most of) the implemented metrics. - -## Bias - -The function `bias` determines bias from predictive Monte-Carlo samples, -automatically recognising whether forecasts are continuous or -integer valued. - -For continuous forecasts, Bias is measured as -$$B_t (P_t, x_t) = 1 - 2 \cdot (P_t (x_t))$$ - -where $P_t$ is the empirical cumulative distribution function of the -prediction for the true value $x_t$. Computationally, $P_t (x_t)$ is -just calculated as the fraction of predictive samples for $x_t$ -that are smaller than $x_t$. - -For integer valued forecasts, Bias is measured as - -$$B_t (P_t, x_t) = 1 - (P_t (x_t) + P_t (x_t + 1))$$ - -to adjust for the integer nature of the forecasts. In both cases, Bias can -assume values between -1 and 1 and is 0 ideally. - -```{r} -## integer valued forecasts -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -bias(true_values, predictions) - -## continuous forecasts -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(30, mean = 1:30)) -bias(true_values, predictions) -``` - - -## Sharpness -Sharpness is the ability of the model to generate predictions within a -narrow range. It is a data-independent measure, and is purely a feature -of the forecasts themselves. - -Shaprness of predictive samples corresponding to one single true value is -measured as the normalised median of the absolute deviation from -the median of the predictive samples. For details, see `?stats::mad` - -```{r} -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -sharpness(predictions) -``` - -## Calibration - -Calibration or reliability of forecasts is the ability of a model to -correctly identify its own uncertainty in making predictions. In a model -with perfect calibration, the observed data at each time point look as if -they came from the predictive probability distribution at that time. - -Equivalently, one can inspect the probability integral transform of the -predictive distribution at time t, - -$$u_t = F_t (x_t)$$ - -where $x_t$ is the observed data point at time $t \text{ in } t_1, …, t_n$, -n being the number of forecasts, and $F_t$ is the (continuous) predictive -cumulative probability distribution at time t. If the true probability -distribution of outcomes at time t is $G_t$ then the forecasts $F_t$ are -said to be ideal if $F_t = G_t$ at all times $t$. In that case, the -probabilities ut are distributed uniformly. - -In the case of discrete outcomes such as incidence counts, -the PIT is no longer uniform even when forecasts are ideal. -In that case a randomised PIT can be used instead: - -$$u_t = P_t(k_t) + v \cdot (P_t(k_t) - P_t(k_t - 1) )$$ - -where $k_t$ is the observed count, $P_t(x)$ is the predictive -cumulative probability of observing incidence $k$ at time $t$, -$P_t (-1) = 0$ by definition and $v$ is standard uniform and independent -of $k$. If $P_t$ is the true cumulative -probability distribution, then $u_t$ is standard uniform. - -The function checks whether integer or continuous forecasts were provided. -It then applies the (randomised) probability integral and tests -the values $u_t$ for uniformity using the -Anderson-Darling test. - -As a rule of thumb, there is no evidence to suggest a forecasting model is -miscalibrated if the p-value found was greater than a threshold of $p >= 0.1$, -some evidence that it was miscalibrated if $0.01 < p < 0.1$, and good -evidence that it was miscalibrated if $p <= 0.01$. -In this context it should be noted, though, that uniformity of the -PIT is a necessary but not sufficient condition of calibration. It should -als be noted that the test only works given sufficient samples, otherwise the -Null hypothesis will often be rejected outright. - - -## Continuous Ranked Probability Score (CRPS) -Wrapper around the `crps_sample()` function from the -`scoringRules` package. For more information look at the manuals from the -`scoringRules` package. The function can be used for continuous as well as -integer valued forecasts. Smaller values are better. - -```{r} -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -crps(true_values, predictions) -``` - - - -## Dawid-Sebastiani Score (DSS) -Wrapper around the `dss_sample()` function from the -`scoringRules` package. For more information look at the manuals from the -`scoringRules` package. The function can be used for continuous as well as -integer valued forecasts. Smaller values are better. - -```{r} -true_values <- rpois(30, lambda = 1:30) -predictions <- replicate(200, rpois(n = 30, lambda = 1:30)) -dss(true_values, predictions) -``` - -## Log Score -Wrapper around the `log_sample()` function from the -`scoringRules` package. For more information look at the manuals from the -`scoringRules` package. The function should not be used for integer valued -forecasts. While Log Scores are in principle possible for integer valued -forecasts they require a kernel density estimate which is not well defined -for discrete values. Smaller values are better. - -```{r} -true_values <- rnorm(30, mean = 1:30) -predictions <- replicate(200, rnorm(n = 30, mean = 1:30)) -logs(true_values, predictions) -``` - -## Brier Score -The Brier score is a proper score rule that assesses the accuracy of -probabilistic binary predictions. The outcomes can be either 0 or 1, -the predictions must be a probability that the true outcome will be 1. - -The Brier Score is then computed as the mean squared error between the -probabilistic prediction and the true outcome. - -$$\text{Brier_Score} = \frac{1}{N} \sum_{t = 1}^{n} (\text{prediction}_t - \text{outcome}_t)^2$$ - - -```{r} -true_values <- sample(c(0,1), size = 30, replace = TRUE) -predictions <- runif(n = 30, min = 0, max = 1) - -brier_score(true_values, predictions) -``` - -## Interval Score -The Interval Score is a Proper Scoring Rule to score quantile predictions, -following Gneiting and Raftery (2007). Smaller values are better. - -The score is computed as - -$$ \text{score} = (\text{upper} - \text{lower}) + \\ -\frac{2}{\alpha} \cdot (\text{lower} - \text{true_value}) \cdot 1(\text{true_values} < \text{lower}) + \\ -\frac{2}{\alpha} \cdot (\text{true_value} - \text{upper}) \cdot -1(\text{true_value} > \text{upper})$$ - - -where $1()$ is the indicator function and $\alpha$ is the decimal value that -indicates how much is outside the prediction interval. -To improve usability, the user is asked to provide an interval range in -percentage terms, i.e. interval_range = 90 (percent) for a 90 percent -prediction interval. Correspondingly, the user would have to provide the -5\% and 95\% quantiles (the corresponding alpha would then be 0.1). -No specific distribution is assumed, -but the range has to be symmetric (i.e you can't use the 0.1 quantile -as the lower bound and the 0.7 quantile as the upper). -Setting `weigh = TRUE` will weigh the score by $\frac{\alpha}{2}$ such that -the Interval Score converges to the CRPS for increasing number of quantiles. - - -```{r} -true_values <- rnorm(30, mean = 1:30) -interval_range <- 90 -alpha <- (100 - interval_range) / 100 -lower <- qnorm(alpha/2, rnorm(30, mean = 1:30)) -upper <- qnorm((1- alpha/2), rnorm(30, mean = 1:30)) - -interval_score(true_values = true_values, - lower = lower, - upper = upper, - interval_range = interval_range) -``` -