diff --git a/.gitignore b/.gitignore
index a515f56da..9447a5036 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ inst/manuscript/manuscript.log
inst/manuscript/manuscript.pdf
inst/manuscript/manuscript.tex
inst/manuscript/manuscript_files/
-docs
\ No newline at end of file
+docs
+..bfg-report/
diff --git a/DESCRIPTION b/DESCRIPTION
index 73dc84c0d..d13ed6797 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -65,7 +65,7 @@ Config/Needs/website:
amirmasoudabdol/preferably
Config/testthat/edition: 3
RoxygenNote: 7.1.2
-URL: https://github.com/epiforecasts/scoringutils, https://epiforecasts.io/scoringutils/
+URL: https://epiforecasts.io/scoringutils/, https://github.com/epiforecasts/scoringutils
BugReports: https://github.com/epiforecasts/scoringutils/issues
VignetteBuilder: knitr
Depends:
diff --git a/NAMESPACE b/NAMESPACE
index 9cab7f73e..13a740e4f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -76,7 +76,6 @@ importFrom(ggplot2,geom_polygon)
importFrom(ggplot2,geom_text)
importFrom(ggplot2,geom_tile)
importFrom(ggplot2,ggplot)
-importFrom(ggplot2,ggtitle)
importFrom(ggplot2,guide_legend)
importFrom(ggplot2,guides)
importFrom(ggplot2,labs)
diff --git a/R/data.R b/R/data.R
index aed5b6c27..76df369bf 100644
--- a/R/data.R
+++ b/R/data.R
@@ -3,6 +3,9 @@
#' A data set with predictions for COVID-19 cases and deaths submitted to the
#' European Forecast Hub.
#'
+#' The data was created using the script create-example-data.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
#' @format A data frame with
#' \describe{
#' \item{location}{the country for which a prediction was made}
@@ -20,11 +23,40 @@
"example_quantile"
+#' Point Forecast Example Data
+#'
+#' A data set with predictions for COVID-19 cases and deaths submitted to the
+#' European Forecast Hub. This data set is like the quantile example data, only
+#' that the median has been replaced by a point forecast.
+#'
+#' The data was created using the script create-example-data.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
+#' @format A data frame with
+#' \describe{
+#' \item{location}{the country for which a prediction was made}
+#' \item{target_end_date}{the date for which a prediction was made}
+#' \item{target_type}{the target to be predicted (cases or deaths)}
+#' \item{true_value}{true observed values}
+#' \item{location_name}{name of the country for which a prediction was made}
+#' \item{forecast_date}{the date on which a prediction was made}
+#' \item{quantile}{quantile of the corresponding prediction}
+#' \item{prediction}{predicted value}
+#' \item{model}{name of the model that generated the forecasts}
+#' \item{horizon}{forecast horizon in weeks}
+#' }
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
+"example_point"
+
+
#' Continuous Forecast Example Data
#'
#' A data set with continuous predictions for COVID-19 cases and deaths
#' constructed from data submitted to the European Forecast Hub.
#'
+#' The data was created using the script create-example-data.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
#' @format A data frame with 13,429 rows and 10 columns:
#' \describe{
#' \item{location}{the country for which a prediction was made}
@@ -47,6 +79,9 @@
#' A data set with integer predictions for COVID-19 cases and deaths
#' constructed from data submitted to the European Forecast Hub.
#'
+#' The data was created using the script create-example-data.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
#' @format A data frame with 13,429 rows and 10 columns:
#' \describe{
#' \item{location}{the country for which a prediction was made}
@@ -75,6 +110,9 @@
#' This should not be understood as sound statistical practice, but rather
#' as a practical way to create an example data set.
#'
+#' The data was created using the script create-example-data.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
#' @format A data frame with 346 rows and 10 columns:
#' \describe{
#' \item{location}{the country for which a prediction was made}
@@ -96,6 +134,9 @@
#' A data set with quantile predictions for COVID-19 cases and deaths
#' submitted to the European Forecast Hub.
#'
+#' The data was created using the script create-example-data.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
#' @format A data frame with 7,581 rows and 9 columns:
#' \describe{
#' \item{location}{the country for which a prediction was made}
@@ -116,6 +157,9 @@
#' A data set with truth values for COVID-19 cases and deaths
#' submitted to the European Forecast Hub.
#'
+#' The data was created using the script create-example-data.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
#' @format A data frame with 140 rows and 5 columns:
#' \describe{
#' \item{location}{the country for which a prediction was made}
@@ -131,5 +175,9 @@
#'
#' A data set with summary information on selected metrics implemented in
#' \pkg{scoringutils}
+#'
+#' The data was created using the script create-metric-tables.R in the inst/
+#' folder (or the top level folder in a compiled package).
+#'
#' @keywords info
-"metrics_summary"
+"metrics"
diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index aa5d46a04..b52ea0c52 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -69,6 +69,22 @@ pairwise_comparison <- function(scores,
metric <- infer_rel_skill_metric(scores)
}
+ # check that values of the chosen metric are not NA
+ if (any(is.na(scores[[metric]]))) {
+ msg <- paste0("Some values for the metric '", metric,
+ "' are NA. These have been removed. ",
+ "Maybe choose a different metric?")
+ warning(msg)
+
+ scores <- scores[!is.na(scores[[metric]])]
+ if (nrow(scores) == 0) {
+ msg <- paste0("After removing NA values for '", metric,
+ "', no values were left.")
+ warning(msg)
+ return(NULL)
+ }
+ }
+
# check that all values of the chosen metric are positive
if (any(sign(scores[[metric]]) < 0)) {
if (any(sign(scores) > 0)) {
@@ -140,6 +156,10 @@ pairwise_comparison_one_group <- function(scores,
stop("pairwise compairons require a column called 'model'")
}
+ if (nrow(scores) == 0) {
+ return(NULL)
+ }
+
# get list of models
models <- unique(scores$model)
@@ -202,32 +222,13 @@ pairwise_comparison_one_group <- function(scores,
)]
# calculate relative skill as geometric mean
- # small theta is again better. If a baseline is given, exclude it
- # from the computation of the geometric mean
- # maybe there is a more elegant way to do this
- if (!is.null(baseline)) {
- result_without_baseline <- data.table::copy(result)
- # filter out all ratios where compare_against is the baseline
- result_without_baseline <- result_without_baseline[
- compare_against != baseline
- ]
- result_without_baseline[, `:=`(theta = geom_mean_helper(ratio)),
- by = "model"
- ]
- # merge back to retain the ratios even for comparisons with the baseline
- result <- merge(result, result_without_baseline, all.x = TRUE)
- # avoid mixture of NA and NaN which can cause problems downstream
- result[is.na(theta), theta := NA_real_]
- # remove NAs form merge in the thetas
- result[, theta := unique(na.omit(theta)), by = "model"]
- } else {
- result[, `:=`(
- theta = geom_mean_helper(ratio),
- rel_to_baseline = NA_real_
- ),
- by = "model"
- ]
- }
+ # small theta is again better (assuming that the score is negatively oriented)
+ result[, `:=`(
+ theta = geom_mean_helper(ratio),
+ rel_to_baseline = NA_real_
+ ),
+ by = "model"
+ ]
if (!is.null(baseline)) {
baseline_theta <- unique(result[model == baseline, ]$theta)
diff --git a/R/plot.R b/R/plot.R
index 07c6dc604..5221616d9 100644
--- a/R/plot.R
+++ b/R/plot.R
@@ -741,7 +741,7 @@ plot_quantile_coverage <- function(scores,
#' @importFrom data.table as.data.table setnames rbindlist
#' @importFrom stats reorder
#' @importFrom ggplot2 labs coord_cartesian facet_wrap facet_grid theme
-#' element_text element_blank ggtitle
+#' element_text element_blank
#' @export
#'
#' @examples
@@ -992,8 +992,7 @@ plot_pairwise_comparison <- function(comparison_result,
hjust = 1, color = "brown4"
),
axis.text.y = element_text(color = "steelblue4")
- ) +
- ggtitle("Pairwise comparisons - ratio of mean scores (for overlapping forecast sets)")
+ )
}
return(plot)
diff --git a/R/score.R b/R/score.R
index d1a8b7865..194a5bc53 100644
--- a/R/score.R
+++ b/R/score.R
@@ -12,7 +12,8 @@
#' before scoring.
#'
#' To obtain a quick overview of the evaluation metrics used, have a look at the
-#' [metrics_summary] data included in the package.
+#' [metrics] data included in the package. The column `metrics$Name` gives an
+#' overview of all available metric names that can be computed.
#'
#' @param data A data.frame or data.table with the predictions and observations.
#' The following columns need to be present:
@@ -31,7 +32,7 @@
#' - `quantile`: quantile to which the prediction corresponds
#' @param metrics the metrics you want to have in the output. If `NULL` (the
#' default), all available metrics will be computed. For a list of available
-#' metrics see [available_metrics()]
+#' metrics see [available_metrics()], or check the [metrics] data set.
#' @param ... additional parameters passed down to lower-level functions.
#' For example, the following arguments can change how weighted interval
#' scores are computed:
@@ -63,6 +64,11 @@
#' score(example_quantile)
#' score(example_integer)
#' score(example_continuous)
+#'
+#' # score point forecasts (marked by 'NA' in the quantile column)
+#' score(example_point) %>%
+#' summarise_scores(by = "model", na.rm = TRUE)
+#'
#' @author Nikos Bosse \email{nikosbosse@@gmail.com}
#' @references Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ
#' (2019) Assessing the performance of real-time epidemic forecasts: A
diff --git a/R/scoringutils.R b/R/scoringutils.R
index 99e338ead..0cc9debe1 100644
--- a/R/scoringutils.R
+++ b/R/scoringutils.R
@@ -1,53 +1,2 @@
-#' @title scoringutils
-#'
-#' @description
-#' This package is designed to help with assessing the quality of predictions.
-#' It provides a collection of proper scoring rules and metrics as well that
-#' can be accessed independently or collectively through a higher-level wrapper
-#' function.
-#'
-#' Predictions can be either probabilistic forecasts (generally predictive
-#' samples generated by Markov Chain Monte Carlo procedures), quantile
-#' forecasts or point forecasts. The true values can be either continuous,
-#' integer, or binary.
-#'
-#' A collection of different metrics and scoring rules can be accessed through
-#' the function [score()]. Given a data.frame of the
-#' correct form the function will automatically figure out the type of
-#' prediction and true values and return appropriate scoring metrics.
-#'
-#' The package also has a lot of default visualisation based on the output
-#' created by [score()].
-#'
-#' - [plot_score_table()]
-#' - [plot_correlation()]
-#' - [plot_wis()]
-#' - [plot_ranges()]
-#' - [plot_heatmap()]
-#' - [plot_predictions()]
-#' - [plot_interval_coverage()]
-#' - [plot_quantile_coverage()]
-#'
-#' Alternatively, the following functions can be accessed directly:
-#'
-#' - [brier_score()]
-#' - [pit()]
-#' - [bias_sample()]
-#' - [bias_quantile()]
-#' - [bias_range()]
-#' - [mad_sample()]
-#' - [crps_sample()]
-#' - [logs_sample()]
-#' - [dss_sample()]
-#' - [ae_median_sample()]
-#'
-#' Predictions can be evaluated in a lot of different formats. If you want to
-#' convert from one format to the other, the following helper functions can
-#' do that for you:
-#'
-#' - [sample_to_quantile()]
-#'
-#' @docType package
-#' @name scoringutils
-
-NULL
+#' @keywords internal
+"_PACKAGE"
diff --git a/R/summarise_scores.R b/R/summarise_scores.R
index facc0cbfe..b47dd3c7b 100644
--- a/R/summarise_scores.R
+++ b/R/summarise_scores.R
@@ -103,18 +103,20 @@ summarise_scores <- function(scores,
by = by
)
- # delete unnecessary columns
- pairwise[, c(
- "compare_against", "mean_scores_ratio",
- "pval", "adj_pval"
- ) := NULL]
- pairwise <- unique(pairwise)
-
- # merge back
- scores <- merge(scores, pairwise,
- all.x = TRUE,
- by = get_forecast_unit(pairwise)
- )
+ if (!is.null(pairwise)) {
+ # delete unnecessary columns
+ pairwise[, c(
+ "compare_against", "mean_scores_ratio",
+ "pval", "adj_pval"
+ ) := NULL]
+ pairwise <- unique(pairwise)
+
+ # merge back
+ scores <- merge(scores, pairwise,
+ all.x = TRUE,
+ by = get_forecast_unit(pairwise)
+ )
+ }
}
# summarise scores -----------------------------------------------------------
diff --git a/R/utils.R b/R/utils.R
index 125a81493..ba0d23e76 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -49,7 +49,7 @@ globalVariables(c(
"mean_scores_ratio",
"metric",
"metrics_select",
- "metrics_summary",
+ "metrics",
"model",
"n_obs",
"n_obs wis_component_name",
@@ -88,7 +88,7 @@ globalVariables(c(
#' @keywords info
available_metrics <- function() {
- return(unique(metrics_summary$Name))
+ return(unique(metrics$Name))
}
#' @title Simple permutation test
diff --git a/README.md b/README.md
index 234909f6d..b09a61462 100644
--- a/README.md
+++ b/README.md
@@ -110,19 +110,17 @@ example_quantile %>%
kable()
#> The following messages were produced when checking inputs:
#> 1. Some values for `prediction` are NA in the data provided and the corresponding rows were removed. This may indicate a problem if unexpected.
-#> Warning in any(sign(scores[[metric]] < 0)): coercing argument of type 'double'
-#> to logical
```
| model | target_type | interval_score | dispersion | underprediction | overprediction | coverage_deviation | bias | ae_median | coverage_50 | coverage_90 | relative_skill | scaled_rel_skill |
|:----------------------|:------------|---------------:|-----------:|----------------:|---------------:|-------------------:|--------:|----------:|------------:|------------:|---------------:|-----------------:|
-| EuroCOVIDhub-baseline | Cases | 28000 | 4100 | 10000.0 | 14000.0 | -0.110 | 0.0980 | 38000 | 0.33 | 0.82 | 1.20 | 1.6 |
-| EuroCOVIDhub-baseline | Deaths | 160 | 91 | 2.1 | 66.0 | 0.120 | 0.3400 | 230 | 0.66 | 1.00 | 1.90 | 3.8 |
-| EuroCOVIDhub-ensemble | Cases | 18000 | 3700 | 4200.0 | 10000.0 | -0.098 | -0.0560 | 24000 | 0.39 | 0.80 | 0.74 | 1.0 |
-| EuroCOVIDhub-ensemble | Deaths | 41 | 30 | 4.1 | 7.1 | 0.200 | 0.0730 | 53 | 0.88 | 1.00 | 0.50 | 1.0 |
-| UMass-MechBayes | Deaths | 53 | 27 | 17.0 | 9.0 | -0.023 | -0.0220 | 78 | 0.46 | 0.88 | 0.63 | 1.2 |
-| epiforecasts-EpiNow2 | Cases | 21000 | 5700 | 3300.0 | 12000.0 | -0.067 | -0.0790 | 28000 | 0.47 | 0.79 | 0.86 | 1.2 |
-| epiforecasts-EpiNow2 | Deaths | 67 | 32 | 16.0 | 19.0 | -0.043 | -0.0051 | 100 | 0.42 | 0.91 | 0.83 | 1.6 |
+| EuroCOVIDhub-baseline | Cases | 28000 | 4100 | 10000.0 | 14000.0 | -0.110 | 0.0980 | 38000 | 0.33 | 0.82 | 1.30 | 1.6 |
+| EuroCOVIDhub-baseline | Deaths | 160 | 91 | 2.1 | 66.0 | 0.120 | 0.3400 | 230 | 0.66 | 1.00 | 2.30 | 3.8 |
+| EuroCOVIDhub-ensemble | Cases | 18000 | 3700 | 4200.0 | 10000.0 | -0.098 | -0.0560 | 24000 | 0.39 | 0.80 | 0.82 | 1.0 |
+| EuroCOVIDhub-ensemble | Deaths | 41 | 30 | 4.1 | 7.1 | 0.200 | 0.0730 | 53 | 0.88 | 1.00 | 0.60 | 1.0 |
+| UMass-MechBayes | Deaths | 53 | 27 | 17.0 | 9.0 | -0.023 | -0.0220 | 78 | 0.46 | 0.88 | 0.75 | 1.3 |
+| epiforecasts-EpiNow2 | Cases | 21000 | 5700 | 3300.0 | 12000.0 | -0.067 | -0.0790 | 28000 | 0.47 | 0.79 | 0.95 | 1.2 |
+| epiforecasts-EpiNow2 | Deaths | 67 | 32 | 16.0 | 19.0 | -0.043 | -0.0051 | 100 | 0.42 | 0.91 | 0.98 | 1.6 |
`scoringutils` contains additional functionality to summarise these
scores at different levels, to visualise them, and to explore the
diff --git a/data/example_point.rda b/data/example_point.rda
new file mode 100644
index 000000000..766ce79db
Binary files /dev/null and b/data/example_point.rda differ
diff --git a/data/metrics.rda b/data/metrics.rda
new file mode 100644
index 000000000..94d5fb67d
Binary files /dev/null and b/data/metrics.rda differ
diff --git a/data/metrics_summary.rda b/data/metrics_summary.rda
deleted file mode 100644
index cc96c38b9..000000000
Binary files a/data/metrics_summary.rda and /dev/null differ
diff --git a/inst/create_example_data.R b/inst/create-example-data.R
similarity index 60%
rename from inst/create_example_data.R
rename to inst/create-example-data.R
index 77734c527..562fbe804 100644
--- a/inst/create_example_data.R
+++ b/inst/create-example-data.R
@@ -2,7 +2,7 @@ library(data.table)
library(dplyr)
library(devtools)
library(here)
-library(covidHubUtils) # devtools::install_github("reichlab/covidHubUtils")
+library(covidHubUtils) # devtools::install_github("reichlab/covidHubUtils") #nolint
library(purrr)
library(data.table)
library(stringr)
@@ -12,28 +12,33 @@ library(scoringutils)
# download data from the European Forecast Hub Github Repository using
# subversion. You can also download the folders manually instead.
-system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-ensemble")
-system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-baseline")
-system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/UMass-MechBayes")
-system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/epiforecasts-EpiNow2")
+system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-ensemble") # nolint
+system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/EuroCOVIDhub-baseline") # nolint
+system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/UMass-MechBayes") # nolint
+system("svn checkout https://github.com/epiforecasts/covid19-forecast-hub-europe/trunk/data-processed/epiforecasts-EpiNow2") # nolint
# load truth data using the covidHubutils package ------------------------------
truth <- covidHubUtils::load_truth(hub = "ECDC") |>
filter(target_variable %in% c("inc case", "inc death")) |>
mutate(target_variable = ifelse(target_variable == "inc case",
- "Cases", "Deaths")) |>
- rename(target_type = target_variable,
- true_value = value) |>
+ "Cases", "Deaths"
+ )) |>
+ rename(
+ target_type = target_variable,
+ true_value = value
+ ) |>
select(-model)
# get the correct file paths to all forecasts ----------------------------------
folders <- here(c("EuroCOVIDhub-ensemble", "EuroCOVIDhub-baseline", "UMass-MechBayes", "epiforecasts-EpiNow2"))
file_paths <- purrr::map(folders,
- .f = function(folder) {
- files <- list.files(folder)
- out <- here::here(folder, files)
- return(out)}) %>%
+ .f = function(folder) {
+ files <- list.files(folder)
+ out <- here::here(folder, files)
+ return(out)
+ }
+) %>%
unlist()
file_paths <- file_paths[grepl(".csv", file_paths)]
@@ -47,42 +52,55 @@ get_model_name <- function(file_path) {
# load forecasts
prediction_data <- map_dfr(file_paths,
- .f = function(file_path) {
- data <- fread(file_path)
- data[, `:=`(
- target_end_date = as.Date(target_end_date),
- quantile = as.numeric(quantile),
- forecast_date = as.Date(forecast_date),
- model = get_model_name(file_path)
- )]
- return(data)
- }) %>%
+ .f = function(file_path) {
+ data <- fread(file_path)
+ data[, `:=`(
+ target_end_date = as.Date(target_end_date),
+ quantile = as.numeric(quantile),
+ forecast_date = as.Date(forecast_date),
+ model = get_model_name(file_path)
+ )]
+ return(data)
+ }
+) %>%
filter(grepl("case", target) | grepl("death", target)) %>%
- mutate(target_type = ifelse(grepl("death", target),
- "Deaths", "Cases"),
- horizon = as.numeric(substr(target, 1, 1))) %>%
+ mutate(
+ target_type = ifelse(grepl("death", target),
+ "Deaths", "Cases"
+ ),
+ horizon = as.numeric(substr(target, 1, 1))
+ ) %>%
rename(prediction = value) %>%
- filter(type == "quantile",
- grepl("inc", target)) %>%
- select(location, forecast_date, quantile, prediction,
- model, target_end_date, target, target_type, horizon)
+ filter(
+ type == "quantile",
+ grepl("inc", target)
+ ) %>%
+ select(
+ location, forecast_date, quantile, prediction,
+ model, target_end_date, target, target_type, horizon
+ )
# harmonise forecast dates to be the date a submission was made
hub_data <- mutate(prediction_data,
- forecast_date = calc_submission_due_date(forecast_date))
+ forecast_date = calc_submission_due_date(forecast_date)
+)
hub_data <- hub_data |>
- filter(horizon <= 3,
- forecast_date > "2021-05-01",
- forecast_date < "2021-07-15",
- # quantile %in% c(seq(0.05, 0.45, 0.1), 0.5, seq(0.55, 0.95, 0.1)),
- location %in% c("DE", "GB", "FR", "IT")) |>
+ filter(
+ horizon <= 3,
+ forecast_date > "2021-05-01",
+ forecast_date < "2021-07-15",
+ # quantile %in% c(seq(0.05, 0.45, 0.1), 0.5, seq(0.55, 0.95, 0.1)),
+ location %in% c("DE", "GB", "FR", "IT")
+ ) |>
select(-target)
truth <- truth |>
- filter(target_end_date > "2021-01-01",
- target_end_date < max(hub_data$target_end_date),
- location %in% c("DE", "GB", "FR", "IT")) |>
+ filter(
+ target_end_date > "2021-01-01",
+ target_end_date < max(hub_data$target_end_date),
+ location %in% c("DE", "GB", "FR", "IT")
+ ) |>
select(-population)
# save example data with forecasts only
@@ -98,6 +116,13 @@ data.table::setDT(example_quantile)
# make model a character instead of a factor
usethis::use_data(example_quantile, overwrite = TRUE)
+
+# create data with point forecasts ---------------------------------------------
+example_point <- data.table::copy(example_quantile)
+example_point[quantile == 0.5, quantile := NA_real_]
+usethis::use_data(example_point, overwrite = TRUE)
+
+
# get continuous sample data ---------------------------------------------------
# define gamma function
fn_gamma <- function(par, x) {
@@ -109,7 +134,6 @@ fn_gamma <- function(par, x) {
# define function to fit gamma
fit_gamma <- function(values, quantiles, init) {
-
x <- values
names(x) <- quantiles
@@ -117,9 +141,11 @@ fit_gamma <- function(values, quantiles, init) {
init <- c(shape = 1, rate = 1)
}
- res <- nloptr::sbplx(x0 = init, fn = fn_gamma, x = x,
- lower = c(shape = 0, rate = 0),
- control = list(xtol_rel = 1.0e-6, ftol_rel = 1.0e-6))
+ res <- nloptr::sbplx(
+ x0 = init, fn = fn_gamma, x = x,
+ lower = c(shape = 0, rate = 0),
+ control = list(xtol_rel = 1.0e-6, ftol_rel = 1.0e-6)
+ )
sol <- res$par
names(sol) <- names(init)
@@ -138,14 +164,21 @@ get_samples <- function(values, quantiles, n_samples = 1000) {
# calculate samples
setDT(example_quantile)
n_samples <- 40
-example_continuous <- example_quantile[, .(prediction = get_samples(prediction,
- quantile,
- n_samples = n_samples),
- sample = 1:n_samples,
- true_value = unique(true_value)),
- by = c("location", "location_name",
- "target_end_date", "target_type",
- "forecast_date", "model", "horizon")]
+example_continuous <- example_quantile[, .(
+ prediction = get_samples(
+ prediction,
+ quantile,
+ n_samples = n_samples
+ ),
+ sample = 1:n_samples,
+ true_value = unique(true_value)
+),
+by = c(
+ "location", "location_name",
+ "target_end_date", "target_type",
+ "forecast_date", "model", "horizon"
+)
+]
# remove unnecessary rows where no predictions are available
example_continuous[is.na(prediction), sample := NA]
example_continuous <- unique(example_continuous)
@@ -170,23 +203,28 @@ usethis::use_data(example_integer, overwrite = TRUE)
example_binary <- data.table::copy(example_continuous)
# store grouping variable
-by = c("location", "target_end_date", "target_type",
- "forecast_date", "model", "horizon")
+by <- c(
+ "location", "target_end_date", "target_type",
+ "forecast_date", "model", "horizon"
+)
# calculate mean value
example_binary[, mean_val := mean(prediction),
- by = by]
+ by = by
+]
# calculate binary prediction as percentage above mean
example_binary[, prediction := mean(prediction > mean_val),
- by = by]
+ by = by
+]
# calculate true value as whether or not observed was above mean
example_binary[, true_value := true_value > mean_val]
# delete unnecessary columns and take unique values
-example_binary[, `:=`(sample = NULL, mean_val = NULL,
- true_value = as.numeric(true_value))]
+example_binary[, `:=`(
+ sample = NULL, mean_val = NULL,
+ true_value = as.numeric(true_value)
+)]
example_binary <- unique(example_binary)
usethis::use_data(example_binary, overwrite = TRUE)
-
diff --git a/inst/tables-metric-overview.R b/inst/create-metric-tables.R
similarity index 94%
rename from inst/tables-metric-overview.R
rename to inst/create-metric-tables.R
index 656150df3..53262d413 100644
--- a/inst/tables-metric-overview.R
+++ b/inst/create-metric-tables.R
@@ -1,31 +1,7 @@
-#------------------------------------------------------------------------------#
-#------------------ Overview of the different forecast types ------------------#
-#------------------------------------------------------------------------------#
library(data.table)
-point_forecast <- list(
- `Forecast type` = c("Point forecast"),
- `Target type` = c("continuous \n discrete \n binary"),
- `Representation of the predictive distribution` = c("one single number for the predicted outcome")
-)
-
-
-probabilistic_forecast <- list(
- `Forecast type` = c("Probabilistic forecast", "Probabilistic forecast"),
- `Target type` = c("continuous \n discrete",
- "binary"),
- `Representation of the predictive distribution` = c(
- "predictive samples \n quantiles \n closed analytical form",
- "binary probabilities"
- )
-)
-
-data <- rbind(as.data.table(point_forecast),
- as.data.table(probabilistic_forecast))
-
-saveRDS(data, "inst/metrics-overview/forecast-types.Rda")
#------------------------------------------------------------------------------#
-#----------------- Overview with applicability of the metrics -----------------#
+#---------------------- Metrics Summary and Overview --------------------------#
#------------------------------------------------------------------------------#
ae <- list(
@@ -257,21 +233,16 @@ data[, References := NULL]
setnames(data, old = c("Properties"),
new = c("Info"))
-metrics_summary <- data[, lapply(.SD, FUN = function(x) {
+metrics <- data[, lapply(.SD, FUN = function(x) {
x <- gsub("$\\checkmark$", '+', x, fixed = TRUE)
x <- gsub("$-$", '-', x, fixed = TRUE)
x <- gsub("$\\sim$", '~', x, fixed = TRUE)
return(x)
})]
-setnames(metrics_summary, old = c("D", "C", "B", "Q"),
+setnames(metrics, old = c("D", "C", "B", "Q"),
new = c("Discrete", "Continuous", "Binary", "Quantile"))
-usethis::use_data(metrics_summary, overwrite = TRUE)
-
-
-# save for manuscript
-data[, c("Name", "Functions") := NULL]
-saveRDS(unique(data), file = "inst/metrics-overview/metrics-summary.Rda")
+usethis::use_data(metrics, overwrite = TRUE)
#------------------------------------------------------------------------------#
diff --git a/inst/manuscript/R/0-run-all.R b/inst/manuscript/R/0-run-all.R
new file mode 100644
index 000000000..00fb6ba1e
--- /dev/null
+++ b/inst/manuscript/R/0-run-all.R
@@ -0,0 +1,31 @@
+rootpath <- c("inst/manuscript/R/")
+
+# Table 1
+# in manuscript.Rmd
+
+# Figure 1, illustration of sharpness and calibration
+source(paste0(rootpath, "illustration-sharpness-calibration.R"))
+
+# Table 2
+# in manuscript.Rmd
+
+# Figure 2
+source(paste0(rootpath, "toy-example-calibration.R"))
+
+# Figure 3
+source(paste0(rootpath, "toy-example-convergence-outliers.R"))
+
+# Figure 4
+source(paste0(rootpath, "toy-example-locality.R"))
+
+# Figure 5
+source(paste0(rootpath, "illustration-relation-to-scale.R"))
+
+# Table 3
+# in manuscript.Rmd
+
+# Table 4
+# in manuscript.Rmd
+
+# All other Figures:
+# in manuscript.Rmd
diff --git a/inst/manuscript/R/illustration-relation-to-scale.R b/inst/manuscript/R/illustration-relation-to-scale.R
index 3bc0b70bd..d0e629533 100644
--- a/inst/manuscript/R/illustration-relation-to-scale.R
+++ b/inst/manuscript/R/illustration-relation-to-scale.R
@@ -1,75 +1,11 @@
-library(scoringutils)
-library(dplyr)
-library(ggplot2)
-library(data.table)
-
library(data.table)
library(dplyr)
library(scoringutils)
library(ggplot2)
library(tidyr)
library(patchwork)
-#
-# sizes_nbinom <- c(0.1, 1, 1e4)
-# n = 1000
-# mus <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5)
-#
-# df <- expand.grid("mu" = mus,
-# "size" = sizes_nbinom)
-# setDT(df)
-#
-# df[, `Log score` := mean(scoringRules::logs(y = rnbinom(n, size = size, mu = mu),
-# family = "negative-binomial",
-# size = size, mu = mu)),
-# by = c("mu", "size")]
-# df[, DSS := mean(scoringRules::dss_nbinom(y = rnbinom(n, size = size, mu = mu),
-# size = size, mu = mu)),
-# by = c("mu", "size")]
-# df[, CRPS := mean(scoringRules::crps(y = rnbinom(n, size = size, mu = mu),
-# family = "negative-binomial",
-# size = size, mu = mu)),
-# by = c("mu", "size")]
-#
-#
-# df |>
-# melt(measure.vars = c("Log score", "DSS"),
-# variable.name = "Scoring rule",
-# value.name = "Score") |>
-# ggplot(aes(y = `Score`, x = mu, color = `Scoring rule`, group = `Scoring rule`)) +
-# geom_line() +
-# facet_wrap(~ size)
-#
-#
-#
-# make_plot <- function(scores, summary_fct = mean) {
-# p1 <- scores |>
-# group_by(state_size, scale, Theta) |>
-# summarise(interval_score = summary_fct(interval_score)) |>
-# group_by(Theta, scale) |>
-# mutate(interval_score = interval_score / mean(interval_score),
-# Theta = ifelse(Theta == "1e+09", "1b", Theta)) |>
-# ggplot(aes(y = interval_score, x = state_size, colour = Theta)) +
-# geom_point(size = 0.4) +
-# labs(y = "WIS", x = "Size of state") +
-# theme_minimal() +
-# facet_wrap(~ scale, scales = "free_y")
-#
-# p2 <- p1 +
-# scale_x_continuous(trans = "log10") +
-# scale_y_continuous(trans = "log10")
-#
-# p1 / p2
-# }
-#
-
-
-
-
-
-
## Real Data
-
ex <- example_continuous |>
filter(model == "EuroCOVIDhub-ensemble")
@@ -97,26 +33,14 @@ p_true <- df |>
theme_scoringutils() +
theme(legend.position = "bottom")
-ggsave("inst/manuscript/plots/illustration-effect-scale.png",
- width = 8, height = 3)
-
-
-
-
-
# ------------------------------------------------------------------------------
-# different illustration:
+# illustration:
# in this we see that the mean as well as the variance of the scores scale
# for crps, while the variance stays constant for dss and log score
-
-library(scoringutils)
-library(dplyr)
-library(data.table)
library(tidyr)
-library(ggplot2)
simulate <- function(n_samples = 5e3,
n_replicates = 1e3,
@@ -160,13 +84,16 @@ grid <- expand.grid(
) |>
setDT()
-res <- readRDS("inst/manuscript/plots/relation-to-scale-example.Rda")
-# res <- grid |>
-# rowwise() |>
-# mutate(simulation := list(simulate(scale_mean = scale_mean, scale_sd = scale_sd)))
-#
-# saveRDS(res, file = "inst/manuscript/plots/relation-to-scale-example.Rda")
+if (!file.exists("inst/manuscript/output/relation-to-scale-example.Rda")) {
+ res <- grid |>
+ rowwise() |>
+ mutate(simulation := list(simulate(scale_mean = scale_mean, scale_sd = scale_sd)))
+
+ saveRDS(res, file = "inst/manuscript/output/relation-to-scale-example.Rda")
+} else {
+ res <- readRDS("inst/manuscript/output/relation-to-scale-example.Rda")
+}
df <- res |>
tidyr::unnest(cols = "simulation")
@@ -179,7 +106,6 @@ df <- df |>
"DSS",
ifelse(`Scoring rule` == "crps", "CRPS", "Log score")))
-
p1 <- df |>
filter(scale_mean == 1,
scale_sd < 20) |>
@@ -215,5 +141,5 @@ p2 + p1 + p_true +
theme(legend.position = "bottom") &
plot_annotation(tag_levels = 'A')
-ggsave("inst/manuscript/plots/illustration-effect-scale.png",
- height = 5, width = 8)
+ggsave("inst/manuscript/output/illustration-effect-scale.png",
+ height = 4.3, width = 8)
diff --git a/inst/manuscript/R/illustration-sharpness-calibration.R b/inst/manuscript/R/illustration-sharpness-calibration.R
index 3b449b5ff..dc9281205 100644
--- a/inst/manuscript/R/illustration-sharpness-calibration.R
+++ b/inst/manuscript/R/illustration-sharpness-calibration.R
@@ -27,11 +27,6 @@ p2 <-
ggtitle("Less sharp") +
theme_scoringutils()
-p1 + p2
-
-ggsave("inst/manuscript/plots/sharpness-illustration.png",
- width = 10, height = 4)
-
p21 <- ggplot(data.frame(x = seq(-8, 8, 0.01),
x_example = rnorm(n = 1601, mean = 0, sd = 1.05)),
aes(x = x)) +
@@ -62,13 +57,9 @@ p23 <- ggplot(data.frame(x = seq(-8, 8, 0.01),
labs(y = "Density") +
theme_scoringutils()
-
-p21 + p22 + p23
-
-ggsave("inst/manuscript/plots/calibration-illustration.png",
- width = 10, height = 4)
-
(p1 + p2) /
- (p21 + p22 + p23)
-ggsave("inst/manuscript/plots/calibration-sharpness-illustration.png",
- width = 8, height = 4.5)
+ (p21 + p22 + p23) &
+ plot_annotation(tag_levels = "A")
+
+ggsave("inst/manuscript/output/calibration-sharpness-illustration.png",
+ width = 8, height = 3.8)
diff --git a/inst/manuscript/R/toy-example-calibration.R b/inst/manuscript/R/toy-example-calibration.R
index 001efe22b..4bcc0cb07 100644
--- a/inst/manuscript/R/toy-example-calibration.R
+++ b/inst/manuscript/R/toy-example-calibration.R
@@ -5,32 +5,46 @@ library(data.table)
library(dplyr)
# generate predictions data.table
-n_truth = 1000
-n_samples = 1000
-predictions <- rnorm(n_truth * n_samples, 0, 1)
-true_values1 <- rnorm(n_samples)
-true_values2 <- rnorm(n_samples, mean = 0.5)
-true_values3 <- rnorm(n_samples, sd = 1.4)
-true_values4 <- rnorm(n_samples, sd = 0.7)
-
-df <- data.table(prediction = rep(predictions, each = 4),
+n_truth = 2000
+n_samples = 2000
+truth <- rnorm(n_truth, mean = 0, sd = 1)
+predictions1 <- rnorm(n_truth * n_samples, mean = 0, sd = 1)
+predictions2 <- rnorm(n_truth * n_samples, mean = 0.5, sd = 1)
+predictions3 <- rnorm(n_truth * n_samples, mean = 0, sd = 2)
+predictions4 <- rnorm(n_truth * n_samples, mean = 0, sd = 0.5)
+
+df <- data.table(true_value = rep(truth, each = n_samples),
id = rep(1:n_truth, each = n_samples),
- true_value = rep(c(true_values1, true_values2,
- true_values3, true_values4), each = n_samples),
+ prediction = c(predictions1, predictions2,
+ predictions3, predictions4),
sample = 1:n_samples,
- `true_distr` = rep(c("Truth: N(0, 1)", "Truth: N(0.5, 1)",
- "Truth: N(0, 1.4)", "Truth: N(0, 0.7)"),
- each = n_truth * n_samples))
+ `model` = rep(c("Pred: N(0, 1)", "Pred: N(0.5, 1)",
+ "Pred: N(0, 2)", "Pred: N(0, 0.5)"),
+ each = n_truth * n_samples))
-df[, true_distr := factor(`true_distr`,
- levels = c("Truth: N(0, 1)", "Truth: N(0.5, 1)",
- "Truth: N(0, 1.4)", "Truth: N(0, 0.7)"))]
+df[, model := factor(`model`,
+ levels = c("Pred: N(0, 1)", "Pred: N(0.5, 1)",
+ "Pred: N(0, 2)", "Pred: N(0, 0.5)"))]
-# obtain scores and create a table based on scores -----------------------------
-res <- score(df)
-res_summarised <- summarise_scores(res, by = c("true_distr"))
+if (!file.exists("inst/manuscript/output/calibration-diagnostic-examples.Rda")) {
+ res <- score(df)
+ pit <- pit(df, by = "model")
-scores_table_plot <- plot_score_table(res_summarised, y = "true_distr") +
+ stored <- list(res = res,
+ pit = pit)
+
+ saveRDS(stored, "inst/manuscript/output/calibration-diagnostic-examples.Rda")
+
+} else {
+
+ stored <- readRDS("inst/manuscript/output/calibration-diagnostic-examples.Rda")
+}
+
+res_summarised <- summarise_scores(stored$res, by = c("model"))
+
+scores_table_plot <- summarise_scores(res_summarised, fun = signif, digits = 2) |>
+ select(-mad) |>
+ plot_score_table(y = "model") +
coord_flip() +
theme_scoringutils() +
theme(axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5)) +
@@ -40,19 +54,19 @@ scores_table_plot <- plot_score_table(res_summarised, y = "true_distr") +
# create histogram true vs. predicted ------------------------------------------
pred_hist <- df |>
ggplot(aes(x = true_value)) +
- facet_wrap(~ true_distr, nrow = 1) +
+ facet_wrap(~ model, nrow = 1) +
geom_histogram(aes(y=..density..),
fill = "grey",
colour = "dark grey") +
- geom_function(fun = dnorm, colour = "black") +
+ geom_density(aes(y=..density.., x = prediction),
+ colour = "black") +
theme_scoringutils() +
labs(y = "Density", x = "Value")
# create pit plots -------------------------------------------------------------
-pit <- pit(df, by = "true_distr")
-pit_plots <- plot_pit(pit) +
- facet_wrap(~ true_distr, nrow = 1) +
+pit_plots <- plot_pit(stored$pit) +
+ facet_wrap(~ model, nrow = 1) +
theme_scoringutils()
# create interval and quantile coverage plots ----------------------------------
@@ -63,16 +77,16 @@ df_quantile <- sample_to_quantile(df,
res_quantile <- score(df_quantile)
res_quantile <- summarise_scores(res_quantile,
- by = c("true_distr", "range", "quantile"))
+ by = c("model", "range", "quantile"))
-res_quantile[, true_distr := factor(true_distr,
- levels = c("Truth: N(0, 1)", "Truth: N(0.5, 1)",
- "Truth: N(0, 1.4)", "Truth: N(0, 0.7)"))]
+res_quantile[, model := factor(model,
+ levels = c("Pred: N(0, 1)", "Pred: N(0.5, 1)",
+ "Pred: N(0, 2)", "Pred: N(0, 0.5)"))]
-res_quantile[, model := true_distr]
+res_quantile[, model := model]
interval_coverage <- plot_interval_coverage(res_quantile) +
- facet_wrap(~ true_distr, nrow = 1) +
+ facet_wrap(~ model, nrow = 1) +
theme_scoringutils()
quantile_coverage <- plot_quantile_coverage(res_quantile) +
@@ -88,58 +102,8 @@ p <- pred_hist /
scores_table_plot +
plot_layout(guides = 'collect') &
theme(legend.position = "none") &
- theme(panel.spacing = unit(2, "lines"))
-
-ggsave("inst/manuscript/plots/calibration-diagnostic-examples.png", width = 11.5, height = 11)
-
-
-
-
-
-
-
-
-
-
-
-
+ theme(panel.spacing = unit(2, "lines")) &
+ plot_annotation(tag_levels = "A")
+p
-#
-# # plot with observations
-# true_value_plot <- ggplot(data = data.frame(x = true_values),
-# aes(x = x)) +
-# geom_histogram(aes(y = ..density..),
-# fill = "grey",
-# colour = "dark grey") +
-# theme_minimal() +
-# labs(x = "True values",
-# y = "Density") +
-# theme(legend.position = "bottom")
-#
-# # plot with standard normal distribution
-# standard_normal <- true_value_plot +
-# geom_function(fun = dnorm, colour = "black") +
-# ggtitle("Normal(0, 1)")
-#
-# # plot with shifted mean
-# shifted_mean <- true_value_plot +
-# geom_function(fun = dnorm, colour = "black", args = list(mean = 0.5)) +
-# ggtitle("Normal(0.5, 1)")
-#
-# # plot with overdispersion
-# overdispersion <- true_value_plot +
-# geom_function(fun = dnorm, colour = "black", args = list(sd = 1.4)) +
-# ggtitle("Normal(0, 1.4)")
-#
-# # plot with underdispersion
-# underdispersion <- true_value_plot +
-# geom_function(fun = dnorm, colour = "black", args = list(sd = 0.7)) +
-# ggtitle("Normal(0, 0.7)")
-#
-# (standard_normal | shifted_mean | overdispersion | underdispersion) /
-# pit_plots /
-# interval_coverage /
-# quantile_coverage
-# # /
-# # gridExtra::tableGrob(scores_table)
-#
+ggsave("inst/manuscript/output/calibration-diagnostic-examples.png", width = 11.5, height = 11)
diff --git a/inst/manuscript/R/toy-example-convergence-outliers.R b/inst/manuscript/R/toy-example-convergence-outliers.R
new file mode 100644
index 000000000..1601691f8
--- /dev/null
+++ b/inst/manuscript/R/toy-example-convergence-outliers.R
@@ -0,0 +1,165 @@
+library(data.table)
+library(scoringutils)
+library(ggplot2)
+
+# ==============================================================================
+# =================== Convergence of scores ====================================
+# ==============================================================================
+
+sample_sizes <- seq(50, 5000, 50)
+sample_sizes <- round(1 * 10^(seq(1, 5, 0.1)))
+n_rep <- 500
+
+true_value = 0
+sd <- 3
+mu <- 2
+
+# analytical scores
+true_crps <- scoringRules::crps(y = 0, family = "normal", mean = mu, sd = sd)
+true_logs <- scoringRules::logs(y = 0, family = "normal", mean = mu, sd = sd)
+true_dss <- scoringRules::dss_norm(y = 0, mean = mu, sd = sd)
+
+if (!file.exists("inst/manuscript/output/sample-convergence.Rda")) {
+ results <- list()
+ for (i in sample_sizes) {
+ samples <- as.data.table(
+ replicate(n_rep,
+ rnorm(n = i, mean = mu, sd = sd))
+ )
+ setnames(samples, as.character(1:n_rep))
+ samples[, sample := 1:i]
+ samples <- melt(samples, id.vars = "sample",
+ variable.name = "repetition",
+ value.name = "prediction")
+ samples[, true_value := true_value]
+ results[[paste(i)]] <- score(
+ samples, metrics = c("crps", "log_score", "dss")
+ )[, n_samples := i]
+ }
+ saveRDS(results, "inst/manuscript/output/sample-convergence.Rda")
+} else {
+ results <- readRDS("inst/manuscript/output/sample-convergence.Rda")
+}
+
+results <- rbindlist(results)
+results <- melt(results, id.vars = c("n_samples", "repetition", "model"),
+ variable.name = "score")
+
+label_fn <- function(x) {
+ ifelse (x >= 1000,
+ paste0(x / 1000, "k"),
+ x)
+}
+
+df <- results[, .(mean = mean(value),
+ quantile_0.05 = quantile(value, 0.05),
+ quantile_0.25 = quantile(value, 0.25),
+ quantile_0.75 = quantile(value, 0.75),
+ quantile_0.95 = quantile(value, 0.95)),
+ by = c("n_samples", "score")]
+df[score == "crps", true_score := true_crps]
+df[score == "log_score", true_score := true_logs]
+df[score == "dss", true_score := true_dss]
+
+df[, score := ifelse(score == "dss", "DSS",
+ ifelse(score == "crps", "CRPS",
+ "Log score"))]
+
+p_convergence <- ggplot(df, aes(x = n_samples)) +
+ geom_line(aes(y = mean)) +
+ geom_ribbon(aes(ymax = quantile_0.95, ymin = quantile_0.05),
+ alpha = 0.1) +
+ geom_ribbon(aes(ymax = quantile_0.75, ymin = quantile_0.25),
+ alpha = 0.3) +
+ geom_hline(aes(yintercept = true_score), linetype = "dashed") +
+ facet_wrap(~ score, scales = "free") +
+ scale_x_continuous(trans = "log10", labels = label_fn) +
+ theme_scoringutils() +
+ labs(x = "Number of samples",
+ y = "Score based on samples")
+
+
+# ==============================================================================
+# =================== scores and outliers, sd ==================================
+# ==============================================================================
+
+library(scoringRules)
+library(dplyr)
+library(patchwork)
+
+# define simulation parameters
+n_steps = 500
+n_rep <- 5000
+true_mean = 0
+true_sd = 5
+true_values <- rnorm(n = n_rep, mean = true_mean, sd = true_sd)
+sd <- 10^(seq(-1, 1.6, length.out = n_steps))
+mu <- seq(0, 100, length.out = n_steps)
+
+
+# look at effect of change in sd on score
+res_sd <- data.table(sd = sd,
+ mu = true_mean)
+
+res_sd[, `:=` (CRPS = mean(scoringRules::crps(y = true_values, family = "normal", mean = mu, sd = sd)),
+ `Log score` = mean(scoringRules::logs(y = true_values, family = "normal", mean = mu, sd = sd)),
+ DSS = mean(scoringRules::dss_norm(y = true_values, mean = mu, sd = sd))),
+ by = "sd"]
+
+deviation_sd <- res_sd |>
+ melt(id.vars = c("sd", "mu"), value.name = "value", variable.name = "Score") |>
+ ggplot(aes(x = sd, y = value, color = Score)) +
+ geom_line() +
+ theme_scoringutils() +
+ geom_vline(aes(xintercept = 5), linetype = "dashed") +
+ coord_cartesian(ylim=c(0, 20)) +
+ annotate(geom="text", x=6, y=12, label="Sd of true \ndata-generating \ndistribution: 5",
+ color="black", hjust = "left", size = 3) +
+ labs(y = "Score", x = "Standard deviation of predictive distribution")
+
+
+
+# define simulation parameters
+true_values <- seq(0, 4, length.out = 1000)
+true_sd = 1
+true_mu = 0
+
+# look at effect of change in sd on score
+res_mu2 <- data.table(true_value = true_values)
+
+res_mu2[, `:=` (CRPS = scoringRules::crps(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10,
+ `Log score` = scoringRules::logs(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10,
+ DSS = scoringRules::dss_norm(y = true_value, mean = true_mu, sd = true_sd) / 10)]
+
+label_fn <- function(x) {
+ paste(10*x)
+}
+
+outlier <- res_mu2 |>
+ melt(id.vars = c("true_value"), value.name = "value", variable.name = "Score") |>
+ ggplot(aes(x = true_value, y = value, color = Score)) +
+ geom_line() +
+ theme_scoringutils() +
+ annotate(geom="text", x=0, y=.8, label="Predictive distribution: \nN(0,1)",
+ color="black", hjust = "left", size = 3) +
+ labs(y = "Score", x = "Observed value") +
+ # geom_vline(aes(xintercept = 0), linetype = "dashed") +
+ geom_area(stat = "function", fun = dnorm, color = "grey", fill = "grey", alpha = 0.5, xlim = c(0, 4)) +
+ scale_y_continuous(label = label_fn)
+
+
+layout <- "
+AA
+BC
+"
+
+plot <- p_convergence /
+deviation_sd + outlier +
+ plot_layout(guides = "collect",
+ design = layout) &
+ plot_annotation(tag_levels = "A") &
+ theme(legend.position = "bottom")
+
+ggsave("inst/manuscript/output/score-convergence-outliers.png", plot = plot,
+ height = 6, width = 8)
+
diff --git a/inst/manuscript/R/toy-example-locality.R b/inst/manuscript/R/toy-example-locality.R
index 4d57b6a7c..584fcd1c0 100644
--- a/inst/manuscript/R/toy-example-locality.R
+++ b/inst/manuscript/R/toy-example-locality.R
@@ -39,23 +39,15 @@ annotation <- df[, .(forecaster, crps, log_score, dss)] |> unique()
ggplot(df, aes(x = factor(outcome), y = prob)) +
geom_col() +
- geom_text(data = annotation, x = 4, y = 0.3, hjust = "left",
- aes(label = paste("CRPS: ", round(crps, 3)))) +
- geom_text(data = annotation,x = 4, y = 0.27, hjust = "left",
- aes(label = paste("Log score: ", round(log_score, 3)))) +
- geom_text(data = annotation, x = 4, y = 0.24, hjust = "left",
- aes(label = paste("DSS: ", round(dss, 3)))) +
+ geom_text(data = annotation, x = 4, y = 0.3, hjust = "left", size = 3,
+ aes(label = paste("CRPS: ", round(crps, 2)))) +
+ geom_text(data = annotation,x = 4, y = 0.27, hjust = "left", size = 3,
+ aes(label = paste("Log score: ", round(log_score, 2)))) +
+ geom_text(data = annotation, x = 4, y = 0.24, hjust = "left", size = 3,
+ aes(label = paste("DSS: ", round(dss, 2)))) +
facet_wrap(~ forecaster) +
geom_vline(aes(xintercept = 2), linetype = "dashed") +
theme_scoringutils() +
labs(y = "Probability assigned", x = "Possible outcomes")
-ggsave("inst/manuscript/plots/score-locality.png", height = 3, width = 8)
-
-
-# test with WIS. Problem: that doesn't work at the moment as intervals are
-# not symmetric
-# dt <- copy(df)
-# dt[, quantile := cumsum(prob_b)]
-# dt[, prediction := outcome]
-# score(dt[, .(true_value, prediction, quantile)])
+ggsave("inst/manuscript/output/score-locality.png", height = 3, width = 8)
diff --git a/inst/manuscript/R/toy-example-mean-sd-deviation.R b/inst/manuscript/R/toy-example-mean-sd-deviation.R
deleted file mode 100644
index d11aaed03..000000000
--- a/inst/manuscript/R/toy-example-mean-sd-deviation.R
+++ /dev/null
@@ -1,96 +0,0 @@
-library(data.table)
-library(scoringutils)
-library(ggplot2)
-library(scoringRules)
-library(dplyr)
-library(patchwork)
-
-# define simulation parameters
-n_steps = 500
-n_rep <- 5000
-true_mean = 0
-true_sd = 5
-true_values <- rnorm(n = n_rep, mean = true_mean, sd = true_sd)
-sd <- 10^(seq(-1, 1.6, length.out = n_steps))
-mu <- seq(0, 100, length.out = n_steps)
-
-
-# look at effect of change in sd on score
-res_sd <- data.table(sd = sd,
- mu = true_mean)
-
-res_sd[, `:=` (CRPS = mean(scoringRules::crps(y = true_values, family = "normal", mean = mu, sd = sd)),
- `Log score` = mean(scoringRules::logs(y = true_values, family = "normal", mean = mu, sd = sd)),
- DSS = mean(scoringRules::dss_norm(y = true_values, mean = mu, sd = sd))),
- by = "sd"]
-
-deviation_sd <- res_sd |>
- melt(id.vars = c("sd", "mu"), value.name = "value", variable.name = "Score") |>
- ggplot(aes(x = sd, y = value, color = Score)) +
- geom_line() +
- theme_scoringutils() +
- geom_vline(aes(xintercept = 5), linetype = "dashed") +
- coord_cartesian(ylim=c(0, 20)) +
- annotate(geom="text", x=6, y=17, label="Sd of true \ndata-generating \ndistribution: 5",
- color="black", hjust = "left") +
- labs(y = "Score", x = "Standard deviation of predictive distribution")
-
-#
-# # look at effect of change in mean on score
-# res_mu <- data.table(sd = true_sd,
-# mu = mu,
-# crps = NA_real_,
-# dss = NA_real_,
-# logs = NA_real_)
-#
-# res_mu[, `:=` (crps = mean(crps_sample(y = true_values, family = "normal", mean = mu, sd = sd)),
-# logs = mean(logs_sample(y = true_values, family = "normal", mean = mu, sd = sd)),
-# dss = mean(dss_norm(y = true_values, mean = mu, sd = sd))),
-# by = "mu"]
-#
-# deviation_mu <- res_mu |>
-# melt(id.vars = c("sd", "mu"), value.name = "value", variable.name = "Score") |>
-# ggplot(aes(x = mu, y = value, color = Score)) +
-# geom_line() +
-# theme_minimal() +
-# labs(y = "Score", x = "Mean of predictive distribution") +
-# geom_vline(aes(xintercept = 0), linetype = "dashed") +
-# coord_cartesian(ylim=c(0, 150))
-
-
-
-# define simulation parameters
-true_values <- seq(0, 4, length.out = 1000)
-true_sd = 1
-true_mu = 0
-
-# look at effect of change in sd on score
-res_mu2 <- data.table(true_value = true_values)
-
-res_mu2[, `:=` (CRPS = scoringRules::crps(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10,
- `Log score` = scoringRules::logs(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10,
- DSS = scoringRules::dss_norm(y = true_value, mean = true_mu, sd = true_sd) / 10)]
-
-label_fn <- function(x) {
- paste(10*x)
-}
-
-outlier <- res_mu2 |>
- melt(id.vars = c("true_value"), value.name = "value", variable.name = "Score") |>
- ggplot(aes(x = true_value, y = value, color = Score)) +
- geom_line() +
- theme_scoringutils() +
- annotate(geom="text", x=0, y=.8, label="Predictive distribution: \nN(0,1)",
- color="black", hjust = "left") +
- labs(y = "Score", x = "Observed value") +
- # geom_vline(aes(xintercept = 0), linetype = "dashed") +
- geom_area(stat = "function", fun = dnorm, color = "grey", fill = "grey", alpha = 0.5, xlim = c(0, 4)) +
- scale_y_continuous(label = label_fn)
-
-
-deviation_sd + outlier +
- plot_layout(guides = "collect") &
- theme(legend.position = "bottom")
-
-ggsave("inst/manuscript/plots/score-deviation-sd-mu.png",
- height = 3, width = 8)
diff --git a/inst/manuscript/R/toy-example-sample-convergence.R b/inst/manuscript/R/toy-example-sample-convergence.R
deleted file mode 100644
index 6c8040d60..000000000
--- a/inst/manuscript/R/toy-example-sample-convergence.R
+++ /dev/null
@@ -1,75 +0,0 @@
-library(data.table)
-library(scoringutils)
-library(ggplot2)
-
-sample_sizes <- seq(50, 5000, 50)
-sample_sizes <- round(1 * 10^(seq(1, 5, 0.1)))
-n_rep <- 500
-
-true_value = 0
-sd <- 3
-mu <- 2
-
-# analytical scores
-true_crps <- scoringRules::crps(y = 0, family = "normal", mean = mu, sd = sd)
-true_logs <- scoringRules::logs(y = 0, family = "normal", mean = mu, sd = sd)
-true_dss <- scoringRules::dss_norm(y = 0, mean = mu, sd = sd)
-
-#
-# results <- list()
-# for (i in sample_sizes) {
-# samples <- as.data.table(
-# replicate(n_rep,
-# rnorm(n = i, mean = mu, sd = sd))
-# )
-# setnames(samples, as.character(1:n_rep))
-# samples[, sample := 1:i]
-# samples <- melt(samples, id.vars = "sample",
-# variable.name = "repetition",
-# value.name = "prediction")
-# samples[, true_value := true_value]
-# results[[paste(i)]] <- score(
-# samples, metrics = c("crps", "log_score", "dss")
-# )[, n_samples := i]
-# }
-# writeRDS(results2, "inst/manuscript/plots/sample-convergence.Rda")
-
-resuts2 <- readRDS("inst/manuscript/plots/sample-convergence.Rda")
-results2 <- rbindlist(results)
-results2 <- melt(results2, id.vars = c("n_samples", "repetition", "model"),
- variable.name = "score")
-
-label_fn <- function(x) {
- ifelse (x >= 1000,
- paste0(x / 1000, "k"),
- x)
-}
-
-df <- results2[, .(mean = mean(value),
- quantile_0.05 = quantile(value, 0.05),
- quantile_0.25 = quantile(value, 0.25),
- quantile_0.75 = quantile(value, 0.75),
- quantile_0.95 = quantile(value, 0.95)),
- by = c("n_samples", "score")]
-df[score == "crps", true_score := true_crps]
-df[score == "log_score", true_score := true_logs]
-df[score == "dss", true_score := true_dss]
-
-df[, score := ifelse(score == "dss", "DSS",
- ifelse(score == "crps", "CRPS",
- "Log score"))]
-
-ggplot(df, aes(x = n_samples)) +
- geom_line(aes(y = mean)) +
- geom_ribbon(aes(ymax = quantile_0.95, ymin = quantile_0.05),
- alpha = 0.1) +
- geom_ribbon(aes(ymax = quantile_0.75, ymin = quantile_0.25),
- alpha = 0.3) +
- geom_hline(aes(yintercept = true_score), linetype = "dashed") +
- facet_wrap(~ score, scales = "free") +
- scale_x_continuous(trans = "log10", labels = label_fn) +
- theme_scoringutils() +
- labs(x = "Number of samples",
- y = "Score based on samples")
-
-ggsave("inst/manuscript/plots/sample-convergence.png", height = 3, width = 8)
diff --git a/inst/manuscript/apa.csl b/inst/manuscript/apa.csl
deleted file mode 100644
index bbb7fdb73..000000000
--- a/inst/manuscript/apa.csl
+++ /dev/null
@@ -1,1914 +0,0 @@
-
-
diff --git a/inst/manuscript/manuscript.Rmd b/inst/manuscript/manuscript.Rmd
index 90bc7bc41..1a497b9ef 100644
--- a/inst/manuscript/manuscript.Rmd
+++ b/inst/manuscript/manuscript.Rmd
@@ -67,14 +67,14 @@ title:
# For running headers, if needed
short: "Evaluating Forecasts with \\pkg{scoringutils} in \\proglang{R}"
abstract: >
- Forecasts play an important role in a variety of fields. Their role in informing public policy has attracted increased attention from the general public with the emergence of the Covid-19 pandemic. Much theoretical work has been done on the development of proper scoring rules and other scoring metrics that can help evaluate these forecasts. However, there is a vast choice of scoring rules available for different types of data, and there has been less of a focus on facilitating their use by those without expertise in forecast evaluation. In this paper we introduce \pkg{scoringutils}, an \proglang{R} package that, given a set of forecasts and truth data, automatically chooses, applies and visualises a set of appropriate scores. It gives the user access to a wide range of scoring metrics for various types of forecasts as well as a variety of ways to visualise the evaluation. We give an overview of the evaluation process and the metrics implemented in \pkg{scoringutils} and show an example evaluation of forecasts for COVID-19 cases and deaths submitted to the European Forecast Hub between May and September 2021.
+ Evaluating forecasts is essential in order to understand and improve forecasting and make forecasts useful to decision-makers. Much theoretical work has been done on the development of proper scoring rules and other scoring metrics that can help evaluate forecasts. In practice, however, conducting a forecast evaluation and comparison of different forecasters remains challenging. In this paper we introduce \pkg{scoringutils}, an \proglang{R} package that aims to greatly facilitates this process. It is especially geared towards comparing multiple forecasters, regardless of how forecasts were created, and visualising results. The package is able to handle missing forecasts and is the first \proglang{R} package to offer extensive support for forecasts represented through predictive quantiles, a format used by several COVID-19 Forecast Hubs. The paper gives a short introduction to forecast evaluation, discusses the metrics implemented in \pkg{scoringutils} and gives guidance on when they are appropriate to use, and illustrates the application of the package using example data of forecasts for COVID-19 cases and deaths submitted to the European Forecast Hub between May and September 2021.
keywords:
# at least one keyword must be supplied
- formatted: [keywords, not capitalized, "\\proglang{R}"]
- plain: [keywords, not capitalized, R]
+ formatted: [forecasting, forecast evaluation, proper scoring rules, scoring, "\\proglang{R}"]
+ plain: [forecasting, forecast evaluation, proper scoring rules, scoring, R]
preamble: >
\usepackage{amsmath}
- \shortcites{reichCollaborativeMultiyearMultimodel2019, kukkonenReviewOperationalRegionalscale2012, funkShorttermForecastsInform2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021, bracherNationalSubnationalShortterm2021}
+ \shortcites{reichCollaborativeMultiyearMultimodel2019, kukkonenReviewOperationalRegionalscale2012, funkShorttermForecastsInform2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021, bracherNationalSubnationalShortterm2021, cramerCOVID19ForecastHub2020}
\usepackage{amssymb}
\usepackage{caption}
\captionsetup[table]{skip=10pt}
@@ -97,12 +97,15 @@ output:
```{r, setup, include=FALSE}
options(prompt = 'R> ', continue = '+ ', width = 70)
+library(scoringutils)
library(knitr)
library(dplyr)
library(magrittr)
library(kableExtra)
library(formatR)
-options(width = 70)
+library(data.table)
+library(patchwork)
+
opts_chunk$set(
cache = TRUE,
warning = FALSE,
@@ -118,7 +121,7 @@ trackdown::download_file("inst/manuscript/manuscript.Rmd", gfile = "scoringutils
# Introduction
-Good forecasts are of great interest to decision makers in various fields like finance \citep{timmermannForecastingMethodsFinance2018, elliottForecastingEconomicsFinance2016}, weather predictions \citep{gneitingWeatherForecastingEnsemble2005, kukkonenReviewOperationalRegionalscale2012} or infectious disease modeling \citep{reichCollaborativeMultiyearMultimodel2019, funkShorttermForecastsInform2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021}. Throughout the COVID-19 pandemic, forecasts from different research institutions on COVID-19 targets like reported cases and deaths have been systematically collated by Forecast Hubs in the US, Germany and Poland, and Europe. An integral part of assessing and improving their usefulness is forecast evaluation. For decades, researchers have developed and refined an arsenal of techniques not only to forecast, but also to evaluate these forecasts (see e.g. \cite{bracherEvaluatingEpidemicForecasts2021}, \cite{funkAssessingPerformanceRealtime2019}, \cite{gneitingProbabilisticForecastsCalibration2007}, and \cite{gneitingStrictlyProperScoring2007}). Yet even with this rich body of research available, implementing a complete forecast evaluation is not trivial.
+Good forecasts are of great interest to decision makers in various fields like finance \citep{timmermannForecastingMethodsFinance2018, elliottForecastingEconomicsFinance2016}, weather predictions \citep{gneitingWeatherForecastingEnsemble2005, kukkonenReviewOperationalRegionalscale2012} or infectious disease modeling \citep{reichCollaborativeMultiyearMultimodel2019, funkShorttermForecastsInform2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021}. Throughout the COVID-19 pandemic, forecasts from different research institutions on COVID-19 targets like reported cases and deaths have been systematically collated by Forecast Hubs in the US, Germany and Poland, and Europe. An integral part of assessing and improving their usefulness is forecast evaluation. For decades, researchers have developed and refined an arsenal of techniques not only to forecast, but also to evaluate these forecasts (see e.g. \cite{goodRationalDecisions1952}, \cite{epsteinScoringSystemProbability1969, murphyNoteRankedProbability1971a, mathesonScoringRulesContinuous1976}, \cite{gneitingProbabilisticForecastsCalibration2007}, \cite{funkAssessingPerformanceRealtime2019}, \cite{gneitingStrictlyProperScoring2007}, \cite{bracherEvaluatingEpidemicForecasts2021}). Yet even with this rich body of research available, implementing a complete forecast evaluation is not trivial.
There already exist a few \proglang{R} \citep{R} packages which implement a wide variety of scoring metrics. The \pkg{scoringRules} package \citep{scoringRules} for example offers a very extensive collection of functions with efficient implementations of different proper scoring rules (some of which are directly reused in \pkg{scoringutils}. However, it focuses on proper scoring rules only and does not implement other evaluation metrics or provide functionality to compare forecast performance visually. It also does not provide functionality to score predictive distributions that are represented by a set of quantiles and does not handle situations with missing data. The \pkg{topmodels} package \citep{topmodels} provides users with various graphical tools to visually evaluate and compare different forecasts. However, the package is as of today not on CRAN and the visualisations are only available for forecasts based on the model classes `lm`, `glm`, `crch` `disttree`. The ¸\pkg{tscount} package \citep{tscount} offers functionality to fit flexible time series models and compare the quality of the generated forecasts using different proper scoring rules. The application of these rules, however, is confined to forecasts of class `tsglm`. Other packages like \pkg{Metrics} \citep{Metrics} and \pkg{MLmetrics} \citep{MLmetrics} provide a collection of metrics geared towards machine learning problems, but also lack plotting functionality as well as support for a variety of metrics and tools commonly used to evaluate and compare probabilistic forecasts. In contrast to the above, \pkg{scoringutils} not only provides metrics to score individual forecasts, but attempts to simplify the process of comparing different forecasts against each other. It accepts arbitrary forecasts regardless of how they were created and automatically returns a variety of suitable metrics, depending on the type and format of the input forecast. It also provides functionality to facilitate comparing forecasters even when individual forecasts are missing and offers a range of plotting functions to visualise different aspects of forecast performance. The \pkg{scoringutils} package is also unique in its extensive support for forecasts in a quantile format like the one used in various COVID-19 Forecast Hubs \citep{cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021, bracherNationalSubnationalShortterm2021}.
@@ -135,7 +138,7 @@ The \pkg{scoringutils} package focuses on probabilistic forecasts, and specifica
Predictive samples offer a lot of flexibility. However, the number of samples necessary to store in order to represent the predictive distribution satisfactorily may be high. This loss of precision is usually especially pronounced in the tails of the predictive distribution. For that reason, often quantiles or central prediction intervals are reported instead. One recent example of this are the COVID-19 Forecast Hubs \citep{cramerCOVID19ForecastHub2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, bracherNationalSubnationalShortterm2021, europeancovid-19forecasthubEuropeanCovid19Forecast2021}. For binary or multinomial prediction targets, common in many classification problems, a probabilistic forecast is represented by the probability that an outcome will come true. Table \ref{tab:forecast-types} summarises the different forecast types and formats.
\begin{table}[]
\centering
-\caption{Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.}
+\caption{Forecast and forecast target types. Forecasts can be probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.}
\label{tab:forecast-types}
\begin{tabular}{@{}lll@{}}
\toprule
@@ -154,24 +157,33 @@ $$ F = G, $$
where $F$ and $G$ are both cumulative distribution functions. As we don't know the true data-generating distribution $G$, we cannot assess the similarity between the two distributions directly. \cite{gneitingProbabilisticForecastsCalibration2007} instead suggest to focus on two central aspects of the predictive distribution: calibration and sharpness (illustrated in Figure \ref{fig:forecast-paradigm}). Calibration refers to the statistical consistency (i.e. absence of systematic deviations) between the predictive distribution and the observations. One can distinguish several forms of calibration which are discussed in detail by \cite{gneitingProbabilisticForecastsCalibration2007}. Sharpness is a feature of the forecast only and describes how concentrated the predictive distribution is, i.e. how informative the forecasts are. The general forecasting paradigm states that a forecaster should maximise sharpness of the predictive distribution subject to calibration.
-```{r forecast-paradigm, echo = FALSE, fig.cap= "Schematic illustration of sharpness (top row) and calibration (bottom row). Sharpness is a property of the forecast (black distributions) only, while calibration is the consistency between the forecasts and the observations drawn from the true data-generating distribution (grey histograms). For illustrative purposes, the probability density function (PDF) rather than the cumulative density function (CDF) is shown.", fig.show="hold"}
+```{r forecast-paradigm, echo = FALSE, fig.cap= "Schematic illustration of sharpness (A, B) and calibration (C, D, E). Sharpness is a property of the forecast (black distributions) only, while calibration is the consistency between the forecasts and the observations drawn from the true data-generating distribution (grey histograms). For illustrative purposes, the probability density function (PDF) rather than the cumulative density function (CDF) is shown.", fig.show="hold"}
-include_graphics("plots/calibration-sharpness-illustration.png")
+include_graphics("output/calibration-sharpness-illustration.png")
```
# Scoring metrics implemented in \pkg{scoringutils} {short-title="Scoring metrics implemented in scoringutils" #metrics}
-An overview of the metrics implemented in \pkg{scoringutils} can be found in Table \ref{tab:metrics-summary}, while Table \ref{tab:score-table-detailed} in the Appendix provides mathematical definitions, as well as a more thorough explanations. Some of the metrics in \pkg{scoringutils} focus on sharpness or calibration alone, others are so-called proper scoring rules \citep{gneitingStrictlyProperScoring2007}, which combine both aspects into a single number. A scoring rule is proper if the ideal forecaster (i.e. one using the data-generating distribution) receives the lowest score in expectation. The scoring rule is called strictly proper, if its optimum is unique. This makes sure that a forecaster evaluated by a strictly proper scoring rule is always incentivised to state their best estimate. Looking at calibration and sharpness independently can be helpful to learn about specific aspects of the forecasts and improve them. Proper scoring rules are especially useful to assess and rank predictive performance of forecasters.
+An overview of the metrics implemented in \pkg{scoringutils} can be found in Table \ref{tab:metrics-summary}, while Table \ref{tab:score-table-detailed} in the Appendix provides mathematical definitions and further details. Some of the metrics in \pkg{scoringutils} focus on sharpness or calibration alone, others are so-called proper scoring rules \citep{gneitingStrictlyProperScoring2007}, which combine both aspects into a single number. A scoring rule is proper if the ideal forecaster (i.e. one using the data-generating distribution) receives the lowest score in expectation. The scoring rule is called strictly proper, if its optimum is unique. This ensures that a forecaster evaluated by a strictly proper scoring rule is always incentivised to state their best estimate. Looking at calibration and sharpness independently can be helpful to learn about specific aspects of the forecasts and improve them. Proper scoring rules are especially useful to assess and rank predictive performance of forecasters.
-
-
-\newpage
+\newpage
```{r metrics-summary, echo = FALSE, cache=FALSE}
-# load data file from inst directory of the package
-data <- readRDS(system.file("metrics-overview/metrics-summary.Rda", package = "scoringutils"))
-
-cap <- "Summary table of scores available in \\pkg{scoringutils}. A version of this table which includes corresponding function names can be accessed in \\proglang{R} by calling \\code{scoringutils::metrics\\_summary}. Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, 'x', or '$\\sim$' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary forecasts), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g. binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g. for 'global' and 'local' see section \\ref{localglobal}). For mathematical defintions of the metrics see Table \\ref{tab:score-table-detailed}."
+# use package data and delete unnecessary columns
+data <- metrics |>
+ select(-Name, -Functions) |>
+ unique()
+
+data <- data[, lapply(.SD, FUN = function(x) {
+ x <- gsub("+", '$\\checkmark$', x, fixed = TRUE)
+ x <- gsub("-", '$-$', x, fixed = TRUE)
+ x <- gsub("~", '$\\sim$', x, fixed = TRUE)
+ return(x)
+})]
+setnames(data, old = c("Discrete", "Continuous", "Binary", "Quantile"),
+ new = c("D", "C", "B", "Q"))
+
+cap <- "Summary table of scores available in \\pkg{scoringutils}. This table (including corresponding function names) can be accessed by calling \\code{scoringutils::metrics\\_summary} in \\proglang{R}. Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, 'x', or '$\\sim$' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g. binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g. for 'global' and 'local' see section \\ref{localglobal}). For mathematical defintions of the metrics see Table \\ref{tab:score-table-detailed}."
data[, 1:6] |>
kableExtra::kbl(format = "latex", booktabs = TRUE,
@@ -200,88 +212,80 @@ The form of calibration most commonly focused on is called probabilistic calibra
One can visualise probabilistic calibration in different ways and \pkg{scoringutils} offers three options. *Interval coverage plots* (see row 3 in Figure \ref{fig:calibration-plots}) show nominal coverage of the central prediction intervals against the percentage of observed values that fall inside the corresponding prediction intervals. Ideally forecasters should lie on the diagonal line. A shift to the left means a forecaster is too conservative and issues a predictive distribution that is too wide and covers more of the observed values than needed. A shift to the right means a forecaster is overconfident and the forecast distribution is too narrow. Similarly, *quantile coverage plots* (row 4 in Figure \ref{fig:calibration-plots}) show the quantiles of the predictive distribution against the percentage of observed values below the corresponding predictive quantiles. For quantiles below the median, a line to the right of the diagonal (predictive quantiles lower than the quantiles of the data-generating distribution) means a forecaster is too conservative, while for quantiles above the median, a line to the left of the diagonal line (predictive quantiles higher than the quantiles of the data-generating distribution) implies conservative predictions.
-
A similar way to visualise the same information is the probability integral transform (PIT) histogram \citep{dawidPresentPositionPotential1984}. The PIT is equal to $F(x_t)$, the cumulative predictive distribution evaluated at the observed value $x_t$ (see more details in Table \ref{tab:score-table-detailed}). If forecasts are probabilistically calibrated, then the transformed values will be uniformly distributed (for a proof see e.g. @angusProbabilityIntegralTransform1994). When plotting a histogram of PIT values (see row 2 in Figure \ref{fig:calibration-plots}), bias usually leads to a triangular shape, a U-shaped histogram corresponds to forecasts that are under-dispersed (too sharp) and a hump-shape appears when forecasts are over-dispersed (too wide).
It is in principle possible to formally test probabilistic calibration, for example by employing a test on the uniformity of PIT values (e.g. the Anderson-Darling test \citep{andersonAsymptoticTheoryCertain1952}). In practice this can be difficult as forecasts and therefore also PIT values are often correlated. We therefore advise against using formal tests in most applied settings. It is also important to note that uniformity of the PIT histogram (or a diagonal on quantile and interval coverage plots) indicates probabilistic calibration, but does not guarantee that forecasts are indeed calibrated in every relevant sense. \cite{gneitingProbabilisticForecastsCalibration2007, hamillInterpretationRankHistograms2001a} provide examples with different forecasters who are clearly mis-calibrated, but have uniform PIT histograms.
-```{r calibration-plots, echo = FALSE, fig.pos = "!h", out.extra = "", fig.cap= "Top row: Standard normal forecasting distribution (black, constant across all examples) against observations sampled from different predictive distributions (grey histograms based on 1000 samples). Second row: PIT histograms based the standard normal predictive distributions and the sampled observations shown in the first row. Third row: Empirical vs. nominal coverage of the central prediction intervals for simulated observations and predictions. Areas shaded in green indicate that the forecasts are too wide (i.e. underconfident), covering more true values than they actually should, while areas in white indicate that the model generates too narrow predictions and fails to cover the desired proportion of true values with its prediction intervals. Fourth row: Quantile coverage values, with green areas indicating too wide (i.e. conservative) forecasts. Last row: Scores for the standard normal predictive distribution and the observations drawn from different data-generating distributions.", cache = FALSE}
-include_graphics("plots/calibration-diagnostic-examples.png")
-
-# readRDS("plots/calibration-diagnostic-examples.Rda") |>
-# print()
-#
-# kableExtra::kbl(format = "latex", booktabs = TRUE,
-# escape = FALSE,
-# linesep = c('\\addlinespace'))
-
+```{r calibration-plots, echo = FALSE, fig.pos = "!h", out.extra = "", fig.cap= "A: Different forecasting distributions (black) against observations sampled from a standard normal distribution (grey histograms). B: PIT histograms based on the predictive distributions and the sampled observations shown in A. C: Empirical vs. nominal coverage of the central prediction intervals for simulated observations and predictions. Areas shaded in green indicate that the forecasts are too wide (i.e. underconfident), covering more true values than they actually should, while areas in white indicate that the model generates too narrow predictions and fails to cover the desired proportion of true values with its prediction intervals. D: Quantile coverage values, with green areas indicating too wide (i.e. conservative) forecasts. E: Scores for the standard normal predictive distribution and the observations drawn from different data-generating distributions.", cache = FALSE}
+include_graphics("output/calibration-diagnostic-examples.png")
```
### Bias
Biased forecasts systematically over- or under-predict the observed values. The bias metric implemented in \pkg{scoringutils} follows \cite{funkAssessingPerformanceRealtime2019}, with slight adaptations for different forecast formats. It captures how much probability mass of the forecast was above or below the true value (mapped to values between -1 and 1, with 0 being ideal) and therefore represents a general tendency to over- or under-predict in relative terms. A value of -1 implies that the entire probability mass of the predictive distribution was below the observed value (and analogously above it for a value of 1).
-For forecasts in a quantile format, bias is also reflected in the over- and under-prediction components of the weighted interval score (a proper scoring rule explained in more detail in section \ref{wis}). These measure over- and under-prediction on an absolute scale (analogous to the absolute error of a point forecast), rather than a relative scale. However, it is not clear what the decomposition 'should' look like and a forecast can be well calibrated and still have different amounts of over- and under-prediction. High over-prediction or under-prediction values can therefore not immediately be interpreted as systematic bias.
+For forecasts in a quantile format, bias is also reflected in the over- and underprediction components of the weighted interval score (a proper scoring rule explained in more detail in section \ref{wis}). These measure over- and underprediction on an absolute scale (analogous to the absolute error of a point forecast), rather than a relative scale. However, it is not clear what the decomposition 'should' look like and a forecast can be well calibrated and still have different amounts of over- and underprediction. High overprediction or underprediction values can therefore not immediately be interpreted as systematic bias.
## Assessing sharpness
Sharpness is the ability to produce narrow forecasts. In contrast to calibration it does not depend on the actual observations and is a quality of the forecast only \citep{gneitingProbabilisticForecastsCalibration2007}. Sharpness is therefore only useful subject to calibration, as exemplified in Figure \ref{fig:forecast-paradigm}. For forecasts provided as samples from the predictive distribution, \pkg{scoringutils} calculates dispersion (the inverse of sharpness) as the normalised median absolute deviation about the median (MAD), following \cite{funkAssessingPerformanceRealtime2019} (for details see Table \ref{tab:metrics-summary}). For quantile forecasts, we instead report the dispersion component of the weighted interval score (see details in section \ref{wis} and \ref{tab:score-table-detailed}) which corresponds to a weighted average of the individual interval widths.
-## Proper scoring rules for sample-based forecasts (CRPS, log score and DSS)
+## Proper scoring rules
+
+### Proper scoring rules for sample-based forecasts (CRPS, log score and DSS)
For forecasts in a sample format, the \pkg{scoringutils} package implements the following proper scoring rules by providing wrappers to the corresponding functions in the \pkg{scoringRules} package: the (continuous) ranked probability score (CRPS) \citep{epsteinScoringSystemProbability1969, murphyNoteRankedProbability1971a, mathesonScoringRulesContinuous1976, gneitingStrictlyProperScoring2007}, the logarithmic score (log score) \citep{goodRationalDecisions1952}, and the Dawid-Sebastiani-score (DSS) \citep{dawidCoherentDispersionCriteria1999} (formal definitions are given in Table \ref{tab:score-table-detailed}). Compared to the implementations in the \pkg{scoringRules} these are exposed to the user through a slightly adapted interface. Other, closed form variants of the CRPS, log score and DSS are available in the \pkg{scoringRules} package.
When scoring forecasts in a sample-based format, the choice is usually between the log score and the CRPS. The DSS is much less commonly used. It is easier to compute, but apart from that does not have immediate advantages over the former two. DSS, CRPS and log score differ in several important aspects: ease of estimation and speed of convergence, treatment of over- and underconfidence, sensitivity to distance \cite{winklerScoringRulesEvaluation1996}, sensitivity to outlier predictions, and sensitivity to the order of magnitude of the forecast quantity.
-### Estimation details and the number of samples required for accurate scoring
+#### Estimation details and the number of samples required for accurate scoring
The CRPS, DSS and log score are in principle all applicable to continuous as well as discrete forecasts. However, they differ in how easily and accurately scores can be computed based on predictive samples. This is an issue for the log score in particular, which equals the negative log density of the predictive distribution evaluated at the observed value and therefore requires a density estimation. The kernel density estimation used in \pkg{scoringutils} (through the function \fct{log\_sample} from the \pkg{scoringRules} package) may be particularly inappropriate for discrete values (see also Table \ref{tab:score-table-detailed}). The log score is therefore not computed for discrete predictions in \pkg{scoringutils}. For a small number of samples, estimated scores may deviate considerably from the exact scores computed based on closed-form predictive functions. This is especially pronounced for the log score, as illustrated in Figure \ref{fig:score-convergence} (adapted from \citep{jordanEvaluatingProbabilisticForecasts2019}).
```{r score-convergence, echo = FALSE, fig.cap="Top: Estimation of scores from predictive samples (adapted from \\citep{jordanEvaluatingProbabilisticForecasts2019}). Scores were computed based on samples of differing size (from 10 to 100,000). This was repeated 500 times for each sample size. The black line is the mean score across the 500 repetitions, shaded areas represent 50\\% and 90\\% intervals, and the dashed line represents the true calculated score. Bottom left: Change in score when the uncertainty of the predictive distribution is changed. The true distribution is N(0,5) with the true standard deviation marked with a dashed line, while the standard deviation of the predictive distribution is varied along the x-axis. Log score and DSS clearly punish overconfidence much more severely than underconfidence. Bottom right: Score achieved for a standard normal predictive distribution (illustrated in grey) and different true observed values. Log score and DSS punish instances more harshly where the observed value is far away from the predictive distribution.", fig.show="hold"}
-include_graphics("plots/sample-convergence.png")
-include_graphics("plots/score-deviation-sd-mu.png")
+include_graphics("output/score-convergence-outliers.png")
```
-### Overconfidence, underconfidence and outliers
+#### Overconfidence, underconfidence and outliers
Proper scoring rules differ in how they penalise over- or underconfident forecasts. The log score and the DSS pnealise overconfidence much more severely than underconfidence, while the CRPS does not distinguish between over- and underconfidence and penalises both rather leniently \citep{macheteContrastingProbabilisticScoring2012} (see Figure \ref{fig:score-convergence}B, left panel). Similarly, the CRPS is relatively lenient with regards to outlier predictions compared to the log score and the DSS (see Figure \ref{fig:score-convergence}B, right panel). The CRPS, which can be thought of as a generalisation of the absolute error to a predictive distribution, scales linearly with the distance between forecast distribution and true value. The log score, on the other hand, as the negative logarithm of the predictive density evaluated at the observed value, can quickly tend to infinity if the probability assigned to the observed outcome is close to zero. Whether or not harsh penalisation of overconfidence and bad predictions is desirable or not depends of course on the setting. If, for example, one wanted to forecast hospital bed capacity, it may be prudent to score forecasts using a log score as one might prefer to be too cautious rather than too confident.
-### Sensitivity to distance - local vs\. global scores {#localglobal}
+#### Sensitivity to distance - local vs\. global scores {#localglobal}
The CRPS and the DSS are so-called global scoring rules, which means that the score is sensitive to the distance of the entire predictive distribution from the observed value. The log score, on the other hand, is local and the resulting score depends only on the probability density assigned to the actual outcome, ignoring the rest of the predictive distribution (see Figure \ref{fig:score-locality}).
Sensitivity to distance (taking the entire predictive distribution into account) may be a desirable property in most settings that involve decision making. A prediction which assigns high probability to results far away from the observed value is arguably less useful than a forecast which assigns a lot of probability mass to values closer to the observed outcome (the probability assigned to the actual outcome being equal for both forecasts). The log score is only implicitly sensitive to distance in expectation if we assume that values close to the observed value are actually more likely to occur. The fact that the log score only depends on the outcome that actually realised, however, may make it more appropriate for inferential purposes (see \citep{winklerScoringRulesEvaluation1996}) and it is commonly used in Bayesian statistics \citep{gelmanUnderstandingPredictiveInformation2014}.
```{r score-locality, echo = FALSE, fig.cap="Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match. The true number later observed, 2, is marked with a dashed line. Both forecasters assign a probability of 0.35 to the observed outcome, 2. Forecaster A's prediction is centred around the observed value, while Forecaster B assigns significant probability to outcomes far away from the observed value. Judged by a local score like the Log Score, both forecasters receive the same score. A global score like the CRPS and the DSS penalises forecaster B more severely."}
-include_graphics("plots/score-locality.png")
+include_graphics("output/score-locality.png")
```
-### Sensitivity to the order of magnitude of the forecast quantity
+#### Sensitivity to the order of magnitude of the forecast quantity
-Average scores usually scale with the order of magnitude of the quantity we try to forecast (as the variance of the data-generating distribution usually increases with the mean). The effect is illustrated in Figure \ref{fig:score-scale}. This makes it harder to compare forecasts for very different targets, or assess average performance if the quantity of interest varies substantially over time. Average scores tend to be dominated by forecasts for targets with high absolute numbers. This is especially the case for the CRPS (as a generalisation of the absolute error), for which average scores tend to increase strongly with the order of magnitude of the quantity to forecast. The log score and the DSS tend to be more robust against this effect, depending on the exact relationship between mean and variance of the data-generating distribution.
+Average scores usually scale with the order of magnitude of the quantity we try to forecast (as the variance of the data-generating distribution usually increases with the mean). Figure \ref{fig:score-scale} illustrates the effect of an increase in scale of the forecast target on average scores. This relation makes it harder to compare forecasts for very different targets, or assess average performance if the quantity of interest varies substantially over time. Average scores tend to be dominated by forecasts for targets with high absolute numbers. This is especially the case for the CRPS (as a generalisation of the absolute error), for which average scores tend to increase strongly with the order of magnitude of the quantity to forecast. The log score and the DSS tend to be more robust against this effect, depending on the exact relationship between mean and variance of the data-generating distribution.
```{r score-scale, echo = FALSE, fig.cap="Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value. A: Mean and standard deviation of scores from a simulation of perfect forecasts with predictive distribution $F$ equal to the true data-generating distribution $G$. The standard deviation of the two distributions was held constant at $\\sigma$, and for different mean values $\\mu$ 100 pairs of forecasts and observations were simulated. Every simulated forecast consisted of 1000 draws from the data-generating distribution $G$ and 5000 draws from the (same) predictive distribution $F$. For all three scoring rules, mean and sd of the calculated scores stay constant regardless of the mean $\\mu$ of $F$ and $G$. B: Same setup, but now the mean of $F$ and $G$ was held constant at $\\mu = 1$ and the standard deviation $\\sigma$ was varied. Average scores increase for all three scoring rules, but most strongly for the CRPS. Standard deviations of the estimated scores stay roughly constant for the DSS and log score, but also increase for the CRPS. C: Scores for forecasts of COVID-19 cases and deaths from the European Forecast Hub ensemble based on the example data provided in the package."}
-include_graphics("plots/illustration-effect-scale.png")
+include_graphics("output/illustration-effect-scale.png")
```
-## Proper scoring rule for quantile-based forecasts (WIS) {#wis}
-For forecasts in an interval or quantile format, \pkg{scoringutils} offers the weighted interval score (WIS) \citep{bracherEvaluatingEpidemicForecasts2021}. The WIS has very similar properties to the CRPS and can be thought of as a quantile-based approximation. For an increasing number of equally-spaced prediction intervals the WIS converges to the CRPS. One additional benefit of the WIS is that it can easily be decomposed into three additive components: an uncertainty penalty (called dispersion or sharpness penalty) for the width of a prediction interval and penalties for over- and under-prediction (if a value falls outside of a prediction interval). This can be very helpful in diagnosing issues with forecasts. It may even be useful to estimate quantiles from predictive samples and use the WIS in addition to the CRPS to make use of this decomposition for this purpose (with the caveat that estimating quantiles from samples may be biased if the number of samples is small).
-
+### Proper scoring rule for quantile-based forecasts (WIS) {#wis}
+For forecasts in an interval or quantile format, \pkg{scoringutils} offers the weighted interval score (WIS) \citep{bracherEvaluatingEpidemicForecasts2021}. The WIS has very similar properties to the CRPS and can be thought of as a quantile-based approximation. For an increasing number of equally-spaced prediction intervals the WIS converges to the CRPS. One additional benefit of the WIS is that it can easily be decomposed into three additive components: an uncertainty penalty (called dispersion or sharpness penalty) for the width of a prediction interval and penalties for over- and underprediction (if a value falls outside of a prediction interval).
-## Proper scoring rules for binary outcomes (BS and log score)
+### Proper scoring rules for binary outcomes (BS and log score)
-Binary forecasts can be scored using the Brier score (BS) or the log score. The Brier score \citep{brierVERIFICATIONFORECASTSEXPRESSED1950} corresponds to the squared difference between the given probability and the outcome (either 0 or 1) and equals the ranked probability score for the case of only two possible outcomes \citep{epsteinScoringSystemProbability1969, murphyNoteRankedProbability1971a}. The log score corresponds to the log of the probability assigned to the observed outcome. Just as with continuous forecasts, the log score penalises overconfidence much more harshly than underconfidence. The Brier score, on the other hand, does not distinguish between over- and underconfidence \citep{macheteContrastingProbabilisticScoring2012} and is therefore more forgiving of outlier predictions.
+Binary forecasts can be scored using the Brier score (BS) or the log score. The Brier score \citep{brierVERIFICATIONFORECASTSEXPRESSED1950} corresponds to the squared difference between the given probability and the outcome (either 0 or 1) and equals the ranked probability score for the case of only two possible outcomes \citep{epsteinScoringSystemProbability1969, murphyNoteRankedProbability1971a}. The log score corresponds to the log of the probability assigned to the observed outcome. Just as with continuous forecasts, the log score penalises overconfidence much more harshly than underconfidence. The Brier score, on the other hand, penalises over- and underconfidence similarly \citep{macheteContrastingProbabilisticScoring2012} and is more forgiving of outlier predictions.
-## Pairwise comparisons
+## Pairwise comparisons {#pairwisetheory}
-In order to compare performance of different models fairly even if forecasts are missing, we can compute relative skill scores based on pairwise comparisons \citep{cramerEvaluationIndividualEnsemble2021}. Models enter a 'pairwise tournament', where all possible pairs of models are compared based on the overlapping set of available forecasts common to both models (omitting comparisons where there is no overlapping set of forecasts). For every pair, the ratio of the mean scores of both models is computed. The relative skill score of a model is then the geometric mean of all mean score ratios which involve that model. This gives us an indicator of performance relative to all other models, with the orientation depending on the score used. The method is able to account for missing forecasts, but it is nevertheless advisable to only compare forecasts that are at least 50\% complete. Furthermore, pairwise comparisons are only possible if all scores have the same sign. Then, a relative skill score smaller than 1 indicates that a model is performing better than the average model for negatively oriented scores. One can also compute a scaled relative skill score by providing baseline model. In that case those mean score rations which include the baseline are excluded when taking the geometric mean to obtain relative skill scores for individual models (which therefore differ slightly from relative scores without a baseline). All individual relative skill scores are then scaled by (i.e. divided by) the relative score of the baseline model.
+Raw scores for different forecasting models are not directly comparable in the case of missing forecasts, as forecasting targets usually differ in their characteristics (e.g. the scale of the forecast target, how difficult targets are to forecast etc.). One way to mitigate this are relative skill scores based on pairwise comparisons \citep{cramerEvaluationIndividualEnsemble2021}. Models enter a 'pairwise tournament', where all possible pairs of models are compared based on the overlapping set of available forecasts common to both models (omitting comparisons where there is no overlapping set of forecasts). For every pair, the ratio of the mean scores of both models is computed. The relative skill score of a model is then the geometric mean of all mean score ratios which involve that model. This gives us an indicator of performance relative to all other models, with the orientation depending on the score used (e.g. for the proper scoring rules presented above, a relative skill score below 1 indicates better performance). As two models can only be fairly compared if they have overlapping forecasts it is advisable to only compare forecasts that are at least 50\% complete. Furthermore, pairwise comparisons are only possible if all scores have the same sign. One can also compute a scaled relative skill score by providing baseline model. All individual relative skill scores are then scaled by (i.e. divided by) the relative score of the baseline model.
It is in principle possible to compute p-values to determine whether two models perform significantly differently. \pkg{scoringutils} allows to compute these using either the Wilcoxon rank sum test (also known as Mann-Whitney-U test) \citep{mannTestWhetherOne1947} or a permutation test. In practice, this is complicated by the fact that both tests assume independent observations. In reality, however, forecasts by a model may be correlated across time or another dimension (e.g. if a forecaster has a bad day, they might perform badly across different targets for a given forecast date). P-values may therefore be too liberal in suggesting significant differences where there aren't any. One way to mitigate this is to aggregate observations over a category where one suspects correlation (for example averaging across all forecasts made on a given date) before making pairwise comparisons. A test that is performed on aggregate scores will likely be more conservative.
+
# Evaluating forecasts using scoringutils {#evaluation-example}
-The \pkg{scoringutils} package offers comprehensive functionality to conduct a forecast evaluation and allows users to check inputs, score forecasts and visualise results. Most functions operate on a `data.frame`-based format, but the package also provides a set of reliable lower-level scoring metrics operating on vectors/matrices, which experienced users can use in their own evaluation pipelines. These will not be discussed in this paper and refer to the vignettes and package documentation for further information. Some helper functions for data-handling, as well as example data sets and tables with additional information about available scoring metrics are also included in the package. Internally, operations are handled using \pkg{data.table} to allow for fast and efficient computation.
+The \pkg{scoringutils} package offers comprehensive functionality to conduct a forecast evaluation and allows users to check inputs, score forecasts and visualise results. Most functions operate on a `data.frame`-based format, but the package also provides a set of function to score individual forecasts directly which operate on vectors/matrices. These will not be discussed in this paper and we refer to the vignettes and package documentation for further information. Some helper functions for data-handling, as well as example data sets and tables with additional information about available scoring metrics are also included in the package.
## Example data
@@ -300,17 +304,19 @@ example_quantile |>
## Expected input formats and data checking
-Depending on the format of the forecast, a `data.frame` (or similar) is required for most functions with column names as shown in Table \ref{tab:column-requirements}.
+Depending on the format of the forecast, a `data.frame` (or similar) is required for most functions with column names as shown in Table \ref{tab:column-requirements}. Point forecasts are defined as forecasts that follow a quantile-based format (and can be mixed with quantile-based forecasts), but which have an `NA` value in the column `"quantile"`.
+
```{r, column-requirements, echo=FALSE}
library(data.table)
requirements <-
data.table(
- "Format" = c("quantile-based", "sample-based", "binary", "pairwise-comparisons"),
+ "Format" = c("quantile-based", "sample-based", "binary", "point-forecasts", "pairwise-comparisons"),
`Required columns` = c("'true_value', 'prediction', 'quantile'",
"'true_value', 'prediction', 'sample'",
"'true_value', 'prediction'",
+ "like quantile-based, but with \n NA in the 'quantile' column",
"additionally a column 'model'"),
- "Example data" = c("example_quantile", "example_integer, \n example_continuous", "example_binary", "~")
+ "Example data" = c("example_quantile", "example_integer, \n example_continuous", "example_binary", "example_point", "~")
)
requirements |>
@@ -318,6 +324,7 @@ requirements |>
booktabs = TRUE,
linesep = c('\\addlinespace'),
caption = "Overview of the columns required for different input formats.") |>
+ kableExtra::column_spec(2, width = "6cm") |>
kableExtra::column_spec(3, width = "3.7cm") |>
kableExtra::kable_styling(latex_options = c("striped",
"repeat_header, scale_down"))
@@ -331,7 +338,7 @@ The function \fct{check\_forecasts} allows to check whether input data conforms
check_forecasts(example_quantile)
```
-The values stored in the list elements \code{target_type} and \code{prediction_type} refer to type of the forecast and the target variable. \code{forecast_unit} contains a vector of the columns which \pkg{scoringutils} thinks denote the unit of a single forecast. This means that in this instance a single forecast (with a set of 23 quantiles) can uniquely be identified by the values in the columns "location", "target\_end\_date", "target\_type", "location\_name", "forecast\_date", "model", "horizon". In this example, having "location" as well as "location\_name" included does not make a difference, as they contain duplicated information. In general, however, it is strongly advised to remove all unnecessary columns that do not help identify a single forecast. \code{unique_values} gives an overview of the number of unique values per column across the entire data set, providing a first hint as to whether the forecast set is complete. \code{warnings} shows potential warnings about the data. In this example, \pkg{scoringutils} warns that there are observed values present for which there is no corresponding forecast. These warnings can often be ignored, but may provide important information. If there are errors that cannot be ignored, a list entry \code{errors} will appear.
+The values stored in the list elements \code{target_type} and \code{prediction_type} refer to type of the forecast and the target variable. \code{forecast_unit} contains a vector of the columns which \pkg{scoringutils} thinks denote the unit of a single forecast. This means that in this instance a single forecast (with a set of 23 quantiles) can uniquely be identified by the values in the columns "location", "target\_end\_date", "target\_type", "location\_name", "forecast\_date", "model", "horizon". In this example, having "location" as well as "location\_name" included does not make a difference, as they contain duplicated information. In general, however, it is strongly advised to remove all unnecessary columns that do not help identify a single forecast. \code{unique_values} gives an overview of the number of unique values per column across the entire data set, providing a first hint as to whether the forecast set is complete. \code{messages} or \code{warnings} show messages and warnings created when checking the data.
## Visualising forecast data
@@ -352,8 +359,7 @@ avail_forecasts(data = example_integer,
plot_avail_forecasts(x = "forecast_date",
show_numbers = FALSE) +
facet_wrap(~ target_type) +
- labs(y = "Model", x = "Forecast date") +
- theme(legend.position = "bottom")
+ labs(y = "Model", x = "Forecast date")
```
@@ -368,13 +374,14 @@ plot_predictions(data = example_quantile,
filter_forecasts = list("model == 'EuroCOVIDhub-ensemble'",
'forecast_date == "2021-06-28"')) +
facet_wrap(target_type ~ location, ncol = 4, scales = "free_y") +
- theme(legend.position = "bottom") +
labs(x = "Target end date")
```
## Scoring forecasts with \fct{score} {short-title="Scoring forecasts with score()" #scoring}
-The function \fct{score} evaluates predictions against observed values and automatically applies the appropriate scoring metrics depending on the input data. We can simply call:
+The function \fct{score} evaluates predictions against observed values and automatically applies the appropriate scoring metrics depending on the input data.
+
+We can simply call:
```{r}
score(example_quantile) |>
@@ -401,7 +408,7 @@ score(example_quantile) |>
While \fct{summarise\_scores} accepts arbitrary summary functions, care has to be taken when using something else than \fct{mean}, because scores may lose propriety when using other summary functions. For example, the median of several individual scores (individually based on a proper scoring rule) is usually not proper. A forecaster judged by the median of several scores may be incentivised to misrepresent their true belief in a way that is not true for the mean score.
-The user must exercise additional caution and should usually avoid aggregating scores across categories which differ much in the magnitude of the quantity to forecast, as forecast errors usually increase with the order of magnitude of the forecast target. In the given example, looking at one score per model (i.e. specifying \code{summarise_by = c("model")}) is problematic, as overall aggregate scores would be dominated by case forecasts, while performance on deaths would have little influence. Similarly, aggregating over different forecast horizons is often ill-advised as the mean will be dominated by further ahead forecast horizons.
+The user must exercise additional caution and should usually avoid aggregating scores across categories which differ much in the magnitude of the quantity to forecast, as forecast errors usually increase with the order of magnitude of the forecast target. In the given example, looking at one score per model (i.e. specifying \code{summarise_by = c("model")}) is problematic, as overall aggregate scores would be dominated by case forecasts, while performance on deaths would have little influence. Similarly, aggregating over different forecast horizons is often ill-advised as the mean will be dominated by further ahead forecast horizons. In some instances it may be helpful to look at relative skill scores instead (see sections \ref{pairwisetheory} and \ref{pairwisecode}).
As a proxy for calibration, we are often interested in empirical coverage-levels of certain central prediction intervals, for example the percentage of true values which fell inside all 50% or 90% prediction intervals. For any quantile-based forecast, we can simply add this information using the function \fct{add\_coverage}. The function has a \code{by} argument which accepts a vector of column names defining the level of grouping for which empirical coverage is computed. Note that these column names should be equal to those passed to \code{by} in subsequent calls of \fct{summarise\_forecasts}.
@@ -418,14 +425,11 @@ example_integer |>
glimpse()
```
-The process is designed to require conscious action by the user, because the estimation of quantiles from predictive samples may be biased if the number of available samples is not sufficiently large. The possibility to switch to a quantile-based format may also be useful for model analytics because of the decomposition of the weighted interval score which is not easily available for the CRPS.
-
-## Pairwise comparisons
+The process is designed to require conscious action by the user, because the estimation of quantiles from predictive samples may be biased if the number of available samples is not sufficiently large.
-In order to obtain a model ranking, we recommend looking at the relative skill in terms of an appropriate proper scoring rule instead of the raw score. Relative skill scores can be aggregated more easily across different forecast targets as they are less influenced by the order of magnitude of the quantity to forecast than e.g. the WIS or the CRPS.
-
-
+## Pairwise comparisons {#pairwisecode}
+In order to obtain a model ranking, we recommend looking at the relative skill in terms of an appropriate proper scoring rule instead of the raw score whenever forecasts are missing.
Relative skill scores can either be obtained by specifying \code{relative_skill = TRUE} in the function \fct{summarise\_scores}, or by calling the function \fct{pairiwse\_comparison}. In both cases, pairwise comparisons are computed according to the grouping specified in the argument \code{by}: internally, the \code{data.frame} with all scores gets split into different \code{data.frame}s according to the values specified in \code{by} (excluding the column 'model'). Relative scores are then computed for every individual group separately. In the example below we specify \code{by = c("model", "target_type")}, which means that there is one relative skill score per model, calculated separately for the different forecasting targets. Using the argument \code{baseline}, we can compute relative skill with respect to a baseline model.
```{r}
@@ -435,10 +439,7 @@ score(example_quantile) |>
glimpse()
```
-When a baseline is provided, pairwise comparisons involving that baseline model are excluded from the computation of relative scores for all non-baseline models. Relative skill scores with a baseline included therefore differ slightly from relative scores without a baseline. Scaled relative skill scores are then computed by scaling (i.e. dividing) all relative scores by the relative score of the baseline model.
-
-
-Pairwise comparisons should usually be made based on unsummarised scores (\fct{pairwise\_comparison} internally summarises over samples and quantiles automatically, but nothing else), as summarising can change the set of overlapping forecasts between two models and distort relative skill scores. When computing relative skill scores using \fct{summarise_scores}, this happens by default. When using \fct{pairwise\_comparison}, the function \fct{summarise\_scores} should usually not be called beforehand. One potential exception to this is when one is interested in the p-values obtained from pairwise comparisons. As forecasts are usually highly correlated (which the calculation of p-values do not account for), it may be sensible to summaries over a few categories to reduce correlation and obtain more conservative p-values.
+Pairwise comparisons should usually be made based on unsummarised scores (the function \fct{pairwise\_comparison} internally summarises over samples and quantiles automatically, but nothing else), as summarising can change the set of overlapping forecasts between two models and distort relative skill scores. When using \fct{pairwise\_comparison}, the function \fct{summarise\_scores} should therefore usually not be called beforehand. One potential exception to this is when one is interested in the p-values obtained from pairwise comparisons. As forecasts are usually highly correlated (which the calculation of p-values do not account for), it may be sensible to summaries over a few categories (provided there are no missing values within the categories summarised over) to reduce correlation and obtain more conservative p-values.
Using the function \fct{plot\_pairwise\_comparison} we can visualise the mean score ratios between all models. The output of the following code is shown in Figure \ref{fig:pairwise-plot}.
@@ -453,34 +454,55 @@ score(example_quantile) |>
## Model diagnostics
The \pkg{scoringutils} package offers a variety of functions to aid the user in diagnosing issues with models.
-
For example, to detect systematic patterns it may be useful to visualise a single metric across several dimensions. The following produces a heatmap of bias values across different locations and forecast targets (output shown in Figure \ref{fig:score-heatmap}).
-```{r score-heatmap, fig.pos = "!h", fig.width = 8, fig.cap = "Heatmap of bias values for different models across different locations and forecast targets. Bias values are bound between -1 (under-prediction) and 1 (over-prediction) and should be 0 ideally. Red tiles indicate an upwards bias (over-prediction), while blue tiles indicate a downwards bias (under-predicction)"}
+```{r score-heatmap, fig.pos = "!h", fig.width = 8, fig.cap = "Heatmap of bias values for different models across different locations and forecast targets. Bias values are bound between -1 (underprediction) and 1 (overprediction) and should be 0 ideally. Red tiles indicate an upwards bias (overprediction), while blue tiles indicate a downwards bias (under-predicction)"}
score(example_continuous) |>
summarise_scores(by = c("model", "location", "target_type")) |>
plot_heatmap(x = "location", metric = "bias") +
- facet_wrap(~ target_type) +
- theme(legend.position = "bottom")
+ facet_wrap(~ target_type)
```
-For quantile-based forecasts, it is helpful to visualise the decomposition of the weighted interval score into its components: dispersion, over-prediction and under-prediction. This can be achieved using the function \fct{plot\_wis\_components}, as shown in Figure \ref{fig:wis-components}
-
-As described in section \ref{scoring} it is possible to convert from a sample-based to a quantile-based forecast format to make use of this decomposition even for sample-based forecasts.
+For quantile-based forecasts, it is helpful to visualise the decomposition of the weighted interval score into its components: dispersion, overprediction and underprediction. This can be achieved using the function \fct{plot\_wis\_components}, as shown in Figure \ref{fig:wis-components}
-```{r wis-components, fig.pos = "!h", fig.width = 8, fig.cap = "Decomposition of the weighted interval score (WIS) into dispersion, over-prediction and under-prediction. The WIS components measure over- and under-prediction in absolute, rather than relative terms."}
+```{r wis-components-code, eval = FALSE, fig.pos = "!h", fig.width = 8, fig.cap = "Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction. The WIS components measure over- and underprediction in absolute, rather than relative terms."}
score(example_quantile) |>
summarise_scores(by = c("model", "target_type")) |>
plot_wis(relative_contributions = FALSE) +
facet_wrap(~ target_type,
- scales = "free_x") +
- theme(legend.position = "bottom")
+ scales = "free_x")
```
+```{r wis-components, echo = FALSE, fig.pos = "!h", fig.width = 9.5, fig.show = "hold", fig.cap = "Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction. A: absolute contributions, B: contributions normalised to 1."}
+p1 <- score(example_quantile) |>
+ summarise_scores(by = c("model", "target_type")) |>
+ plot_wis(relative_contributions = FALSE) +
+ facet_wrap(~ target_type,
+ scales = "free_x") +
+ theme(panel.spacing = unit(1.5, "lines"))
+
+p2 <- score(example_quantile) |>
+ summarise_scores(by = c("model", "target_type")) |>
+ plot_wis(relative_contributions = TRUE) +
+ facet_wrap(~ target_type,
+ scales = "free_x") +
+ theme(axis.title.y = element_blank(),
+ axis.text.y = element_blank()) +
+ theme(panel.spacing = unit(1.5, "lines")) +
+ labs(x = "Normalised WIS contributions")
+
+p1 + p2 +
+ plot_annotation(tag_levels = "A") +
+ plot_layout(guides = "collect") &
+ theme(legend.position = "bottom")
+
+```
+
+
Special attention should be given to calibration. The most common way of assessing calibration (more precisely: probabilistic calibration) are PIT histograms, as explained in section \ref{probabilistic-calibration}. Ideally, PIT values should be uniformly distributed after the transformation.
-We can compute PIT values in the following way
+We can compute PIT values in the following way:
```{r}
example_continuous |>
@@ -488,7 +510,6 @@ example_continuous |>
```
and create PIT histograms using the function \fct{plot\_pit}. The output of the following is shown in Figure \ref{fig:pit-plots}:
-
```{r pit-plots, fig.pos = "!h", fig.cap="PIT histograms of all models stratified by forecast target. Histograms should ideally be uniform. A u-shape usually indicates overconfidence (forecasts are too narrow), a hump-shaped form indicates underconfidence (forecasts are too uncertain) and a triangle-shape indicates bias.", fig.width = 8, fig.height=4}
example_continuous |>
@@ -497,9 +518,9 @@ example_continuous |>
facet_grid(target_type ~ model)
```
-We can also look at interval and quantile coverage plots (explained in more detail in section \ref{probabilistic-calibration}) using the functions \fct{plot\_interval\_coverage} and \fct{plot\_quantile\_coverage}. These plots require that the columns "range" and "quantile", respectively, be present in the scores to plot, and therefore need to be included in the `by` argument when summarising scores. The output of the following is shown in Figure \ref{fig:coverage}.
+We can also look at interval and quantile coverage plots (explained in more detail in section \ref{probabilistic-calibration}) using the functions \fct{plot\_interval\_coverage} and \fct{plot\_quantile\_coverage}. These plots require that the columns "range" and "quantile", respectively, be present in the scores to plot, and therefore need to be included in the `by` argument when summarising scores. The output of the following is shown in Figure \ref{fig:coverage}.
-```{r coverage, fig.width = 10, fig.pos = "!h", fig.show='hold', fig.cap = "Interval coverage and quantile coverage plots. Areas shaded in green indicate that the forecasts are too wide (i.e. underconfident), while areas in white indicate that the model is overconfident and generates too narrow predictions intervals."}
+```{r coverage-code, eval = FALSE, fig.width = 10, fig.pos = "!h", fig.show='hold', fig.cap = "Interval coverage and quantile coverage plots. Areas shaded in green indicate that the forecasts are too wide (i.e. underconfident), while areas in white indicate that the model is overconfident and generates too narrow predictions intervals."}
cov_scores <- score(example_quantile) |>
summarise_scores(by = c("model", "target_type", "range", "quantile"))
@@ -510,6 +531,27 @@ plot_quantile_coverage(cov_scores) +
facet_wrap(~ target_type)
```
+```{r coverage, echo = FALSE, fig.height = 6, fig.width = 10, fig.pos = "!h", fig.show='hold', fig.cap = "Interval coverage (A) and quantile coverage (B) plots. Areas shaded in green indicate that the forecasts are too wide (i.e. underconfident), while areas in white indicate that the model is overconfident and generates too narrow predictions intervals."}
+cov_scores <- score(example_quantile) |>
+ summarise_scores(by = c("model", "target_type", "range", "quantile"))
+
+p1 <- plot_interval_coverage(cov_scores) +
+ facet_wrap(~ target_type) +
+ theme(panel.spacing = unit(2, "lines"))
+
+p2 <- plot_quantile_coverage(cov_scores) +
+ facet_wrap(~ target_type) +
+ theme(panel.spacing = unit(2, "lines"))
+
+p1 / p2 +
+ plot_annotation(tag_levels = "A") +
+ plot_layout(guides = "collect") &
+ theme(legend.position = "bottom")
+
+```
+
+
+
It may sometimes be interesting to see how different scores correlate with each other. We can examine this using the function \fct{correlation}. When dealing with quantile-based forecasts, it is important to call \fct{summarise\_scorees} before \fct{correlation} in order to summarise over quantiles before computing correlations. The plot resulting from the following code is shown in Figure \ref{fig:correlation-plot}.
```{r correlation-plot, fig.pos = "!h", fig.width=8, fig.height=4, fig.cap = "Correlation between different scores"}
@@ -531,7 +573,8 @@ Forecast evaluation is invaluable to understanding and improving current forecas
The package is still under active development and we warmly welcome contributions to \pkg{scoringutils}. In the future we hope to extend the number of scoring metrics supported. This includes spherical scoring rules \citep{gneitingStrictlyProperScoring2007, joseCharacterizationSphericalScoring2009, macheteContrastingProbabilisticScoring2012}, evaluation of multinomial prediction tasks, as well as a broader range of scoring metrics for point forecasts. We also plan to expand the plotting functionality and hope to make templates available for automated scoring reports.
-NOT SURE WHAT ELSE TO DISCUSS?
+
+## Acknowledgments
@@ -540,8 +583,6 @@ NOT SURE WHAT ELSE TO DISCUSS?
-## Acknowledgments
-
Funding statements
@@ -565,7 +606,7 @@ Funding statements
\appendix
-# (APPENDIX) Supplementary information {-}
+# (APPENDIX) Detailed Information on Metrics {-}
```{r score-table-detailed, echo=FALSE, cache = FALSE}
@@ -583,42 +624,6 @@ data[, 1:2] |>
```
-
-
-```{r wis-components-relative, eval = FALSE, echo = FALSE, include = FALSE, fig.width = 10, fig.cap = "Components of the weighted interval score normalised to one to show relative contribution of different penalties."}
-score(example_quantile) |>
- summarise_scores(by = c("model", "target_type")) |>
- plot_wis(relative_contributions = TRUE) +
- facet_wrap(~ target_type,
- scales = "free_x") +
- coord_flip() +
- theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) +
- theme(legend.position = "bottom")
-```
-
-
-
-```{r pit-plots-quantile, eval = FALSE, echo = FALSE, include = FALSE, fig.cap="PIT histograms based on a subset of ten equally-spaced quantiles of all models stratified by forecast target. Histograms should ideally be uniform. A u-shape usually indicates overconfidence (forecasts are too narrow), a hump-shaped form indicates underconfidence (forecasts are too uncertain) and a triangle-shape indicates bias."}
-subset(example_quantile, example_quantile$quantile %in% seq(0.1, 0.9, 0.1)) |>
- pit(by = c("model", "target_type")) |>
- plot_pit() +
- facet_grid(model ~ target_type)
-```
-
-
-
-
-
-
-```{r, eval = FALSE, echo = FALSE, include = FALSE, fig.pos = "!h", fig.width=8, fig.height=4, fig.pos="!h"}
-example_quantile |>
- score() |>
- summarise_scores(by = c("model", "range", "target_type")) |>
- plot_ranges() +
- facet_wrap(~ target_type, scales = "free")
-```
-
-
\newpage
diff --git a/inst/manuscript/manuscript.aux b/inst/manuscript/manuscript.aux
index 827598ec9..b0fecdf45 100644
--- a/inst/manuscript/manuscript.aux
+++ b/inst/manuscript/manuscript.aux
@@ -20,10 +20,12 @@
\citation{timmermannForecastingMethodsFinance2018,elliottForecastingEconomicsFinance2016}
\citation{gneitingWeatherForecastingEnsemble2005,kukkonenReviewOperationalRegionalscale2012}
\citation{reichCollaborativeMultiyearMultimodel2019,funkShorttermForecastsInform2020,cramerEvaluationIndividualEnsemble2021,bracherShorttermForecastingCOVID192021,europeancovid-19forecasthubEuropeanCovid19Forecast2021}
-\citation{bracherEvaluatingEpidemicForecasts2021}
-\citation{funkAssessingPerformanceRealtime2019}
+\citation{goodRationalDecisions1952}
+\citation{epsteinScoringSystemProbability1969,murphyNoteRankedProbability1971a,mathesonScoringRulesContinuous1976}
\citation{gneitingProbabilisticForecastsCalibration2007}
+\citation{funkAssessingPerformanceRealtime2019}
\citation{gneitingStrictlyProperScoring2007}
+\citation{bracherEvaluatingEpidemicForecasts2021}
\newlabel{introduction}{{1}{1}{}{section.1}{}}
\citation{R}
\citation{scoringRules}
@@ -34,18 +36,18 @@
\citation{cramerEvaluationIndividualEnsemble2021,bracherShorttermForecastingCOVID192021,europeancovid-19forecasthubEuropeanCovid19Forecast2021,bracherNationalSubnationalShortterm2021}
\citation{europeancovid-19forecasthubEuropeanCovid19Forecast2021}
\citation{gneitingStrictlyProperScoring2007}
-\newlabel{forecast-types-and-forecast-formats}{{1.1}{2}{}{subsection.1.1}{}}
\citation{cramerCOVID19ForecastHub2020,cramerEvaluationIndividualEnsemble2021,bracherShorttermForecastingCOVID192021,bracherNationalSubnationalShortterm2021,europeancovid-19forecasthubEuropeanCovid19Forecast2021}
\citation{gneitingProbabilisticForecastsCalibration2007}
-\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.\relax }}{3}{table.caption.1}\protected@file@percent }
-\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
-\newlabel{tab:forecast-types}{{1}{3}{Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.\relax }{table.caption.1}{}}
-\newlabel{the-forecasting-paradigm}{{1.2}{3}{}{subsection.1.2}{}}
\citation{gneitingProbabilisticForecastsCalibration2007}
\citation{gneitingProbabilisticForecastsCalibration2007}
+\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Forecast and forecast target types. Forecasts can be probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.\relax }}{3}{table.caption.1}\protected@file@percent }
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
+\newlabel{tab:forecast-types}{{1}{3}{Forecast and forecast target types. Forecasts can be probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.\relax }{table.caption.1}{}}
+\newlabel{forecast-types-and-forecast-formats}{{1.1}{3}{}{subsection.1.1}{}}
+\newlabel{the-forecasting-paradigm}{{1.2}{3}{}{subsection.1.2}{}}
\citation{gneitingStrictlyProperScoring2007}
-\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Schematic illustration of sharpness (top row) and calibration (bottom row)}}{4}{figure.caption.2}\protected@file@percent }
-\newlabel{fig:forecast-paradigm}{{1}{4}{Schematic illustration of sharpness (top row) and calibration (bottom row)}{figure.caption.2}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Schematic illustration of sharpness (A, B) and calibration (C, D, E)}}{4}{figure.caption.2}\protected@file@percent }
+\newlabel{fig:forecast-paradigm}{{1}{4}{Schematic illustration of sharpness (A, B) and calibration (C, D, E)}{figure.caption.2}{}}
\newlabel{metrics}{{2}{4}{}{section.2}{}}
\gdef \LT@i {\LT@entry
{1}{94.51282pt}\LT@entry
@@ -54,17 +56,17 @@
{1}{19.15776pt}\LT@entry
{1}{19.15776pt}\LT@entry
{1}{276.6107pt}}
-\newlabel{tab:metrics-summary}{{2}{6}{}{table.2}{}}
-\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Summary table of scores available in {\fontseries {m}\fontseries {b}\selectfont scoringutils}. A version of this table which includes corresponding function names can be accessed in \textsf {R} by calling \bgroup \catcode `\_12\relax \catcode `\~12\relax \catcode `\$12\relax {\normalfont \ttfamily \hyphenchar \font =-1 scoringutils::metrics\_summary}\egroup . Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, 'x', or '$\sim $' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary forecasts), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g. binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g. for 'global' and 'local' see section \ref {localglobal}). For mathematical defintions of the metrics see Table \ref {tab:score-table-detailed}.\relax }}{6}{table.2}\protected@file@percent }
+\newlabel{tab:metrics-summary}{{2}{5}{}{table.2}{}}
+\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Summary table of scores available in {\fontseries {m}\fontseries {b}\selectfont scoringutils}. This table (including corresponding function names) can be accessed by calling \bgroup \catcode `\_12\relax \catcode `\~12\relax \catcode `\$12\relax {\normalfont \ttfamily \hyphenchar \font =-1 scoringutils::metrics\_summary}\egroup in \textsf {R}. Not all metrics are implemented for all types of forecasts and forecasting formats, as indicated by tickmarks, 'x', or '$\sim $' (depends). D (discrete forecasts based on predictive samples), C (continuous, sample-based forecasts), B (binary), and Q (any forecasts in a quantile-based format) refer to different forecast formats. While the distinction is not clear-cut (e.g. binary is a special case of discrete), it is useful in the context of the package as available functions and functionality may differ. For a more detailed description of the terms used in this table see the corresponding paper sections (e.g. for 'global' and 'local' see section \ref {localglobal}). For mathematical defintions of the metrics see Table \ref {tab:score-table-detailed}.\relax }}{5}{table.2}\protected@file@percent }
\citation{gneitingProbabilisticForecastsCalibration2007}
\citation{dawidPresentPositionPotential1984}
\citation{angusProbabilityIntegralTransform1994}
\citation{andersonAsymptoticTheoryCertain1952}
\citation{gneitingProbabilisticForecastsCalibration2007,hamillInterpretationRankHistograms2001a}
-\newlabel{assessing-calibration}{{2.1}{7}{}{subsection.2.1}{}}
-\newlabel{probabilistic-calibration}{{2.1.1}{7}{}{subsubsection.2.1.1}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Top row}}{8}{figure.caption.3}\protected@file@percent }
-\newlabel{fig:calibration-plots}{{2}{8}{Top row}{figure.caption.3}{}}
+\newlabel{assessing-calibration}{{2.1}{6}{}{subsection.2.1}{}}
+\newlabel{probabilistic-calibration}{{2.1.1}{6}{}{subsubsection.2.1.1}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces A}}{7}{figure.caption.3}\protected@file@percent }
+\newlabel{fig:calibration-plots}{{2}{7}{A}{figure.caption.3}{}}
\citation{funkAssessingPerformanceRealtime2019}
\citation{gneitingProbabilisticForecastsCalibration2007}
\citation{funkAssessingPerformanceRealtime2019}
@@ -72,72 +74,77 @@
\citation{goodRationalDecisions1952}
\citation{dawidCoherentDispersionCriteria1999}
\citation{winklerScoringRulesEvaluation1996}
-\newlabel{bias}{{2.1.2}{9}{}{subsubsection.2.1.2}{}}
-\newlabel{assessing-sharpness}{{2.2}{9}{}{subsection.2.2}{}}
-\newlabel{proper-scoring-rules-for-sample-based-forecasts-crps-log-score-and-dss}{{2.3}{9}{}{subsection.2.3}{}}
\citation{jordanEvaluatingProbabilisticForecasts2019}
+\newlabel{bias}{{2.1.2}{8}{}{subsubsection.2.1.2}{}}
+\newlabel{assessing-sharpness}{{2.2}{8}{}{subsection.2.2}{}}
+\newlabel{proper-scoring-rules}{{2.3}{8}{}{subsection.2.3}{}}
+\newlabel{proper-scoring-rules-for-sample-based-forecasts-crps-log-score-and-dss}{{2.3.1}{8}{}{subsubsection.2.3.1}{}}
\citation{jordanEvaluatingProbabilisticForecasts2019}
\citation{jordanEvaluatingProbabilisticForecasts2019}
\citation{macheteContrastingProbabilisticScoring2012}
\citation{winklerScoringRulesEvaluation1996}
\citation{gelmanUnderstandingPredictiveInformation2014}
-\newlabel{estimation-details-and-the-number-of-samples-required-for-accurate-scoring}{{2.3.1}{10}{}{subsubsection.2.3.1}{}}
-\newlabel{overconfidence-underconfidence-and-outliers}{{2.3.2}{10}{}{subsubsection.2.3.2}{}}
-\newlabel{localglobal}{{2.3.3}{10}{}{subsubsection.2.3.3}{}}
-\newlabel{sensitivity-to-the-order-of-magnitude-of-the-forecast-quantity}{{2.3.4}{10}{}{subsubsection.2.3.4}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Top}}{11}{figure.caption.4}\protected@file@percent }
-\newlabel{fig:score-convergence}{{3}{11}{Top}{figure.caption.4}{}}
+\newlabel{estimation-details-and-the-number-of-samples-required-for-accurate-scoring}{{2.3.1}{9}{Estimation details and the number of samples required for accurate scoring}{section*.4}{}}
+\@writefile{toc}{\contentsline {paragraph}{Estimation details and the number of samples required for accurate scoring}{9}{section*.4}\protected@file@percent }
+\newlabel{overconfidence-underconfidence-and-outliers}{{2.3.1}{9}{Overconfidence, underconfidence and outliers}{section*.6}{}}
+\@writefile{toc}{\contentsline {paragraph}{Overconfidence, underconfidence and outliers}{9}{section*.6}\protected@file@percent }
+\newlabel{localglobal}{{2.3.1}{9}{Sensitivity to distance - local vs. global scores}{section*.7}{}}
+\@writefile{toc}{\contentsline {paragraph}{Sensitivity to distance - local vs. global scores}{9}{section*.7}\protected@file@percent }
+\newlabel{sensitivity-to-the-order-of-magnitude-of-the-forecast-quantity}{{2.3.1}{9}{Sensitivity to the order of magnitude of the forecast quantity}{section*.9}{}}
+\@writefile{toc}{\contentsline {paragraph}{Sensitivity to the order of magnitude of the forecast quantity}{9}{section*.9}\protected@file@percent }
+\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Top}}{10}{figure.caption.5}\protected@file@percent }
+\newlabel{fig:score-convergence}{{3}{10}{Top}{figure.caption.5}{}}
\citation{bracherEvaluatingEpidemicForecasts2021}
\citation{brierVERIFICATIONFORECASTSEXPRESSED1950}
\citation{epsteinScoringSystemProbability1969,murphyNoteRankedProbability1971a}
\citation{macheteContrastingProbabilisticScoring2012}
-\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match}}{12}{figure.caption.5}\protected@file@percent }
-\newlabel{fig:score-locality}{{4}{12}{Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match}{figure.caption.5}{}}
-\newlabel{wis}{{2.4}{12}{}{subsection.2.4}{}}
-\newlabel{proper-scoring-rules-for-binary-outcomes-bs-and-log-score}{{2.5}{12}{}{subsection.2.5}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value}}{13}{figure.caption.6}\protected@file@percent }
-\newlabel{fig:score-scale}{{5}{13}{Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value}{figure.caption.6}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match}}{11}{figure.caption.8}\protected@file@percent }
+\newlabel{fig:score-locality}{{4}{11}{Probabilities assigned by two hypothetical forecasters, A and B, to the possible number of goals in a football match}{figure.caption.8}{}}
+\newlabel{wis}{{2.3.2}{11}{Sensitivity to the order of magnitude of the forecast quantity}{subsubsection.2.3.2}{}}
+\newlabel{proper-scoring-rules-for-binary-outcomes-bs-and-log-score}{{2.3.3}{11}{Sensitivity to the order of magnitude of the forecast quantity}{subsubsection.2.3.3}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value}}{12}{figure.caption.10}\protected@file@percent }
+\newlabel{fig:score-scale}{{5}{12}{Scores depend on the variability of the data and therefore implicitly on the order of magnitude of the observed value}{figure.caption.10}{}}
\citation{cramerEvaluationIndividualEnsemble2021}
\citation{mannTestWhetherOne1947}
-\newlabel{pairwise-comparisons}{{2.6}{14}{}{subsection.2.6}{}}
-\newlabel{evaluation-example}{{3}{14}{}{section.3}{}}
\citation{europeancovid-19forecasthubEuropeanCovid19Forecast2021}
-\newlabel{example-data}{{3.1}{15}{}{subsection.3.1}{}}
-\newlabel{expected-input-formats-and-data-checking}{{3.2}{15}{}{subsection.3.2}{}}
-\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Overview of the columns required for different input formats.\relax }}{16}{table.caption.7}\protected@file@percent }
-\newlabel{tab:column-requirements}{{3}{16}{Overview of the columns required for different input formats.\relax }{table.caption.7}{}}
-\newlabel{visualising-forecast-data}{{3.3}{18}{}{subsection.3.3}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Overview of the number of available forecasts}}{19}{figure.caption.8}\protected@file@percent }
-\newlabel{fig:avail-forecasts-plot}{{6}{19}{Overview of the number of available forecasts}{figure.caption.8}{}}
-\newlabel{scoring}{{3.4}{19}{}{subsection.3.4}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021}}{20}{figure.caption.9}\protected@file@percent }
-\newlabel{fig:forecast-visualisation}{{7}{20}{Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021}{figure.caption.9}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Coloured table to visualise the computed scores}}{21}{figure.caption.10}\protected@file@percent }
-\newlabel{fig:score-table}{{8}{21}{Coloured table to visualise the computed scores}{figure.caption.10}{}}
-\newlabel{pairwise-comparisons-1}{{3.5}{22}{}{subsection.3.5}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Ratios of mean scores based on overlapping forecast sets}}{24}{figure.caption.11}\protected@file@percent }
-\newlabel{fig:pairwise-plot}{{9}{24}{Ratios of mean scores based on overlapping forecast sets}{figure.caption.11}{}}
-\newlabel{model-diagnostics}{{3.6}{24}{}{subsection.3.6}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Heatmap of bias values for different models across different locations and forecast targets}}{25}{figure.caption.12}\protected@file@percent }
-\newlabel{fig:score-heatmap}{{10}{25}{Heatmap of bias values for different models across different locations and forecast targets}{figure.caption.12}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Decomposition of the weighted interval score (WIS) into dispersion, over-prediction and under-prediction}}{25}{figure.caption.13}\protected@file@percent }
-\newlabel{fig:wis-components}{{11}{25}{Decomposition of the weighted interval score (WIS) into dispersion, over-prediction and under-prediction}{figure.caption.13}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces PIT histograms of all models stratified by forecast target}}{27}{figure.caption.14}\protected@file@percent }
-\newlabel{fig:pit-plots}{{12}{27}{PIT histograms of all models stratified by forecast target}{figure.caption.14}{}}
-\newlabel{summary-and-discussion}{{3.7}{27}{}{subsection.3.7}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces Interval coverage and quantile coverage plots}}{28}{figure.caption.15}\protected@file@percent }
-\newlabel{fig:coverage}{{13}{28}{Interval coverage and quantile coverage plots}{figure.caption.15}{}}
-\@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Correlation between different scores}}{28}{figure.caption.16}\protected@file@percent }
-\newlabel{fig:correlation-plot}{{14}{28}{Correlation between different scores}{figure.caption.16}{}}
+\newlabel{pairwisetheory}{{2.4}{13}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.2.4}{}}
+\newlabel{evaluation-example}{{3}{13}{Sensitivity to the order of magnitude of the forecast quantity}{section.3}{}}
+\newlabel{example-data}{{3.1}{13}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.1}{}}
+\newlabel{expected-input-formats-and-data-checking}{{3.2}{14}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.2}{}}
+\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Overview of the columns required for different input formats.\relax }}{15}{table.caption.11}\protected@file@percent }
+\newlabel{tab:column-requirements}{{3}{15}{Overview of the columns required for different input formats.\relax }{table.caption.11}{}}
+\newlabel{visualising-forecast-data}{{3.3}{17}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.3}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Overview of the number of available forecasts}}{18}{figure.caption.12}\protected@file@percent }
+\newlabel{fig:avail-forecasts-plot}{{6}{18}{Overview of the number of available forecasts}{figure.caption.12}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021}}{18}{figure.caption.13}\protected@file@percent }
+\newlabel{fig:forecast-visualisation}{{7}{18}{Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021}{figure.caption.13}{}}
+\newlabel{scoring}{{3.4}{18}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.4}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces Coloured table to visualise the computed scores}}{20}{figure.caption.14}\protected@file@percent }
+\newlabel{fig:score-table}{{8}{20}{Coloured table to visualise the computed scores}{figure.caption.14}{}}
+\newlabel{pairwisecode}{{3.5}{21}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.5}{}}
+\newlabel{model-diagnostics}{{3.6}{22}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.6}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces Ratios of mean scores based on overlapping forecast sets}}{23}{figure.caption.15}\protected@file@percent }
+\newlabel{fig:pairwise-plot}{{9}{23}{Ratios of mean scores based on overlapping forecast sets}{figure.caption.15}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {10}{\ignorespaces Heatmap of bias values for different models across different locations and forecast targets}}{24}{figure.caption.16}\protected@file@percent }
+\newlabel{fig:score-heatmap}{{10}{24}{Heatmap of bias values for different models across different locations and forecast targets}{figure.caption.16}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction}}{24}{figure.caption.17}\protected@file@percent }
+\newlabel{fig:wis-components}{{11}{24}{Decomposition of the weighted interval score (WIS) into dispersion, overprediction and underprediction}{figure.caption.17}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12}{\ignorespaces PIT histograms of all models stratified by forecast target}}{25}{figure.caption.18}\protected@file@percent }
+\newlabel{fig:pit-plots}{{12}{25}{PIT histograms of all models stratified by forecast target}{figure.caption.18}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {13}{\ignorespaces Interval coverage (A) and quantile coverage (B) plots}}{26}{figure.caption.19}\protected@file@percent }
+\newlabel{fig:coverage}{{13}{26}{Interval coverage (A) and quantile coverage (B) plots}{figure.caption.19}{}}
\citation{gneitingStrictlyProperScoring2007,joseCharacterizationSphericalScoring2009,macheteContrastingProbabilisticScoring2012}
-\newlabel{acknowledgments}{{3.8}{29}{}{subsection.3.8}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Correlation between different scores}}{27}{figure.caption.20}\protected@file@percent }
+\newlabel{fig:correlation-plot}{{14}{27}{Correlation between different scores}{figure.caption.20}{}}
+\newlabel{summary-and-discussion}{{3.7}{27}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.7}{}}
+\newlabel{acknowledgments}{{3.8}{27}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.8}{}}
\gdef \LT@ii {\LT@entry
{1}{91.49744pt}\LT@entry
{1}{346.24875pt}}
-\newlabel{appendix-supplementary-information}{{3.8}{30}{}{subsection.3.8}{}}
-\@writefile{toc}{\contentsline {section}{(APPENDIX) Supplementary information}{30}{subsection.3.8}\protected@file@percent }
-\newlabel{tab:score-table-detailed}{{4}{30}{}{table.4}{}}
-\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Detailed explanation of all the metrics,\relax }}{30}{table.4}\protected@file@percent }
+\newlabel{appendix-detailed-information-on-metrics}{{3.8}{28}{Sensitivity to the order of magnitude of the forecast quantity}{subsection.3.8}{}}
+\@writefile{toc}{\contentsline {section}{(APPENDIX) Detailed Information on Metrics}{28}{subsection.3.8}\protected@file@percent }
+\newlabel{tab:score-table-detailed}{{4}{28}{Sensitivity to the order of magnitude of the forecast quantity}{table.4}{}}
+\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Detailed explanation of all the metrics,\relax }}{28}{table.4}\protected@file@percent }
\bibdata{references.bib,scoringutils-paper.bib}
\bibcite{andersonAsymptoticTheoryCertain1952}{{1}{1952}{{Anderson and Darling}}{{}}}
\bibcite{angusProbabilityIntegralTransform1994}{{2}{1994}{{Angus}}{{}}}
@@ -176,4 +183,4 @@
\bibcite{winklerScoringRulesEvaluation1996}{{35}{1996}{{Winkler \emph {et~al.}}}{{Winkler, Mu{\~n}oz, Cervera, Bernardo, Blattenberger, Kadane, Lindley, Murphy, Oliver, and {R{\'i}os-Insua}}}}
\bibcite{MLmetrics}{{36}{2016}{{Yan}}{{}}}
\bibcite{topmodels}{{37}{2022}{{Zeileis and Lang}}{{}}}
-\gdef \@abspage@last{43}
+\gdef \@abspage@last{41}
diff --git a/inst/manuscript/manuscript.out b/inst/manuscript/manuscript.out
index 7ef6d15ba..0f95cd3bd 100644
--- a/inst/manuscript/manuscript.out
+++ b/inst/manuscript/manuscript.out
@@ -6,21 +6,18 @@
\BOOKMARK [3][-]{Subsubsection.2.1.0.Probabilistic calibration.3}{\376\377\000P\000r\000o\000b\000a\000b\000i\000l\000i\000s\000t\000i\000c\000\040\000c\000a\000l\000i\000b\000r\000a\000t\000i\000o\000n}{Subsection.2.0.Assessing calibration.2}% 6
\BOOKMARK [3][-]{Subsubsection.2.1.1.Bias.3}{\376\377\000B\000i\000a\000s}{Subsection.2.0.Assessing calibration.2}% 7
\BOOKMARK [2][-]{Subsection.2.1.Assessing sharpness.2}{\376\377\000A\000s\000s\000e\000s\000s\000i\000n\000g\000\040\000s\000h\000a\000r\000p\000n\000e\000s\000s}{Section.1.Scoring metrics implemented in scoringutils.1}% 8
-\BOOKMARK [2][-]{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000s\000\040\000f\000o\000r\000\040\000s\000a\000m\000p\000l\000e\000-\000b\000a\000s\000e\000d\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000\050\000C\000R\000P\000S\000,\000\040\000l\000o\000g\000\040\000s\000c\000o\000r\000e\000\040\000a\000n\000d\000\040\000D\000S\000S\000\051}{Section.1.Scoring metrics implemented in scoringutils.1}% 9
-\BOOKMARK [3][-]{Subsubsection.2.3.0.Estimation details and the number of samples required for accurate scoring.3}{\376\377\000E\000s\000t\000i\000m\000a\000t\000i\000o\000n\000\040\000d\000e\000t\000a\000i\000l\000s\000\040\000a\000n\000d\000\040\000t\000h\000e\000\040\000n\000u\000m\000b\000e\000r\000\040\000o\000f\000\040\000s\000a\000m\000p\000l\000e\000s\000\040\000r\000e\000q\000u\000i\000r\000e\000d\000\040\000f\000o\000r\000\040\000a\000c\000c\000u\000r\000a\000t\000e\000\040\000s\000c\000o\000r\000i\000n\000g}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 10
-\BOOKMARK [3][-]{Subsubsection.2.3.1.Overconfidence, underconfidence and outliers.3}{\376\377\000O\000v\000e\000r\000c\000o\000n\000f\000i\000d\000e\000n\000c\000e\000,\000\040\000u\000n\000d\000e\000r\000c\000o\000n\000f\000i\000d\000e\000n\000c\000e\000\040\000a\000n\000d\000\040\000o\000u\000t\000l\000i\000e\000r\000s}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 11
-\BOOKMARK [3][-]{Subsubsection.2.3.2.Sensitivity to distance - local vs. global scores.3}{\376\377\000S\000e\000n\000s\000i\000t\000i\000v\000i\000t\000y\000\040\000t\000o\000\040\000d\000i\000s\000t\000a\000n\000c\000e\000\040\000-\000\040\000l\000o\000c\000a\000l\000\040\000v\000s\000.\000\040\000g\000l\000o\000b\000a\000l\000\040\000s\000c\000o\000r\000e\000s}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 12
-\BOOKMARK [3][-]{Subsubsection.2.3.3.Sensitivity to the order of magnitude of the forecast quantity.3}{\376\377\000S\000e\000n\000s\000i\000t\000i\000v\000i\000t\000y\000\040\000t\000o\000\040\000t\000h\000e\000\040\000o\000r\000d\000e\000r\000\040\000o\000f\000\040\000m\000a\000g\000n\000i\000t\000u\000d\000e\000\040\000o\000f\000\040\000t\000h\000e\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000\040\000q\000u\000a\000n\000t\000i\000t\000y}{Subsection.2.2.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).2}% 13
-\BOOKMARK [2][-]{Subsection.2.3.Proper scoring rule for quantile-based forecasts (WIS).2}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000\040\000f\000o\000r\000\040\000q\000u\000a\000n\000t\000i\000l\000e\000-\000b\000a\000s\000e\000d\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000\050\000W\000I\000S\000\051}{Section.1.Scoring metrics implemented in scoringutils.1}% 14
-\BOOKMARK [2][-]{Subsection.2.4.Proper scoring rules for binary outcomes (BS and log score).2}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000s\000\040\000f\000o\000r\000\040\000b\000i\000n\000a\000r\000y\000\040\000o\000u\000t\000c\000o\000m\000e\000s\000\040\000\050\000B\000S\000\040\000a\000n\000d\000\040\000l\000o\000g\000\040\000s\000c\000o\000r\000e\000\051}{Section.1.Scoring metrics implemented in scoringutils.1}% 15
-\BOOKMARK [2][-]{Subsection.2.5.Pairwise comparisons.2}{\376\377\000P\000a\000i\000r\000w\000i\000s\000e\000\040\000c\000o\000m\000p\000a\000r\000i\000s\000o\000n\000s}{Section.1.Scoring metrics implemented in scoringutils.1}% 16
-\BOOKMARK [1][-]{Section.2.Evaluating forecasts using scoringutils.1}{\376\377\000E\000v\000a\000l\000u\000a\000t\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000u\000s\000i\000n\000g\000\040\000s\000c\000o\000r\000i\000n\000g\000u\000t\000i\000l\000s}{}% 17
-\BOOKMARK [2][-]{Subsection.3.0.Example data.2}{\376\377\000E\000x\000a\000m\000p\000l\000e\000\040\000d\000a\000t\000a}{Section.2.Evaluating forecasts using scoringutils.1}% 18
-\BOOKMARK [2][-]{Subsection.3.1.Expected input formats and data checking.2}{\376\377\000E\000x\000p\000e\000c\000t\000e\000d\000\040\000i\000n\000p\000u\000t\000\040\000f\000o\000r\000m\000a\000t\000s\000\040\000a\000n\000d\000\040\000d\000a\000t\000a\000\040\000c\000h\000e\000c\000k\000i\000n\000g}{Section.2.Evaluating forecasts using scoringutils.1}% 19
-\BOOKMARK [2][-]{Subsection.3.2.Visualising forecast data.2}{\376\377\000V\000i\000s\000u\000a\000l\000i\000s\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000\040\000d\000a\000t\000a}{Section.2.Evaluating forecasts using scoringutils.1}% 20
-\BOOKMARK [2][-]{Subsection.3.3.Scoring forecasts with score().2}{\376\377\000S\000c\000o\000r\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000w\000i\000t\000h\000\040\000s\000c\000o\000r\000e\000\050\000\051}{Section.2.Evaluating forecasts using scoringutils.1}% 21
-\BOOKMARK [2][-]{Subsection.3.4.Pairwise comparisons.2}{\376\377\000P\000a\000i\000r\000w\000i\000s\000e\000\040\000c\000o\000m\000p\000a\000r\000i\000s\000o\000n\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 22
-\BOOKMARK [2][-]{Subsection.3.5.Model diagnostics.2}{\376\377\000M\000o\000d\000e\000l\000\040\000d\000i\000a\000g\000n\000o\000s\000t\000i\000c\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 23
-\BOOKMARK [2][-]{Subsection.3.6.Summary and discussion.2}{\376\377\000S\000u\000m\000m\000a\000r\000y\000\040\000a\000n\000d\000\040\000d\000i\000s\000c\000u\000s\000s\000i\000o\000n}{Section.2.Evaluating forecasts using scoringutils.1}% 24
-\BOOKMARK [2][-]{Subsection.3.7.Acknowledgments.2}{\376\377\000A\000c\000k\000n\000o\000w\000l\000e\000d\000g\000m\000e\000n\000t\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 25
-\BOOKMARK [1][-]{subsection.3.8}{\376\377\000\050\000A\000P\000P\000E\000N\000D\000I\000X\000\051\000\040\000S\000u\000p\000p\000l\000e\000m\000e\000n\000t\000a\000r\000y\000\040\000i\000n\000f\000o\000r\000m\000a\000t\000i\000o\000n}{}% 26
+\BOOKMARK [2][-]{Subsection.2.2.Proper scoring rules.2}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000s}{Section.1.Scoring metrics implemented in scoringutils.1}% 9
+\BOOKMARK [3][-]{Subsubsection.2.3.0.Proper scoring rules for sample-based forecasts (CRPS, log score and DSS).3}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000s\000\040\000f\000o\000r\000\040\000s\000a\000m\000p\000l\000e\000-\000b\000a\000s\000e\000d\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000\050\000C\000R\000P\000S\000,\000\040\000l\000o\000g\000\040\000s\000c\000o\000r\000e\000\040\000a\000n\000d\000\040\000D\000S\000S\000\051}{Subsection.2.2.Proper scoring rules.2}% 10
+\BOOKMARK [3][-]{Subsubsection.2.3.1.Proper scoring rule for quantile-based forecasts (WIS).3}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000\040\000f\000o\000r\000\040\000q\000u\000a\000n\000t\000i\000l\000e\000-\000b\000a\000s\000e\000d\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000\050\000W\000I\000S\000\051}{Subsection.2.2.Proper scoring rules.2}% 11
+\BOOKMARK [3][-]{Subsubsection.2.3.2.Proper scoring rules for binary outcomes (BS and log score).3}{\376\377\000P\000r\000o\000p\000e\000r\000\040\000s\000c\000o\000r\000i\000n\000g\000\040\000r\000u\000l\000e\000s\000\040\000f\000o\000r\000\040\000b\000i\000n\000a\000r\000y\000\040\000o\000u\000t\000c\000o\000m\000e\000s\000\040\000\050\000B\000S\000\040\000a\000n\000d\000\040\000l\000o\000g\000\040\000s\000c\000o\000r\000e\000\051}{Subsection.2.2.Proper scoring rules.2}% 12
+\BOOKMARK [2][-]{Subsection.2.3.Pairwise comparisons.2}{\376\377\000P\000a\000i\000r\000w\000i\000s\000e\000\040\000c\000o\000m\000p\000a\000r\000i\000s\000o\000n\000s}{Section.1.Scoring metrics implemented in scoringutils.1}% 13
+\BOOKMARK [1][-]{Section.2.Evaluating forecasts using scoringutils.1}{\376\377\000E\000v\000a\000l\000u\000a\000t\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000u\000s\000i\000n\000g\000\040\000s\000c\000o\000r\000i\000n\000g\000u\000t\000i\000l\000s}{}% 14
+\BOOKMARK [2][-]{Subsection.3.0.Example data.2}{\376\377\000E\000x\000a\000m\000p\000l\000e\000\040\000d\000a\000t\000a}{Section.2.Evaluating forecasts using scoringutils.1}% 15
+\BOOKMARK [2][-]{Subsection.3.1.Expected input formats and data checking.2}{\376\377\000E\000x\000p\000e\000c\000t\000e\000d\000\040\000i\000n\000p\000u\000t\000\040\000f\000o\000r\000m\000a\000t\000s\000\040\000a\000n\000d\000\040\000d\000a\000t\000a\000\040\000c\000h\000e\000c\000k\000i\000n\000g}{Section.2.Evaluating forecasts using scoringutils.1}% 16
+\BOOKMARK [2][-]{Subsection.3.2.Visualising forecast data.2}{\376\377\000V\000i\000s\000u\000a\000l\000i\000s\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000\040\000d\000a\000t\000a}{Section.2.Evaluating forecasts using scoringutils.1}% 17
+\BOOKMARK [2][-]{Subsection.3.3.Scoring forecasts with score().2}{\376\377\000S\000c\000o\000r\000i\000n\000g\000\040\000f\000o\000r\000e\000c\000a\000s\000t\000s\000\040\000w\000i\000t\000h\000\040\000s\000c\000o\000r\000e\000\050\000\051}{Section.2.Evaluating forecasts using scoringutils.1}% 18
+\BOOKMARK [2][-]{Subsection.3.4.Pairwise comparisons.2}{\376\377\000P\000a\000i\000r\000w\000i\000s\000e\000\040\000c\000o\000m\000p\000a\000r\000i\000s\000o\000n\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 19
+\BOOKMARK [2][-]{Subsection.3.5.Model diagnostics.2}{\376\377\000M\000o\000d\000e\000l\000\040\000d\000i\000a\000g\000n\000o\000s\000t\000i\000c\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 20
+\BOOKMARK [2][-]{Subsection.3.6.Summary and discussion.2}{\376\377\000S\000u\000m\000m\000a\000r\000y\000\040\000a\000n\000d\000\040\000d\000i\000s\000c\000u\000s\000s\000i\000o\000n}{Section.2.Evaluating forecasts using scoringutils.1}% 21
+\BOOKMARK [2][-]{Subsection.3.7.Acknowledgments.2}{\376\377\000A\000c\000k\000n\000o\000w\000l\000e\000d\000g\000m\000e\000n\000t\000s}{Section.2.Evaluating forecasts using scoringutils.1}% 22
+\BOOKMARK [1][-]{subsection.3.8}{\376\377\000\050\000A\000P\000P\000E\000N\000D\000I\000X\000\051\000\040\000D\000e\000t\000a\000i\000l\000e\000d\000\040\000I\000n\000f\000o\000r\000m\000a\000t\000i\000o\000n\000\040\000o\000n\000\040\000M\000e\000t\000r\000i\000c\000s}{}% 23
diff --git a/inst/manuscript/output/calibration-diagnostic-examples.Rda b/inst/manuscript/output/calibration-diagnostic-examples.Rda
new file mode 100644
index 000000000..40474fc7e
Binary files /dev/null and b/inst/manuscript/output/calibration-diagnostic-examples.Rda differ
diff --git a/inst/manuscript/output/calibration-diagnostic-examples.png b/inst/manuscript/output/calibration-diagnostic-examples.png
new file mode 100644
index 000000000..d6da9655a
Binary files /dev/null and b/inst/manuscript/output/calibration-diagnostic-examples.png differ
diff --git a/inst/manuscript/output/calibration-sharpness-illustration.png b/inst/manuscript/output/calibration-sharpness-illustration.png
new file mode 100644
index 000000000..926f89d02
Binary files /dev/null and b/inst/manuscript/output/calibration-sharpness-illustration.png differ
diff --git a/inst/manuscript/output/illustration-effect-scale.png b/inst/manuscript/output/illustration-effect-scale.png
new file mode 100644
index 000000000..08e5330ca
Binary files /dev/null and b/inst/manuscript/output/illustration-effect-scale.png differ
diff --git a/inst/manuscript/plots/relation-to-scale-example.Rda b/inst/manuscript/output/relation-to-scale-example.Rda
similarity index 100%
rename from inst/manuscript/plots/relation-to-scale-example.Rda
rename to inst/manuscript/output/relation-to-scale-example.Rda
diff --git a/inst/manuscript/output/sample-convergence.Rda b/inst/manuscript/output/sample-convergence.Rda
new file mode 100644
index 000000000..21da13ced
Binary files /dev/null and b/inst/manuscript/output/sample-convergence.Rda differ
diff --git a/inst/manuscript/output/score-convergence-outliers.png b/inst/manuscript/output/score-convergence-outliers.png
new file mode 100644
index 000000000..1df4a1a5a
Binary files /dev/null and b/inst/manuscript/output/score-convergence-outliers.png differ
diff --git a/inst/manuscript/output/score-locality.png b/inst/manuscript/output/score-locality.png
new file mode 100644
index 000000000..b9efd44c4
Binary files /dev/null and b/inst/manuscript/output/score-locality.png differ
diff --git a/inst/manuscript/plots/calibration-diagnostic-examples.Rda b/inst/manuscript/plots/calibration-diagnostic-examples.Rda
deleted file mode 100644
index 193566bb3..000000000
Binary files a/inst/manuscript/plots/calibration-diagnostic-examples.Rda and /dev/null differ
diff --git a/inst/manuscript/plots/calibration-diagnostic-examples.png b/inst/manuscript/plots/calibration-diagnostic-examples.png
deleted file mode 100644
index b3bcefd44..000000000
Binary files a/inst/manuscript/plots/calibration-diagnostic-examples.png and /dev/null differ
diff --git a/inst/manuscript/plots/calibration-illustration.png b/inst/manuscript/plots/calibration-illustration.png
deleted file mode 100644
index a3047230e..000000000
Binary files a/inst/manuscript/plots/calibration-illustration.png and /dev/null differ
diff --git a/inst/manuscript/plots/calibration-sharpness-illustration.png b/inst/manuscript/plots/calibration-sharpness-illustration.png
deleted file mode 100644
index e7909e918..000000000
Binary files a/inst/manuscript/plots/calibration-sharpness-illustration.png and /dev/null differ
diff --git a/inst/manuscript/plots/forecast-paradigm.png b/inst/manuscript/plots/forecast-paradigm.png
deleted file mode 100644
index d898710bf..000000000
Binary files a/inst/manuscript/plots/forecast-paradigm.png and /dev/null differ
diff --git a/inst/manuscript/plots/illustration-effect-scale-sim.png b/inst/manuscript/plots/illustration-effect-scale-sim.png
deleted file mode 100644
index 18461b336..000000000
Binary files a/inst/manuscript/plots/illustration-effect-scale-sim.png and /dev/null differ
diff --git a/inst/manuscript/plots/illustration-effect-scale.png b/inst/manuscript/plots/illustration-effect-scale.png
deleted file mode 100644
index fc6ce9692..000000000
Binary files a/inst/manuscript/plots/illustration-effect-scale.png and /dev/null differ
diff --git a/inst/manuscript/plots/sample-convergence.png b/inst/manuscript/plots/sample-convergence.png
deleted file mode 100644
index 64db7d163..000000000
Binary files a/inst/manuscript/plots/sample-convergence.png and /dev/null differ
diff --git a/inst/manuscript/plots/score-deviation-sd-mu.png b/inst/manuscript/plots/score-deviation-sd-mu.png
deleted file mode 100644
index 5b146bb5a..000000000
Binary files a/inst/manuscript/plots/score-deviation-sd-mu.png and /dev/null differ
diff --git a/inst/manuscript/plots/score-locality.png b/inst/manuscript/plots/score-locality.png
deleted file mode 100644
index e612188ba..000000000
Binary files a/inst/manuscript/plots/score-locality.png and /dev/null differ
diff --git a/inst/manuscript/plots/sharpness-illustration.png b/inst/manuscript/plots/sharpness-illustration.png
deleted file mode 100644
index 3f6525bfe..000000000
Binary files a/inst/manuscript/plots/sharpness-illustration.png and /dev/null differ
diff --git a/inst/metrics-overview/forecast-types.Rda b/inst/metrics-overview/forecast-types.Rda
deleted file mode 100644
index 3ff41d540..000000000
Binary files a/inst/metrics-overview/forecast-types.Rda and /dev/null differ
diff --git a/inst/metrics-overview/metrics-summary.Rda b/inst/metrics-overview/metrics-summary.Rda
deleted file mode 100644
index ef3da650f..000000000
Binary files a/inst/metrics-overview/metrics-summary.Rda and /dev/null differ
diff --git a/man/example_binary.Rd b/man/example_binary.Rd
index b76c3fa97..57d4ef9fe 100644
--- a/man/example_binary.Rd
+++ b/man/example_binary.Rd
@@ -35,5 +35,8 @@ The outcome was constructed as whether or not the actually
observed value was below or above that mean prediction.
This should not be understood as sound statistical practice, but rather
as a practical way to create an example data set.
+
+The data was created using the script create-example-data.R in the inst/
+folder (or the top level folder in a compiled package).
}
\keyword{datasets}
diff --git a/man/example_continuous.Rd b/man/example_continuous.Rd
index 3f071a724..2111773f7 100644
--- a/man/example_continuous.Rd
+++ b/man/example_continuous.Rd
@@ -29,4 +29,8 @@ example_continuous
A data set with continuous predictions for COVID-19 cases and deaths
constructed from data submitted to the European Forecast Hub.
}
+\details{
+The data was created using the script create-example-data.R in the inst/
+folder (or the top level folder in a compiled package).
+}
\keyword{datasets}
diff --git a/man/example_integer.Rd b/man/example_integer.Rd
index 98dceaf65..5bbd059fe 100644
--- a/man/example_integer.Rd
+++ b/man/example_integer.Rd
@@ -26,4 +26,8 @@ example_integer
A data set with integer predictions for COVID-19 cases and deaths
constructed from data submitted to the European Forecast Hub.
}
+\details{
+The data was created using the script create-example-data.R in the inst/
+folder (or the top level folder in a compiled package).
+}
\keyword{datasets}
diff --git a/man/example_point.Rd b/man/example_point.Rd
new file mode 100644
index 000000000..70b694d47
--- /dev/null
+++ b/man/example_point.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{example_point}
+\alias{example_point}
+\title{Point Forecast Example Data}
+\format{
+A data frame with
+\describe{
+\item{location}{the country for which a prediction was made}
+\item{target_end_date}{the date for which a prediction was made}
+\item{target_type}{the target to be predicted (cases or deaths)}
+\item{true_value}{true observed values}
+\item{location_name}{name of the country for which a prediction was made}
+\item{forecast_date}{the date on which a prediction was made}
+\item{quantile}{quantile of the corresponding prediction}
+\item{prediction}{predicted value}
+\item{model}{name of the model that generated the forecasts}
+\item{horizon}{forecast horizon in weeks}
+}
+}
+\source{
+\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
+}
+\usage{
+example_point
+}
+\description{
+A data set with predictions for COVID-19 cases and deaths submitted to the
+European Forecast Hub. This data set is like the quantile example data, only
+that the median has been replaced by a point forecast.
+}
+\details{
+The data was created using the script create-example-data.R in the inst/
+folder (or the top level folder in a compiled package).
+}
+\keyword{datasets}
diff --git a/man/example_quantile.Rd b/man/example_quantile.Rd
index f6b1e1927..22a42ffec 100644
--- a/man/example_quantile.Rd
+++ b/man/example_quantile.Rd
@@ -29,4 +29,8 @@ example_quantile
A data set with predictions for COVID-19 cases and deaths submitted to the
European Forecast Hub.
}
+\details{
+The data was created using the script create-example-data.R in the inst/
+folder (or the top level folder in a compiled package).
+}
\keyword{datasets}
diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd
index fa6aac56a..35a244548 100644
--- a/man/example_quantile_forecasts_only.Rd
+++ b/man/example_quantile_forecasts_only.Rd
@@ -27,4 +27,8 @@ example_quantile_forecasts_only
A data set with quantile predictions for COVID-19 cases and deaths
submitted to the European Forecast Hub.
}
+\details{
+The data was created using the script create-example-data.R in the inst/
+folder (or the top level folder in a compiled package).
+}
\keyword{datasets}
diff --git a/man/example_truth_only.Rd b/man/example_truth_only.Rd
index 3a7bc3f9c..e2f911aa6 100644
--- a/man/example_truth_only.Rd
+++ b/man/example_truth_only.Rd
@@ -24,4 +24,8 @@ example_truth_only
A data set with truth values for COVID-19 cases and deaths
submitted to the European Forecast Hub.
}
+\details{
+The data was created using the script create-example-data.R in the inst/
+folder (or the top level folder in a compiled package).
+}
\keyword{datasets}
diff --git a/man/metrics_summary.Rd b/man/metrics.Rd
similarity index 68%
rename from man/metrics_summary.Rd
rename to man/metrics.Rd
index ce71a7d33..85b3633a0 100644
--- a/man/metrics_summary.Rd
+++ b/man/metrics.Rd
@@ -1,17 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data.R
\docType{data}
-\name{metrics_summary}
-\alias{metrics_summary}
+\name{metrics}
+\alias{metrics}
\title{Summary information for selected metrics}
\format{
An object of class \code{data.table} (inherits from \code{data.frame}) with 22 rows and 8 columns.
}
\usage{
-metrics_summary
+metrics
}
\description{
A data set with summary information on selected metrics implemented in
\pkg{scoringutils}
}
+\details{
+The data was created using the script create-metric-tables.R in the inst/
+folder (or the top level folder in a compiled package).
+}
\keyword{info}
diff --git a/man/score.Rd b/man/score.Rd
index a183e03c0..a169b5967 100644
--- a/man/score.Rd
+++ b/man/score.Rd
@@ -27,7 +27,7 @@ For a quantile-format forecast you should provide a column called \code{quantile
\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the
default), all available metrics will be computed. For a list of available
-metrics see \code{\link[=available_metrics]{available_metrics()}}}
+metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.}
\item{...}{additional parameters passed down to lower-level functions.
For example, the following arguments can change how weighted interval
@@ -61,7 +61,8 @@ You can (and should) check your input using the function \code{\link[=check_fore
before scoring.
To obtain a quick overview of the evaluation metrics used, have a look at the
-\link{metrics_summary} data included in the package.
+\link{metrics} data included in the package. The column \code{metrics$Name} gives an
+overview of all available metric names that can be computed.
}
\examples{
library(magrittr) # pipe operator
@@ -76,6 +77,11 @@ score(example_binary)
score(example_quantile)
score(example_integer)
score(example_continuous)
+
+# score point forecasts (marked by 'NA' in the quantile column)
+score(example_point) \%>\%
+ summarise_scores(by = "model", na.rm = TRUE)
+
}
\references{
Funk S, Camacho A, Kucharski AJ, Lowe R, Eggo RM, Edmunds WJ
diff --git a/man/score_binary.Rd b/man/score_binary.Rd
index 6237468e7..33c8f714c 100644
--- a/man/score_binary.Rd
+++ b/man/score_binary.Rd
@@ -31,7 +31,7 @@ of the values in \code{forecast_unit}.}
\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the
default), all available metrics will be computed. For a list of available
-metrics see \code{\link[=available_metrics]{available_metrics()}}}
+metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.}
}
\value{
A data.table with appropriate scores. For more information see
diff --git a/man/score_quantile.Rd b/man/score_quantile.Rd
index c8a402476..b07d52033 100644
--- a/man/score_quantile.Rd
+++ b/man/score_quantile.Rd
@@ -38,7 +38,7 @@ of the values in \code{forecast_unit}}
\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the
default), all available metrics will be computed. For a list of available
-metrics see \code{\link[=available_metrics]{available_metrics()}}}
+metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.}
\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged
into an interval score that, in the limit, corresponds to CRPS. Alpha is the
diff --git a/man/score_sample.Rd b/man/score_sample.Rd
index 5672f8d93..e59ddc5be 100644
--- a/man/score_sample.Rd
+++ b/man/score_sample.Rd
@@ -31,7 +31,7 @@ of the values in \code{forecast_unit}}
\item{metrics}{the metrics you want to have in the output. If \code{NULL} (the
default), all available metrics will be computed. For a list of available
-metrics see \code{\link[=available_metrics]{available_metrics()}}}
+metrics see \code{\link[=available_metrics]{available_metrics()}}, or check the \link{metrics} data set.}
\item{prediction_type}{character, should be either "continuous" or "integer"}
}
diff --git a/man/scoringutils-package.Rd b/man/scoringutils-package.Rd
new file mode 100644
index 000000000..36ac569f3
--- /dev/null
+++ b/man/scoringutils-package.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/scoringutils.R
+\docType{package}
+\name{scoringutils-package}
+\alias{scoringutils}
+\alias{scoringutils-package}
+\title{scoringutils: Utilities for Scoring and Assessing Predictions}
+\description{
+Combines a collection of metrics and proper scoring rules (Tilmann Gneiting & Adrian E Raftery (2007) ) with an easy to use wrapper that can be used to automatically evaluate predictions. Apart from proper scoring rules functions are provided to assess bias, sharpness and calibration (Sebastian Funk, Anton Camacho, Adam J. Kucharski, Rachel Lowe, Rosalind M. Eggo, W. John Edmunds (2019) ) of forecasts. Several types of predictions can be evaluated: probabilistic forecasts (generally predictive samples generated by Markov Chain Monte Carlo procedures), quantile forecasts or point forecasts. Observed values and predictions can be either continuous, integer, or binary. Users can either choose to apply these rules separately in a vector / matrix format that can be flexibly used within other packages, or they can choose to do an automatic evaluation of their forecasts. This is implemented with 'data.table' and provides a consistent and very efficient framework for evaluating various types of predictions.
+}
+\seealso{
+Useful links:
+\itemize{
+ \item \url{https://epiforecasts.io/scoringutils/}
+ \item \url{https://github.com/epiforecasts/scoringutils}
+ \item Report bugs at \url{https://github.com/epiforecasts/scoringutils/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Nikos Bosse \email{nikosbosse@gmail.com} (\href{https://orcid.org/0000-0002-7750-5280}{ORCID})
+
+Authors:
+\itemize{
+ \item Sam Abbott \email{contact@samabbott.co.uk} (\href{https://orcid.org/0000-0001-8057-8037}{ORCID})
+ \item Hugo Gruson \email{hugo.gruson@lshtm.ac.uk} (\href{https://orcid.org/0000-0002-4094-1476}{ORCID})
+}
+
+Other contributors:
+\itemize{
+ \item Johannes Bracher \email{johannes.bracher@kit.edu} (\href{https://orcid.org/0000-0002-3777-1410}{ORCID}) [contributor]
+ \item Sebastian Funk \email{sebastian.funk@lshtm.ac.uk} [contributor]
+}
+
+}
+\keyword{internal}
diff --git a/man/scoringutils.Rd b/man/scoringutils.Rd
deleted file mode 100644
index b2a950592..000000000
--- a/man/scoringutils.Rd
+++ /dev/null
@@ -1,56 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/scoringutils.R
-\docType{package}
-\name{scoringutils}
-\alias{scoringutils}
-\title{scoringutils}
-\description{
-This package is designed to help with assessing the quality of predictions.
-It provides a collection of proper scoring rules and metrics as well that
-can be accessed independently or collectively through a higher-level wrapper
-function.
-
-Predictions can be either probabilistic forecasts (generally predictive
-samples generated by Markov Chain Monte Carlo procedures), quantile
-forecasts or point forecasts. The true values can be either continuous,
-integer, or binary.
-
-A collection of different metrics and scoring rules can be accessed through
-the function \code{\link[=score]{score()}}. Given a data.frame of the
-correct form the function will automatically figure out the type of
-prediction and true values and return appropriate scoring metrics.
-
-The package also has a lot of default visualisation based on the output
-created by \code{\link[=score]{score()}}.
-\itemize{
-\item \code{\link[=plot_score_table]{plot_score_table()}}
-\item \code{\link[=plot_correlation]{plot_correlation()}}
-\item \code{\link[=plot_wis]{plot_wis()}}
-\item \code{\link[=plot_ranges]{plot_ranges()}}
-\item \code{\link[=plot_heatmap]{plot_heatmap()}}
-\item \code{\link[=plot_predictions]{plot_predictions()}}
-\item \code{\link[=plot_interval_coverage]{plot_interval_coverage()}}
-\item \code{\link[=plot_quantile_coverage]{plot_quantile_coverage()}}
-}
-
-Alternatively, the following functions can be accessed directly:
-\itemize{
-\item \code{\link[=brier_score]{brier_score()}}
-\item \code{\link[=pit]{pit()}}
-\item \code{\link[=bias_sample]{bias_sample()}}
-\item \code{\link[=bias_quantile]{bias_quantile()}}
-\item \code{\link[=bias_range]{bias_range()}}
-\item \code{\link[=mad_sample]{mad_sample()}}
-\item \code{\link[=crps_sample]{crps_sample()}}
-\item \code{\link[=logs_sample]{logs_sample()}}
-\item \code{\link[=dss_sample]{dss_sample()}}
-\item \code{\link[=ae_median_sample]{ae_median_sample()}}
-}
-
-Predictions can be evaluated in a lot of different formats. If you want to
-convert from one format to the other, the following helper functions can
-do that for you:
-\itemize{
-\item \code{\link[=sample_to_quantile]{sample_to_quantile()}}
-}
-}
diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R
index 34a7ad33b..8b4dcd74c 100644
--- a/tests/testthat/test-summarise_scores.R
+++ b/tests/testthat/test-summarise_scores.R
@@ -29,3 +29,29 @@ test_that("summarise_scores() handles wrong by argument well", {
fixed = TRUE
)
})
+
+test_that("summarise_scores() works with point forecasts in a quantile format", {
+ ex <- data.table::copy(example_quantile)
+
+ ex[quantile == 0.5, quantile := NA_real_]
+
+ scores <- score(ex)
+
+ summarise_scores(scores, by = "model",
+ na.rm = TRUE)
+
+ summarise_scores(scores, by = "model",
+ na.rm = TRUE,
+ relative_skill = TRUE)
+
+ scores <- score(example_point[is.na(quantile)])
+
+ expect_warning(
+ expect_warning(
+ summarise_scores(scores, by = "model", relative_skill = TRUE, na.rm = TRUE)
+ )
+ )
+})
+
+
+
diff --git a/vignettes/getting-started.Rmd b/vignettes/getting-started.Rmd
index 06912a9bb..786539190 100644
--- a/vignettes/getting-started.Rmd
+++ b/vignettes/getting-started.Rmd
@@ -122,6 +122,17 @@ score(example_quantile) %>%
summarise_scores()
```
+### Scoring point forecasts
+
+Point forecasts can be scored by making use of the quantile-based format, but with a value of `NA` or `"point"` in the quantile column. Point forecasts can be scored alongside other quantile-based forecasts. As point forecasts will have values of `NA` for metrics designed for probabilistic forecasts, it is importnat to use `na.rm = TRUE` when summarising.
+
+```{r}
+score(example_point) %>%
+ summarise_scores(by = "model", na.rm = TRUE)
+```
+
+
+
### Adding empirical coverage
For quantile-based forecasts we are often interested in specific coverage-levels, for example, what percentage of true values fell between all 50% or the 90% prediction intervals. We can add this information using the function `add_coverage()`. This function also requires a `by` argument which defines the level of grouping for which the percentage of true values covered by certain prediction intervals is computed.
@@ -311,10 +322,10 @@ example_integer %>%
## Available metrics
-An overview of available metrics can be found in the `metrics_summary` data set that is included in the package.
+An overview of available metrics can be found in the `metrics` data set that is included in the package.
```{r}
-metrics_summary
+metrics
```
## Lower level functions
diff --git a/vignettes/metric-details.Rmd b/vignettes/metric-details.Rmd
index 789299d89..bf3ff3047 100644
--- a/vignettes/metric-details.Rmd
+++ b/vignettes/metric-details.Rmd
@@ -26,10 +26,21 @@ library(data.table)
This table gives an overview for when which metric can be applied and gives a very brief description. Note that this table on shows the metrics as implemented in `scoringutils`. For example, only scoring of sample-based discrete and continuous distributions is implemented in `scoringutils`, but closed-form solutions often exist (e.g. in the `scoringRules` package).
```{r, echo = FALSE, results = "asis"}
+data <- copy(metrics)
+setnames(data, old = c("Discrete", "Continuous", "Binary", "Quantile"),
+ new = c("D", "C", "B", "Q"))
+data[, c("Name", "Functions") := NULL]
-data <- readRDS(
- system.file("metrics-overview/metrics-summary.Rda", package = "scoringutils")
-)
+replace <- function(x) {
+ x <- gsub("+", "y", x, fixed = TRUE)
+ x <- gsub("-", "n", x, fixed = TRUE)
+ return(x)
+}
+
+data$D <- replace(data$D)
+data$C <- replace(data$C)
+data$B <- replace(data$B)
+data$Q <- replace(data$Q)
data[, 1:6] %>%
kbl(format = "html",