diff --git a/inst/manuscript/illustration-sharpness-calibration.R b/inst/manuscript/R/illustration-sharpness-calibration.R
similarity index 100%
rename from inst/manuscript/illustration-sharpness-calibration.R
rename to inst/manuscript/R/illustration-sharpness-calibration.R
diff --git a/inst/manuscript/R/toy-example-calibration.R b/inst/manuscript/R/toy-example-calibration.R
index ee3238c27..565c6d64f 100644
--- a/inst/manuscript/R/toy-example-calibration.R
+++ b/inst/manuscript/R/toy-example-calibration.R
@@ -39,13 +39,13 @@ res <- summarise_scores(res, by = c("model"))
 #   c("score", "N(0, 1)", "N(0.5, 1)", "N(0, 1.4)", "N(0, 0.7)")
 # )
 
-res[, model := factor(model, levels = c("N(0, 1)", "N(0.5, 1)",
-                                        "N(0, 1.4)", "N(0, 0.7)"))]
-
-setcolorder(
-  res,
-  c("score", "N(0, 1)", "N(0.5, 1)", "N(0, 1.4)", "N(0, 0.7)")
-)
+# res[, model := factor(model, levels = c("N(0, 1)", "N(0.5, 1)",
+#                                         "N(0, 1.4)", "N(0, 0.7)"))]
+#
+# setcolorder(
+#   res,
+#   c("Score", "N(0, 1)", "N(0.5, 1)", "N(0, 1.4)", "N(0, 0.7)")
+# )
 
 scores_table_plot <- plot_score_table(res, y = "model") +
   coord_flip() +
diff --git a/inst/manuscript/R/toy-example-mean-sd-deviation.R b/inst/manuscript/R/toy-example-mean-sd-deviation.R
new file mode 100644
index 000000000..9bed5125a
--- /dev/null
+++ b/inst/manuscript/R/toy-example-mean-sd-deviation.R
@@ -0,0 +1,98 @@
+library(data.table)
+library(scoringutils)
+library(ggplot2)
+library(scoringRules)
+library(dplyr)
+library(patchwork)
+
+# define simulation parameters
+n_steps = 500
+n_rep <- 5000
+true_mean = 0
+true_sd = 5
+true_values <- rnorm(n = n_rep, mean = true_mean, sd = true_sd)
+sd <- 10^(seq(-1, 1.6, length.out = n_steps))
+mu <- seq(0, 100, length.out = n_steps)
+
+
+# look at effect of change in sd on score
+res_sd <- data.table(sd = sd,
+                     mu = true_mean,
+                     crps = NA_real_,
+                     dss = NA_real_,
+                     logs = NA_real_)
+
+res_sd[, `:=` (crps = mean(crps(y = true_values, family = "normal", mean = mu, sd = sd)),
+               logs = mean(logs(y = true_values, family = "normal", mean = mu, sd = sd)),
+               dss = mean(dss_norm(y = true_values, mean = mu, sd = sd))),
+       by = "sd"]
+
+deviation_sd <- res_sd |>
+  melt(id.vars = c("sd", "mu"), value.name = "value", variable.name = "Score") |>
+  ggplot(aes(x = sd, y = value, color = Score)) +
+  geom_line() +
+  theme_minimal() +
+  geom_vline(aes(xintercept = 5), linetype = "dashed") +
+  coord_cartesian(ylim=c(0, 20)) +
+  labs(y = "Score", x = "Standard deviation of predictive distribution")
+
+#
+# # look at effect of change in mean on score
+# res_mu <- data.table(sd = true_sd,
+#                      mu = mu,
+#                      crps = NA_real_,
+#                      dss = NA_real_,
+#                      logs = NA_real_)
+#
+# res_mu[, `:=` (crps = mean(crps(y = true_values, family = "normal", mean = mu, sd = sd)),
+#                logs = mean(logs(y = true_values, family = "normal", mean = mu, sd = sd)),
+#                dss = mean(dss_norm(y = true_values, mean = mu, sd = sd))),
+#        by = "mu"]
+#
+# deviation_mu <- res_mu |>
+#   melt(id.vars = c("sd", "mu"), value.name = "value", variable.name = "Score") |>
+#   ggplot(aes(x = mu, y = value, color = Score)) +
+#   geom_line() +
+#   theme_minimal() +
+#   labs(y = "Score", x = "Mean of predictive distribution") +
+#   geom_vline(aes(xintercept = 0), linetype = "dashed") +
+#   coord_cartesian(ylim=c(0, 150))
+
+
+
+# define simulation parameters
+true_values <- seq(0, 4, length.out = 1000)
+true_sd = 1
+true_mu = 0
+
+# look at effect of change in sd on score
+res_mu2 <- data.table(true_value = true_values,
+                      crps = NA_real_,
+                      dss = NA_real_,
+                      logs = NA_real_)
+
+res_mu2[, `:=` (crps = crps(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10,
+                logs = logs(y = true_value, family = "normal", mean = true_mu, sd = true_sd) / 10,
+                dss = dss_norm(y = true_value, mean = true_mu, sd = true_sd) / 10)]
+
+label_fn <- function(x) {
+  paste(10*x)
+}
+
+outlier <- res_mu2 |>
+  melt(id.vars = c("true_value"), value.name = "value", variable.name = "Score") |>
+  ggplot(aes(x = true_value, y = value, color = Score)) +
+  geom_line() +
+  theme_minimal() +
+  labs(y = "Score", x = "Observed value") +
+  geom_vline(aes(xintercept = 0), linetype = "dashed") +
+  geom_area(stat = "function", fun = dnorm, color = "grey", fill = "grey", alpha = 0.5, xlim = c(0, 4)) +
+  scale_y_continuous(label = label_fn)
+
+
+deviation_sd + outlier +
+  plot_layout(guides = "collect") &
+  theme(legend.position = "bottom")
+
+ggsave("inst/manuscript/plots/score-deviation-sd-mu.png",
+       height = 4, width = 8)
diff --git a/inst/manuscript/manuscript.Rmd b/inst/manuscript/manuscript.Rmd
index aa377b675..a9fdcf3c1 100644
--- a/inst/manuscript/manuscript.Rmd
+++ b/inst/manuscript/manuscript.Rmd
@@ -70,11 +70,17 @@ opts_chunk$set(
 )
 ```
 
+```{r eval = FALSE, echo = FALSE}
+trackdown::update_file("inst/manuscript/manuscript.Rmd", gfile = "scoringutils-paper", hide_code = FALSE)
+
+```
+
+
 # Introduction
 
 Good forecasts are of great interest to decision makers in various fields like finance \citep{timmermannForecastingMethodsFinance2018, elliottForecastingEconomicsFinance2016}, weather predictions \citep{gneitingWeatherForecastingEnsemble2005, kukkonenReviewOperationalRegionalscale2012} or infectious disease modeling \citep{reichCollaborativeMultiyearMultimodel2019, funkShorttermForecastsInform2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, europeancovid-19forecasthubEuropeanCovid19Forecast2021}. An integral part of assessing and improving their usefulness is forecast evaluation. For decades, researchers have developed and refined an arsenal of techniques not only to forecast, but also to evaluate these forecasts (see e.g. \cite{bracherEvaluatingEpidemicForecasts2021}, \cite{funkAssessingPerformanceRealtime2019}, \cite{gneitingProbabilisticForecastsCalibration2007}, and \cite{gneitingStrictlyProperScoring2007}). Yet even with this rich body of research available, implementing a complete forecast evaluation in \proglang{R} is not trivial. 
 
-Some packages exist that bundle different scoring metrics together, but none offer the user a standalone solution to forecast evaluation. The \pkg{scoringRules} package \citep{scoringRules} offers a very extensive collection of proper scoring rules. Its exclusive focus on proper scoring rules and the fact that all functions are implemented using vectors and matrices instead of data.frames, however, make it more suitable for experienced users or as a building block in larger application. It also lacks features like pairwise comparisons between forecast models and plotting functionality that are invaluable in the evaluation process. Other packages like \pkg{Metrics} \citep{Metrics} and \pkg{MLmetrics} \citep{MLmetrics} are geared towards machine learning problems and don't implement the set of metrics and scoring rules desired for forecast evaluation. The \pkg{scoringutils} package aims to bring forth a standardised and tested toolkit. It offers convenient automated forecast evaluation in a \code{data.frame} format, but also provides experienced users with a set of reliable lower-level scoring metrics they can build upon in other applications. In addition it implements a wide range of flexible plots that are able to cover most day-to-day use cases. 
+Some packages exist that bundle different scoring metrics together, but none offer the user a standalone solution to forecast evaluation. The \pkg{scoringRules} package \citep{scoringRules} offers a very extensive collection of proper scoring rules. Its exclusive focus on proper scoring rules and the fact that all functions are implemented using vectors and matrices instead of data.frames, however, make it more suitable for experienced users or as a building block in larger applications. It also lacks features like pairwise comparisons between forecast models and plotting functionality that are invaluable in the evaluation process. Other packages like \pkg{Metrics} \citep{Metrics} and \pkg{MLmetrics} \citep{MLmetrics} are geared towards machine learning problems and don't implement the set of metrics and scoring rules desired for forecast evaluation. The \pkg{scoringutils} package aims to bring forth a standardised and tested toolkit. It offers convenient automated forecast evaluation in a \code{data.frame} format, but also provides experienced users with a set of reliable lower-level scoring metrics they can build upon in other applications. In addition it implements a wide range of flexible plots that are able to cover most day-to-day use cases. 
 
 The remainder of this section will provide an overview of the fundamental ideas behind forecast evaluation. Section \ref{metrics} will give a detailed theoretical explanation of the evaluation metrics in \pkg{scoringutils} and when to use them. Section \ref{evalutation-example} will demonstrate how to conduct an evaluation in \pkg{scoringutils} using forecasts of COVID-19 submitted to the European Forecast Hub \citep{europeancovid-19forecasthubEuropeanCovid19Forecast2021} as a case study. 
 
@@ -82,31 +88,12 @@ The remainder of this section will provide an overview of the fundamental ideas
 
 ## Forecast types and forecast formats
 
-In its most general sense, a forecast is the forecaster’s stated belief about the future \citep{gneitingStrictlyProperScoring2007} that can come in many different forms. Quantitative forecasts are either point forecasts or probabilistic in nature and can make statements about continuous, discrete or binary outcome variables. Point forecasts only give one single number for the most likely outcome. Probabilistic forecasts, in contrast, by definition provide a full predictive distribution. This makes them much more useful in any applied setting, as we learn about the forecaster's uncertainty and their belief about all aspects of the underlying data-generating distribution. Probabilistic forecasts are the focus of this paper as well as the \pkg{scoringutils} package. 
+In its most general sense, a forecast is the forecaster’s stated belief about the future \citep{gneitingStrictlyProperScoring2007} that can come in many different forms. Quantitative forecasts are either point forecasts or probabilistic in nature and can make statements about continuous, discrete or binary outcome variables. Point forecasts only give one single number for the most likely outcome. Probabilistic forecast, in contrast, by definition provide a full predictive distribution. This makes them much more useful in any applied setting, as we learn about the forecaster's uncertainty and their belief about all aspects of the underlying data-generating distribution. Probabilistic forecasts are the focus of this paper as well as the \pkg{scoringutils} package. 
 
 The predictive distribution of a probabilistic forecast can be represented in different ways with implications for the appropriate evaluation approach. For most forecasting problems, predictive distributions are not readily available in a closed form (and the \pkg{scoringutils} package therefore does not support scoring them directly). Instead, predictive distributions are often represented by a set of quantiles or predictive samples. Predictive samples require a lot of storage space and also come with a loss of precision in the tails of the predictive distribution, where many samples are needed to accurately characterise the distribution. For that reason, often quantiles or central prediction intervals are reported instead. One recent example of this are the COVID-19 Forecast Hubs \citep{cramerCOVID19ForecastHub2020, cramerEvaluationIndividualEnsemble2021, bracherShorttermForecastingCOVID192021, bracherNationalSubnationalShortterm2021, europeancovid-19forecasthubEuropeanCovid19Forecast2021}. For binary or multinomial prediction targets, common in many classification problems, a probabilistic forecasts is represented by the probability that an outcome will come true. Table \ref{tab:forecast-types} summarises the different forecast types and formats. 
-<!-- While specific metrics may differ depending on the forecast type or format, the general forecasting paradigm \cite{gneitingProbabilisticForecastsCalibration2007} that guides the evaluation process is the same. -->
-
-
-
-<!-- \begin{table}[] -->
-<!-- \centering -->
-<!-- \caption{Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representation of the predictive distribution are possible.} -->
-<!-- \label{tab:forecast-types} -->
-<!-- \begin{tabular}{lll} -->
-<!-- \hline -->
-<!-- Forecast type                                                 & Target type                                                                                 & Representation of the predictive distribution                                                           \\ \hline -->
-<!-- \multicolumn{1}{|l|}{Point forecast}                          & \multicolumn{1}{l|}{\begin{tabular}[c]{@{}l@{}}continuous\\ discrete\\ binary\end{tabular}} & one single number for the predicted outcome                                                             \\ \hline -->
-<!-- \multicolumn{1}{|l|}{\multirow{2}{*}{probabilistic forecast}} & \begin{tabular}[c]{@{}l@{}}continuous\\ discrete\end{tabular}                               & \begin{tabular}[c]{@{}l@{}}predictive samples, \\ closed analytical form, \\ or quantiles,\end{tabular} \\ \cline{2-2} -->
-<!-- \multicolumn{1}{|l|}{}                                        & binary                                                                                      & binary probabilities                                                                                    \\ \hline -->
-<!-- \end{tabular} -->
-<!-- \end{table} -->
-
-
-
 \begin{table}[]
 \centering
-\caption{Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representation of the predictive distribution are possible.}
+\caption{Forecast and forecast target types. Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representations of the predictive distribution are possible.}
 \label{tab:forecast-types}
 \begin{tabular}{@{}lll@{}}
 \toprule
@@ -117,29 +104,6 @@ Forecast type                        & Target type          & Representation of
 \end{tabular}
 \end{table}
 
-```{r forecast-types2, echo = FALSE}
-
-# data <- readRDS(system.file("metrics-overview/forecast-types.Rda",
-#                             package = "scoringutils"))
-# 
-# data |>
-#   dplyr::mutate_all(linebreak) |>
-#   kableExtra::kbl(format = "latex",
-#                   valign = "middle",
-#                   align = c("lll"),
-#                   caption = "Forecasts can be distinguished by whether they are probabilistic in nature, or a point forecast only. Depending on the type of the target (discrete, continuous or binary) different representation of the predictive distribution are possible.",
-#                   booktabs = TRUE,
-#                   escape = FALSE,
-#                   linesep = c('\\addlinespace')) |>
-#   # kableExtra::column_spec(1, width = "3.5cm") |>
-#   # kableExtra::column_spec(2, width = "2.5cm") |>
-#   # kableExtra::column_spec(3, width = "4.5cm") |>
-#   kableExtra::row_spec(0, bold = TRUE) |>
-#   # kableExtra::row_spec(1, hline = TRUE) |>
-#   kableExtra::kable_styling()  |>
-#   kableExtra::collapse_rows()
-```
-
 ## The Forecasting paradigm 
 
 Any forecaster should aim to minimise the difference between the (cumulative) predictive distribution $F$ and the unknown true data-generating distribution $G$ \citep{gneitingProbabilisticForecastsCalibration2007}. For an ideal forecast, we therefore have 
@@ -195,12 +159,12 @@ Probabilistic calibration means that the forecast distributions are consistent w
 We can visualise probabilistic calibration in different ways and \pkg{scoringutils} offers three possibilities. *Interval coverage plots* (see Figure \ref{fig:calibration-plots}) show nominal coverage of the central prediction intervals against the percentage of observed values that fell inside the corresponding prediction intervals. Ideally forecasters should lie on the 45° line. A shift to the left means a forecaster is too conservative and issues a predictive distribution that is too wide and covers more of the observed values than needed. A shift to the right means a forecaster is too confident and the forecast distribution is too narrow. Similarly, *quantile coverage plots* (Figure \ref{fig:calibration-plots}) show the quantiles of the predictive distribution against the percentage of observed values below the corresponding predictive quantiles. For quantiles below the median, a line to the right of the 45° degree line (predictive quantiles lower than the quantiles of the data-generating distribution) means a forecaster is too conservative, while for quantiles above the median, a line to the left of the 45° degree line (predictive quantiles higher than the quantiles of the data-generating distribution) implies conservative predictions. 
 
 <!-- gneitingProbabilisticForecastsCalibration2007 -->
-A similar way to visualise the same information is the probability integral transform (PIT) histogram \citep{dawidPresentPositionPotential1984}. The PIT is equal to $F(x_t)$, the predictive distribution evaluated at the observed value (see more details in Table \ref{tab:score-table-detailed}). If forecasts are probabilistically calibrated, then the transformed values will follow a uniform distribution. When plotting a histogram of PIT values, bias ususally leads to a triangular shape, a U-shaped histogram corresponds to forecasts that are under-dispersed (too sharp) and a hump-shape appears when forecasts are over-dispersed (too wide). 
+A similar way to visualise the same information is the probability integral transform (PIT) histogram \citep{dawidPresentPositionPotential1984}. The PIT is equal to $F(x_t)$, the predictive distribution evaluated at the observed value (see more details in Table \ref{tab:score-table-detailed}). If forecasts are probabilistically calibrated, then the transformed values will follow a uniform distribution. When plotting a histogram of PIT values, bias usually leads to a triangular shape, a U-shaped histogram corresponds to forecasts that are under-dispersed (too sharp) and a hump-shape appears when forecasts are over-dispersed (too wide). 
 
-It is in principle possible formally test probabilistic calibration, for example by employing a test on the uniformity of PIT values (e.g. the Anderson-Darling test \citep{andersonAsymptoticTheoryCertain1952}). In practice this is often difficult as forecasts and therefore also PIT values are often correlated. We therefore advise against using formal tests in most applied settings. It is also important to note that uniformity of the PIT histogram (or a 45° degree line in quantile and interval coverage plots) indicates probabilistic calibration, but does not guarantee that forecasts are indeed calibrated in every relevant sense. \cite{gneitingProbabilisticForecastsCalibration2007, hamillInterpretationRankHistograms2001a} provide examples with different forecasters who are clearly mis-calibrated, but have uniform PIT histograms. 
+It is in principle possible to formally test probabilistic calibration, for example by employing a test on the uniformity of PIT values (e.g. the Anderson-Darling test \citep{andersonAsymptoticTheoryCertain1952}). In practice this is often difficult as forecasts and therefore also PIT values are often correlated. We therefore advise against using formal tests in most applied settings. It is also important to note that uniformity of the PIT histogram (or a 45° degree line in quantile and interval coverage plots) indicates probabilistic calibration, but does not guarantee that forecasts are indeed calibrated in every relevant sense. \cite{gneitingProbabilisticForecastsCalibration2007, hamillInterpretationRankHistograms2001a} provide examples with different forecasters who are clearly mis-calibrated, but have uniform PIT histograms. 
 
 
-```{r calibration-plots, echo = FALSE,  fig.pos = "!h", out.extra = "", fig.cap= "Calibration plots for different forecast distributions.", cache = FALSE}
+```{r calibration-plots, echo = FALSE,  fig.pos = "!h", out.extra = "", fig.cap= "Top row: Standard normal true data-generating distribution (black) against predictions sampled from different predictive distributions. Second row: PIT histograms based on 1000 observations dranwn from the standard normal true data-generating distribution and 1000 samples from different predictive distributions for each simulated observations. Third row: Empirical vs. nominal coverage of the central prediction intervals for simulated observations and predictions. Areas shaded in green indicate that the model is underconfident, covering more true values than it actually should, while areas in white indicate that the model fails to cover the desired proportion of true values with its prediction intervals. Fourth row: Quantile coverage values. Again, green areas indicate conservative forecasts. Last row: Scores for the different predictive distributions. Note that the DSS and Log score punish overconfidence more harshly than underconfidence, while the CRPS does not.", cache = FALSE}
 include_graphics("plots/calibration-diagnostic-examples.png")
 
 # readRDS("plots/calibration-diagnostic-examples.Rda") |>
@@ -221,41 +185,41 @@ For forecasts in a quantile-based format, there exists a second possibility to a
 
 ## Metrics to assess sharpness
 
-Sharpness is the ability to produce narrow forecasts. In contrast to calibation it does not depend on the actual observations and is a quality of the forecast only \ref{gneitingProbabilisticForecastsCalibration2007}. Sharpness is therefore only useful subject to calibration, as exemplified in Figure \ref{fig:calibration-example}. We may be willing to trade off a little calibration for a lot more sharpness, but usually not much. 
-For sample-based forecasts, \pkg{scoringutils} calculates dispersion (the inverse of sharpness) as the normalised median absolute deviation about the median (MAD), following \cite{funkAssessingPerformanceRealtime2019} (for details see Table \ref{tab:score-overview}). For quantile forecasts, we instead report the dispersion component of the weighted interval score (see details in section \ref{wis} and \ref{score-table-detailed}) which corresponds to a weighted average of the individual interval widths. 
+Sharpness is the ability to produce narrow forecasts. In contrast to calibration it does not depend on the actual observations and is a quality of the forecast only \citep{gneitingProbabilisticForecastsCalibration2007}. Sharpness is therefore only useful subject to calibration, as exemplified in Figure \ref{fig:calibration-plots}. We may be willing to trade off a little calibration for a lot more sharpness, but usually not much. 
+For sample-based forecasts, \pkg{scoringutils} calculates dispersion (the inverse of sharpness) as the normalised median absolute deviation about the median (MAD), following \cite{funkAssessingPerformanceRealtime2019} (for details see Table \ref{tab:metrics-summary}). For quantile forecasts, we instead report the dispersion component of the weighted interval score (see details in section \ref{wis} and \ref{tab:score-table-detailed}) which corresponds to a weighted average of the individual interval widths. 
 
 
 ## Proper scoring rules for sample-based forecasts (CRPS, logS and DSS)
 
 For sample-based forecasts, the \pkg{scoringutils} package provides the following proper scoring rules: the (continuous) ranked probability score (CRPS) \citep{epsteinScoringSystemProbability1969, mathesonScoringRulesContinuous1976, gneitingStrictlyProperScoring2007}, the log score (logS) \citep{goodRationalDecisions1952}, and the Dawid-Sebastiani-score (DSS) \citep{dawidCoherentDispersionCriteria1999} (formal definitions are given in Table \ref{tab:score-table-detailed}). The corresponding functions are imported from the \pkg{scoringRules} package and exposed to the user through a slightly adapted interface. Other, non-sample-based variants of the CRPS, logS and DSS are available in the \pkg{scoringRules} package. 
 
-When scoring forecasts in a sample-based format, the choice is usually between the logS and the CRPS. The DSS is much less commonly used. It is easier to compute, but apart from that does not have immediate advantages over the former two. DSS, CRPS and logS differ in four important aspects: Ease of estimation and speed of convergence, sensitivity to distance \cite{winklerScoringRulesEvaluation1996}, sensitivity to outlier predictions, and sensitivity to the order of magnitude of the forecast quantity. 
-
-<!-- Add part about how they deal differently with uncertainty. I.e. Log score prefers more uncertain forecasts -->
+When scoring forecasts in a sample-based format, the choice is usually between the logS and the CRPS. The DSS is much less commonly used. It is easier to compute, but apart from that does not have immediate advantages over the former two. DSS, CRPS and logS differ in several important aspects: Ease of estimation and speed of convergence, treatment of over- and underconfidence, sensitivity to distance \cite{winklerScoringRulesEvaluation1996}, sensitivity to outlier predictions, and sensitivity to the order of magnitude of the forecast quantity. 
 
 ### Estimation details and speed of convergence
 
-All three scores are in principle applicable to continuous as well as discrete forecasts. However, the fact that scores in practice need to be estimated based on predictive samples (rather than a closed-form predictive distribution) introduces additional complications. For the CRPS and the DSS, approximate scores can directly be computed based on the empirical distribution function, i.e. based on predictive samples directly. For the log score (which equals the negative log density of the predictive distribution evaluated at the observed value), a density estimation is required first. The \pkg{scoringRules} implementation of the log score used within \pkg{scoringutils} performs a kernel density estimation that may in particular be inappropriate for discrete values (see also Table \ref{tab:table-summary-scores}). The logS is therefore not computed for discrete predictions in \pkg{scoringutils}. The sample-based version of the log score also tends to converge more slowly to the exact score than the CRPS or the DSS, as illustrated in Figure \ref{fig:score-convergence}. 
+All three scores are in principle applicable to continuous as well as discrete forecasts. However, the fact that scores in practice need to be estimated based on predictive samples (rather than a closed-form predictive distribution) introduces additional complications. For the CRPS and the DSS, approximate scores can directly be computed based on the empirical distribution function, i.e. based on predictive samples directly. For the log score (which equals the negative log density of the predictive distribution evaluated at the observed value), a density estimation is required first. The \pkg{scoringRules} implementation of the log score used within \pkg{scoringutils} performs a kernel density estimation that may in particular be inappropriate for discrete values (see also Table \ref{tab:score-table-detailed}). The logS is therefore not computed for discrete predictions in \pkg{scoringutils}. The sample-based version of the log score also tends to converge more slowly to the exact score than the CRPS or the DSS, as illustrated in Figure \ref{fig:score-convergence} (adapted from \citep{jordanEvaluatingProbabilisticForecasts2019}). 
 
-```{r score-convergence, echo = FALSE, fig.cap="Convergence of different scores. MORE DETAIL"}
+```{r score-convergence, echo = FALSE, fig.cap="Top: Convergence of different scores. Scores were computed based on samples of differing size (from 10 to 100,000). This was repeated 500 times for each sample size. The black line is the mean score across the 500 repetitions, shaded areas represent 50\\% and 90\\% intervals, and the dashed line represents the true calculated score.  Bottom left: Change in score when the uncertainty of the predictive distribution is changed. The true distribution is N(0,5) with the true standard deviation marked with a dashed line, while the standard deviation of the predictive distribution is varied along the x-axis. Log score and DSS clearly punish overconfidence much more severely than underconfidence. Bottom right: Score achieved for a standard normal predictive distribution (illustrated in grey) and different true observed values. Log score and DSS punish instances more harshly where the observed value is far away from the predictive distribution.", fig.show="hold"}
 include_graphics("plots/sample-convergence.png")
+include_graphics("plots/score-deviation-sd-mu.png")
 ```
 
+### Overconfidence, underconfidence and outliers. 
+
+All proper scoring rules award the best score to the forecaster who reports a predictive distribution equal to the data-generating distribution. However, they differ in how they punish over- or underconfident forecasts. The log score and the DSS punish overconfidence much more severely than underconfidence, while the CRPS does not distinguish between over- and underconfidence and punishes both rather mildly. This is illustrated in Figure \ref{fig:score-convergence}. Similarly, the CRPS is rather lenient with regards to outlier predictions compared to the log score and the DSS (see Figure \ref{fig:score-convergence}). The CRPS, which can be thought of as a generalisation of the absolute error to a predictive distribution, scales linearly with the distance between forecast distribution and true value. The log score, however, as the log of the predictive density evaluated at the observed value, can quickly go to infinity if the probability assigned to the observed outcome is close to zero. Whether or not harsh punishment of overconfidence and bad predictions is desirable or not depends of course on the setting. If, for example, we wanted to forecast hospital bed capacity, it may be prudent to score forecasts using a log score, because we would likely rather be too cautious rather than too confident. 
+
+
 ### Sensitivity to distance - local vs\. global scores
 
-The CRPS (and to a lesser degree the DSS, which relies on the first two moments of a distribution) is a so-called global scoring rule, which means that the score is sensitive to the distance of the entire predictive distribution from the observed value. The log score, on the other hand is local and not sensitive to the distance between the observed value and the predictive distribution. The resulting score does not depend on the overall distance between the observed value and the distribution, but only on the probability density assigned to the actual outcome. Figure \ref{fig:score-locality} illustrates the case of two hypothetical forecasters predicting the number of goals in a football match. Both assign the same probability to the outcome that was later observed, but forecaster B gives significant probability to outcomes far away from the observed outcome. The crps as a global scoring rule takes the entire predictive distribution into account and will assign a worse score to forecaster B. The log score is only determined by the probability assigned to the observed outcome and therefore both forecasters receive the same score. Sensitivity to distance (taking the entire predictive distribution into account) may be an advantage in most settings that involve decision making. Forecaster A's prediction that assigns high probability to results far away from the observed value is arguably less useful than B's forecast that assigns higher probability to values closer to it (the probability assigned to the actual outcome being equal for both forecasts). The log score is only implicitly sensitive to distance if we assume that values close to the observed value are actually more likely to occur. The logS may, however, be more appropriate for inferential purposes (see \cite{winklerScoringRulesEvaluation1996}) and is commonly used in Bayesian statistics \citep{gelmanUnderstandingPredictiveInformation2014}. 
+The CRPS (and to a lesser degree the DSS, which relies on the first two moments of a distribution) is a so-called global scoring rule, which means that the score is sensitive to the distance of the entire predictive distribution from the observed value. The log score, on the other hand, is local and not sensitive to the distance between the observed value and the predictive distribution. The resulting score does not depend on the overall distance between the observed value and the distribution, but only on the probability density assigned to the actual outcome. Figure \ref{fig:score-locality} illustrates the case of two hypothetical forecasters predicting the number of goals in a football match. Both assign the same probability to the outcome that was later observed, but forecaster B gives significant probability to outcomes far away from the observed outcome. The crps as a global scoring rule takes the entire predictive distribution into account and will assign a worse score to forecaster B. The log score is only determined by the probability assigned to the observed outcome and therefore both forecasters receive the same score. Sensitivity to distance (taking the entire predictive distribution into account) may be an advantage in most settings that involve decision making. Forecaster A's prediction that assigns high probability to results far away from the observed value is arguably less useful than B's forecast that assigns higher probability to values closer to it (the probability assigned to the actual outcome being equal for both forecasts). The log score is only implicitly sensitive to distance if we assume that values close to the observed value are actually more likely to occur. The logS may, however, be more appropriate for inferential purposes (see \citep{winklerScoringRulesEvaluation1996}) and is commonly used in Bayesian statistics \citep{gelmanUnderstandingPredictiveInformation2014}. 
 
-```{r score-locality, echo = FALSE, fig.cap="Probabilities assigned by to hypothetical forecasters, A and B, to the possible number of goals in a football match. The true number later observed, 2, is marked with a dashed line. Both forecasters assign a probability of 0.35 to the observed outcome, 2. Forecaster A's prediction is centred around the observed value, while Forecaster B assigns significant probability to outcomes far away from the observed value. Judged by a local score like the Log Score, both forecasters receive the same score. A global score like the CRPS punishes forecaster B more severly."}
+```{r score-locality, echo = FALSE, fig.cap="Probabilities assigned by to hypothetical forecasters, A and B, to the possible number of goals in a football match. The true number later observed, 2, is marked with a dashed line. Both forecasters assign a probability of 0.35 to the observed outcome, 2. Forecaster A's prediction is centred around the observed value, while Forecaster B assigns significant probability to outcomes far away from the observed value. Judged by a local score like the Log Score, both forecasters receive the same score. A global score like the CRPS punishes forecaster B more severely."}
 
 include_graphics("plots/score-locality.png")
 ```
 
-### Robustness to outlier predictions
-A second important difference is how forecasts are treated that deviate strongly from the observed outcome. The CRPS can be thought of as a generalisation of the absolute error to a predictive distribution. It therefore scales linearly with the distance between forecast distribution and true value. The log score, however, as the log of the predictive density evaluated at the observed value, can quickly go to negative infinity if the probability assigned to the observed outcome is close to zero. The CRPS is therefore considered more stable than the log score. The behaviour of the DSS is in between the two. Whether or not harsh punishment of bad predictions is desirable or not depends of course on the setting. \cite{bracherEvaluatingEpidemicForecasts2021} exemplify that in practice there may indeed be substantial differences between how the CRPS and log score judge the same forecast. 
-<!-- Maybe have a plot for this? -->
-
 ### Sensitivity to the order of magnitude of the forecast quantity
-As the CRPS is a generalisation of the absolute value, overall scores usually depend on the order of magnitude of the quantity we try to forecast (as the variance of the data-generating distribution usually increases with the mean). This makes it harder to compare forecasts for very different targets, or assess average performance if the quantity of interest varies substantially over time. Average scores tend to be dominated by forecasts for targets with high absolute numbers. This may be desirable,if we care most about forecasts in situations where numbers are high, but usually it is not. LogS and DSS are more robust against this effect. Another way to address this issue by using pairwise comparisons will be introduced later. 
+As the CRPS is a generalisation of the absolute value, overall scores usually depend on the order of magnitude of the quantity we try to forecast (as the variance of the data-generating distribution usually increases with the mean). This makes it harder to compare forecasts for very different targets, or assess average performance if the quantity of interest varies substantially over time. Average scores tend to be dominated by forecasts for targets with high absolute numbers. This may be desirable,if we care most about forecasts in situations where numbers are high, but usually it is not. The log score is often slightly more robust against this effect, depending on the exact relationship between mean and variance of the data-generating distribution. Another way to address this issue by using pairwise comparisons will be introduced later. 
 <!-- Maybe have a plot for this? -->
 
 ## Proper scoring rule for quantile-based forecasts (WIS) {#wis}
@@ -264,17 +228,15 @@ For forecasts in an interval or quantile format, \pkg{scoringutils} offers the w
 
 ## Proper scoring rules for binary outcomes (BS and logS)
 
-Binary forecasts can be scored using the Brier score (BS) \citep{brierVERIFICATIONFORECASTSEXPRESSED1950}, which corresponds to the squared difference between the given probability and the outcome (either 0 or 1). 
-
-ADD EXPLANATION OF LOG SCORE FOR BINARY OUTCOMES AND WHEN TO USE WHICH
+Binary forecasts can be scored using the Brier score (BS) or the log score. The Brier score \citep{brierVERIFICATIONFORECASTSEXPRESSED1950} corresponds to the squared difference between the given probability and the outcome (either 0 or 1) and equals the ranked probability score for the case of only two possible outcomes. The log score corresponds to the log of the probability assigned to the observed outcome. Just as with continuous forecasts, the log score punishes overconfidence much more harshly than underconfidence. The Brier score, on the other hand, does not distinguish between over- and underconfidence \citep{macheteContrastingProbabilisticScoring2012} and is therefore more forgiving of outlier predictions. 
 
 ## Pairwise comparisons
 
-In order to compare performance of different models fairly, we can compute relative skill scores based on pariwise comparisons \citep{cramerEvaluationIndividualEnsemble2021}. Models enter a 'pairwise tournament', where all possible pairs of models are compared based on the overlapping set of available forecasts common to both models (omitting comparisons where there is no overlapping set of forecasts). For every pair, the ratio of the mean scores of both models is computed. The relative skill score of a model is then the geometric mean of all mean score ratios which involve that model. This gives us an indicator of performance relative to all other models. The orientation depends on the score used. For the proper scoring rules described above, smaller is better and a relative skill score smaller than 1 indicates that a model is performing better than the average model. We can also compute a scaled relative skill score using a baseline model. In that case those mean score rations which include the baseline are excluded when taking the geometric mean to obtain relative skill scores for individual models (which therefore differ slightly from relative scores without a baseline). All individual relative skill scores are then scaled by (i.e. divided by) the relative score of the baseline model.
+In order to compare performance of different models fairly, we can compute relative skill scores based on pairwise comparisons \citep{cramerEvaluationIndividualEnsemble2021}. Models enter a 'pairwise tournament', where all possible pairs of models are compared based on the overlapping set of available forecasts common to both models (omitting comparisons where there is no overlapping set of forecasts). For every pair, the ratio of the mean scores of both models is computed. The relative skill score of a model is then the geometric mean of all mean score ratios which involve that model. This gives us an indicator of performance relative to all other models. The orientation depends on the score used. For the proper scoring rules described above, smaller is better and a relative skill score smaller than 1 indicates that a model is performing better than the average model. We can also compute a scaled relative skill score using a baseline model. In that case those mean score rations which include the baseline are excluded when taking the geometric mean to obtain relative skill scores for individual models (which therefore differ slightly from relative scores without a baseline). All individual relative skill scores are then scaled by (i.e. divided by) the relative score of the baseline model.
 
 It is in principle possible to compute p-values to determine whether two models perform significantly differently. \pkg{scoringutils} allows to compute these using either the Wilcoxon rank sum test (also known as Mann-Whitney-U test) \citep{mannTestWhetherOne1947} or a permutation test. In practice, this is slightly complicated by the fact that both tests assume independent observations. In reality, however, forecasts by a model may be correlated across time or another dimension (e.g. if a forecaster has a bad day, they will likely perform badly across different targets for a given forecast date). P-values may therefore be too liberal in suggesting significant differences where there aren't any. One way to mitigate this is to aggregate observations over a category where one suspects correlation (fore example averaging across all forecasts made on a given date) before making pairwise comparisons. A test that is performed on aggregate scores will likely be more conservative. 
 
-# Evaluating forecasts using scoringutils {#evalutation-example}
+# Evaluating forecasts using scoringutils {#evaluation-example}
 
 The \pkg{scoringutils} package offers comprehensive functionality to conduct a forecast evaluation and allows users to check inputs, score forecasts and visualise results. Most functions operate on a `data.frame`-based format, but the package also provides a set of reliable lower-level scoring metrics operating on vectors/matrices, which experienced users can use in their own evaluation pipelines. Some helper functions for data-handling, as well as example data sets and tables with additional information about available scoring metrics are also included in the package. Internally, operations are handled using \pkg{data.table} to allow for fast and efficient computation. 
 
@@ -356,7 +318,7 @@ avail_forecasts(data = example_integer,
 
 <!-- THIS NEEDS TO BE REWORKED DEPENDING ON WHAT WE DO WITH THE FUNCTION -->
 
-The forecasts and observed values themselves can be visualised using the \fct{plot\_predictions} function. It offers the user an optional ad hoc way to filter both forecasts and observed values. Forecasts and observed values can be passed in separately (and are merged internally) or as a single data.frame. Conditions to filter on need to be provided as a list of strings, where each of the strings represents an expression that can be evaluated to filter the data. To display, for example, short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021 as well as 5 weeks of prior data, we can call the following. The resulting plot is shown in Figure \ref{fig:forecast-visualistion}. 
+The forecasts and observed values themselves can be visualised using the \fct{plot\_predictions} function. It offers the user an optional ad hoc way to filter both forecasts and observed values. Forecasts and observed values can be passed in separately (and are merged internally) or as a single data.frame. Conditions to filter on need to be provided as a list of strings, where each of the strings represents an expression that can be evaluated to filter the data. To display, for example, short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021 as well as 5 weeks of prior data, we can call the following. The resulting plot is shown in Figure \ref{fig:forecast-visualisation}. 
 
 ```{r forecast-visualisation, fig.pos = "!h", fig.width = 10, fig.height = 5, fig.cap = "Short-term forecasts for COVID-19 cases and deaths made by the EuroCOVIDhub-ensemble model on June 28 2021."}
 plot_predictions(data = example_quantile,
@@ -421,7 +383,7 @@ The process is designed to require conscious action by the user, because the est
 
 ## Pairwise comparisons
 
-In order to obtain a model ranking, we recommend to look at the relative skill in terms of an appropriate proper scoring rule instead of the raw score. Relative skill scores can be aggregated more easily across different forecast targets as they are less influenced by the order of magnitude of the quantity to forecast than e.g. the WIS or the CRPS. 
+In order to obtain a model ranking, we recommend looking at the relative skill in terms of an appropriate proper scoring rule instead of the raw score. Relative skill scores can be aggregated more easily across different forecast targets as they are less influenced by the order of magnitude of the quantity to forecast than e.g. the WIS or the CRPS. 
 <!-- Maybe add some more details and pitfalls, e.g. that models with few locations make these locations relatively more important -->
 <!-- It may therefore be sensible to require all models to have submitted forecasts for at least 50\% of all forecast targets in order to enter the pairwise comparisons.  -->
 
@@ -434,14 +396,14 @@ score(example_quantile) |>
   glimpse()
 ```
 
-When a baseline is provided, pairwise comparisons involving that baseline model are excluded from the computation of relative scores for all non-baseline models. Relative skill scores with a basline included therefore differ slightly from relative scores without a baseline. Scaled relative skill scores are then computed by scaling (i.e. dividing) all relative scores by the relative score of the baseline model. 
+When a baseline is provided, pairwise comparisons involving that baseline model are excluded from the computation of relative scores for all non-baseline models. Relative skill scores with a baseline included therefore differ slightly from relative scores without a baseline. Scaled relative skill scores are then computed by scaling (i.e. dividing) all relative scores by the relative score of the baseline model. 
 <!-- THINK ABOUT HOW TO HANDLE THIS AGAIN IN THE PACKAGE-->
 
 Pairwise comparisons should usually be made based on unsummarised scores (\fct{pairwise\_comparison} internally summarises over samples and quantiles automatically, but nothing else), as summarising can change the set of overlapping forecasts between two models and distort relative skill scores. When computing relative skill scores using \fct{summarise_scores}, this happens by default. When using \fct{pairwise\_comparison}, the function \fct{summarise\_scores} should usually not be called beforehand. One potential exception to this is when one is interested in the p-values obtained from pairwise comparisons. As forecasts are usually highly correlated (which the calculation of p-values do not account for), it may be sensible to summaries over a few categories to reduce correlation and obtain more conservative p-values. 
 
 Using the function \fct{plot\_pairwise\_comparison} we can visualise the mean score ratios between all models. The output of the following code is shown in Figure \ref{fig:pairwise-plot} (in the SI?). 
 
-```{r pairwise-plot, echo=TRUE, fig.width = 8, fig.cap="Ratios of mean scores based on overlapping forecast sets. If a tile is blue, then the model on the y-axis performed better. If it is red, the model on the x-axis performed better in direct comparison. When interpreting the plot one should look at the model on the y-axis, and the model on the x-axis is the one it is compared against. In the example above, the EuroCOVIDhub-ensemble performs best (it only has values smaller 1), while the EuroCOVIDhub-baseline performs worst (and only has values larger than 1). For cases, the UMass-MechBayes model is of course excluded as there are no case forecasts available and therefore the set of overlapping forecasts is empty."}
+```{r pairwise-plot, echo=TRUE, fig.width = 8, fig.cap="Ratios of mean scores based on overlapping forecast sets. If a tile is blue, then the model on the y-axis performed better. If it is red, the model on the x-axis performed better in direct comparison. When interpreting the plot one should look at the model on the y-axis, and the model on the x-axis is the one it is compared against. In the example above, the EuroCOVIDhub-ensemble performs best (it only has values smaller than one), while the EuroCOVIDhub-baseline performs worst (and only has values larger than one). For cases, the UMass-MechBayes model is of course excluded as there are no case forecasts available and therefore the set of overlapping forecasts is empty."}
 score(example_quantile) |>
   pairwise_comparison(by = c("model", "target_type"), 
                       baseline = "EuroCOVIDhub-baseline") |>
@@ -456,7 +418,7 @@ The \pkg{scoringutils} package offers a variety of functions to aid the user in
 
 For example, to detect systematic patterns it may be useful to visualise a single metric across several dimensions. The following produces a heatmap of bias values across different locations and forecast targets (output shown in Figure \ref{fig:score-heatmap}). 
 
-```{r score-heatmap, fig.pos = "!h", fig.width = 8, fig.cap = "CAPTION"}
+```{r score-heatmap, fig.pos = "!h", fig.width = 8, fig.cap = "Heatmap of bias values for different models across different locations and forecast targets. Bias values are bound between -1 (under-prediction) and 1 (over-prediction) and are 0 ideally."}
 score(example_continuous) |>
   summarise_scores(by = c("model", "location", "target_type")) |>
   plot_heatmap(x = "location", metric = "bias") + 
@@ -465,9 +427,9 @@ score(example_continuous) |>
 
 ```
 
-For quantile-based forecasts, it is helpful to visualise the decomposition of the weighted interval score into its components: dispersion, over-prediction and under-prediction. This can be achieved using the function \fct{plot\_wis\_components}, as shown in Figure \ref{wis-comonents} and Figure \ref{wis-components-relative} in the SI. As described in section \ref{scoring} it is possible to convert from a sample-based to a quantile-based forecast format to make use of this decomposition even for sample-based forecasts. 
+For quantile-based forecasts, it is helpful to visualise the decomposition of the weighted interval score into its components: dispersion, over-prediction and under-prediction. This can be achieved using the function \fct{plot\_wis\_components}, as shown in Figure \ref{fig:wis-components} and Figure \ref{fig:wis-components-relative} in the SI. As described in section \ref{scoring} it is possible to convert from a sample-based to a quantile-based forecast format to make use of this decomposition even for sample-based forecasts. 
 
-```{r wis-components, fig.pos = "!h", fig.width = 8, fig.cap = "CAPTION"}
+```{r wis-components, fig.pos = "!h", fig.width = 8, fig.cap = "Decomposition of the weighted interval score (WIS) into dispersion, over-prediction and under-prediction. The WIS components measure over- and under-prediction in absolute, rather than relative terms."}
 score(example_quantile) |>
   summarise_scores(by = c("model", "target_type")) |>
   plot_wis(relative_contributions = FALSE) + 
@@ -487,7 +449,7 @@ example_continuous |>
 
 and create PIT histograms using the function \fct{plot\_pit}. The output of the following is shown in Figure \ref{fig:pit-plots} (see a quantile-based example in Figure \ref{fig:pit-plots-quantile} in the SI): 
 
-```{r pit-plots, fig.pos = "!h", fig.cap="CAPTION", fig.width = 8, fig.height=4}
+```{r pit-plots, fig.pos = "!h", fig.cap="PIT histograms of all models stratified by forecast target. Histograms should ideally be uniform. A u-shape usually indicates overconfidence (forecasts are too narrow), a hump-shaped form indicates underconfidence (forecasts are too uncertain) and a triangle-shape indicates bias.", fig.width = 8, fig.height=4}
 example_continuous |>
   pit(by = c("model", "target_type")) |>
   plot_pit() + 
@@ -496,8 +458,6 @@ example_continuous |>
 
 We can also look at interval and quantile coverage plots (explained in more detail in section \ref{probabilistic-calibration}) using the functions \fct{plot\_interval\_coverage} and \fct{plot\_quantile\_coverage}. These plots require that the columns "range" and "quantile", respectively, be present in the scores to plot, and therefore need to be included in the `by` argument when summarising scores. The output of the following is shown in Figure \ref{fig:coverage}.  
 
-<!-- We can have a closer look at calibration using the functions \fct{interval\_coverage} and \fct{quantile\_coverage}. The interval coverage plot shows the proportion of all true values that fall within all the different prediction intervals. This gives a visual impression of probabilistic calibration \ref{gneitingProbabilisticForecastsCalibration2007}. Ideally, $x$ percent of true values should be covered by the $x$\%-prediction intervals, resulting in a 45° line. Areas shaded in green indicate that the model is covering more true values than it actually should, while areas in white indicate that the model fails to cover the desired proportion of true values with its prediction intervals. The majority of the models were too confident in their predictions, while some showed showed good calibration. The quantile coverage plot shows the proportion of all true values below certain predictive quantiles. While this plot is slightly harder to interpret, it also includes information about bias as and allows to separate the lower and upper boundaries of the prediction intervals. Figure \ref{fig:coverage} -->
-
 ```{r coverage, fig.width = 10, fig.pos = "!h", fig.show='hold', fig.cap = "Interval coverage and quantile coverage plots."}
 cov_scores <- score(example_quantile) |>
   summarise_scores(by = c("model", "target_type", "range", "quantile"))
@@ -529,7 +489,7 @@ correlations |>
 
 Forecast evaluation is invaluable to understanding and improving current forecasts. The \pkg{scoringutils} package aims to facilitate this process and make it easier, even for less experienced users. It provides a fast, flexible and convenient evaluation framework based on `data.frame`s, but also makes a set of scoring functions available to more experienced users to be used in other packages or pipelines. A set of visualisations and plotting functions make help with diagnosing issues with models and allows for thorough comparison between different forecasting approaches. 
 
-The package is still under active development and we warmly welcome contributions to \pkg{scoringutils}. In the future we hope to extend the number of scoring metrics supported, especially with regard to point forecasts. We also plan to expand the plotting functionality and hope to make templates available for automated scoring reports. We are also considering to provide S3 classes for the different forecast formats. 
+The package is still under active development and we warmly welcome contributions to \pkg{scoringutils}. In the future we hope to extend the number of scoring metrics supported. This includes spherical scoring rules, evaluation of multinomial prediction tasks, as well as a broader range of scoring metrics for point forecasts. We also plan to expand the plotting functionality and hope to make templates available for automated scoring reports. 
 
 NOT SURE WHAT ELSE TO DISCUSS?
 
@@ -588,7 +548,7 @@ data[, 1:2] |>
 
 ## Additional analyses and visualisations {#additional}
 
-```{r wis-components-relative, fig.width = 10, fig.cap = "CAPTION"}
+```{r wis-components-relative, fig.width = 10, fig.cap = "Components of the weighted interval score normalised to one to show relative contribution of different penalties."}
 score(example_quantile) |>
   summarise_scores(by = c("model", "target_type")) |>
   plot_wis(relative_contributions = TRUE) + 
@@ -601,7 +561,7 @@ score(example_quantile) |>
 
 
 
-```{r pit-plots-quantile, fig.cap="CAPTION"}
+```{r pit-plots-quantile, fig.cap="PIT histograms based on a subset of ten equally-spaced quantiles of all models stratified by forecast target. Histograms should ideally be uniform. A u-shape usually indicates overconfidence (forecasts are too narrow), a hump-shaped form indicates underconfidence (forecasts are too uncertain) and a triangle-shape indicates bias."}
 subset(example_quantile, example_quantile$quantile %in% seq(0.1, 0.9, 0.1)) |>
   pit(by = c("model", "target_type")) |>
   plot_pit() + 
@@ -611,7 +571,7 @@ subset(example_quantile, example_quantile$quantile %in% seq(0.1, 0.9, 0.1)) |>
 
 ### Scores by interval ranges
 
-If you would like to see how different forecast interval ranges contribute to average scores, you can viusalise scores by interval range:
+If you would like to see how different forecast interval ranges contribute to average scores, you can visualise scores by interval range:
 
 ```{r, fig.pos = "!h", fig.width=8, fig.height=4, fig.pos="!h"}
 example_quantile |>
diff --git a/inst/manuscript/plots/score-deviation-sd-mu.png b/inst/manuscript/plots/score-deviation-sd-mu.png
new file mode 100644
index 000000000..bffe7fa83
Binary files /dev/null and b/inst/manuscript/plots/score-deviation-sd-mu.png differ
diff --git a/inst/manuscript/scoringutils-paper.bib b/inst/manuscript/scoringutils-paper.bib
index 44b633f4c..1377c98e0 100644
--- a/inst/manuscript/scoringutils-paper.bib
+++ b/inst/manuscript/scoringutils-paper.bib
@@ -78,7 +78,7 @@ @article{brierVERIFICATIONFORECASTSEXPRESSED1950
   file = {/mnt/data/github-synced/zotero-nikos/storage/ZCBG3Z38/Brier - 1950 - VERIFICATION OF FORECASTS EXPRESSED IN TERMS OF PR.pdf;/mnt/data/github-synced/zotero-nikos/storage/I83583N3/1520-0493_1950_078_0001_vofeit_2_0_co_2.html}
 }
 
-@misc{cramerCOVID19ForecastHub2020a,
+@misc{cramerCOVID19ForecastHub2020,
   title = {{{COVID-19 Forecast Hub}}: 4 {{December}} 2020 Snapshot},
   shorttitle = {{{COVID-19 Forecast Hub}}},
   author = {Cramer, Estee and Nicholas G Reich and Serena Yijin Wang and Jarad Niemi and Abdul Hannan and Katie House and Youyang Gu and Shanghong Xie and Steve Horstman and {aniruddhadiga} and Robert Walraven and {starkari} and Michael Lingzhi Li and Graham Gibson and Lauren Castro and Dean Karlen and Nutcha Wattanachit and {jinghuichen} and {zyt9lsb} and {aagarwal1996} and Spencer Woody and Evan Ray and Frost Tianjian Xu and Hannah Biegel and GuidoEspana and Xinyue X and Johannes Bracher and Elizabeth Lee and {har96} and {leyouz}},
@@ -86,7 +86,8 @@ @misc{cramerCOVID19ForecastHub2020a
   month = dec,
   publisher = {{Zenodo}},
   doi = {10.5281/zenodo.3963371},
-  abstract = {This update to the COVID-19 Forecast Hub repository is a snapshot as of 4 December 2020 of the data hosted by and visualized at~https://covid19forecasthub.org/.}
+  abstract = {This update to the COVID-19 Forecast Hub repository is a snapshot as of 4 December 2020 of the data hosted by and visualized at~https://covid19forecasthub.org/.},
+  file = {/mnt/data/github-synced/zotero-nikos/storage/AVWA2UPE/4305938.html}
 }
 
 @article{cramerEvaluationIndividualEnsemble2021,
@@ -174,6 +175,23 @@ @article{elliottForecastingEconomicsFinance2016
   file = {/mnt/data/github-synced/zotero-nikos/storage/3AXCLA59/Elliott and Timmermann - 2016 - Forecasting in Economics and Finance.pdf}
 }
 
+@article{epsteinScoringSystemProbability1969,
+  title = {A {{Scoring System}} for {{Probability Forecasts}} of {{Ranked Categories}}},
+  author = {Epstein, Edward S.},
+  year = {1969},
+  month = dec,
+  journal = {Journal of Applied Meteorology},
+  volume = {8},
+  number = {6},
+  pages = {985--987},
+  publisher = {{American Meteorological Society}},
+  issn = {0021-8952},
+  doi = {10.1175/1520-0450(1969)008<0985:ASSFPF>2.0.CO;2},
+  langid = {english},
+  keywords = {ranked probability score,RPS},
+  file = {/mnt/data/github-synced/zotero-nikos/storage/XAVX39GC/Epstein - 1969 - A Scoring System for Probability Forecasts of Rank.pdf;/mnt/data/github-synced/zotero-nikos/storage/CVK2YPKP/A-Scoring-System-for-Probability-Forecasts-of.html}
+}
+
 @misc{europeancovid-19forecasthubEuropeanCovid19Forecast2021,
   title = {European {{Covid-19 Forecast Hub}}},
   author = {{European Covid-19 Forecast Hub}},
@@ -316,6 +334,20 @@ @article{hamillInterpretationRankHistograms2001a
   file = {/mnt/data/github-synced/zotero-nikos/storage/FJYU9QZH/Hamill - 2001 - Interpretation of Rank Histograms for Verifying En.pdf;/mnt/data/github-synced/zotero-nikos/storage/SH65U38N/1520-0493_2001_129_0550_iorhfv_2.0.co_2.html}
 }
 
+@article{jordanEvaluatingProbabilisticForecasts2019,
+  title = {Evaluating {{Probabilistic Forecasts}} with {{{\textbf{scoringRules}}}}},
+  author = {Jordan, Alexander and Kr{\"u}ger, Fabian and Lerch, Sebastian},
+  year = {2019},
+  journal = {Journal of Statistical Software},
+  volume = {90},
+  number = {12},
+  issn = {1548-7660},
+  doi = {10.18637/jss.v090.i12},
+  abstract = {Probabilistic forecasts in the form of probability distributions over future events have become popular in several fields including meteorology, hydrology, economics, and demography. In typical applications, many alternative statistical models and data sources can be used to produce probabilistic forecasts. Hence, evaluating and selecting among competing methods is an important task. The scoringRules package for R provides functionality for comparative evaluation of probabilistic models based on proper scoring rules, covering a wide range of situations in applied work. This paper discusses implementation and usage details, presents case studies from meteorology and economics, and points to the relevant background literature.},
+  langid = {english},
+  file = {/mnt/data/github-synced/zotero-nikos/storage/DSYW6QUF/Jordan et al. - 2019 - Evaluating Probabilistic Forecasts with bscoring.pdf}
+}
+
 @article{kukkonenReviewOperationalRegionalscale2012,
   title = {A Review of Operational, Regional-Scale, Chemical Weather Forecasting Models in {{Europe}}},
   author = {Kukkonen, J. and Olsson, T. and Schultz, D. M. and Baklanov, A. and Klein, T. and Miranda, A. I. and Monteiro, A. and Hirtl, M. and Tarvainen, V. and Boy, M. and Peuch, V.-H. and Poupkou, A. and Kioutsioukis, I. and Finardi, S. and Sofiev, M. and Sokhi, R. and Lehtinen, K. E. J. and Karatzas, K. and San Jos{\'e}, R. and Astitha, M. and Kallos, G. and Schaap, M. and Reimer, E. and Jakobs, H. and Eben, K.},
@@ -351,6 +383,22 @@ @article{liboschikTscountPackageAnalysis2017
   file = {/mnt/data/github-synced/zotero-nikos/storage/NAKUKRYZ/Liboschik et al. - 2017 - tscount An R Package for Analysis of Count Time S.pdf}
 }
 
+@article{macheteContrastingProbabilisticScoring2012,
+  title = {Contrasting {{Probabilistic Scoring Rules}}},
+  author = {Machete, Reason Lesego},
+  year = {2012},
+  month = jul,
+  journal = {arXiv:1112.4530 [math, stat]},
+  eprint = {1112.4530},
+  eprinttype = {arxiv},
+  primaryclass = {math, stat},
+  abstract = {There are several scoring rules that one can choose from in order to score probabilistic forecasting models or estimate model parameters. Whilst it is generally agreed that proper scoring rules are preferable, there is no clear criterion for preferring one proper scoring rule above another. This manuscript compares and contrasts some commonly used proper scoring rules and provides guidance on scoring rule selection. In particular, it is shown that the logarithmic scoring rule prefers erring with more uncertainty, the spherical scoring rule prefers erring with lower uncertainty, whereas the other scoring rules are indifferent to either option.},
+  archiveprefix = {arXiv},
+  langid = {english},
+  keywords = {62B10; 62C05; 62G05; 62G07; 62F99; 62P05; 62P12; 62P20,Mathematics - Statistics Theory},
+  file = {/mnt/data/github-synced/zotero-nikos/storage/8FYPC3Y4/Machete - 2012 - Contrasting Probabilistic Scoring Rules.pdf}
+}
+
 @article{mannTestWhetherOne1947,
   title = {On a {{Test}} of {{Whether}} One of {{Two Random Variables}} Is {{Stochastically Larger}} than the {{Other}}},
   author = {Mann, H. B. and Whitney, D. R.},
@@ -367,6 +415,43 @@ @article{mannTestWhetherOne1947
   file = {/mnt/data/github-synced/zotero-nikos/storage/YTWX67GQ/Mann and Whitney - 1947 - On a Test of Whether one of Two Random Variables i.pdf;/mnt/data/github-synced/zotero-nikos/storage/8DA4G3H8/1177730491.html}
 }
 
+@article{mathesonScoringRulesContinuous1976,
+  title = {Scoring {{Rules}} for {{Continuous Probability Distributions}}},
+  author = {Matheson, James E. and Winkler, Robert L.},
+  year = {1976},
+  month = jun,
+  journal = {Management Science},
+  volume = {22},
+  number = {10},
+  pages = {1087--1096},
+  publisher = {{INFORMS}},
+  issn = {0025-1909},
+  doi = {10.1287/mnsc.22.10.1087},
+  abstract = {Personal, or subjective, probabilities are used as inputs to many inferential and decision-making models, and various procedures have been developed for the elicitation of such probabilities. Included among these elicitation procedures are scoring rules, which involve the computation of a score based on the assessor's stated probabilities and on the event that actually occurs. The development of scoring rules has, in general, been restricted to the elicitation of discrete probability distributions. In this paper, families of scoring rules for the elicitation of continuous probability distributions are developed and discussed.},
+  file = {/mnt/data/github-synced/zotero-nikos/storage/SVJ7YPP7/Matheson and Winkler - 1976 - Scoring Rules for Continuous Probability Distribut.pdf;/mnt/data/github-synced/zotero-nikos/storage/H5CNZS4U/mnsc.22.10.html}
+}
+
+@article{reichCollaborativeMultiyearMultimodel2019,
+  title = {A Collaborative Multiyear, Multimodel Assessment of Seasonal Influenza Forecasting in the {{United States}}},
+  author = {Reich, Nicholas G. and Brooks, Logan C. and Fox, Spencer J. and Kandula, Sasikiran and McGowan, Craig J. and Moore, Evan and Osthus, Dave and Ray, Evan L. and Tushar, Abhinav and Yamana, Teresa K. and Biggerstaff, Matthew and Johansson, Michael A. and Rosenfeld, Roni and Shaman, Jeffrey},
+  year = {2019},
+  month = feb,
+  journal = {Proceedings of the National Academy of Sciences},
+  volume = {116},
+  number = {8},
+  pages = {3146--3154},
+  publisher = {{National Academy of Sciences}},
+  issn = {0027-8424, 1091-6490},
+  doi = {10.1073/pnas.1812594116},
+  abstract = {Influenza infects an estimated 9\textendash 35 million individuals each year in the United States and is a contributing cause for between 12,000 and 56,000 deaths annually. Seasonal outbreaks of influenza are common in temperate regions of the world, with highest incidence typically occurring in colder and drier months of the year. Real-time forecasts of influenza transmission can inform public health response to outbreaks. We present the results of a multiinstitution collaborative effort to standardize the collection and evaluation of forecasting models for influenza in the United States for the 2010/2011 through 2016/2017 influenza seasons. For these seven seasons, we assembled weekly real-time forecasts of seven targets of public health interest from 22 different models. We compared forecast accuracy of each model relative to a historical baseline seasonal average. Across all regions of the United States, over half of the models showed consistently better performance than the historical baseline when forecasting incidence of influenza-like illness 1 wk, 2 wk, and 3 wk ahead of available data and when forecasting the timing and magnitude of the seasonal peak. In some regions, delays in data reporting were strongly and negatively associated with forecast accuracy. More timely reporting and an improved overall accessibility to novel and traditional data sources are needed to improve forecasting accuracy and its integration with real-time public health decision making.},
+  chapter = {PNAS Plus},
+  copyright = {Copyright \textcopyright{} 2019 the Author(s). Published by PNAS.. https://creativecommons.org/licenses/by-nc-nd/4.0/This open access article is distributed under Creative Commons Attribution-NonCommercial-NoDerivatives License 4.0 (CC BY-NC-ND).},
+  langid = {english},
+  pmid = {30647115},
+  keywords = {forecasting,infectious disease,influenza,public health,statistics},
+  file = {/mnt/data/github-synced/zotero-nikos/storage/XEKLR37W/Reich et al. - 2019 - A collaborative multiyear, multimodel assessment o.pdf;/mnt/data/github-synced/zotero-nikos/storage/QID3PLW4/3146.html}
+}
+
 @article{timmermannForecastingMethodsFinance2018,
   title = {Forecasting {{Methods}} in {{Finance}}},
   author = {Timmermann, Allan},