Merge pull request #42 from mattheaphy/dev

exp_df and trx_df coercion
mattheaphy · Nov 25, 2023 · 4854843 · 4854843
2 parents 351e5fb + 5093881
commit 4854843
Show file tree

Hide file tree

Showing 28 changed files with 961 additions and 49 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -67,7 +67,9 @@ export(add_predictions)
 export(add_transactions)
 export(anti_join)
 export(arrange)
+export(as_exp_df)
 export(as_exposed_df)
+export(as_trx_df)
 export(autoplot)
 export(autotable)
 export(bake)
@@ -88,8 +90,10 @@ export(full_join)
 export(group_by)
 export(groups)
 export(inner_join)
+export(is_exp_df)
 export(is_exposed_df)
 export(is_split_exposed_df)
+export(is_trx_df)
 export(left_join)
 export(mutate)
 export(plot_actual_to_expected)

diff --git a/NEWS.md b/NEWS.md
@@ -18,6 +18,13 @@ when `exp_stats()` is passed a weighting variable.
 - Added a `summary()` method for `exposed_df` objects that calls `exp_stats()`.
 - The assumed default status in `expose()` functions was changed from the first
 observed status to the most common status.
+- The functions `as_exp_df()` and `as_trx_df()` were added to convert 
+pre-aggregated experience studies to the `exp_df` and `trx_df` formats, 
+respectively.
+- `agg_sim_dat` - a new simulated data set of pre-aggregated experience was 
+added for testing `as_exp_df()` and `as_trx_df()`.
+- `is_exp_df()` and `as_trx_df()` were added to test for the `exp_df` and 
+`trx_df` classes.
 
 # actxps 1.3.0
 

diff --git a/R/agg_sim_dat.R b/R/agg_sim_dat.R
@@ -0,0 +1,40 @@
+#' Aggregate simulated annuity data
+#'
+#' A pre-aggregated version of surrender and withdrawal experience from the
+#' simulated data sets `census_dat`, `withdrawals`, and `account_vals`. This
+#' data is theoretical only and does not represent the experience on any
+#' specific product.
+#'
+#' @format A data frame containing summarized experience study results grouped
+#' by policy year, income guarantee presence, tax-qualified status, and product.
+#'
+#' @details
+#'
+#' \describe{
+#'   \item{pol_yr}{Policy year}
+#'   \item{inc_guar}{Indicates whether the policy was issued with an income
+#'   guarantee}
+#'   \item{qual}{Indicates whether the policy was purchased with tax-qualified
+#'   funds}
+#'   \item{product}{Product: a, b, or c}
+#'   \item{exposure_n}{Sum of policy year exposures by count}
+#'   \item{claims_n}{Sum of claim counts}
+#'   \item{av}{Sum of account value}
+#'   \item{exposure_amt}{Sum of policy year exposures weighted by account value}
+#'   \item{claims_amt}{Sum of claims weighted by account value}
+#'   \item{av_sq}{Sum of squared account values}
+#'   \item{n}{Number of exposure records}
+#'   \item{wd}{Sum of partial withdrawal transactions}
+#'   \item{wd_n}{Count of partial withdrawal transactions}
+#'   \item{wd_flag}{Count of exposure records with partial withdrawal
+#'   transactions}
+#'   \item{wd_sq}{Sum of squared partial withdrawal transactions}
+#'   \item{av_w_wd}{Sum of account value for exposure records with partial
+#'   withdrawal transactions}
+#' }
+#' @seealso [census_dat]
+#' @name agg_sim_dat
+
+NULL
+#' @rdname agg_sim_dat
+"agg_sim_dat"
diff --git a/R/exp_df_helpers.R b/R/exp_df_helpers.R
@@ -0,0 +1,160 @@
+#' Termination summary helper functions
+#'
+#' Convert aggregate termination experience studies to the `exp_df` class.
+#'
+#' `is_exp_df()` will return `TRUE` if `x` is an `exp_df` object.
+#'
+#' `as_exp_df()` will coerce a data frame to an `exp_df` object if that
+#' data frame has columns for exposures and claims.
+#'
+#' `as_exp_df()` is most useful for working with aggregate summaries of
+#' experience that were not created by actxps where individual policy
+#' information is not available. After converting the data to the `exp_df`
+#' class, [summary()] can be used to summarize data by any grouping variables,
+#' and [autoplot()] and [autotable()] are available for reporting.
+#'
+#' If nothing is passed to `wt`, the data frame `x` must include columns
+#' containing:
+#'
+#' - Exposures (`exposure`)
+#' - Claim counts (`claims`)
+#'
+#' If `wt` is passed, the data must include columns containing:
+#'
+#' - Weighted exposures (`exposure`)
+#' - Weighted claims (`claims`)
+#' - Claim counts (`n_claims`)
+#' - The raw sum of weights **NOT** multiplied by exposures
+#' - Exposure record counts (`.weight_n`)
+#' - The raw sum of squared weights (`.weight_sq`)
+#'
+#' The names in parentheses above are expected column names. If the data
+#' frame passed to `as_exp_df()` uses different column names, these can be
+#' specified using the `col_*` arguments.
+#'
+#' When a column name is passed to `wt`, the columns `.weight`, `.weight_n`,
+#' and `.weight_sq` are used to calculate credibility and confidence intervals.
+#' If credibility and confidence intervals aren't required, then it is not
+#' necessary to pass anything to `wt`. The results of `as_exp_df()` and any
+#' downstream summaries will still be weighted as long as the exposures and
+#' claims are pre-weighted.
+#'
+#' `target_status`, `start_date`, and `end_date` are optional arguments that are
+#' only used for printing the resulting `exp_df` object.
+#'
+#' @param x An object. For `as_exp_df()`, `x` must be a data frame.
+#' @param expected A character vector containing column names in x with
+#' expected values
+#' @param wt Optional. Length 1 character vector. Name of the column in `x`
+#' containing weights to use in the calculation of claims, exposures, partial
+#' credibility, and confidence intervals.
+#' @param col_claims Optional. Name of the column in `x` containing claims. The
+#' assumed default is "claims".
+#' @param col_exposure Optional. Name of the column in `x` containing exposures.
+#' The assumed default is "exposure".
+#' @param col_n_claims Optional and only used used when `wt` is passed. Name of
+#' the column in `x` containing the number of claims.
+#' @param col_weight_sq Optional and only used used when `wt` is passed. Name of
+#' the column in `x` containing the sum of squared weights.
+#' @param col_weight_n Optional and only used used when `wt` is passed. Name of
+#' the column in `x` containing exposure record counts.
+#' @param credibility If `TRUE`, future calls to [summary()] will include
+#' partial credibility weights and credibility-weighted termination rates.
+#' @param conf_level Confidence level used for the Limited Fluctuation
+#' credibility method and confidence intervals
+#' @param cred_r Error tolerance under the Limited Fluctuation credibility
+#' method
+#' @param conf_int If `TRUE`, future calls to [summary()] will include
+#' confidence intervals around the observed termination rates and any
+#' actual-to-expected ratios.
+#' @inheritParams expose
+#'
+#' @return For `is_exp_df()`, a length-1 logical vector. For `as_exp_df()`,
+#' an `exp_df` object.
+#'
+#' @seealso [exp_stats()] for information on how `exp_df` objects are typically
+#' created from individual exposure records.
+#'
+#' @examples
+#' # convert pre-aggregated experience into an exp_df object
+#' dat <- as_exp_df(agg_sim_dat, col_exposure = "exposure_n",
+#'                  col_claims = "claims_n",
+#'                  target_status = "Surrender",
+#'                  start_date = 2005, end_date = 2019,
+#'                  conf_int = TRUE)
+#' dat
+#' is_exp_df(dat)
+#'
+#' # summary by policy year
+#' summary(dat, pol_yr)
+#'
+#' # repeat the prior exercise on a weighted basis
+#' dat_wt <- as_exp_df(agg_sim_dat, wt = "av",
+#'                     col_exposure = "exposure_amt",
+#'                     col_claims = "claims_amt",
+#'                     col_n_claims = "claims_n",
+#'                     col_weight_sq = "av_sq",
+#'                     col_weight_n = "n",
+#'                     target_status = "Surrender",
+#'                     start_date = 2005, end_date = 2019,
+#'                     conf_int = TRUE)
+#' dat_wt
+#'
+#' # summary by policy year
+#' summary(dat_wt, pol_yr)
+#'
+#'
+#' @export
+as_exp_df <- function(x, expected = NULL, wt = NULL,
+                      col_claims, col_exposure,
+                      col_n_claims, col_weight_sq, col_weight_n,
+                      target_status = NULL,
+                      start_date = as.Date("1900-01-01"), end_date = NULL,
+                      credibility = FALSE,
+                      conf_level = 0.95, cred_r = 0.05, conf_int = FALSE) {
+
+  if (is_exp_df(x)) return(x)
+
+  if (!is.data.frame(x)) {
+    rlang::abort("`x` must be a data frame.")
+  }
+
+  # column name alignment
+  if (!missing(col_exposure)) x <- x |> rename(exposure = {{col_exposure}})
+  if (!missing(col_claims)) x <- x |> rename(claims = {{col_claims}})
+
+  if (is.null(wt)) {
+    req_names <- c("exposure", "claims")
+  } else {
+    req_names <- c("exposure", "claims", "n_claims", ".weight",
+                   ".weight_sq", ".weight_n")
+    if (!missing(col_n_claims)) x <- x |> rename(n_claims = {{col_n_claims}})
+    x <- x |> rename(.weight = {{wt}})
+    if (!missing(col_weight_sq)) x <- x |>
+        rename(.weight_sq = {{col_weight_sq}})
+    if (!missing(col_weight_n)) x <- x |> rename(.weight_n = {{col_weight_n}})
+  }
+
+  # check required columns
+  verify_col_names(names(x), req_names)
+
+  if (is.null(wt)) x$n_claims <- x$claims
+
+  new_exp_df(x,
+             .groups = list(),
+             target_status = target_status,
+             start_date = start_date,
+             expected = expected,
+             end_date = end_date,
+             wt = wt,
+             credibility = credibility,
+             conf_level = conf_level, cred_r = cred_r,
+             conf_int = conf_int)
+
+}
+
+#' @export
+#' @rdname as_exp_df
+is_exp_df <- function(x) {
+  inherits(x, "exp_df")
+}
diff --git a/R/exp_stats.R b/R/exp_stats.R
@@ -332,7 +332,23 @@ finish_exp_stats <- function(.data, target_status, expected,
                .after = dplyr::last_col())
   }
 
-  tibble::new_tibble(res,
+  new_exp_df(res,
+             .groups = .groups,
+             target_status = target_status,
+             start_date = start_date,
+             expected = expected,
+             end_date = end_date,
+             wt = wt,
+             credibility = credibility,
+             conf_level = conf_level, cred_r = cred_r,
+             conf_int = conf_int)
+}
+
+# low level class constructor
+new_exp_df <- function(x, .groups, target_status, start_date, expected,
+                       end_date, wt, credibility, conf_level,
+                       cred_r = cred_r, conf_int) {
+  tibble::new_tibble(x,
                      class = "exp_df",
                      groups = .groups,
                      target_status = target_status,
@@ -341,7 +357,8 @@ finish_exp_stats <- function(.data, target_status, expected,
                      end_date = end_date,
                      wt = wt,
                      xp_params = list(credibility = credibility,
-                                      conf_level = conf_level, cred_r = cred_r,
+                                      conf_level = conf_level,
+                                      cred_r = cred_r,
                                       conf_int = conf_int))
 }
 

diff --git a/R/expose_split.R b/R/expose_split.R
@@ -37,7 +37,8 @@
 #' @examples
 #' toy_census |> expose_cy("2022-12-31") |> expose_split()
 #'
-#' @seealso [expose()]
+#' @seealso [expose()] for information on creating exposure records from census
+#' data.
 #'
 #' @export
 expose_split <- function(.data) {

diff --git a/R/exposed_df_helpers.R b/R/exposed_df_helpers.R
@@ -42,6 +42,8 @@
 #' `as_exposed_df()`, an `exposed_df` object.
 #'
 #' @importFrom vctrs vec_ptype2 vec_cast
+#' @seealso [expose()] for information on how `exposed_df` objects are typically
+#' created from census data.
 #'
 #' @export
 is_exposed_df <- function(x) {
@@ -119,16 +121,11 @@ as_exposed_df <- function(x, end_date, start_date = as.Date("1900-01-01"),
 
   # check required columns
   # pol_num, status, exposure, 2 date cols, policy period (policy expo only)
-  unmatched <- c("pol_num", "status", "exposure",
+  req_names <- c("pol_num", "status", "exposure",
                  exp_col_pol_per,
                  exp_cols_dates,
                  exp_cols_trx)
-  unmatched <- setdiff(unmatched, names(x))
-
-  if (length(unmatched) > 0) {
-    rlang::abort(c(x = glue::glue("The following columns are missing from `x`: {paste(unmatched, collapse = ', ')}."),
-                   i = "Hint: create these columns or use the `col_*` arguments to specify existing columns that should be mapped to these elements."))
-  }
+  verify_col_names(names(x), req_names)
 
   if (missing(default_status)) {
     default_status <- most_common(x$status)
@@ -612,3 +609,13 @@ verify_get_trx_types <- function(.data, required = TRUE) {
   }
   trx_types
 }
+
+# function to verify that required names exist and to send an error if not
+verify_col_names <- function(x_names, required) {
+  unmatched <- setdiff(required, x_names)
+
+  if (length(unmatched) > 0) {
+    rlang::abort(c(x = glue::glue("The following columns are missing: {paste(unmatched, collapse = ', ')}."),
+                   i = "Hint: create these columns or use the `col_*` arguments to specify existing columns that should be mapped to these elements."))
+  }
+}
diff --git a/R/sim_data.R b/R/sim_data.R
@@ -43,7 +43,7 @@
 #'   \item{av_anniv}{Account value on the policy anniversary date}
 #' }
 #'
-
+#' @seealso [census_dat]
 #' @name sim_data
 
 NULL
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,7 +43,7 @@ @@
     #'   \item{av_anniv}{Account value on the policy anniversary date}
     #' }
     #'
+    #' @seealso [census_dat]
     #' @name sim_data
     NULL
@@ Expand Down @@