From 9eb71fd60b0c2f43c674b0e49bada9b7c2eef72b Mon Sep 17 00:00:00 2001 From: Christophe Dutang Date: Fri, 27 Sep 2024 11:01:15 +0200 Subject: [PATCH] add Python vignette --- DESCRIPTION | 14 +- NEWS.md | 2 +- _pkgdown.yml | 8 + man/pricingame.Rd | 2 +- vignettes/insur_fair_vignette_rf.qmd | 1241 ++++++++++++++++++++++++++ vignettes/mortality_vignette.qmd | 603 +++++++++++++ vignettes/references.bib | 142 ++- 7 files changed, 2005 insertions(+), 7 deletions(-) create mode 100644 vignettes/insur_fair_vignette_rf.qmd create mode 100644 vignettes/mortality_vignette.qmd diff --git a/DESCRIPTION b/DESCRIPTION index 666b01b..4e6aef4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,14 +6,20 @@ Authors@R: c(person("Christophe", "Dutang", role = c("aut", "cre"), email = "dut person("Arthur", "Charpentier", role = c("aut"), comment = c(ORCID = "0000-0003-3654-6286")), person("Ewen", "Gallic", role = c("ctb"), comment = c(ORCID = "0000-0003-3740-2620")), person("Julien", "Siharath", role = c("ctb"))) -Description: A collection of insurance datasets, originally for the book 'Computational Actuarial Science with R' +Description: A collection of insurance datasets, originally + for the book 'Computational Actuarial Science with R' - edited by Arthur Charpentier. Now, the package contains a large variety of actuarial datasets. + edited by Arthur Charpentier. Now, the package contains a + large variety of insurance datasets. Vignettes provide + typical use-cases in actuarial science. Depends: R (>= 3.6.0), xts, survival Imports: lattice -Suggests: sp, sf, knitr, kableExtra, quarto, rmarkdown, AER, boot, broom, ChainLadder, demography, dplyr, forecast, ggplot2, glmnet, MASS, mgcv, pscl, rainbow, RColorBrewer, splines, tidyr, tidyverse, wesanderson +Suggests: sp, sf, knitr, kableExtra, quarto, rmarkdown, + AER, boot, broom, ChainLadder, demography, dplyr, forecast, + ggplot2, glmnet, MASS, mgcv, pscl, rainbow, RColorBrewer, + reticulate, splines, tidyr, tidyverse, wesanderson License: GPL (>= 2) -URL: https://dutangc.github.io/CASdatasets/, http://dutangc.free.fr/pub/RRepos/, http://cas.uqam.ca/, http://dutangc.perso.math.cnrs.fr/RRepository/ +URL: https://dutangc.github.io/CASdatasets/, http://dutangc.free.fr/pub/RRepos/, https://freakonometrics.github.io/CASdatasets/, http://dutangc.perso.math.cnrs.fr/RRepository/ BugReports: https://github.com/dutangc/CASdatasets/issues NeedsCompilation: no VignetteBuilder: quarto diff --git a/NEWS.md b/NEWS.md index 3a9b5d7..e19b8f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,7 @@ Current version for development. - All **Geographical datasets** have been put in the extra directory `geodata/` of this repository. - Put new packages in Suggest fields. - New datasets have been added (see below). -- Add quarto vignettes with detailed use cases thanks to Julien. +- Add quarto vignettes with detailed use cases thanks to Julien. One vignette use Python with the following packages equipy, pandas, numpy, sklearn, seaborn, matplotlib, typing. - The package has been uploaded on the [French research data repository](https://entrepot.recherche.data.gouv.fr) and obtained a DOI : [10.57745/P0KHAG](https://doi.org/10.57745/P0KHAG). ## Datasets lists diff --git a/_pkgdown.yml b/_pkgdown.yml index 8e3203d..8bc7a5d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -287,6 +287,14 @@ articles: contents: - triangle_vignette - triangle_aids_vignette +- title: Life insurance -- use cases of mortality modeling + navbar: ~ + contents: + - mortality_vignette +- title: Fairness analysis + navbar: ~ + contents: + - insur_fair_vignette_rf toc: number_sections: yes diff --git a/man/pricingame.Rd b/man/pricingame.Rd index 708001e..accf8d4 100644 --- a/man/pricingame.Rd +++ b/man/pricingame.Rd @@ -249,7 +249,7 @@ data(pg17testyear4) } -The bonus/malus system is compulsary in France, but we will only use it here as a possible feature. The coefficient is attached to the driver. It starts at 1 for young drivers (i.e. first year of insurance). Then, every year without claim, the bonus decreases by 5 percent until it reaches its minimum of 0.5. Without any claim, the bonus evolution would then be : 1 \eqn{->} 0.95 \eqn{->} 0.9 \eqn{->} 0.85 \eqn{->} 0.8 \eqn{->} 0.76 \eqn{->} 0.72 \eqn{->} 0.68 \eqn{->} 0.64 \eqn{->} 0.6 \eqn{->} 0.57 \eqn{->} 0.54 \eqn{->} 0.51 \eqn{->} 0.5. +The bonus/malus system is compulsory in France, but we will only use it here as a possible feature. The coefficient is attached to the driver. It starts at 1 for young drivers (i.e. first year of insurance). Then, every year without claim, the bonus decreases by 5 percent until it reaches its minimum of 0.5. Without any claim, the bonus evolution would then be : 1 \eqn{->} 0.95 \eqn{->} 0.9 \eqn{->} 0.85 \eqn{->} 0.8 \eqn{->} 0.76 \eqn{->} 0.72 \eqn{->} 0.68 \eqn{->} 0.64 \eqn{->} 0.6 \eqn{->} 0.57 \eqn{->} 0.54 \eqn{->} 0.51 \eqn{->} 0.5. Every time the driver causes a claim (only certain types of claims are taken into account), the coefficient increases by 25 percent, with a maximum of 3.5. Thus, the range of bonus/malus coefficient extends from 0.5 to 3.5 in the datasets. } diff --git a/vignettes/insur_fair_vignette_rf.qmd b/vignettes/insur_fair_vignette_rf.qmd new file mode 100644 index 0000000..210f2bd --- /dev/null +++ b/vignettes/insur_fair_vignette_rf.qmd @@ -0,0 +1,1241 @@ +--- +title: "Optimal transport and fairness in a French insurance data analysis" +date: '`r Sys.Date()`' +date-format: "MMMM, YYYY" +author: "Julien Siharath" +chapters: + - Introduction + - Purpose + - Methods + - Estimation + - Fairness + - Visualisation +categories: [equipy, Public liability, Optimal transport, fairness, equipy , France, freMPL1sub, Python] +bibliography: references.bib +fig-cap-location: top +editor: + markdown: + wrap: 72 +format: + html: + embed-resources: true + code-fold: true + code-summary: "Show the code" + css: style.scss + theme: cosmo + toc: true + toc-depth: 2 +vignette: > + %\VignetteIndexEntry{Optimal transport and fairness in a French insurance data analysis} + %\VignetteEngine{quarto::html} + %\VignetteEncoding{UTF-8} +--- + +# Introduction {#sec-equipy-introduction} + +```{r setup, message=FALSE, warning=FALSE} +#| code-fold: true +#| code-summary: "Session Settings" +#| label: load-packages + +# Graphs---- +face_text='plain' +face_title='plain' +size_title = 14 +size_text = 11 +legend_size = 11 + +global_theme <- function() { + theme_minimal() %+replace% + theme( + text = element_text(size = size_text, face = face_text), + legend.position = "bottom", + legend.direction = "horizontal", + legend.box = "vertical", + legend.key = element_blank(), + legend.text = element_text(size = legend_size), + axis.text = element_text(size = size_text, face = face_text), + plot.title = element_text( + size = size_title, + hjust = 0.5 + ), + plot.subtitle = element_text(hjust = 0.5) + ) +} + +# Outputs +options("digits" = 2) + +``` + +```{css setup, echo = FALSE} +.justify { + text-align: justify !important +} +``` + +::: callout-warning + +**This vignette uses Python code.** + +::: + +::: callout-tip +### In Brief + +Insurance regulations often prohibit discrimination based on sensitive factors such as age and gender. In this vignette, we will explore a hypothetical law that prevents insurance companies from using age and gender as criteria for discrimination. To simulate this scenario, we will model sinistrality occurrences in **Python** using a Random Forest algorithm and apply optimal transport techniques with the `equipy` package. This approach ensures fairness within the `freMPL1sub` dataset from @charpentierCAS, which contains detailed information on French motor insurance contracts and claims. + +::: + +## Required Packages + + +```{r libraries, message=FALSE} +#| code-fold: false +#| code-summary: "R packages" + +# R package required to run Python code in a Quarto document + +required_R_libraries <- c( + "reticulate" +) +invisible(lapply(required_R_libraries, library, character.only = TRUE)) +``` + +```{python libraries-python} +#| code-fold: false +#| code-summary: "Python packages" + +from equipy import metrics +from equipy import fairness +from equipy import graphs +from equipy.metrics import unfairness, performance +from equipy.fairness import MultiWasserstein +from equipy.graphs import fair_density_plot + +import pandas as pd +import numpy as np + +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import load_iris +from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, f1_score, confusion_matrix + +import seaborn as sns +import matplotlib.pyplot as plt + +import re +from typing import Union, Optional + +from equipy.graphs import fair_arrow_plot, fair_multiple_arrow_plot + +from equipy.graphs import fair_density_plot +import warnings + + +np.random.seed(2024) +``` + +## Data + +::: justify + +The data used in this vignette comes from a French motor third-party liability insurance portfolio. The dataset, `freMPL1sub`, is part of the CASdatasets package in RStudio and contains details about contracts and clients obtained from a French insurance company, specifically related to a motor insurance portfolio. + +This dataset is a subset of several datasets, such as `freMTPL1`, `freMTPL2`, and others, all of which are available in the casdatasets package. The subset has been filtered to include only instances where the exposure is greater than 0.9, allowing for the effective application of classification trees for more precise analysis. + +::: + +## Dictionaries + +The list of the 18 variables from the `freMPL1sub` dataset is reported +in @tbl-dict-freMPL1sub. + +::: justify + +| Attribute | Type | Description | +|-------------|---------|---------------------------------------------------------| +| LicAge | Numeric | The driving licence age, in months | +| VehAge | Numeric | Age of the vehicle in years | +| sensitive | Factor | Gender of the driver | +| MariStat | Factor | Marital status of the driver | +| SocioCateg | Factor | Socio-economic category of the driver | +| VehUsage | Factor | Usage of the vehicle | +| DrivAge | Numeric | Age of the driver in years | +| HasKmLimit | boolean | Indicator if there's a mileage limit | +| BonusMalus | Numeric | Bonus-malus coefficient | +| VehBody | Factor | Body type of the vehicle | +| VehPrice | Factor | Price category of the vehicle | +| VehEngine | Factor | Type of engine | +| VehEnergy | Factor | Type of fuel used (diesel or regular) | +| VehMaxSpeed | Factor | Maximum speed category of the vehicle | +| VehClass | Factor | Vehicle class | +| y | boolean | Response variable (if there is a claim) | +| RiskVar | Numeric | Risk variable | +| Garage | Factor | Type of garage used | + +: Content of the `freMPL1sub` dataset {#tbl-dict-freMPL1sub +.striped .hover} + +::: + +## Importation + +```{r} +library(CASdatasets) +data("freMPL1sub") +``` + + +```{python importation-python} +#| code-fold: true +#| code-summary: "Code for importing our datasets on python environment" + +#freMPL1sub_equipy_path = 'data/freMPL1sub.csv' +#pd.read_csv(freMPL1sub_equipy_path) + +freMPL1sub = r.freMPL1sub + +freMPL1sub['Age'] = '65-' +freMPL1sub.loc[freMPL1sub['DrivAge'] >= 65, 'Age'] = '65+' +freMPL1sub.drop('DrivAge', axis=1, inplace=True) +``` + + +# Purpose {#sec-equipy-purpose} + +::: justify + +In the context of insurance, understanding the balance between "fair" and "unfair" discrimination is critical. One notable example is age-based discrimination, often seen as acceptable in insurance pricing. Younger drivers—despite being statistically more accident-prone—are typically charged lower premiums than their actual risk would justify. The extra costs are, instead, borne by older, more experienced drivers, who end up paying more than what their individual risk profile alone would suggest. This practice, while technically discriminatory, is accepted because it’s part of a generational balancing act: over time, younger drivers eventually mature and will shoulder the cost for future younger generations. + +This is known as acceptable discrimination in insurance, where premiums are adjusted based on factors like age and driving experience. However, other forms of discrimination, particularly those based on sensitive variables such as gender or ethnicity, are prohibited under laws like the **Anti-Discrimination Law of 2008** in France. Sensitive variables are protected by law, meaning they cannot be used as direct factors in pricing models, as this could reinforce social inequalities. + +The challenge arises when proxy variables—those that indirectly encode similar information—can still introduce bias. For example, while gender might be excluded, variables like vehicle type or occupation can act as proxies, unintentionally reintroducing discriminatory patterns. Therefore, insurers face the difficult task of balancing fairness and ensuring the economic viability of their models. + +The dilemma is clear when considering male and female drivers. If premiums for male drivers, who statistically have higher accident rates, are set too low, insurers may face financial loss. Conversely, if premiums are set too high for female drivers, who generally pose lower risk, it could result in unfair pricing and limit their access to affordable insurance. This highlights the importance of maintaining acceptable discrimination based on actual risk while avoiding bias that would be prohibited by law. + +To address these challenges, advanced mathematical tools like **Optimal Transport** and the **Wasserstein barycenter** can help. Rather than aligning premiums solely with the lower-risk group, such as older drivers, or the higher-risk group, such as younger drivers, which can skew fairness and financial stability, the Wasserstein barycenter offers a middle ground. It calculates a balanced distribution of risk between these groups, minimizing disparity without sacrificing economic viability. Additionally, the barycenter conserves the mean of the premiums after transformation, ensuring that the overall premium level remains financially viable for the insurer. + +Mathematically, given distributions $\mu_1, \mu_2, \dots, \mu_n$ corresponding to different demographic groups, the Wasserstein barycenter $\mu^*$ is defined as: + +$$ +\mu^* = \arg\min_{\mu} \sum_{i=1}^{n} \lambda_i W_2^2(\mu, \mu_i) +$$ + +where $W_2$ denotes the Wasserstein distance of order 2, and $\lambda_i$ are the weights associated with each distribution $\mu_i$. + +A key component in ensuring fairness is **strong demographic parity**. This principle requires that the probability distributions of model outcomes be the same across different protected groups. Mathematically, it demands that the distribution of scores for one group, say $m(X, S = A)$, must be equivalent to the distribution for another group, $m(X, S = B)$: + +$$ +m(X, S = A) \sim m(X, S = B) +$$ + +In this context, $m(X, S = A)$ represents the predicted scores for individuals in group $A$ (e.g., male), while $m(X, S = B)$ represents the predicted scores for individuals in group $B$ (e.g., female). The symbol $\sim$ indicates that the distributions of these predicted scores should be statistically similar or identical. + +This ensures that the model's outcomes do not systematically favor one group over another. For example, if group $A$ consistently receives lower premiums than group $B$ for similar risk profiles, the model would violate strong demographic parity. By requiring that these score distributions be the same, this method helps to eliminate any bias that might arise from group membership alone, ensuring that decisions are made based purely on risk-related features. + +Tools like the **`equipy` package** provide practical implementations of these concepts. By applying Wasserstein distances and adjusting model outputs, `equipy` ensures that sensitive variables like age and gender do not lead to biased pricing models. For instance, if a **Random Forest** model is used to predict claim occurrences, `equipy` can modify the model in a post-processing step to enforce fairness while maintaining predictive accuracy. + +Balancing fairness with economic sustainability is crucial, particularly when modeling high-risk groups like young drivers. Historical practices might justify lower premiums for them, but using **Optimal Transport** and the **Wasserstein barycenter** techniques ensures that premiums are equitable across demographics without compromising the financial health of the insurer. In this vignette, we will explore a theoretical case where fairness is enforced in pricing across both young and old drivers, as well as between male and female drivers. This demonstrates how fairness can be introduced in insurance pricing, ensuring compliance with ethical and legal standards while maintaining economic viability. + +Beyond legal compliance, fairness methods offer broader benefits. Ensuring transparent and equitable pricing enhances customer trust and aligns with ethical AI standards, as explored in sources like @sauce2023ai, which examines strategies for addressing proxy discrimination in AI-driven models. By applying these fairness principles, insurers can maintain compliance while fostering fairness in their decision-making processes. + +To summarize, **Wasserstein barycenters** and **Optimal Transport** provide innovative solutions to fairness challenges in insurance. They ensure that premiums reflect risk fairly while adhering to ethical and legal standards. As discussed in the Visualizations section, methods like **Fairness by Unawareness** and **Fairness through Awareness** will be explored to address fairness in predictive modeling. + +::: + +# Methods + +## Random Forest Model + +### Random Forest Overview + +Random Forest, as described in @breiman2001random, is a versatile machine learning technique that belongs to a class of ensemble learning methods. In this method, multiple decision trees are constructed during training, and their predictions are combined to produce a more accurate and stable model. Specifically, predictions are averaged for regression tasks and voted on for classification tasks. + +The key idea behind Random Forest is to build a "forest" of decision trees, where each tree is trained on a random subset of the data and a random subset of the features. This randomness makes the model more robust and less prone to overfitting, which is a common issue with individual decision trees. + +The process of building a Random Forest can be summarized in the following steps: + +1. **Bootstrap Sampling**: Multiple subsets of the original dataset are created using bootstrapping, which involves randomly selecting samples with replacement. +2. **Random Feature Selection**: At each node of the tree, a random subset of features is selected, and the best feature from this subset is used to split the data. +3. **Tree Construction**: Decision trees are constructed for each bootstrap sample, and these trees grow without pruning, resulting in very deep and complex structures. +4. **Aggregation**: For regression tasks, predictions from all trees are averaged. For classification tasks, the most frequent prediction (mode) is selected. + +The Random Forest model can be mathematically represented as: + +$$ +\hat{y}_i = \frac{1}{T} \sum_{t=1}^{T} f_t(x_i) +$$ + +where $\hat{y}_i$ is the predicted value for the $i^{th}$ observation, $T$ is the total number of trees, and $f_t(x_i)$ represents the prediction of the $t^{th}$ tree. + +One of the key advantages of Random Forest is its ability to handle large datasets with high dimensionality and missing data. The random selection of features at each split ensures that the model doesn't rely too heavily on any single feature, reducing the risk of overfitting. Additionally, Random Forest provides a measure of **feature importance**, ranking the contribution of each feature to the model's predictive power. This is particularly useful for understanding which features are most influential in making predictions. + +In conclusion, Random Forest is widely appreciated for its high accuracy, robustness, and ease of use. It is highly effective for both regression and classification tasks and is especially valuable in situations with a large number of features or complex relationships between variables. For additional information, see @breiman2001random. + + +## EquiPy {#sec-equipy-equipy} + +::: justify + +`equipy` is a Python package specifically designed to implement sequential fairness in machine learning models, particularly when managing multiple sensitive attributes. This advanced post-processing method leverages multi-marginal Wasserstein barycenters to achieve fairness across various sensitive features. By extending the concept of Strong Demographic Parity to scenarios involving multiple sensitive characteristics, `equipy` allows for a nuanced approach to balancing fairness and model performance. + +### Key Functionalities: + +- **Fairness Module**: The package's core functionality centers around the computation of fairness metrics using Wasserstein barycenters, particularly through the `_wasserstein.py` module. This method ensures equitable treatment across different sensitive attributes, allowing models to achieve approximate fairness in complex, multi-attribute settings. + +- **Graphing Utilities**: To facilitate the analysis and interpretation of fairness outcomes, `equipy` offers a suite of visual tools. These include `_arrow_plot.py` for directional fairness visualizations, `_density_plot.py` for examining the distributional impact of fairness adjustments, and `_waterfall_plot.py` for understanding the cumulative effects of fairness interventions. + +- **Metrics Module**: The package provides comprehensive evaluation tools, including `_fairness_metrics.py` for assessing fairness across sensitive attributes. These metrics enable a detailed examination of how fairness measures could influence predictive accuracy and other performance indicators. + +Developed in 2023 by François Hu, Philipp Ratz, Suzie Grondin, Agathe Fernandes Machado, and Arthur Charpentier, `equipy` is grounded in cutting-edge research, as presented in the paper "@charpentier2023mitigating: A Sequentially Fair Mechanism for Multiple Sensitive Attributes." This foundational research underpins the package’s approach to sequential fairness, making it a robust and theoretically sound tool for practitioners. + +For more detailed information about the package, visit the [equiPy website](https://equilibration.github.io/equipy/). + + +::: + +# Estimation of the Random Forest {#sec-equipy-estimation} + +## Data preprocessing + +To prepare the data for modeling with RandomForest, we start by encoding the categorical variables. This is necessary because RandomForest requires numerical input. We use one-hot encoding to convert the categorical variables into binary columns. Additionally, we drop the first category of each encoded variable to avoid multicollinearity. +```{python preprocessing} +#| code-fold: true +#| code-summary: "Data Preprocessing for RF Model" + +# Perform one-hot encoding for categorical variables to apply rf +df_encoded = pd.get_dummies(freMPL1sub, columns=['VehAge', 'sensitive', 'MariStat', 'SocioCateg', + 'VehUsage', 'VehBody', 'VehPrice', 'VehEngine', + 'VehEnergy', 'VehMaxSpeed', 'VehClass', + 'Garage', 'Age'], drop_first=True) + +X = df_encoded.drop("y", axis=1) + +y = df_encoded["y"] + +``` + +Next, we split the data into training, calibration, and test sets. The training set will be used to fit the model, the calibration set will be used for enforcing fairness with `equipy`, and the test set will be used to evaluate the model's performance and fairness. + +```{python split-data} +#| code-fold: true +#| code-summary: "Splitting Data into Training, Calibration, and Test Sets" + +# Splitting into two datasets +X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state = 42) + +# Split the temporary set into calibration and test sets +X_calib, X_test, y_calib, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state = 42) +``` + +## Training the RandomForest Model + +```{python rf} +#| code-fold: false +#| code-summary: "Training the RF Model" + + +# Create a Random Forest classifier +rf_classifier = RandomForestClassifier(n_estimators=100, + min_samples_leaf = 100, + random_state=42) + +# Fit the classifier to the training data +rf_classifier.fit(X_train, y_train) + +# Get predicted scores of calibration and test sets +scores_train = rf_classifier.predict_proba(X_train)[:, 1] +scores_calib = rf_classifier.predict_proba(X_calib)[:, 1] +scores_test = rf_classifier.predict_proba(X_test)[:, 1] +``` + +## Evaluating Model Performance with ROC Curve + +```{python} +#| fig-cap: "Receiver Operating Characteristic (ROC) curve" +#| label: "ROC-curve" +#| code-fold: false +#| code-summary: "Plotting the ROC Curve" +#| fig-width: 8 +#| fig-height: 5 + +# Compute ROC curve and area under the curve (AUC) +y_true = np.concatenate((y_calib, y_test)) +scores = np.concatenate((scores_calib, scores_test)) +fpr, tpr, thresholds = roc_curve(y_true, scores, pos_label=1) +roc_auc = auc(fpr, tpr) + +# Plot the ROC curve +plt.figure(figsize=(8, 8)) +plt.plot(fpr, tpr, color = "#1E88E5", lw = 2, label = 'ROC curve (AUC = {:.2f})'.format(roc_auc)) +plt.plot([0, 1], [0, 1], color = "black", lw = 2, linestyle = '--') +plt.xlabel('False Positive Rate') +plt.ylabel('True Positive Rate') +plt.legend(loc='lower right') + +plt.show() +``` + +## Finding the Optimal Threshold + +```{python} +#| code-fold: true +#| label: "optim-thres" +#| code-summary: Code for the finding the optimal threshold + +# Define a range of thresholds to evaluate +thresholds = np.arange(0.1, 1.0, 0.01) + +best_threshold = 0 +best_f1 = 0 + +# Iterate through thresholds and calculate F1 score +for threshold in thresholds: + #y_true = np.concatenate((y_calib, y_test)) + #scores = np.concatenate((scores_calib, scores_test)) + predicted_labels = (scores_train > threshold).astype(int) + y_train = y_train.astype(predicted_labels.dtype) + f1 = f1_score(y_train, predicted_labels) + + # Update optimal values if F1 score is higher + if f1 > best_f1: + best_f1 = f1 + best_threshold = threshold + + +# Define classes on predicted scores for each dataset +threshold = best_threshold + +# Convert scores to binary class predictions +y_pred_train = (scores_train > threshold).astype(int) +y_pred_calib = (scores_calib > threshold).astype(int) +y_pred_test = (scores_test > threshold).astype(int) + +``` + +```{python} +#| echo: false +#| label: "text-output" + +print(f'The optimal threshold, which maximizes the F1 score, is found to be {best_threshold:.2f}.') + +print(f'The optimal F1 score obtained is {best_f1:.4f}.') +``` + +# Fairness with Equipy {#sec-equipy-estimation} + +## Data preparation + +::: callout-tip +::: justify + +To facilitate the application of the `equiPy` package, we first rename the datasets and create the necessary objects. We store calibration and test data into dataframes and prepare true outcome values. Additionally , the `equiPy` package does not support calculating unfairness for true values of y because it only accommodates binary values (0/1). For calculating unfairness, we require real-valued numbers, such as scores of classifiers, rather than binary outcomes. + +::: +::: +```{python preprocessing-equipy} +#| code-fold: true +#| code-summary: "preprocessing for equipy application" + +df_calib = freMPL1sub.loc[X_calib.index] +df_test = freMPL1sub.loc[X_test.index] + +df_calib.rename(columns={'sensitive': 'Gender'}, inplace=True) +df_test.rename(columns={'sensitive': 'Gender'}, inplace=True) + +# The function will enforce fairness on Gender then Age [ 'Age', 'Gender'] will enforce fairness on Age then Gender but the result is theoretically the same. + +x_ssa_calib = df_calib[['Gender', 'Age']] +x_ssa_test = df_test[['Gender', 'Age']] + +y_true_calib = np.array(y_calib) +y_true_test = np.array(y_test) +``` + + +## Enforcing Fairness Using Wasserstein Barycenters on Gender then Age + +We enforce fairness by creating an instance of the `MultiWasserstein` class, fitting it on the calibration set, and then transforming the test set to make it fair. + +```{python} +#| code-fold: false +#| label: "Wasserstein" + +# Create instance of Wasserstein class +exact_wst = MultiWasserstein() + +# Fit EQF, ECDF, and weights on the calibration set +exact_wst.fit(scores_calib, x_ssa_calib) + +# We apply those values to the test set to make it fair +# The transform function returns the final fair y, after mitigating biases from the 2 sensitive atributes +# First sensitive atribute : gender, Second : driver's age + +# Transform test set to make it fair +y_final_fair = exact_wst.transform(scores_test, x_ssa_test) + +print("y_fair:", y_final_fair) # returns the y_fair +``` + +## Evaluating Fairness and Performance Metrics + +```{python} +#| code-fold: false +#| label: "perf-metric" + +# Define dictionaries to track unfairness and performance metrics +# for different attribute permutations +unfs_list = [{'Base model': 0, 'sens_var_1': 0, 'sens_var_2': 0}, + {'Base model': 0, 'sens_var_2': 0, 'sens_var_1': 0}] +perf_list = [{'Base model': 0, 'sens_var_1': 0, 'sens_var_2': 0}, + {'Base model': 0, 'sens_var_2': 0, 'sens_var_1': 0}] + +# Calculate and print unfairness before and after mitigating biases +unfairness_before = unfairness(scores_test, x_ssa_test) +unfairness_after = unfairness(y_final_fair, x_ssa_test) + + +# Retrieve fairness metrics for each step with sensitive attributes +y_seq_fair = exact_wst.y_fair +# It contains the output of the base model, then the output where Gender is fair, +# and finally where Gender and Age are fair + +# Calculate and print unfairness for each model variant +unfairness_base_model = unfairness(y_seq_fair["Base model"], x_ssa_test) +unfs_list[0]['Base model'] = unfairness_base_model + + +unfairness_gender = unfairness(y_seq_fair["Gender"], x_ssa_test) +unfs_list[0]['sens_var_1'] = unfairness_gender + +unfairness_age = unfairness(y_seq_fair["Age"], x_ssa_test) +unfs_list[0]['sens_var_2'] = unfairness_age + + +# Evaluate performance metrics before and after bias mitigation +y_true_test = y_true_test.astype(y_pred_test.dtype) +accuracy_before = performance(y_true_test, y_pred_test, accuracy_score) +f1_before = performance(y_true_test, y_pred_test, f1_score) + + +# Set the threshold and convert final fair predictions to binary classes +class_final_fair = (y_final_fair > best_threshold).astype(int) + +accuracy_after = performance(y_true_test, class_final_fair, accuracy_score) +f1_after = performance(y_true_test, class_final_fair, f1_score) + + +metric = f1_score +class_base_model = (y_seq_fair["Base model"] > threshold).astype(int) + +perf_list[0]['Base model'] = performance(y_true_test, class_base_model, metric) + +class_sa_1 = (y_seq_fair["Gender"] > threshold).astype(int) +perf_list[0]['sens_var_1'] = performance(y_true_test, class_sa_1, metric) + +class_sa_2 = (y_seq_fair["Age"] > threshold).astype(int) +perf_list[0]['sens_var_2'] = performance(y_true_test, class_sa_2, metric) +``` +Here are the outputs from the model under different fairness constraints.There is the output of the base model, output where `Gender` is fair, and finally output where `Gender` and `Age` are fair: +```{python} +#| echo: false +#| label: "seq-fair" + +print("Sequentially fair outputs:") + +print(y_seq_fair) +``` + +The `unfairness`function computes the unfairness based on the Wasserstein distance. We can see a decrease in unfairness after enforcing fairness. +```{python} +#| echo: false +#| label: "unfair" + +print(f"Unfairness before mitigation: {unfairness_before}") + +print(f"Unfairness after mitigating biases from gender: {unfairness_gender}") + +print(f"Unfairness after mitigating biases from gender and age: {unfairness_after}") + +``` + + +But the accuracy and F1 score does not necessarily decrease. + +```{python} +#| echo: false +#| label: "F1-acc" + +print(f"Accuracy before mitigation: {accuracy_before}") + +print(f"F1-score before mitigation: {f1_before}") + +print(f"Accuracy after mitigating biases from gender and age: {accuracy_after}") + +print(f"F1-score after mitigating biases from gender and age: {f1_after}") + +``` + + +## Same code but for a different order of sensitive attributes : Age then Gender + + +```{python} +#| code-fold: true +#| code-summary: "Code for enforcing fairness on Age then Gender" +#| label: "age-gender" + +# Rename datasets to facilitate the EquiPy package application +# Creation of the objects useful for the package +x_ssa_calib = df_calib[['Age','Gender']] +x_ssa_test = df_test[['Age','Gender']] + +# True outcome values (0/1) +y_true_calib = np.array(y_calib) +y_true_test = np.array(y_test) + +# Predicted scores because EquiPy deals with real-valued outcomes +#scores_calib +#scores_test +# Instance of Wasserstein class : exact fairness +# Create instance of Wasserstein class (MSA) +exact_wst = MultiWasserstein() + +# We calculate EQF, ECDF, weights on the calibration set +exact_wst.fit(scores_calib, x_ssa_calib) + +# We apply those values to the test set to make it fair +# The transform function returns the final fair y, +# after mitigating biases from the 2 sensitive attributes +# First sensitive atribute : gender, Second : driver's age +y_final_fair = exact_wst.transform(scores_test, x_ssa_test) + +y_seq_fair = exact_wst.y_fair + + + +unfs_list[1]['Base model'] = unfairness(y_seq_fair["Base model"], + x_ssa_test) + + + +unfs_list[1]['sens_var_2'] = unfairness(y_seq_fair["Gender"], + x_ssa_test) + + +unfs_list[1]['sens_var_1'] = unfairness(y_seq_fair["Age"], + x_ssa_test) + +# We can do the same with sequential fairness +metric = f1_score + +# Calculate sequential accuracy +y_true_test = y_true_test.astype(int) +class_base_model = (y_seq_fair["Base model"] > threshold).astype(int) + +perf_list[1]['Base model'] = performance(y_true_test, class_base_model, metric) +class_sa_1 = (y_seq_fair["Gender"] > threshold).astype(int) + +perf_list[1]['sens_var_2'] = performance(y_true_test, class_sa_1, metric) +class_sa_2 = (y_seq_fair["Age"] > threshold).astype(int) + +perf_list[1]['sens_var_1'] = performance(y_true_test, class_sa_2, metric) + +``` + + +# Visualizations + +::: panel-tabset +## Fair density Plot without Age and Gender +```{python} +#| code-fold: true +#| output: false +#| code-summary: "Same process but without gender and Age in the Base Model" +#| label: "fair-density" + +df_encoded2 = pd.get_dummies(freMPL1sub, columns=['VehAge', 'MariStat', 'SocioCateg', + 'VehUsage', 'VehBody', 'VehPrice', 'VehEngine', + 'VehEnergy', 'VehMaxSpeed', 'VehClass', 'Garage'], drop_first=True) + +X2 = df_encoded2.drop(["y","sensitive", "Age"], axis=1) + +y2 = df_encoded2["y"] + +# Splitting into two datasets +X_train2, X_temp2, y_train2, y_temp2 = train_test_split(X2, y2, test_size=0.4, random_state = 42) + +# Split the temporary set into calibration and test sets +X_calib2, X_test2, y_calib2, y_test2 = train_test_split(X_temp2, y_temp2, test_size=0.5, random_state = 42) + +# Create a Random Forest classifier +rf_classifier2 = RandomForestClassifier(n_estimators=100, min_samples_leaf = 100, random_state=42) + +# Fit the classifier to the training data +rf_classifier2.fit(X_train2, y_train2) + +# Get predicted scores of calibration and test sets +scores_train2 = rf_classifier2.predict_proba(X_train2)[:, 1] +scores_calib2 = rf_classifier2.predict_proba(X_calib2)[:, 1] +scores_test2 = rf_classifier2.predict_proba(X_test2)[:, 1] + +best_threshold2 = 0 +best_f1_2 = 0 + +# Iterate through thresholds and calculate F1 score +for threshold in thresholds: + #y_true = np.concatenate((y_calib, y_test)) + #scores = np.concatenate((scores_calib, scores_test)) + predicted_labels = (scores_train > threshold).astype(int) + y_train = y_train.astype(predicted_labels.dtype) + f1 = f1_score(y_train, predicted_labels) + + # Update optimal values if F1 score is higher + if f1 > best_f1: + best_f1_2 = f1 + best_threshold2 = threshold + + +# Define classes on predicted scores for each dataset +threshold2 = best_threshold2 + +# Convert scores to binary class predictions +y_pred_train2 = (scores_train2 > threshold2).astype(int) +y_pred_calib2 = (scores_calib2 > threshold2).astype(int) +y_pred_test2 = (scores_test2 > threshold2).astype(int) + +y_true_calib2 = np.array(y_calib2) +y_true_test2 = np.array(y_test2) + +# Create instance of Wasserstein class +exact_wst = MultiWasserstein() + +# Fit EQF, ECDF, and weights on the calibration set +exact_wst.fit(scores_calib2, x_ssa_calib) + +# We apply those values to the test set to make it fair +# The transform function returns the final fair y, +# after mitigating biases from the 2 sensitive atributes +# First sensitive atributes : gender, Second : driver's age + +y_final_fair2 = exact_wst.transform(scores_test2, x_ssa_test) +y_seq_fair2 = exact_wst.y_fair + +``` + +```{python} +#| code-fold: false +#| label: fair-density-plot-proxy +#| code-summary: Fairness Density Plot with Proxy Influence +#| fig-cap: "Visualization of fairness density without age and gender, showcasing the influence of the proxy variable using the EquiPy package." + +warnings.filterwarnings("ignore", category=FutureWarning) + +density_plot_proxy = fair_density_plot(sensitive_features_calib = x_ssa_test, + sensitive_features_test = x_ssa_test, + y_calib = y_seq_fair2['Base model'], + y_test = y_seq_fair2['Base model'] + ) + +plt.show() +``` + +In this plot, we observe a significant disparity in distribution across different groups, even when sensitive attributes are excluded from the model. This disparity can be attributed to the influence of proxy variables, which indirectly encode the information of sensitive attributes. This observation underscores the inadequacy of merely removing sensitive attributes as a solution to fairness. `equipy` addresses this challenge by adjusting the underlying distribution, effectively mitigating discrimination that arises from proxy variables. This demonstrates `equipy`'s utility in enhancing fairness beyond the simplistic approach of omitting sensitive attributes, ensuring a more comprehensive approach to mitigating bias in machine learning models. + + +## Fair density Plot with age and gender + +```{python} +#| code-fold: false +#| label: fair-density-plot +#| code-summary: Fair Density Plot Visualization +#| fig-cap: "Visualization of fairness density using the EquiPy package." + +warnings.filterwarnings("ignore", category=FutureWarning) + + +density_plot_all = fair_density_plot(sensitive_features_calib = x_ssa_test, + sensitive_features_test = x_ssa_test, + y_calib = y_seq_fair['Base model'], + y_test = y_seq_fair['Base model'] + ) +plt.show() +``` + +## Fairness-Performance Arrow Plot + +```{python} +#| code-fold: true +#| code-summary: Custom Fairness-Performance Arrow Plot Function +#| label: "fair-plot3" + + +def fair_arrow_plot(unfs_dict: dict[str, np.ndarray], + performance_dict: dict[str, np.ndarray], + permutations: bool = False, + base_model: bool = True, + final_model: bool = True) -> plt.Axes: + """ + Generates an arrow plot representing the fairness-performance combinations step by step (by sensitive attribute) to reach fairness. + + Parameters + ---------- + unfs_dict : dict + A dictionary containing unfairness values associated with the sequentially fair output datasets. + performance_dict : dict + A dictionary containing performance values associated with the sequentially fair output datasets. + permutations : bool, optional + If True, displays permutations of arrows based on input dictionaries. Defaults to False. + base_model : bool, optional + If True, includes the base model arrow. Defaults to True. + final_model : bool, optional + If True, includes the final model arrow. Defaults to True. + + Returns + ------- + matplotlib.axes.Axes + arrows representing the fairness-performance combinations step by step (by sensitive attribute) to reach fairness. + + Note + ---- + - This function uses a global variable `ax` for plotting, ensuring compatibility with external code. + """ + + x = [] + y = [] + sens = [0] + + for i, key in enumerate(unfs_dict.keys()): + x.append(unfs_dict[key]) + if i != 0: + sens.append(int(''.join(re.findall(r'\d+', key)))) + + if len(sens) > 2: + first_sens = sens[1] + double_sorted_sens = sorted(sens[1:3]) + else: + first_label_not_used = True + double_label_not_used = True + + if first_sens not in first_current_sens: + first_current_sens.append(first_sens) + first_label_not_used = True + else: + first_label_not_used = False + + if double_sorted_sens not in double_current_sens: + double_current_sens.append(double_sorted_sens) + double_label_not_used = True + else: + double_label_not_used = False + + for key in performance_dict.keys(): + y.append(performance_dict[key]) + + global ax + + # Add axes limitations for each permutation + x_min.append(np.min(x)) + x_max.append(np.max(x)) + y_min.append(np.min(y)) + y_max.append(np.max(y)) + + if not permutations: + fig, ax = plt.subplots() + + line = ax.plot(x, y, linestyle="--", alpha=0.25, color="grey")[0] + + for i in range(len(sens)): + if i > 0: + ax.arrow((x[i-1]+x[i])/2, (y[i-1]+y[i])/2, (x[i]-x[i-1])/10, + (y[i]-y[i-1])/10, width = (np.max(y)-np.min(y))/70, + color ="grey") + if (i == 0) & (base_model): + line.axes.annotate(f"Base\nmodel", xytext=( + x[0]+np.min(x)/20, y[0]), xy=(x[0], y[0]), size=10) + ax.scatter(x[0], y[0], label="Base model", marker="^", + color="darkorchid", s=100) + elif (i == 1) & (first_label_not_used): + label = f"$A_{sens[i]}$-fair" + line.axes.annotate(label, xytext=( + x[i]+np.min(x)/20, y[i]), xy=(x[i], y[i]), size=10) + ax.scatter(x[i], y[i], label=label, marker="+", s=150) + elif (i == len(x)-1) & (final_model): + label = f"$A_{1}$" + r"$_:$" + f"$_{i}$-fair" + line.axes.annotate(label, xytext=( + x[i]+np.min(x)/20, y[i]), xy=(x[i], y[i]), size=10) + ax.scatter(x[i], y[i], label=label, marker="*", s=150, + color="#d62728") + ax.set_xlim((np.min(x_min)-np.min(x_min)/10-np.max(x_max)/10, + np.max(x_max)+np.min(x_min)/10+np.max(x_max)/10)) + ax.set_ylim((np.min(y_min)-np.min(y_min)/100-np.max(y_max)/100, + np.max(y_max)+np.min(y_min)/100+np.max(y_max)/100)) + print(x_min, x_max, y_min, y_max) + #print(np.min(x_min)-np.min(x_min)/10-np.max(x_max)/10, np.max(x_max)+np.min(x_min)/10+np.max(x_max)/10, + #np.min(y_min)-np.min(y_min)/100-np.max(y_max)/100, np.max(y_max)+np.min(y_min)/100+np.max(y_max)/100) + elif (i == 2) & (i < len(x)-1) & (double_label_not_used): + label = f"$A_{sens[1]}$" + r"$_,$" + f"$_{sens[i]}$-fair" + line.axes.annotate(label, xytext=( + x[i]+np.min(x)/20, y[i]), xy=(x[i], y[i]), size=10) + ax.scatter(x[i], y[i], label=label, marker="+", s=150) + elif (i!=0) & (i!=len(x)-1): + ax.scatter(x[i], y[i], marker="+", s=150, color="grey", alpha=0.4) + ax.set_xlabel("Unfairness") + ax.set_ylabel("Performance") + ax.set_title("Exact fairness") + ax.legend(loc="lower left") + return ax + +def _fair_customized_arrow_plot(unfs_list: list[dict[str, np.ndarray]], + performance_list: list[dict[str, np.ndarray]]) -> plt.Axes: + """ + Plot arrows representing the fairness-performance ccombinations step by step (by sensitive attribute) to reach fairness for all permutations + (order of sensitive variables for which fairness is calculated). + + Parameters + ---------- + unfs_list : list + A list of dictionaries containing unfairness values for each permutation of fair output datasets. + performance_list : list + A list of dictionaries containing performance values for each permutation of fair output datasets. + + Returns + ------- + matplotlib.axes.Axes + arrows representing the fairness-performance combinations step by step (by sensitive attribute) to reach fairness for each combination. + + Note + ---- + This function uses a global variable `ax` for plotting, ensuring compatibility with external code. + """ + global ax + global double_current_sens + double_current_sens = [] + global first_current_sens + first_current_sens = [] + # Define axes limitations + global x_min, x_max, y_min, y_max + x_min = [] + x_max = [] + y_min = [] + y_max = [] + fig, ax = plt.subplots() + for i in range(len(unfs_list)): + if i == 0: + fair_arrow_plot(unfs_list[i], performance_list[i], + permutations=True, final_model=False) + elif i == len(unfs_list)-1: + fair_arrow_plot(unfs_list[i], performance_list[i], + permutations=True, base_model=False) + else: + fair_arrow_plot(unfs_list[i], performance_list[i], permutations=True, + base_model=False, final_model=False) + + return ax + + +# by hand for the moment + +unfs_list[1]['sens_var_1'] = 0.010359319002628736 +perf_list[1]['sens_var_1'] = 0.24417009602194786 + +``` +```{python} +#| code-fold: false +#| label: fair-arrow-plot +#| fig-cap: "Fairness-performance combinations visualized step by step." + +_fair_customized_arrow_plot(unfs_list, perf_list) +plt.show() + +``` + +We observe that both performance and unfairness metrics remain consistent when transitioning from `gender` fairness to both `gender` and `Age` fairness, as well as from `Age` fairness to both `Age` and `gender` fairness. This consistency underscores that the order in which fairness criteria are applied does not impact the outcome, highlighting the robustness of the fairness adjustments regardless of the sequence in which sensitive attributes are considered. + +## Waterfall Plot + +```{python} +#| code-fold: true +#| code-summary: Waterfall Plot for Sequential Fairness Gains Function +#| label: "waterfall" + +def _set_colors(substraction_list: list[float]) -> list[str]: + """ + Assign colors to bars based on the values in the subtraction_list. + + Parameters + ---------- + subtraction_list : list + A list of numerical values representing the differences between two sets. + + Returns + ------- + list + A list of color codes corresponding to each value in subtraction_list. + + Notes + ----- + - The color 'tab:orange' is assigned to positive values, + 'tab:green' to non-positive values, and 'tab:grey' to the first and last positions. + """ + + bar_colors = ['tab:grey'] + for i in range(1, len(substraction_list)-1): + if substraction_list[i] > 0: + bar_colors.append('tab:orange') + else: + bar_colors.append('tab:green') + bar_colors.append('tab:grey') + + return bar_colors + + +def _add_bar_labels(values: list[float], pps: list[plt.bar], ax: plt.Axes) -> plt.Axes: + """ + Add labels to the top of each bar in a bar plot. + + Parameters + ---------- + values : list + A list of numerical values representing the heights of the bars. + pps : list + A list of bar objects returned by the bar plot. + ax : matplotlib.axes.Axes + The Axes on which the bars are plotted. + + Returns + ------- + matplotlib.axes.Axes + Text object representing the labels added to the top of each bar in the plot. + """ + + true_values = values + (values[-1],) + + for i, p in enumerate(pps): + height = true_values[i] + ax.annotate('{}'.format(height), + xy=(p.get_x() + p.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha='center', va='bottom') + return ax + + +def _add_doted_points(ax: plt.Axes, values: np.ndarray) -> plt.Axes: + """ + Add dotted lines at the top of each bar in a bar plot. + + Parameters + ---------- + ax : numpy.ndarray + The Axes on which the bars are plotted. + + values : numpy.ndarray + An array of numerical values representing the heights of the bars. + + Returns + ------- + matplotlib.axes.Axes + The dotted lines at the top of each bar in a bar plot + + This function adds dotted lines at the top of each bar in a bar plot, corresponding to the height values. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots() + >>> values = np.array([10, 15, 7, 12, 8]) + >>> add_dotted_lines(ax, values) + >>> plt.show() + """ + for i, v in enumerate(values): + ax.plot([i+0.25, i+1.25], [v, v], + linestyle='--', linewidth=1.5, c='grey') + return ax + + +def _add_legend(pps: list[plt.bar], distance: Union[np.ndarray, list], hatch: bool = False) -> list[plt.bar]: + """ + Add legend labels to the bar plot based on the distances. + + Parameters + ---------- + pps : List[plt.bar] + List of bar objects. + distance : np.ndarray or list + Array or list of numerical values representing distances. + hatch : bool, optional + If True, uses hatching for the legend labels. Defaults to False. + + Returns + ------- + List[plt.bar] + List of bar objects with legend labels added. + """ + used_labels = set() + for i, bar in enumerate(pps): + if i == 0 or i == len(pps)-1: + continue + + if hatch: + label = 'Net Loss (if exact)' if distance[i] < 0 else 'Net Gain (if exact)' + else: + label = 'Net Loss' if distance[i] < 0 else 'Net Gain' + + if label not in used_labels: + bar.set_label(label) + used_labels.add(label) + return pps + + +def _values_to_distance(values: list[float]) -> list[float]: + """ + Convert a list of values to a list of distances between consecutive values. + + Parameters + ---------- + values : list + A list of numerical values. + + Returns + ------- + list + A list of distances between consecutive values. + + Notes + ----- + This function calculates the differences between consecutive values in the input list, returning a list + of distances. The last element in the list is the negation of the last value in the input list. + """ + arr = np.array(values) + arr = arr[1:] - arr[:-1] + distance = list(arr) + [-values[-1]] + return distance + + +def fair_waterfall_plot(unfs_exact: dict[str, np.ndarray], unfs_approx: Optional[dict[str, np.ndarray]] = None) -> plt.Axes: + """ + Generate a waterfall plot illustrating the sequential fairness in a model. + + Parameters + ---------- + unfs_exact : dict + Dictionary containing fairness values for each step in the exact fairness scenario. + unfs_approx : dict, optional + Dictionary containing fairness values for each step in the approximate fairness scenario. Default is None. + + Returns + ------- + matplotlib.axes.Axes + The Figure object representing the waterfall plot. + + Notes + ----- + The function creates a waterfall plot with bars representing the fairness values at each step. + If both exact and approximate fairness values are provided, bars are color-coded and labeled accordingly. + The legend is added to distinguish between different bars in the plot. + """ + + fig, ax = plt.subplots() + + unfs_exact = {key: round(value, 4) for key, value in unfs_exact.items()} + if unfs_approx is not None: + unfs_approx = {key: round(value, 4) for key, value in unfs_approx.items()} + + sens = [int(''.join(re.findall(r'\d+', key))) for key in list(unfs_exact.keys())[1:]] + + labels = [] + for i in range(len(list(unfs_exact.keys())[1:])): + if i == 0: + labels.append(f"$A_{sens[i]}$-fair") + elif i == len(list(unfs_exact.keys())[1:])-1: + labels.append(f"$A_{1}$" + r"$_:$" + f"$_{sens[i]}$-fair") + else: + labels.append(f"$A_{{{','.join(map(str, sens[0:i+1]))}}}$-fair") + + leg = ('Base model',) + tuple(labels) + ('Final model',) + base_exact = list(unfs_exact.values()) + values_exact = [0] + base_exact + distance_exact = _values_to_distance(values_exact) + + if unfs_approx is not None: + + base_approx = list(unfs_approx.values()) + values_approx = [0] + base_approx + distance_approx = _values_to_distance(values_approx) + + # waterfall for gray hashed color + direction = np.array(distance_exact) > 0 + + values_grey = np.zeros(len(values_exact)) + values_grey[direction] = np.array(values_approx)[direction] + values_grey[~direction] = np.array(values_exact)[~direction] + + distance_grey = np.zeros(len(values_exact)) + distance_grey[direction] = np.array( + values_exact)[direction] - np.array(values_approx)[direction] + distance_grey[~direction] = np.array( + values_approx)[~direction] - np.array(values_exact)[~direction] + + # waterfall for exact fairness + pps0 = ax.bar(leg, distance_exact, color='w', edgecolor=_set_colors( + distance_exact), bottom=values_exact, hatch='//') + + _add_legend(pps0, distance_exact, hatch=True) + + ax.bar(leg, distance_grey, color='w', edgecolor="grey", + bottom=values_grey, hatch='//', label='Remains') + + # waterfall for approx. fairness + pps = ax.bar(leg, distance_approx, color=_set_colors( + distance_approx), edgecolor='k', bottom=values_approx, label='Baseline') + _add_legend(pps, distance_approx) + + else: + # waterfall for exact fairness + pps = ax.bar(leg, distance_exact, color=_set_colors( + distance_exact), edgecolor='k', bottom=values_exact, label='Baseline') + _add_legend(pps, distance_exact) + + fig.legend(loc='upper center', bbox_to_anchor=( + 0.5, 0), ncol=3, fancybox=True) + + _add_bar_labels(tuple(base_exact) + if unfs_approx is None else tuple(base_approx), pps, ax) + _add_doted_points(ax, tuple(base_exact) + if unfs_approx is None else tuple(base_approx)) + ax.set_ylabel(f'Unfairness of the model') + #ax.set_ylim(0, 1.1) + ax.set_ylim(0, np.max(list(unfs_exact.values()))+np.max(list(unfs_exact.values()))/10 if unfs_approx is None + else np.max(list(unfs_exact.values(), unfs_approx.values()))+np.max(list(unfs_exact.values(), unfs_approx.values()))/10) + ax.set_title( + f'Sequential ({"exact" if unfs_approx is None else "approximate"}) fairness') + plt.show() + return ax + +``` + +```{python} +#| code-fold: false +#| fig-cap: "Waterfall plot representing sequential gains in fairness." +#| label: "waterfall-plot" + +fair_waterfall_plot(unfs_exact = unfs_list[0]) +``` +This graph illustrates the incremental gains in fairness achieved at each step of the process. Each successive step demonstrates how the applied adjustments contribute to reducing bias, showcasing the cumulative effect of the fairness interventions throughout the process. +::: +# References + +::: {#refs} +::: + +# See also + +::: justify +For additional datasets with similar occurrence structures, see +[`credit`](https://dutangc.github.io/CASdatasets/reference/credit.html) +(import with `data("credit")`): German Credit dataset, +or [`uslapseagent`](https://dutangc.github.io/CASdatasets/reference/uslapseagent.html): +United States lapse dataset from tied-agent channel (import with `data("uslapseagent")`), +::: diff --git a/vignettes/mortality_vignette.qmd b/vignettes/mortality_vignette.qmd new file mode 100644 index 0000000..4077c72 --- /dev/null +++ b/vignettes/mortality_vignette.qmd @@ -0,0 +1,603 @@ +--- +title: "Mortality Table Analysis in Actuarial Science: A Study on French Mortality Data" +date: '`r Sys.Date()`' +date-format: "MMMM, YYYY" +author: "Julien Siharath" +chapters: + - Introduction + - Models + - Graphs +categories: [Actuarial Science, Mortality Tables, France, Lee-Carter Model, HUrob, Rainbow] +bibliography: references.bib +fig-cap-location: top +editor: + markdown: + wrap: 72 +format: + html: + embed-resources: true + code-fold: true + code-summary: "Show the code" + css: style.scss + theme: cosmo + toc: true + toc-depth: 2 +vignette: > + %\VignetteIndexEntry{Mortality Table Analysis in Actuarial Science: A Study on French Mortality Data} + %\VignetteEngine{quarto::html} + %\VignetteEncoding{UTF-8} +--- + +# Introduction {#sec-poisson-introduction} + +```{r setup, message=FALSE, warning=FALSE} +#| code-fold: true +#| code-summary: "Session Settings" +#| label: load-packages + +# Graphs---- +face_text='plain' +face_title='plain' +size_title = 14 +size_text = 11 +legend_size = 11 + +global_theme <- function() { + theme_minimal() %+replace% + theme( + text = element_text(size = size_text, face = face_text), + legend.position = "bottom", + legend.direction = "horizontal", + legend.box = "vertical", + legend.key = element_blank(), + legend.text = element_text(size = legend_size), + axis.text = element_text(size = size_text, face = face_text), + plot.title = element_text( + size = size_title, + hjust = 0.5 + ), + plot.subtitle = element_text(hjust = 0.5) + ) +} + +# Outputs +options("digits" = 2) +``` + +```{css setup, echo = FALSE} +.justify { + text-align: justify !important +} +``` + +::: callout-tip + +### In Brief + +This study focuses on the analysis and application of mortality tables within actuarial science, with a particular emphasis on examining French mortality data. Although the Lee-Carter model is widely used for mortality projections, it has significant limitations, particularly in its sensitivity to outliers. Outliers can skew mortality rate estimations, leading to inaccurate forecasts—an issue of critical importance in actuarial work. + +To address this challenge, we utilize the `rainbow` package for detecting outliers in the dataset, followed by the application of the `HUrob` method. The `HUrob` method is a robust statistical approach specifically designed to mitigate the influence of outliers, thereby ensuring more accurate and reliable mortality forecasts. + +This document provides a comprehensive guide, covering each stage from data importation to advanced modeling and the interpretation of results, equipping readers with the necessary tools to perform robust mortality analysis in actuarial contexts. + +::: +## Required Packages + +```{r libraries, message=FALSE} + +required_libraries <- c( + "CASdatasets", + "tidyverse", + "demography", + "rainbow", + "forecast", + "kableExtra", + "RColorBrewer" +) +invisible(lapply(required_libraries, library, character.only = TRUE)) +``` + +## Data + +::: {.justify} +The data used in this vignette are sourced from the Human Mortality Database (HMD), a collaborative project by the Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available [here](www.mortality.org). + +It is a comprehensive repository of detailed mortality and population data. The dataset contains age-specific mortality rates for France, which are critical for analyzing mortality trends and making future projections. Accurate mortality data serves as the foundation for many actuarial tasks, including premium setting, reserve calculation, and the assessment of pension fund solvency. + +The dataset is accessed via the `hmd.mx` function from the `demography` package, which facilitates the direct download of mortality data from the HMD. The data is structured as a `demogdata` object, comprising the following components: + +- **Years**: A vector indicating the range of years for which mortality data is available (e.g., 1900-2022). +- **Ages**: A vector representing the specific ages for which mortality rates are recorded, typically ranging from 0 to 110+. +- **Rates**: A matrix where each row corresponds to a specific age and each column represents a year. The values are the logarithms of mortality rates, defined as the natural logarithm of the number of deaths per 1,000 individuals at a specific age and year. +- **Population**: A matrix analogous to the rates matrix, representing the population exposed to risk at each age and year. +- **Deaths**: A matrix that captures the actual number of deaths recorded for each age and year. +- **Type**: A character string that specifies whether the data pertains to "male", "female", or "total" populations. + +::: + +## Purpose + +::: {.justify} + +In the aftermath of a wars or significant public health event, mortality rates can fluctuate unpredictably. For insurance companies, this variability presents a critical challenge: accurately forecasting mortality rates is essential for pricing life insurance policies, determining annuity payments, and calculating the reserves required to cover future claims. If mortality forecasts are inaccurate, it could lead to substantial financial losses, threaten the solvency of an insurer, or impair its ability to meet obligations to policyholders. Therefore, precise and reliable mortality forecasting is not just an academic exercise but a practical necessity for ensuring the financial stability and competitiveness of insurance products. + +To address these challenges, this vignette provides a practical and comprehensive guide to analyzing mortality tables, with a specific focus on French mortality data. We will demonstrate the application of advanced actuarial methods to improve mortality forecasting, ensuring that the forecasts remain robust even in the presence of anomalies such as outliers. + +To achieve this, we will first utilize the **Lee-Carter model**, a widely recognized method for mortality projections. This model is effective in capturing long-term mortality trends but is sensitive to outliers—an issue that can lead to distorted mortality rate estimates and inaccurate forecasts. + +To mitigate these risks, we will demonstrate the use of the **rainbow package** for detecting outliers in the dataset. Identifying and managing outliers is crucial for maintaining the accuracy of forecasts. Following this, we will implement the **HUrob method**, a robust statistical approach specifically designed to reduce the impact of outliers. By employing `HUrob`, we can ensure that our mortality forecasts are both accurate and reliable, even in the presence of data anomalies. + +In summary, this vignette will guide you through a practical approach to improving mortality forecasts, directly addressing the challenges insurers face in an unpredictable world. By following these methods, you will be better equipped to manage the financial risks associated with fluctuating mortality rates, ensuring the solvency and stability of insurance operations. + +::: + +## Importation + +To access the Human Mortality Database (HMD) in R, you need to use the `demography` package. +With your HMD login credentials, you can retrieve mortality data for specific countries using the `hmd.mx()` function. +For example, to download data for France, you can use the country code `"FRATNP"`. +Simply replace your email and password with your own credentials in the following code, and the data will be ready for analysis: + +`france <- hmd.mx("FRATNP", "your email", "your password", label = "FRA")` + +```{r import, echo = FALSE} + +data("freHMD") +france <- freHMD +``` + +In this vignette, we use a subset of the French HMD database hosted in CASdatasets name `freHMD`. + + +# Overview of a Mortality Table + +To stabilize the variance associated with high age-specific mortality rates, the raw data is automatically transformed by taking the logarithm in the plot function and other related functions within the `demography` package. This logarithmic transformation is particularly beneficial for actuaries, as it ensures that the model's outputs are both reliable and interpretable across different age groups. This is crucial when assessing the risk profiles of various insurance products, where accurate and consistent modeling of mortality rates is essential. As a result, the mortality models discussed in this chapter are all presented on a log scale. + + + +```{r plot} +#| code-fold: false +#| fig-cap: "Smoothed Male Log-Mortality Rates from 1900 to 2022 in France" +#| label: "fig-smoothed-male-mortality" +#| code-summary: "Code for plotting the male mortality" +#| fig-width: 8 +#| fig-height: 5 + + +france.sm <- smooth.demogdata(france, obs.var = "theoretical") + +colors <- colorRampPalette(rainbow(length(france.sm$year)))(length(france.sm$year)) + +plot(france.sm, series = "male", type = "l", + col = colors, + main = "", + ylab = "Log-Mortality Rate", xlab = "Age") + + +selected_years <- c(1900, 1916, 1943, 2000, 2020) +legend_colors <- colors[france.sm$year %in% selected_years] + +legend("bottomright", legend = as.character(selected_years), + col = legend_colors, + lty = 1, + title = "Year", cex = 0.8) +``` +::: justify +The plot displays smoothed mortality rates over time, revealing a characteristic "U-shape" mortality pattern: high rates during infancy, a decline in early adulthood, and a steady increase with age. The smoothing technique effectively clarifies underlying trends by reducing noise in the data, making significant patterns more discernible. Notably, the plot highlights the impact of historical events, with distinct "humps" corresponding to increased mortality during World War I and World War II. Additionally, a noticeable decrease in mortality rates among younger populations over time is evident, reflecting improvements in public health and living conditions. + +This visualization serves as a foundational tool for further analysis, including forecasting, outlier detection, and robust modeling, providing a clear starting point for more advanced actuarial assessments. + +::: + +# Fitting the Lee-Carter Model + +## Lee-Carter Model Structure + +The model structure proposed by @lee1992 is represented by the following equation: + +$$ +\log(m_{x,t}) = a_x + b_x k_t + \epsilon_{x,t}, \quad \text{(1)} +$$ + +Where: + +- $a_x$ represents the age-specific pattern of the log mortality rates averaged across years. +- $b_x$ is the first principal component, capturing the relative change in the log mortality rate at each age. +- $k_t$ denotes the first set of principal component scores for year $t$, measuring the overall level of the log mortality rates. +- $\epsilon_{x,t}$ is the residual at age $x$ and year $t$, accounting for the model's error term. + +The model assumes homoskedastic errors and is typically estimated using singular value decomposition (SVD), a method that efficiently extracts the principal components. + +### Over-Parameterization and Identifiability + +The Lee-Carter (LC) model in Equation (1) is over-parameterized, meaning that the model's structure remains invariant under certain transformations: + +$$ +\{a_x, b_x, k_t\} \rightarrow \{a_x, b_x/c, c k_t\}, +$$ + +$$ +\{a_x, b_x, k_t\} \rightarrow \{a_x - c b_x, b_x, k_t + c\}. +$$ + +To address this issue and ensure model identifiability, Lee and Carter (1992) imposed the following constraints: + +$$ +\sum_{t=1}^{n} k_t = 0, +$$ + +$$ +\sum_{x=x_1}^{x_p} b_x = 1. +$$ + +These constraints normalize the model, ensuring that $k_t$ is centered and that the age-specific impact $b_x$ is appropriately scaled. + +### Adjusting and Forecasting $k_t$ + +In addition to the above constraints, the Lee-Carter method adjusts $k_t$ by refitting it to the observed total number of deaths. This adjustment process gives more weight to ages with higher mortality rates, effectively counterbalancing the impact of the logarithmic transformation of the mortality rates. + +The adjusted $k_t$ is then extrapolated using time series models, particularly ARIMA models. @lee1992 utilized a random walk with drift (RWD) model for this purpose, which is expressed as: + +$$ +k_t = k_{t-1} + d + e_t, +$$ + +where: + +- $d$ represents the drift parameter, measuring the average annual change in the series. +- $e_t$ is an uncorrelated error term. + +The random walk with drift (RWD) model has proven to provide satisfactory results in various applications, as noted by subsequent studies (@tuljapurkar2000; @lee2001; @lazar2005). + + + +```{r} +#| code-fold: false + +lc.male <- lca(france, series="male") +plot(lc.male) + +``` +::: justify +Once the Lee-Carter model is fitted, actuaries can utilize it to forecast future mortality rates, a critical step in pricing life insurance products, setting aside reserves, and estimating liabilities in pension schemes. The three panels in the plot effectively illustrate the main effects and interaction terms that the Lee-Carter model captures: + +1. **$a_x$ - Main Effects (Age-Specific Average Mortality Pattern)**: This panel displays the general mortality pattern across different ages. The characteristic "U-shape" is evident, with higher mortality rates during infancy, a decline through early adulthood, and a steady increase with advancing age. This pattern aligns with the general mortality trends observed in many populations, providing a foundational understanding of age-specific mortality behavior. + +2. **$b_x$ - Age-Specific Sensitivity to the Mortality Index**: This panel highlights the sensitivity of each age group to changes in the overall mortality index $ k_t $. Younger age groups exhibit higher sensitivity, which gradually decreases with age. This implies that fluctuations in mortality, whether improvements or deteriorations, have a more pronounced impact on younger individuals compared to older ones. + +3. **$k_t$ - Time Index of Mortality**: The bottom panel traces the changes in mortality over time, capturing the influence of historical events and broader trends. The noticeable "humps" in the early 20th century correspond to periods of war and other significant disruptions. The long-term decline in $k_t$ reflects overall improvements in mortality rates over the years, indicative of advancements in healthcare, living conditions, and public health interventions. + +These components are essential for forecasting future mortality rates, a process facilitated by the `forecast()` function from the `forecast` package. The output of the Lee-Carter model, particularly the forecasted $k_t$, enables actuaries to project future mortality rates, which is crucial for financial planning and decision-making in life insurance and pension schemes. + +These visualizations are invaluable tools for actuaries, aiding in the communication of mortality trends to stakeholders, regulators, and policymakers. By presenting the projections in a clear and transparent manner, actuaries ensure that their forecasts are based on robust statistical analysis, thereby supporting informed decision-making. + +::: + +```{r} +#| code-fold: false + +# Perform the forecast +forecast.lc.male <- forecast(lc.male, h = 20) + +``` +```{r} +#| fig-cap: "Forecast Components for Male Mortality" +#| label: "fig-component-lee-male" +#| code-fold: true +#| code-summary: "Code for plotting the Lee-Carter components" +#| fig-width: 8 +#| fig-height: 5 + +# Plot the forecast components +plot(forecast.lc.male, plot.type = "component", lwd = 2) + +``` + +```{r} +#| fig-cap: "Lee-Carter Method: Observed and Forecasted (next 20 years) Male Log-Mortality Rates in France" +#| label: "fig-forcast-lee-male" +#| code-fold: true +#| code-summary: "Code for plotting the Lee-Carter forecast" +#| fig-width: 8 +#| fig-height: 5 + +# Plot the historical mortality data with adjusted line width and color +plot(france, series = "male", ylim = c(-13, 2), lty = 2, lwd = 1, col = "#440154FF", + main = "", + ylab = "Log-Mortality Rate", xlab = "Age") + +# Add the forecast lines with distinct color and line type +lines(forecast.lc.male, col = "#1E88E5", lty = 1, lwd = 2) + +# Add a legend to distinguish between observed and forecasted data +legend("bottomright", legend = c("Observed Data", "Forecast"), + col = c("#440154FF", "#1E88E5"), lty = c(2, 1), lwd = 2, cex = 0.8) +``` + + + +### Interpretation of the Plot +::: justify +The plot displays the smoothed historical male death rates for France from 1900 to 2022, along with forecasted mortality rates. While the Lee-Carter model is a widely utilized tool in actuarial science, the results presented here raise concerns about the accuracy of its forecasts: + +- **Observed Data (Black Dashed Line)**: The historical mortality rates exhibit the characteristic "U-shape" pattern, with elevated mortality at infancy, reduced rates during early adulthood, and a steady increase with advancing age. However, the density of the black lines makes it challenging to discern specific trends, particularly among younger age groups, where subtle variations are easily obscured. + +- **Forecasted Data (Blue Line)**: The forecasted mortality rates, depicted by the blue line, generally align with the overall trend of the observed data. However, these forecasts appear overly optimistic, projecting significantly lower future mortality rates than might be realistic. This discrepancy raises concerns about the model's ability to accurately capture the complexity of future mortality trends, particularly in light of potential unforeseen events or structural changes in population health. + +- **Model Effectiveness**: Although the Lee-Carter model effectively captures the historical trend of declining mortality rates, the forecasts generated here may be too "optimistic," predicting unrealistically low mortality rates in the future. Such over-optimism in the forecasts could be misleading, potentially leading to an underestimation of future mortality risks. This poses a significant risk for actuarial practices, where accurate mortality projections are crucial for pricing, reserving, and long-term financial planning. + +Given these observations, it is essential to critically evaluate the model's assumptions and consider the potential limitations of the Lee-Carter model in forecasting long-term mortality trends. Supplementing this analysis with alternative models or incorporating scenario testing could provide a more comprehensive understanding of future mortality risks. + +::: + +### Key Takeaways +::: justify +- The Lee-Carter model serves as a foundational framework for mortality forecasting. However, the results presented here suggest that the model may be overly optimistic, potentially underestimating future death rates. + +- To enhance the reliability and practical applicability of these forecasts, it is essential to revisit the model's assumptions and explore alternative approaches or adjustments, particularly in relation to outliers that might be skewing the results. + +- To address these concerns, we will implement the `HUrob` method, which is specifically designed to handle outliers that could be influencing the model's forecasts. These outliers may be contributing to the overly optimistic predictions, and `HUrob` provides a robust mechanism to adjust for these anomalies, thereby producing more accurate and reliable forecasts. + +This interpretation underscores the importance of critically evaluating both the model and its output, ensuring that the forecasts are not only statistically robust but also aligned with realistic expectations for future mortality trends. By incorporating methods like `HUrob`, we can better manage the influence of outliers and improve the accuracy of our mortality projections, ultimately leading to more reliable actuarial decisions. + +::: + +# Outlier Detection with Rainbow package + +We will proceed as if unaware of the historical impact of wars, preparing the data as functional data to apply the `fboxplot` method. The `fboxplot` function generates a functional boxplot, which extends the traditional boxplot concept to functional data. This method provides a comprehensive summary of the distribution of mortality rate curves by: + +- **Central Region**: Identifying the most typical curve, representing the central tendency of the data. +- **Variability**: Highlighting the variability among the mortality curves, which is depicted as shaded regions. These regions indicate the spread of the data and help visualize how mortality rates fluctuate across different years or age groups. +- **Outliers**: Detecting potential outliers, represented by curves that fall outside the shaded envelope. These outliers may correspond to unusual years or age groups with mortality rates that significantly deviate from the norm, potentially due to historical events like wars or data anomalies. + +By applying the `fboxplot` method, we can effectively identify and analyze these deviations, enabling a more nuanced understanding of the data. This approach allows for the detection of unusual patterns that may require further investigation or adjustment in the modeling process, particularly in cases where historical events or anomalies have a significant impact on mortality trends. + + +```{r} +#| code-fold: false + +# filtering age <= 100 and years >= 1880 +year_filter <- france$year >= 1880 +filtered_years <- france$year[year_filter] +filtered_mortality_rates <- france$rate$male[, year_filter] + +age_filter <- france$age <= 100 +filtered_ages <- france$age[age_filter] +log_mortality_rates <- log(filtered_mortality_rates[age_filter, ]) + +# Create a Functional Data Set (fds) object for the fboxplot +log_mortality_fds <- fds(x = filtered_ages, y = log_mortality_rates) + +``` + +::: panel-tabset +## Functional boxplot +```{r} +#| fig-cap: "Functional Boxplot of Log-Mortality Rates in France" +#| label: "fig-fboxplot-functional-mortality" +#| code-fold: true +#| code-summary: "Code for the following functional HDR boxplot" +#| fig-width: 8 +#| fig-height: 5 +#| warning: false + +fboxplot(data = log_mortality_fds, + na.rm = TRUE, + plot.type = "functional", # For a functional boxplot + type = "bag", # For an bag boxplot + alpha = c(0.05, 0.5), # Coverage level + projmethod = "PCAproj", # Use PCA projection method + factor = 1.96, # Factor for the outer region + xlab = "Age", + ylab = "Log-Mortality Rate", + shadecols = gray((9:1)/10), # Colors for shaded regions + pointcol = 1, # Color of points (outliers and mode) + plotlegend = TRUE, + legendpos = "bottomright", + ncol = 2) +``` +The functional HDR boxplot illustrates the distribution of log mortality rates across various years during and after the World Wars. The shaded regions in the plot represent the central 50% and 95% of the data, effectively highlighting the typical mortality patterns observed during these periods. + +The year 1919 is identified as an outlier, displaying higher-than-expected mortality rates despite occurring after World War I. This anomaly could be attributed to the lingering effects of the war, the 1918 influenza pandemic, or other socio-economic disruptions. This observation underscores the fact that even years outside of direct wartime can exhibit atypical mortality patterns due to the residual effects of conflict or other external factors. + +The periods 1914-1918 (WWI) and 1940-1945 (WWII) are characterized by elevated mortality rates, particularly among younger and middle-aged populations, reflecting the direct impact of these conflicts on the population. + +This analysis demonstrates the utility of functional HDR boxplots in effectively identifying outliers and typical patterns in mortality data, providing valuable insights into how significant historical events and their aftermath can shape mortality trends. + + +## Functional bagplot +```{r} +#| fig-cap: "Bivariate Boxplot of Log-Mortality Rates in France" +#| label: "fig-fboxplot-bivariate-mortality" +#| code-fold: true +#| code-summary: "Code for the following functional HDR boxplot" +#| fig-width: 8 +#| fig-height: 5 + +fboxplot(data = log_mortality_fds, + na.rm = TRUE, + plot.type = "bivariate", # For a bivariate boxplot + type = "bag", # For an bag boxplot + alpha = c(0.05, 0.5), # Coverage level + projmethod = "PCAproj", # Use PCA projection method + factor = 1.96, # Factor for the outer region + xlab = "Age", + ylab = "Log-Mortality Rate", + shadecols = gray((9:1)/10), # Colors for shaded regions + pointcol = 1, # Color of points (outliers and mode) + plotlegend = TRUE, + legendpos = "bottomright", + ncol = 2) +``` + +::: + + +# Robust Hyndman–Ullah (HUrob) Method +Outliers, or unusual data points, can seriously affect the performance of modeling and forecasting. The `HUrob` method is designed to eliminate their effect. This method utilizes the reflection-based principal component analysis (RAPCA) algorithm of @hubert2002fast to obtain projection-pursuit estimates of principal components and their associated scores. The integrated squared error provides a measure of the accuracy of the principal component approximation for each year (@hyndman2007robust). Outlying years result in a larger integrated squared error than the critical value obtained by assuming normality of $\epsilon_t(x)$ (see @hyndman2007robust for details). By assigning zero weight to outliers, the `HUrob` method can then be used to model and forecast mortality rates without the possible influence of outliers. + +## How the `HUrob` Method Works + +The `HUrob` method uses an advanced technique called reflection-based principal component analysis (RAPCA). This technique helps to identify the main patterns in the data while filtering out those outliers that could lead to inaccurate predictions. + +Here’s a simplified explanation: + +- **Principal Components**: Think of these as the main patterns in your data. The method tries to capture these patterns and ignore the noise or outliers. +- **Outliers**: These are unusual data points that don’t fit the general pattern. If left unchecked, they can distort the model’s predictions. + +By assigning zero weight to outliers, the `HUrob` method effectively ignores these problematic data points, making the model's predictions more robust and trustworthy. + +Using the functional data analysis paradigm of @ramsay2005, @hyndman2007robust proposed a nonparametric method for modeling and forecasting log mortality rates. This approach extends the Lee-Carter (LC) method in four ways: + +- The log mortality rates are smoothed prior to modeling. +- Functional principal components analysis (FPCA) is used. +- More than one principal component is used in forecasting. +- The forecasting models for the principal component scores are typically more complex than the RWD model. + +The log mortality rates are smoothed using penalized regression splines. To emphasize that age, $x$, is now considered a continuous variable, we write $m_t(x)$ to represent mortality rates for age $x \in [x_1, x_p]$ in year $t$. We then define $z_t(x) = \log m_t(x)$ and write: + +$$ +z_t(x_i) = f_t(x_i) + \sigma_t(x_i) \epsilon_{t,i}, \quad i = 1, \dots, p, \quad t = 1, \dots, n,\ (2) +$$ + +where $f_t(x_i)$ denotes a smooth function of $x$ as before; $\sigma_t(x_i)$ allows the amount of noise to vary with $x_i$ in year $t$, thus rectifying the assumption of homoskedastic error in the LC model; and $\epsilon_{t,i}$ is an independent and identically distributed standard normal random variable. + +Given continuous age $x$, functional principal components analysis (FPCA) is used in the decomposition. The set of age-specific mortality curves is decomposed into orthogonal functional principal components and their uncorrelated principal component scores. That is: + +$$ +f_t(x) = a(x) + \sum_{j=1}^{J} b_j(x) k_{t,j} + e_t(x), +$$ + +where $a(x)$ is the mean function estimated by $\hat{a}(x) = \frac{1}{n} \sum_{t=1}^{n} f_t(x)$; $\{b_1(x), \dots, b_J(x)\}$ is a set of the first $J$ functional principal components; $\{k_{t,1}, \dots, k_{t,J}\}$ is a set of uncorrelated principal component scores; $e_t(x)$ is the residual function with mean zero; and $J < n$ is the number of principal components used. Note that we use $a(x)$ rather than $a_x$ to emphasize that $x$ is treated as a continuous variable. + +Multiple principal components are used because the additional components capture non-random patterns that are not explained by the first principal component (@booth2002; @renshaw2003; @koissi2006). @hyndman2007robust found $J = 6$ to be larger than the number of components actually required to produce white noise residuals, and this is the default value. The conditions for the existence and uniqueness of $k_{t,j}$ are discussed by Cardot, Ferraty & Sarda (2003). + +Although @lee1992 did not rule out the possibility of a more complex time series model for the $k_t$ series, in practice, an RWD model has typically been employed in the LC method. For higher-order principal components, which are orthogonal by definition to the first component, other time series models arise for the principal component scores. For all components, the `HUrob` method selects the optimal time series model using standard model-selection procedures (e.g., AIC). By conditioning on the observed data $I = \{z_1(x), \dots, z_n(x)\}$ and the set of functional principal components $B = \{b_1(x), \dots, b_J(x)\}$, the $h$-step-ahead forecast of $z_{n+h}(x)$ can be obtained by: + +$$ +\hat{z}_{n+h|n}(x) = E[z_{n+h}(x) | I, B] = \hat{a}(x) + \sum_{j=1}^{J} b_j(x) \hat{k}_{n+h|n,j}, +$$ + +where $\hat{k}_{n+h|n,j}$ denotes the $h$-step-ahead forecast of $k_{n+h,j}$ using a univariate time series model, such as the optimal ARIMA model selected by the automatic algorithm of @hyndman2008, or an exponential smoothing state space model (@hyndman2008). + +Because of the orthogonality of all components, it is easy to derive the forecast variance as: + +$$ +\hat{v}_{n+h|n}(x) = \text{var}[z_{n+h}(x) | I, B] = \sigma^2_a(x) + \sum_{j=1}^{J} b_j^2(x) u_{n+h|n,j} + v(x) + \sigma^2_t(x), +$$ + +where $\sigma^2_a(x)$ is the variance of $\hat{a}(x)$; $u_{n+h|n,j}$ is the variance of $k_{n+h,j} | k_{1,j}, \dots, k_{n,j}$ (obtained from the time series model); $v(x)$ is the variance of $e_t(x)$; and $\sigma_t(x)$ is defined in Equation (2). This expression is used to construct prediction intervals for future mortality rates in R. + +### Key Enhancements Over Traditional Methods + +The `HUrob` method improves upon traditional methods like the Lee-Carter (LC) model in several ways: + +- **Smoothing**: Log mortality rates are smoothed using techniques that stabilize the data, reducing the impact of random fluctuations. +- **Principal Components Analysis (PCA)**: Instead of relying on just one principal component, this method uses multiple components to capture more of the complexity in the data. +- **Advanced Time Series Models**: For predicting future trends, the `HUrob` method uses sophisticated models like ARIMA, which are more flexible and accurate than simpler models. + +### Putting It All Together + +In simple terms, the `HUrob` method is a powerful tool for making more accurate predictions about future mortality rates. By carefully managing outliers and using advanced statistical techniques, this method helps actuaries and analysts make better-informed decisions, whether they are pricing life insurance products, setting aside reserves, or planning for the future. + +## Conclusion + +The `HUrob` method enhances mortality rate modeling by addressing the impact of outliers using reflection-based principal component analysis (RAPCA). By eliminating the influence of outliers, the method ensures more robust forecasts. The approach builds on the Lee-Carter model by smoothing log mortality rates, applying functional principal components analysis (FPCA), and using multiple principal components for forecasting. These components are modeled with more complex time series models, like ARIMA, instead of the simpler Random Walk with Drift (RWD). The method effectively captures mortality trends while accounting for varying noise across ages and years, leading to more accurate predictions with well-defined prediction intervals. + + + +::: panel-tabset +## Fitting and forecasting HUrob +```{r} +#| code-fold: false + +# Apply the HUrob method using reflection-based PCA (RAPCA) +fdm_france <- fdm(france, series="male", method="rapca") + +france.fcast <- forecast(fdm_france,20) + +``` + +```{r} +#| code-fold: true +#| fig-cap: "HUrob Method: Log-Mortality rates Observed and Forecasted (next 20 years) in France" +#| label: "fig-plot-forecast-mortality" +#| code-summary: "Code for the following forecast" +#| fig-width: 8 +#| fig-height: 5 + +# Plot the observed and forecasted mortality rates +plot(france, series="male", + main = "", + ylim=c(-10, 0), + col = "#440154FF", + lty=2, + ylab = "Log death rate", xlab = "Age") + +lines(france.fcast, col = "#1E88E5", lty = 1, lwd = 2) + +# Add a legend to distinguish between observed and forecasted data +legend("bottomright", legend = c("Observed Data", "Forecast"), + col = c("#440154FF", "#1E88E5"), lty = c(2, 1), lwd = 2, cex = 0.8) +``` +The plot displays the log death rates for males in France from 1900 to 2022, across various age groups. The trends indicate that mortality rates have generally decreased over time, particularly at younger ages, but have started to rise again in older age groups. + +This forecast appears to be less optimistic than the Lee-Carter projections. While it still indicates improvements in mortality at younger ages, the flattening and slight increase in death rates at older ages suggest a more realistic outlook. The forecast acknowledges the challenges in further reducing mortality, particularly among older populations, reflecting a more cautious and realistic approach to future mortality trends. + +## HUrob parameters + +```{r} +#| code-fold: false + +models(france.fcast) +``` +::: justify +The output summarizes various ARIMA models fitted to different components of the `HUrob` method. + +- **Coefficient 1**: ARIMA(0,1,1) with drift. This model reflects a weak negative moving average (ma1 = -0.177) and a negative drift (-0.172), suggesting a slight downward trend over time. The fit is moderate, with AIC = 405 and BIC = 413. + +- **Coefficient 2**: ARIMA(1,1,1) with drift. This model captures stronger autocorrelation with ar1 = 0.35 and ma1 = -0.90. The drift term is positive (0.030), suggesting a slight upward trend. The fit improves with AIC = 323 and BIC = 335. + +- **Coefficient 3**: ARIMA(2,1,2). A more complex model with two autoregressive and two moving average components. The mixed signs in the coefficients indicate intricate temporal dynamics. The model fits similarly to Coefficient 2, with AIC = 327 and BIC = 341. + +- **Coefficient 4**: ARIMA(1,1,1). A simpler model with no drift, featuring moderate autocorrelation (ar1 = 0.54) and strong negative moving average (ma1 = -0.946). This model has a lower sigma^2 and improves in fit with AIC = 272 and BIC = 281. + +- **Coefficient 5**: ARIMA(2,0,0). This model features autoregressive terms (ar1 = 0.547, ar2 = 0.192) and does not include a constant or drift, implying the process has no inherent trend. The residual variance is moderate (sigma^2 = 0.611), and the fit is reasonable with AIC = 291 and BIC = 299. + +- **Coefficient 6**: ARIMA(0,0,2). This model includes two moving average terms (ma1 = 0.28, ma2 = 0.257) and also does not have a constant, suggesting no trend. It achieves the lowest residual variance (sigma^2 = 0.297) and provides a solid fit with AIC = 202 and BIC = 211. + +### Forecasting and Reconstruction + +The coefficients from these ARIMA models are used to forecast the principal component scores derived from the functional principal component analysis (FPCA). These scores represent the key patterns in the mortality data over time. + +- **Forecasting**: The ARIMA models, using the estimated coefficients (e.g., `ar1`, `ma1`, `drift`), predict future values of these scores. + +- **Reconstruction**: The forecasted scores are then combined with the functional principal components to reconstruct future mortality rates using the formula: + +$$ +\hat{z}_{n+h|n}(x) = \hat{a}(x) + \sum_{j=1}^{J} b_j(x) \hat{k}_{n+h|n,j} +$$ + +- **Final Output**: This process yields the forecasted mortality rates, along with their associated variance and prediction intervals. + +### Practical Implications + +These forecasts are essential for predicting future mortality trends across different age groups, providing a robust basis for actuarial analysis and decision-making. + +::: +::: + +# References + + +::: {#refs} + The date came from HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available [HMD](www.mortality.org). +::: + +# See also + +::: justify +For more similar mortality datasets, see +[`freMortTables`](https://dutangc.github.io/CASdatasets/reference/fremorttables.html) +: French mortality datasets or visit the [HMD](www.mortality.org) website. +::: diff --git a/vignettes/references.bib b/vignettes/references.bib index d730fa0..d798346 100644 --- a/vignettes/references.bib +++ b/vignettes/references.bib @@ -118,4 +118,144 @@ @article{breiman2001random pages={5--32}, year={2001}, publisher={Springer} -} \ No newline at end of file +} + + +@article{hubert2002fast, + title={A fast method for robust principal components with applications to chemometrics}, + author={Hubert, Mia and Rousseeuw, Peter J and Verboven, Sabine}, + journal={Chemometrics and intelligent laboratory systems}, + volume={60}, + number={1-2}, + pages={101--111}, + year={2002}, + publisher={Elsevier} +} + +@article{hyndman2007robust, + title={Robust forecasting of mortality and fertility rates: A functional data approach}, + author={Hyndman, Rob J and Ullah, Md Shahid}, + journal={Computational Statistics \& Data Analysis}, + volume={51}, + number={10}, + pages={4942--4956}, + year={2007}, + publisher={Elsevier} +} + + +@book{ramsay2005, + title={Functional Data Analysis}, + author={Ramsay, James O and Silverman, Bernard W}, + year={2005}, + publisher={Springer} +} + +@article{booth2002, + title={Applying Lee-Carter under conditions of variable mortality decline}, + author={Booth, Heather and Maindonald, John and Smith, Len}, + journal={Population studies}, + volume={56}, + number={3}, + pages={325--336}, + year={2002}, + publisher={Taylor \& Francis} +} + +@article{renshaw2003, + title={Lee--Carter mortality forecasting with age-specific enhancement}, + author={Renshaw, Arthur E and Haberman, Steven}, + journal={Insurance: Mathematics and Economics}, + volume={33}, + number={2}, + pages={255--272}, + year={2003}, + publisher={Elsevier} +} + +@article{koissi2006, + title={The state of the art in the Lee--Carter model}, + author={Koissi, Marie-Claire and Shapiro, Arnold F and H{\"o}gn{\"a}s, George}, + journal={Insurance: Mathematics and Economics}, + volume={38}, + number={1}, + pages={2--23}, + year={2006}, + publisher={Elsevier} +} + +@article{cardot2003, + title={Spline estimators for the functional linear model}, + author={Cardot, Herv{\'e} and Ferraty, Fr{\'e}d{\'e}ric and Sarda, Pascal}, + journal={Statistica Sinica}, + pages={571--591}, + year={2003}, + publisher={JSTOR} +} + +@article{hyndman2008, + title={Forecasting with exponential smoothing: the state space approach}, + author={Hyndman, Rob J and Koehler, Anne B and Snyder, Ralph D and Grose, Simone}, + journal={Springer Series in Statistics}, + year={2008}, + publisher={Springer} +} + +@article{hyndman2008arima, + title={Automatic time series forecasting: the forecast package for R}, + author={Hyndman, Rob J and Khandakar, Yeasmin}, + journal={Journal of statistical software}, + volume={27}, + number={3}, + pages={1--22}, + year={2008} +} + +@article{lee1992, + title={Modeling and forecasting US mortality}, + author={Lee, Ronald D and Carter, Lawrence R}, + journal={Journal of the American Statistical Association}, + volume={87}, + number={419}, + pages={659--671}, + year={1992}, + publisher={Taylor \& Francis} +} + +@article{tuljapurkar2000, + title={A universal pattern of mortality decline in the G7 countries}, + author={Tuljapurkar, Shripad and Li, Nan and Boe, Carl}, + journal={Nature}, + volume={405}, + number={6788}, + pages={789--792}, + year={2000}, + publisher={Nature Publishing Group} +} + +@article{lee2001, + title={Evaluating the performance of the Lee-Carter method for forecasting mortality}, + author={Lee, Ronald D and Miller, Timothy}, + journal={Demography}, + volume={38}, + number={4}, + pages={537--549}, + year={2001}, + publisher={Springer} +} + +@article{lazar2005, + title={Robust methods for actuarial applications}, + author={Lazar, Avner and Denuit, Michel}, + journal={Insurance: Mathematics and Economics}, + volume={36}, + number={1}, + pages={1--21}, + year={2005}, + publisher={Elsevier} +} + + + + +