econ-ark · llorracc · Jan 20, 2021 · Jan 15, 2021 · Jan 18, 2021 · Jan 18, 2021
diff --git a/HARK/datasets/SCF/WealthIncomeDist/README.md b/HARK/datasets/SCF/WealthIncomeDist/README.md
@@ -0,0 +1,8 @@
+# Summary statistics of wealth and permanent income in the United States
+
+# TODO
+
+The summary statistics in `WealthIncomeStats.csv` are computed using the
+Survey of Consumer Finances. The file can be replicated directly from the
+unprocessed SCF summary files using the repository [SCF-IncWealthDist](https://github.com/Mv77/SCF-IncWealthDist),
+created by [Mateo Velasquez-Giraldo](https://mv77.github.io/).
diff --git a/HARK/datasets/SCF/WealthIncomeDist/WealthIncomeStats.csv b/HARK/datasets/SCF/WealthIncomeDist/WealthIncomeStats.csv
diff --git a/HARK/datasets/SCF/WealthIncomeDist/__init__.py b/HARK/datasets/SCF/WealthIncomeDist/__init__.py
diff --git a/HARK/datasets/SCF/WealthIncomeDist/parser.py b/HARK/datasets/SCF/WealthIncomeDist/parser.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jan  8 15:36:14 2021
+
+@author: Mateo
+"""
+
+import numpy as np
+import pandas as pd
+from warnings import warn
+import os
+
+scf_sumstats_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_scf_distr_stats():
+    """
+    
+    """
+
+    filename = os.path.join(scf_sumstats_dir, "WealthIncomeStats.csv")
+
+    # Read csv
+    table = pd.read_csv(filename, sep=",")
+
+    return table
+
+
+def parse_scf_distr_stats(
+    age = None, education = None, year = None
+):
+
+    # Pre-process year to make it a five-year bracket as in the table
+    if age is not None:
+
+        u_bound = int(np.ceil(age/5) * 5)
+        l_bound = u_bound - 5
+        age_bracket = '(' + str(l_bound) + ',' + str(u_bound) + ']'
+
+    else:
+
+        # If no year is given, use all years.
+        age_bracket = 'All'
+
+    # Check whether education is in one of the allowed categories
+    if education is not None:
+
+        message = ("If an education level is provided, it must be one of " +
+                   "'NoHS', 'HS', or 'College'.")
+        assert education in ['NoHS','HS','College'], message
+
+    else:
+
+        education = 'All'
+
+    # Parse the year
+    year_str = 'All' if year is None else str(int(year))
+
+    # Read table
+    filename = os.path.join(scf_sumstats_dir, "WealthIncomeStats.csv")
+
+    # Read csv
+    table = pd.read_csv(filename, sep=",",
+                        index_col = ['Educ','YEAR','Age_grp'],
+                        dtype = {'Educ': str,'YEAR': str,'Age_grp': str})
+
+    # Try to access the requested combination
+    try:
+
+        row = table.loc[(education, year_str, age_bracket)]
+
+    except KeyError as e:
+
+        message = ("The summary statistics do not contain the "+
+                   "Age/Year/Education combination that was requested.")
+        raise Exception(message).with_traceback(e.__traceback__)
+
+    # Check for NAs
+    if any(row.isna()):
+        warn("There were not enough observations in the requested " + 
+             "Age/Year/Education combination to compute all summary" +
+             "statistics.")
+
+    return row.to_dict()
+
+def income_wealth_dists_from_scf(age = None, education = None, year = None):
+
+    stats = parse_scf_distr_stats(age, education, year)
+
+    param_dict = {
+        'aNrmInitMean' : stats['lnNrmWealth.mean'], # Mean of log initial assets (only matters for simulation)
+        'aNrmInitStd'  : stats['lnNrmWealth.sd'],   # Standard deviation of log initial assets (only for simulation)
+        'pLvlInitMean' : stats['lnPermIncome.mean'],# Mean of log initial permanent income (only matters for simulation)
+        'pLvlInitStd'  : stats['lnPermIncome.sd'],  # Standard deviation of log initial permanent income (only matters for simulation)
+    }
+
+    return param_dict
diff --git a/HARK/datasets/SCF/__init__.py b/HARK/datasets/SCF/__init__.py
diff --git a/examples/Calibration/SCF_distributions.py b/examples/Calibration/SCF_distributions.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jan 18 13:57:50 2021
+
+@author: Mateo
+"""
+
+from HARK.datasets.SCF.WealthIncomeDist.parser import income_wealth_dists_from_scf
+import seaborn as sns
+from itertools import product, starmap
+import pandas as pd
+
+# List the education levels and years
+educ_lvls = ['NoHS', 'HS', 'College']
+years = list(range(1995,2022,3))
+
+age = 25
+
+# %% Get the distribution of aNrm and pLvl at each year x education
+params = list(product([age],educ_lvls, years))
+age, education, year = list(zip(*params))
+
+frame = pd.DataFrame({'age': age, 'education': education, 'year': year})
+
+results = list(starmap(income_wealth_dists_from_scf, params))
+frame = pd.concat([frame, pd.DataFrame(results)], axis = 1)
+
+# %% Plot time trends at different education levels.
+
+# Formatting
+frame = frame.melt(id_vars = ['age', 'education','year'])
+aux = frame["variable"].str.split("(Mean|Std)", n = 1, expand = True) 
+frame["variable"] = aux[0]
+frame["stat"] = aux[1]
+
+# Plot
+g = sns.FacetGrid(frame, col="stat", row = "variable", hue="education", sharey = True)
+g.map(sns.scatterplot, "year", "value", alpha=.7)
+g.add_legend()