Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds household_structure.py with smoothed averages of number of <18, 18-64, and 65+ people per family sorted by head age and income bracket #30

Closed
wants to merge 19 commits into from
Closed
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions ogusa_calibrate/household_structure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove these empty lines


import pickle
import pandas as pd
import numpy as np
import microdf as mdf
import matplotlib.pyplot as plt
from scipy.stats import kde

panel_li = pickle.load(open('psid_lifetime_income.pkl', 'rb')) #created by psid_data_setup.py

panel_li.insert(len(panel_li.columns),"weight",1) #create column of only 1s which is used as weights for taking microdf average
Copy link
Contributor

@MaxGhenis MaxGhenis Mar 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

microdf doesn't require weights, you can leave the weights argument in any function empty to have it be unweighted (or for this file, skip the microdf import as it's unnecessary)

panel_li.insert(len(panel_li.columns),"nu18",0)
panel_li.insert(len(panel_li.columns),"n1864",0)
panel_li.insert(len(panel_li.columns),"n65",0)
#RData file and therefore dataset in psid_data_setup.py -> panel_li is assumed to have 'num_family' as follows the updated list of variables pulled

def add65(head_age,spouse_age):
count = 0
if(head_age >= 65):
count += 1
if(spouse_age>=65):
count += 1
return count #assumes only head or spouse of head can be 65+
panel_li['n65'] = panel_li.apply(lambda x: add65(x['head_age'],x['spouse_age']), axis=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
panel_li['n65'] = panel_li.apply(lambda x: add65(x['head_age'],x['spouse_age']), axis=1)
panel_li['n65'] = np.where(panel_li.head_age > 64, 1, 0) + np.where(panel_li.spouse_age > 64, 1, 0)

can replace the add65 function


def add1864(head_age,spouse_age,num_family,num_18):
count = num_family-num_18
if(head_age >= 65):
count -= 1
if(spouse_age>=65):
count -= 1
return count
panel_li['n1864'] = panel_li.apply(lambda x: add1864(x['head_age'],x['spouse_age'],x['num_family'],x['num_children_under18']), axis=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
panel_li['n1864'] = panel_li.apply(lambda x: add1864(x['head_age'],x['spouse_age'],x['num_family'],x['num_children_under18']), axis=1)
panel_li['n1864'] = panel_li.num_family - panel_li.n65 - panel_li.nu18

after moving the nu18 line above this, or just use num_children_under18


panel_li['nu18'] = panel_li.apply(lambda x: x['num_children_under18'], axis=1) #assumes only children can be <18
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
panel_li['nu18'] = panel_li.apply(lambda x: x['num_children_under18'], axis=1) #assumes only children can be <18
panel_li['nu18'] = panel_li.num_children_under18


panel_li2 = panel_li.reset_index()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
panel_li2 = panel_li.reset_index()
panel_li.reset_index(inplace=True)
panel_li_20_80 = panel_li[panel_li.head_age.isbetween(20, 80)]

replacing below lines too

panel_li3 = panel_li2[panel_li2['head_age'] <= 80]
panel_li3 = panel_li3[panel_li3['head_age'] >= 20]
panel_li4 = panel_li3.groupby(['head_age', 'li_group']).apply(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
panel_li4 = panel_li3.groupby(['head_age', 'li_group']).apply(
panel_li_group = panel_li_20_80.groupby(['head_age', 'li_group'])[["nu18", "n1864", "n65"]].mean()

This takes care of much of the below code. Don't need microdf since nothing's weighted.

lambda x: pd.Series({
'nu18': mdf.weighted_mean(x, 'nu18', 'weight')
})).unstack() #rows become head age, columns income bracket, same for each
panel_li4.dropna(inplace = True)

panel_li5 = panel_li3.groupby(['head_age', 'li_group']).apply(
lambda x: pd.Series({
'nu18': mdf.weighted_mean(x, 'n1864', 'weight')
})).unstack()
panel_li5.dropna(inplace = True)
panel_li6 = panel_li3.groupby(['head_age', 'li_group']).apply(
lambda x: pd.Series({
'nu18': mdf.weighted_mean(x, 'n65', 'weight')
})).unstack()
panel_li6.dropna(inplace = True)

temp1 = panel_li4.sum(axis=1)
panelFinal = temp1.sum()
temp18 = panel_li4[panel_li4.columns] * 1/panelFinal #scales down to sum of 1 for smoothing, same for each below

temp2 = panel_li5.sum(axis=1)
panelFinal2 = temp2.sum()
temp1864 = panel_li5[panel_li5.columns] * 1/panelFinal2

temp3 = panel_li6.sum(axis=1)
panelFinal3 = temp3.sum()
temp65 = panel_li6[panel_li6.columns] * 1/panelFinal3


def MVKDE(S, J, proportion_matrix, filename=None, plot=False, bandwidth=.25):
'''
Generates a Multivariate Kernel Density Estimator and returns a
matrix representing a probability distribution according to given
age categories, and ability type categories.
Args:
S (scalar): the number of age groups in the model
J (scalar): the number of ability type groups in the model.
proportion_matrix (Numpy array): SxJ shaped array that
represents the proportions of the total going to each
(s,j) combination
filename (str): the file name to save image to
plot (bool): whether or not to save a plot of the probability
distribution generated by the kde or the proportion matrix
bandwidth (scalar): used in the smoothing of the kernel. Higher
bandwidth creates a smoother kernel.
Returns:
estimator_scaled (Numpy array): SxJ shaped array that
that represents the smoothed distribution of proportions
going to each (s,j)
'''
proportion_matrix_income = np.sum(proportion_matrix, axis=0)
proportion_matrix_age = np.sum(proportion_matrix, axis=1)
age_probs = np.random.multinomial(70000, proportion_matrix_age)
income_probs = np.random.multinomial(70000, proportion_matrix_income)
age_frequency = np.array([])
income_frequency = np.array([])
age_mesh = complex(str(S) + 'j')
income_mesh = complex(str(J) + 'j')
j = 18
'''creating a distribution of age values'''
for i in age_probs:
listit = np.ones(i)
listit *= j
age_frequency = np.append(age_frequency, listit)
j += 1

k = 1
'''creating a distribution of ability type values'''
for i in income_probs:
listit2 = np.ones(i)
listit2 *= k
income_frequency = np.append(income_frequency, listit2)
k += 1

freq_mat = np.vstack((age_frequency, income_frequency)).T
density = kde.gaussian_kde(freq_mat.T, bw_method=bandwidth)
age_min, income_min = freq_mat.min(axis=0)
age_max, income_max = freq_mat.max(axis=0)
agei, incomei = np.mgrid[age_min:age_max:age_mesh,
income_min:income_max:income_mesh]
coords = np.vstack([item.ravel() for item in [agei, incomei]])
estimator = density(coords).reshape(agei.shape)
estimator_scaled = estimator/float(np.sum(estimator))
if plot:
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(agei,incomei, estimator_scaled, rstride=5)
ax.set_xlabel("Age")
ax.set_ylabel("Ability Types")
ax.set_zlabel("Received proportion of total bequests")
plt.savefig(filename)
return estimator_scaled

result18 = MVKDE(60, 7, temp18)
result18 = result18*panelFinal #scale back up, same for each below

result1864 = MVKDE(60, 7, temp1864)
result1864 = result1864*panelFinal2

result65 = MVKDE(60, 7, temp65)
result65 = result65*panelFinal3

np.save('nu18', result18)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you include these files, as well as temp18 etc., in the PR?

np.save('n1864', result1864)
np.save('n65', result65)