-
Notifications
You must be signed in to change notification settings - Fork 0
/
education.py
225 lines (178 loc) · 7.14 KB
/
education.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import statsmodels.api as sm
# Import UN webpage for scraping
url = "http://web.archive.org/web/20110514112442/\
http://unstats.un.org/unsd/demographic/products/socind/education.htm"
# Get the content
r = requests.get(url)
# Pass the results to Beautiful Soup
soup = BeautifulSoup(r.content)
# Check for desired table(s)
for row in soup('table'):
print(row)
# Run script in a wrapper
if r.ok:
# Put content in the parsing structure
soup = BeautifulSoup(r.content)
# Find all table rows within the appropriate table
A = soup('table')[6]('table')[2].findAll('tr')
A = A[4:-1]
# Define and find each record, add to dataRows
dataRows = []
for row in A:
record = [x.string for x in row.findAll('td', {'align': 'right'})]
record.insert(0, row.findAll('td')[0].string)
dataRows.append(record)
# Create column names for dataRows
colNames = ['Country', 'Year', 'Total', 'Men', 'Women']
# Read into a pandas dataframe
dfUNData = pd.DataFrame(dataRows, columns=colNames)
# Write data to CSV file
dfUNData.to_csv('un_data.csv', encoding='utf-8', index=False)
else:
print r.status_code
# Examine a histogram of male education
plt.figure()
(dfUNData['Men'].astype(int)).hist()
plt.title('Male School Life Expentancy')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
# Examine a histogram of female education
plt.close()
plt.figure()
(dfUNData['Women'].astype(int)).hist()
plt.title('Women School Life Expentancy')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
# Print summary statistics by gender
print "The mean age for men is: "
print (dfUNData['Men'].astype(int)).mean()
print "\n"
print "The median age for men is: "
print (dfUNData['Men'].astype(int)).median()
print "\n"
print "The mean age for women is: "
print (dfUNData['Women'].astype(int)).mean()
print "\n"
print "The median age for women is: "
print (dfUNData['Women'].astype(int)).median()
print "\n"
# The UN Education Dataset
# The mean and median school life expectancy for women is greater than the mean
# and median school life expectancy of men (i.e., girls appear to stay in
# school longer than boys). This seems somewhat counterintuitive to stories
# about gender and education in developing countries, where it is routine for
# families to invest more in the education of sons than daughters. However,
# this is data across 182 countries, including many countries like the U.S.,
# where women stay in school longer than men. Although some countries, such
# as Afghanistan or Chad, clearly show educational bias for male children when
# looking at the country-level data in this dataset.
# Import World Bank data on GDP
data = pd.read_csv('ny.gdp.mktp.cd_indicator_en_csv_v2.csv', skiprows=3)
data.head()
# Read into a pandas dataframe
dfGDPData = pd.DataFrame(data, columns=['Country Code', '1999', '2000', '2001',
'2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010'])
# Remove scientific notation on GDP figures
pd.set_option('display.float_format', lambda x: '%.3f' % x)
dfGDPData.head()
# Remove rows/countries that are completely missing data
dfGDPData = dfGDPData.dropna(thresh=11)
# Eliminate the few remaining NaN values
dfGDPData = dfGDPData.dropna()
# Since there is variation in the country names between the UN Education
# Dataset and the World Bank GDP Dataset, we will use the country code to
# merge the datasets together. In order to do this, we will need to scrape
# a third dataframe that has UN country codes.
# Import UN code page
url = "http://unstats.un.org/unsd/methods/m49/m49alpha.htm"
# Get the content
r = requests.get(url)
# Pass the results to Beautiful Soup
soup = BeautifulSoup(r.content)
# Run script in a wrapper
if r.ok:
# Put content in the parsing structure
soup = BeautifulSoup(r.content)
# Find all table rows within the appropriate table
A = soup('table')[3]('table')[2].findAll('tr')
A = A[1:]
# Define and find each record, add to dataRows
dataRows = []
for row in A:
record = [x.string for x in row.findAll('p')]
dataRows.append(record)
# Create column names for dataRows
colNames = ['NumericalCode', 'Country', 'ISOCode']
# Read into a pandas dataframe
dfUNCodes = pd.DataFrame(dataRows, columns=colNames)
# Clean up the dataframe
dfUNCodes.dropna(inplace=True)
dfUNCodes['NumericalCode'] = dfUNCodes.apply(
lambda x: x['NumericalCode'].rstrip(), axis=1)
dfUNCodes['Country'] = dfUNCodes.apply(
lambda x: x['Country'].rstrip(), axis=1)
dfUNCodes['ISOCode'] = dfUNCodes.apply(
lambda x: x['ISOCode'].lstrip(), axis=1)
dfUNCodes['ISOCode'] = dfUNCodes.apply(
lambda x: x['ISOCode'].rstrip(), axis=1)
# Write data to CSV file
dfUNCodes.to_csv('un_codes.csv', encoding='utf-8', index=False)
else:
print r.status_code
# Merge UN education dataframe and UN country codes dataframe on country name
dfUN_merged = pd.merge(dfUNData, dfUNCodes, on='Country', how='inner')
# Rename columns so the dataframe columns align
dfGDPData.columns = dfGDPData.columns.str.replace('Country Code', 'ISOCode')
# Merge updated UN data with the World Bank dataframe on the country code
dfAll_merged = pd.merge(dfUN_merged, dfGDPData, on='ISOCode', how='inner')
# Verify data types are compatible for running regression
dfAll_merged.info()
# <class 'pandas.core.frame.DataFrame'>
# Int64Index: 162 entries, 0 to 161
# Data columns (total 19 columns):
# Country 162 non-null object
# Year 162 non-null object
# Total 162 non-null object
# Men 162 non-null object
# Women 162 non-null object
# NumericalCode 162 non-null object
# ISOCode 162 non-null object
# 1999 162 non-null float64
# 2000 162 non-null float64
# 2001 162 non-null float64
# 2002 162 non-null float64
# 2003 162 non-null float64
# 2004 162 non-null float64
# 2005 162 non-null float64
# 2006 162 non-null float64
# 2007 162 non-null float64
# 2008 162 non-null float64
# 2009 162 non-null float64
# 2010 162 non-null float64
# dtypes: float64(12), object(7)
# memory usage: 25.3+ KB
# Convert 'Men', 'Women', and 'Total' to floats
dfAll_merged[['Men', 'Women', 'Total']] = dfAll_merged[['Men', 'Women', 'Total']].astype(float)
# Do a log-transform of the GDP to get a scale that can be used to compare
years = (['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
'2007', '2008', '2009', '2010'])
for year in years:
dfAll_merged[year + 'L'] = dfAll_merged[year].apply(lambda x: np.log(x))
# Create a linear model with log GDP (dependent) and education (independent)
loggdp = dfAll_merged[['1999L', '2000L', '2001L', '2002L', '2003L', '2004L',
'2005L', '2006L', '2007L', '2008L', '2009L', '2010L']].mean(axis=1)
education = dfAll_merged['Total']
y = np.matrix(loggdp).transpose()
x = np.matrix(education).transpose()
X = sm.add_constant(x)
model = sm.OLS(y,X)
results = model.fit()
print results.summary()