-
Notifications
You must be signed in to change notification settings - Fork 4
/
parse_data.py
210 lines (188 loc) · 7.84 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import csv
from pprint import pprint
"""
General Notes:
LIVING_QTRS:
Type of Living Quarters
01 House, apartment, flat, condo
02 HU in nontransient hotel, motel
03 HU-permanent in transient hotel, motel
04 HU in rooming house
05 Mobile home/trailer w/no permanent rooms added
06 Mobile home/trailer w/1+ permanent rooms added
07 HU not specified above
08 Quarters not HU in room or board house
09 Unit not permanent-transient hotel, motel
10 Unoccupied site for mobile home/trailer/tent
11 Student quarters in college dormitory
12 Group quarter unit not specified above
98 Not ascertained
NON_RESP:
Category of type A non-response
1 Refused
2 No one home - repeated calls
3 Temporarily absent
4 Language problem
5 Other
"""
def try_int(value):
"""
Try to cast to int, if not return 0, because the problem
with parsing some of these is that they at "00" which
isn't a valid integer value
"""
try:
return int(value)
except ValueError:
return 0
def parse_family():
"""
Question ID FinalDocName Processing Variable Label Location Length
IDN.000_00.000 RECTYPE File type identifier 1 - 2 2
IDN.000_02.000 SRVY_YR Year of National Health Interview Survey 3 - 6 4
IDN.000_04.000 HHX Household Number 7 - 12 6
IDN.000_25.000 INTV_QRT Interview Quarter 13 1
IDN.000_30.000 INTV_MON Interview Month 14 - 15 2
IDN.000_65.000 WTIA_HH Weight - Interim Annual 16 - 21 6
IDN.000_70.000 WTFA_HH Weight - Final Annual 22 - 27 6
COV.260_00.000 LIVQRT Type of Living Quarters 28 - 29 2
MHH.000_00.000 NON_INTV Category of type A non-response 30 1
MHH.000_00.000 ACPT_FAM Number of families in HH responding 31 - 32 2
MHH.000_00.000 REJ_FAM Number of families in HH not responding 33 - 34 2
MHH.000_00.000 ACPT_PER Number of persons in HH responding 35 - 36 2
MHH.000_00.000 REJ_PER Number of persons in HH not responding 37 - 38 2
MHH.000_00.000 ACPTCHLD Number of children in HH that responded 39 - 40 2
UCF.000_00.000 REGION Region 41 1
UCF.000_00.000 STRAT_P Pseudo-stratum for public use file variance estimation 42 - 44 3
UCF.000_00.000 PSU_P Pseudo-PSU for public use file variance estimation 45 - 46 2
"""
ret_list = [] # list containing all the records that we will write to file
familyData = open('familyxx.dat','r').readlines()
for line in familyData:
csv_dict_row = {
'RECTYPE' : int(line[0:2]) , # int
'SURVEY_YR' : int(line[2:6]), # int TODO: USE WITH INTERVW_MONTH for datetime object
'HOUSE_NUM' : int(line[6:12]), # int I believe it to be the Unique ID
'INTERVW_QTR' : int(line[12]), # int
'INTERVW_MONTH' : int(line[13:15]), # int
'WEIGHT_INTERIM': int(line[15:21]), # int
'WEIGHT_FINAL' : int(line[21:27]), # int
'LIVING_QTRS' : int(line[27:29]), # int, ID highlighted above
'NON_RESP' : try_int(line[29]), # int, if true probably should throw away
'NUM_FAM_RESP': try_int(line[30:32]), # int
'NUM_FAM_NON_RESP' : try_int(line[32:34]), # int
'NUM_PPL_RESP' : try_int(line[34:36]), # int
'NUM_PPL_NON_RESP': try_int(line[36:38]), # int
'NUM_CHILD_RESP' : try_int(line[38:40]), # int
'REGION' : int(line[40]), # int, pretty much useless
'STRAT_P' : int(line[41:44]), # int, No idea what this is
'PSU_P' : int(line[44:46]) # int, not a clue what this is either
}
ret_list.append(csv_dict_row)
return ret_list
def parse_person_text():
"""
The file to describe all of this stuff is very long ( approx. 650 Variables)
So I'm writing a script to parse all that information automatically to make
a dict containing all those variables
Returns a list of strings and tuples containing the ranges
"""
read1 = open('person_desc_small.txt','r')
personDesc = read1.readlines()
variable_names = []
indices = []
i = 0
for x in xrange(0,len(personDesc)-23,24):
curr_line = personDesc[x:x+24]
curr_line = [x.strip('\n').strip() for x in curr_line]
if i == 0:
variable_names.extend(curr_line)
if i == 1:
tmp_value = None
for elem in curr_line:
try:
tmp_value = int(elem)-1
except ValueError:
tmp_str = elem.split()
tmp_value = (int(tmp_str[0])-1,int(tmp_str[2]))
indices.append(tmp_value)
if i >= 2:
break
i+=1
read1.close()
read2 = open('person_desc.txt','r')
personDesc = read2.readlines()
for line in personDesc:
line = line.split()
if int(line[-1]) == 1:
indices.append(int(line[-2])-1)
else:
indices.append((int(line[-4])-1,int(line[-2])))
if len(line[1]) == 3 and line[1][0] == 'R':
variable_names.append(line[3])
else:
variable_names.append(line[2])
read2.close()
return variable_names,indices
def try_types(elem):
if elem.isspace():
return None
try:
return int(elem)
except ValueError:
return elem
def parse_persons():
"""
Check variable descriptions here: ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NHIS/2014/personsx_summary.pdf
Parses person file for data
"""
csv_list = []
variable_names, indices = parse_person_text()
read_persons_file = open('personsx.dat','r')
read_persons = read_persons_file.readlines()
for line in read_persons:
curr_dict = {}
i = 0
for name in variable_names:
if type(indices[i]) == type(0):
curr_dict[name] = try_types(line[indices[i]])
else:
curr_dict[name] = try_types(line[indices[i][0]:indices[i][1]])
i+=1
csv_list.append(curr_dict)
read_persons_file.close()
with open('persons.csv', 'w') as csvfile:
fieldnames = variable_names
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for line in csv_list:
writer.writerow(line)
csvfile.close()
def get_variable_list():
"""
Get list of variables for reducing csv file
"""
f = open('predictors.txt','r')
reader = f.readlines()
reader = [x.strip() for x in reader]
f.close()
return reader
def get_reduced_file(variables, filename='feature_set.txt', big_file = 'persons.csv'):
"""
Takes in list of variables and reduces the csv file to the
reduced feature set
"""
f = open(big_file,'r')
reader = csv.DictReader(f)
with open(filename, 'w') as csvfile:
fieldnames = variables
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for line in reader:
write_row = {}
for elem in variables:
write_row[elem] = line[elem]
writer.writerow(write_row)
csvfile.close()
f.close()
get_reduced_file(get_variable_list())