-
Notifications
You must be signed in to change notification settings - Fork 2
/
survived_titanic.js
241 lines (182 loc) · 6.5 KB
/
survived_titanic.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// Title: titanic suvival prediction
// Date: August 13, 2019
// Created by: Lydia Jessup
// Using an example from TensorFlow.js here:
// https://www.tensorflow.org/js/tutorials/training/nodejs_training
// Description: This example uses pre-cleaned data to train a neural network
// to predict survival on the titanic
//load tensorflow
const tf = require('@tensorflow/tfjs');
///////////////////////////////////////////////
// Import, normalize and transform data
///////////////////////////////////////////////
// util function to normalize a value between a given range.
function normalize(value, min, max) {
if (min === undefined || max === undefined) {
return value;
}
return (value - min) / (max - min);
}
// data can be loaded from URLs or local file paths when running in Node.js.
// in this case loading pre-cleaned and split data
const TRAIN_DATA_PATH = 'file://titanic_train.csv';
const TEST_DATA_PATH = 'file://titanic_test.csv';
//// Make Constants from training data ////
// have to put in all min and max values in order to normalize
//age and fare -- for class and sex will have to do one hot encoding
const AGE_MIN = 0.0;
const AGE_MAX = 80.0;
const FARE_MIN = 0.0;
const FARE_MAX = 512.0;
const NUM_SURVIVED_CLASSES = 2;
const TRAINING_DATA_LENGTH = 916;
const TEST_DATA_LENGTH = 393;
//// Convert rows from the CSV into features and labels ////
// Each feature field is normalized within training data constants
// xs is the features and xy is the label we are predicting
const csvTransform =
({xs, ys}) => {
const values = [
normalize(xs.age, AGE_MIN, AGE_MAX),
normalize(xs.fare, FARE_MIN, FARE_MAX),
xs.is_female //since this is already binary 0 and 1 we don't need to normalize
//leaving out class for now
];
return {xs: values, ys: ys.survived};
}
//make training data
const trainingData =
tf.data.csv(TRAIN_DATA_PATH, {columnConfigs: {survived: {isLabel: true}}})
.map(csvTransform)
.shuffle(TRAINING_DATA_LENGTH)
.batch(100);
// Make training validation data from training data
const trainingValidationData =
tf.data.csv(TRAIN_DATA_PATH, {columnConfigs: {survived: {isLabel: true}}})
.map(csvTransform)
.batch(TRAINING_DATA_LENGTH);
// Make test vaidation data from test data
const testValidationData =
tf.data.csv(TEST_DATA_PATH, {columnConfigs: {survived: {isLabel: true}}})
.map(csvTransform)
.batch(TEST_DATA_LENGTH);
///////////////////////////////////////////////
// Create model
///////////////////////////////////////////////
//using sequential model
const model = tf.sequential();
//adding 3 hidden layers
//input shape is 3 because there are 3 data points we are using for each person
//using ReLU activation
//Following what I used in colab example to keep it comparable
//colab was:
//128 - relu
//128 - relu
//1 - sigmoid
model.add(tf.layers.dense({units: 128, activation: 'relu', inputShape: [3]}));
model.add(tf.layers.dense({units: 128, activation: 'relu'}));
//model.add(tf.layers.dense({units: 150, activation: 'relu'}));
//output layer using softmax
//units is the number of outcome classes for survival (2)
model.add(tf.layers.dense({units: NUM_SURVIVED_CLASSES, activation: 'softmax'}));
//compile the model
//leaving the defaults
//using adam optimizer and sparseCategoricalCrossentroy loss function
model.compile({
optimizer: tf.train.adam(),
loss: 'sparseCategoricalCrossentropy',
metrics: ['accuracy']
});
///////////////////////////////////////////////
// Evaluate model
///////////////////////////////////////////////
// Returns survival outcome class evaluation percentages for training data
// with an option to include test data
/////// evaluate function ///////
async function evaluate(useTestData) {
let results = {};
await trainingValidationData.forEachAsync(survivedBatch => {
const values = model.predict(survivedBatch.xs).dataSync();
//console.log(values);
const classSize = TRAINING_DATA_LENGTH / NUM_SURVIVED_CLASSES;
for (let i = 0; i < NUM_SURVIVED_CLASSES; i++) {
results[survivedCode(i)] = {
training: calcSurvivedEval(i, classSize, values)
};
}
});
if (useTestData) {
await testValidationData.forEachAsync(survivedBatch => {
const values = model.predict(survivedBatch.xs).dataSync();
//console.log(values);
const classSize = TEST_DATA_LENGTH / NUM_SURVIVED_CLASSES;
//console.log(classSize); this is fine
for (let i = 0; i < NUM_SURVIVED_CLASSES; i++) {
results[survivedCode(i)].validation =
calcSurvivedEval(i, classSize, values);
}
//this is the area that is a problem...
//console.log(calcSurvivedEval(1, classSize, values));
});
}
return results;
}
///////////////////////////////////////////////
// Predict on sample
///////////////////////////////////////////////
async function predictSample(sample) {
let result = model.predict(tf.tensor(sample, [1,sample.length])).arraySync();
var maxValue = 0;
var predictedSurvival = 2;
//console.log(result[0][0]);
for (var i = 0; i < NUM_SURVIVED_CLASSES; i++) {
if (result[0][i] > maxValue) {
predictedSurvival = i;
}
}
//console.log(predictedSurvival);
return survivedCode(predictedSurvival);
}
///////////////////////////////////////////////
// Calculate accuracy
///////////////////////////////////////////////
// Determines accuracy evaluation for a given pitch class by index
function calcSurvivedEval(survivedIndex, classSize, values) {
// From example: Output has 7 different class values for each pitch, offset based on
// which pitch class (ordered by i)
// console.log(values);
// console.log(classSize);
let index = (survivedIndex * classSize * NUM_SURVIVED_CLASSES) + survivedIndex;
let total = 0;
for (let i = 0; i < classSize; i++) {
//console.log(values[index]); <-- something wrong with this
total += values[index];
index += NUM_SURVIVED_CLASSES;
//console.log(index);
//console.log(total);
}
//console.log(classSize);
return total / classSize;
}
/////// Convert survival code to classes ///////
// Returns the string value for Baseball pitch labels
function survivedCode(classNum) {
switch (classNum) {
case 0:
return 'Did not survive';
case 1:
return 'Survived!';
default:
return 'Unknown';
}
}
module.exports = {
evaluate,
model,
survivedCode,
predictSample,
testValidationData,
trainingData,
TEST_DATA_LENGTH
}
// end