diff --git a/test_gpt2.c b/test_gpt2.c index e49b73fad..3db8dff7a 100644 --- a/test_gpt2.c +++ b/test_gpt2.c @@ -6,7 +6,7 @@ int check_tensor(float *a, float *b, int n, const char* label) { int print_upto = 5; int ok = 1; float maxdiff = 0.0f; - float tol = 2e-2; + float tol = 2e-2f; printf("%s\n", label); for (int i = 0; i < n; i++) { // look at the diffence at position i of these two tensors @@ -52,7 +52,7 @@ int main(int argc, char *argv[]) { FILE *state_file = fopen("gpt2_124M_debug_state.bin", "rb"); if (state_file == NULL) { printf("Error opening state file\n"); return 1; } int state_header[256]; - fread(state_header, sizeof(int), 256, state_file); + freadCheck(state_header, sizeof(int), 256, state_file); if (state_header[0] != 20240327) { printf("Bad magic state file\n"); return 1; } if (state_header[1] != 2) { printf("Bad version in state file\n"); @@ -75,28 +75,28 @@ int main(int argc, char *argv[]) { float* expected_loss = (float*) malloc(1 * sizeof(float)); // read reference information from Python - fread(x, sizeof(int), B*T, state_file); - fread(y, sizeof(int), B*T, state_file); - fread(expected_logits, sizeof(float), B*T*V, state_file); - fread(expected_loss, sizeof(float), 1, state_file); - fread(expected_grads_memory, sizeof(float), model.num_parameters, state_file); - fclose(state_file); + freadCheck(x, sizeof(int), B*T, state_file); + freadCheck(y, sizeof(int), B*T, state_file); + freadCheck(expected_logits, sizeof(float), B*T*V, state_file); + freadCheck(expected_loss, sizeof(float), 1, state_file); + freadCheck(expected_grads_memory, sizeof(float), model.num_parameters, state_file); + fcloseCheck(state_file); // overall OK signal for the test int allok = 1; // let's do 10 training iterations, following the pytorch code float expected_losses[10] = { - 5.270007133483887, - 4.059706687927246, - 3.3751230239868164, - 2.8007826805114746, - 2.315382242202759, - 1.8490285873413086, - 1.3946564197540283, - 0.9991465210914612, - 0.6240804195404053, - 0.37651097774505615 + 5.270007133483887f, + 4.059706687927246f, + 3.3751230239868164f, + 2.8007826805114746f, + 2.315382242202759f, + 1.8490285873413086f, + 1.3946564197540283f, + 0.9991465210914612f, + 0.6240804195404053f, + 0.37651097774505615f }; for (int step = 0; step < 10; step++) { diff --git a/test_gpt2.cu b/test_gpt2.cu index f35f3ed23..30612fdda 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -304,16 +304,16 @@ int main(int argc, char *argv[]) { // expected losses are as follows, from Python float expected_losses[10] = { - 5.270009, - 4.060681, - 3.320085, - 2.717550, - 2.181066, - 1.653923, - 1.168050, - 0.736873, - 0.401021, - 0.187493 + 5.270009f, + 4.060681f, + 3.320085f, + 2.717550f, + 2.181066f, + 1.653923f, + 1.168050f, + 0.736873f, + 0.401021f, + 0.187493f }; // compare diff --git a/test_gpt2_fp32.cu b/test_gpt2_fp32.cu index 954981c7b..a3a677067 100644 --- a/test_gpt2_fp32.cu +++ b/test_gpt2_fp32.cu @@ -102,7 +102,7 @@ int main(int argc, char *argv[]) { // at this point, target should be equal to expected_logits, let's compare // copy logits to CPU so we can compare them float* logits_cpu = (float*)mallocCheck(B * T * Vp * sizeof(float)); - cudaMemcpy(logits_cpu, model.acts.output, B * T * Vp * sizeof(float), cudaMemcpyDeviceToHost); + cudaCheck(cudaMemcpy(logits_cpu, model.acts.output, B * T * Vp * sizeof(float), cudaMemcpyDeviceToHost)); // compare the output logits from the forward pass // also careful that we don't access and compare the padded columns of logits @@ -200,16 +200,16 @@ int main(int argc, char *argv[]) { // expected losses are as follows, from Python float expected_losses[10] = { - 5.270007133483887, - 4.059706687927246, - 3.3751230239868164, - 2.8007826805114746, - 2.315382242202759, - 1.8490285873413086, - 1.3946564197540283, - 0.9991465210914612, - 0.6240804195404053, - 0.37651097774505615 + 5.270007133483887f, + 4.059706687927246f, + 3.3751230239868164f, + 2.8007826805114746f, + 2.315382242202759f, + 1.8490285873413086f, + 1.3946564197540283f, + 0.9991465210914612f, + 0.6240804195404053f, + 0.37651097774505615f }; // compare diff --git a/train_gpt2.c b/train_gpt2.c index 799a7a854..655ecacb3 100644 --- a/train_gpt2.c +++ b/train_gpt2.c @@ -352,7 +352,7 @@ void attention_backward(float* dinp, float* dpreatt, float* datt, // dout is (B, T, C) int C3 = C*3; int hs = C / NH; // head size - float scale = 1.0 / sqrtf(hs); + float scale = 1.f / sqrtf(hs); for (int b = 0; b < B; b++) { for (int t = 0; t < T; t++) { diff --git a/train_gpt2.cu b/train_gpt2.cu index 52d0eef49..a4617cbb5 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1638,8 +1638,8 @@ int main(int argc, char *argv[]) { // build an EvalLoader for HellaSwag EvalLoader eval_loader; const char* hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin"; - const char hellaswag_available = access(hellaswag_path, F_OK) == 0; - const char run_hellaswag = hellaswag_eval && hellaswag_available; + const bool hellaswag_available = access(hellaswag_path, F_OK) == 0; + const bool run_hellaswag = hellaswag_eval && hellaswag_available; if (run_hellaswag) { evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes); }