diff --git a/test_gpt2.c b/test_gpt2.c
index e49b73fad..3db8dff7a 100644
--- a/test_gpt2.c
+++ b/test_gpt2.c
@@ -6,7 +6,7 @@ int check_tensor(float *a, float *b, int n, const char* label) {
     int print_upto = 5;
     int ok = 1;
     float maxdiff = 0.0f;
-    float tol = 2e-2;
+    float tol = 2e-2f;
     printf("%s\n", label);
     for (int i = 0; i < n; i++) {
         // look at the diffence at position i of these two tensors
@@ -52,7 +52,7 @@ int main(int argc, char *argv[]) {
     FILE *state_file = fopen("gpt2_124M_debug_state.bin", "rb");
     if (state_file == NULL) { printf("Error opening state file\n"); return 1; }
     int state_header[256];
-    fread(state_header, sizeof(int), 256, state_file);
+    freadCheck(state_header, sizeof(int), 256, state_file);
     if (state_header[0] != 20240327) { printf("Bad magic state file\n"); return 1; }
     if (state_header[1] != 2) {
         printf("Bad version in state file\n");
@@ -75,28 +75,28 @@ int main(int argc, char *argv[]) {
     float* expected_loss = (float*) malloc(1 * sizeof(float));
 
     // read reference information from Python
-    fread(x, sizeof(int), B*T, state_file);
-    fread(y, sizeof(int), B*T, state_file);
-    fread(expected_logits, sizeof(float), B*T*V, state_file);
-    fread(expected_loss, sizeof(float), 1, state_file);
-    fread(expected_grads_memory, sizeof(float), model.num_parameters, state_file);
-    fclose(state_file);
+    freadCheck(x, sizeof(int), B*T, state_file);
+    freadCheck(y, sizeof(int), B*T, state_file);
+    freadCheck(expected_logits, sizeof(float), B*T*V, state_file);
+    freadCheck(expected_loss, sizeof(float), 1, state_file);
+    freadCheck(expected_grads_memory, sizeof(float), model.num_parameters, state_file);
+    fcloseCheck(state_file);
 
     // overall OK signal for the test
     int allok = 1;
 
     // let's do 10 training iterations, following the pytorch code
     float expected_losses[10] = {
-        5.270007133483887,
-        4.059706687927246,
-        3.3751230239868164,
-        2.8007826805114746,
-        2.315382242202759,
-        1.8490285873413086,
-        1.3946564197540283,
-        0.9991465210914612,
-        0.6240804195404053,
-        0.37651097774505615
+        5.270007133483887f,
+        4.059706687927246f,
+        3.3751230239868164f,
+        2.8007826805114746f,
+        2.315382242202759f,
+        1.8490285873413086f,
+        1.3946564197540283f,
+        0.9991465210914612f,
+        0.6240804195404053f,
+        0.37651097774505615f
     };
     for (int step = 0; step < 10; step++) {
 
diff --git a/test_gpt2.cu b/test_gpt2.cu
index f35f3ed23..30612fdda 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -304,16 +304,16 @@ int main(int argc, char *argv[]) {
 
     // expected losses are as follows, from Python
     float expected_losses[10] = {
-        5.270009,
-        4.060681,
-        3.320085,
-        2.717550,
-        2.181066,
-        1.653923,
-        1.168050,
-        0.736873,
-        0.401021,
-        0.187493
+        5.270009f,
+        4.060681f,
+        3.320085f,
+        2.717550f,
+        2.181066f,
+        1.653923f,
+        1.168050f,
+        0.736873f,
+        0.401021f,
+        0.187493f
     };
 
     // compare
diff --git a/test_gpt2_fp32.cu b/test_gpt2_fp32.cu
index 954981c7b..a3a677067 100644
--- a/test_gpt2_fp32.cu
+++ b/test_gpt2_fp32.cu
@@ -102,7 +102,7 @@ int main(int argc, char *argv[]) {
     // at this point, target should be equal to expected_logits, let's compare
     // copy logits to CPU so we can compare them
     float* logits_cpu = (float*)mallocCheck(B * T * Vp * sizeof(float));
-    cudaMemcpy(logits_cpu, model.acts.output, B * T * Vp * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(logits_cpu, model.acts.output, B * T * Vp * sizeof(float), cudaMemcpyDeviceToHost));
 
     // compare the output logits from the forward pass
     // also careful that we don't access and compare the padded columns of logits
@@ -200,16 +200,16 @@ int main(int argc, char *argv[]) {
 
     // expected losses are as follows, from Python
     float expected_losses[10] = {
-        5.270007133483887,
-        4.059706687927246,
-        3.3751230239868164,
-        2.8007826805114746,
-        2.315382242202759,
-        1.8490285873413086,
-        1.3946564197540283,
-        0.9991465210914612,
-        0.6240804195404053,
-        0.37651097774505615
+        5.270007133483887f,
+        4.059706687927246f,
+        3.3751230239868164f,
+        2.8007826805114746f,
+        2.315382242202759f,
+        1.8490285873413086f,
+        1.3946564197540283f,
+        0.9991465210914612f,
+        0.6240804195404053f,
+        0.37651097774505615f
     };
 
     // compare
diff --git a/train_gpt2.c b/train_gpt2.c
index 799a7a854..655ecacb3 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -352,7 +352,7 @@ void attention_backward(float* dinp, float* dpreatt, float* datt,
     // dout is (B, T, C)
     int C3 = C*3;
     int hs = C / NH; // head size
-    float scale = 1.0 / sqrtf(hs);
+    float scale = 1.f / sqrtf(hs);
 
     for (int b = 0; b < B; b++) {
         for (int t = 0; t < T; t++) {
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 52d0eef49..a4617cbb5 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1638,8 +1638,8 @@ int main(int argc, char *argv[]) {
     // build an EvalLoader for HellaSwag
     EvalLoader eval_loader;
     const char* hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin";
-    const char hellaswag_available = access(hellaswag_path, F_OK) == 0;
-    const char run_hellaswag = hellaswag_eval && hellaswag_available;
+    const bool hellaswag_available = access(hellaswag_path, F_OK) == 0;
+    const bool run_hellaswag = hellaswag_eval && hellaswag_available;
     if (run_hellaswag) {
         evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
     }