From dc837074c4ca73583c3541ea54438d7fda84fdf9 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 11 Apr 2024 11:04:42 -0700
Subject: [PATCH] Add .npy support to debug_to_file() (#8177)

* Add .npy support to halide_image_io

The .npy format is NumPy's native format for storing multidimensional arrays (aka tensors/buffers). Being able to load/save in this format makes it (potentially) a lot easier to interchange data with the Python ecosystem, as well as providing a file format that support floating-point data more robustly than any of the others that we current support.

This adds load/save support for a useful subset:
- We support the int/uint/float types common in Halide (except for f16/bf16 for now)
- We don't support reading or writing files that are in `fortran_order`
- We don't support any object/struct/etc files, only numeric primitives
- We only support loading files that are in the host's endianness (typically little-endian)

Note that at present this doesn't support f16 / bf16 formats, but that could likely be added with minimal difficulty.

The tricky bit of this is that the reading code has to parse a (limited) Python dict in text form. Please review that part carefully.

TODO: we could probably add this as an option for `debug_to_file()` without too much pain in a followup PR.

* clang-tidy

* clang-tidy

* Address review comments

* Allow for "keys" as well as 'keys'

* Add .npy support to debug_to_file()

Built on top of https://github.com/halide/Halide/pull/8175, this adds .npy as an option. This is actually pretty great because it's easy to do something like

```
ss = numpy.load("my_file.npy")
print(ss)
```

in Python and get nicely-formatted output, which can sometimes be a lot easier for debugging that inserting lots of print() statements (see https://github.com/halide/Halide/issues/8176)

Did a drive-by change to the correctness test to use this format instead of .mat.

* Add float16 support

* Add support for Float16 images in npy

* Assume little-endian

* Remove redundant halide_error()

* naming convention

* naming convention

* Test both mat and npy

* Don't call halide_error()

* Use old-school parser

* clang-tidy
---
 src/DebugToFile.cpp                |   4 +
 src/runtime/write_debug_image.cpp  | 140 ++++++++++++++++++++++++---
 test/correctness/debug_to_file.cpp | 147 +++++++++++++++--------------
 3 files changed, 207 insertions(+), 84 deletions(-)

diff --git a/src/DebugToFile.cpp b/src/DebugToFile.cpp
index 8147e4cfe7f1..8510b806a132 100644
--- a/src/DebugToFile.cpp
+++ b/src/DebugToFile.cpp
@@ -42,6 +42,8 @@ class DebugToFile : public IRMutator {
                 num_elements *= bound.extent;
             }
 
+            // TODO: why do we bother with this? halide_debug_to_file()
+            // can infer the type-and-size it needs from the buffer's type field.
             int type_code = 0;
             Type t = op->types[0];
             if (t == Float(32)) {
@@ -64,6 +66,8 @@ class DebugToFile : public IRMutator {
                 type_code = 8;
             } else if (t == Int(64)) {
                 type_code = 9;
+            } else if (t == Float(16)) {
+                type_code = 10;
             } else {
                 user_error << "Type " << t << " not supported for debug_to_file\n";
             }
diff --git a/src/runtime/write_debug_image.cpp b/src/runtime/write_debug_image.cpp
index f51017c1fbb4..a5f8816db2c7 100644
--- a/src/runtime/write_debug_image.cpp
+++ b/src/runtime/write_debug_image.cpp
@@ -1,13 +1,16 @@
 #include "HalideRuntime.h"
 
-// We support three formats, tiff, mat, and tmp.
+// We support four formats, npy, tiff, mat, and tmp.
 //
 // All formats support arbitrary types, and are easy to write in a
 // small amount of code.
 //
+// npy:
+// - Arbitrary dimensionality, type
+// - Readable by NumPy and other Python tools
 // TIFF:
 // - 2/3-D only
-// - Readable by the most tools
+// - Readable by a lot of tools
 // mat:
 // - Arbitrary dimensionality, type
 // - Readable by matlab, ImageStack, and many other tools
@@ -26,20 +29,22 @@ namespace Internal {
 // Mappings from the type_code passed in to the type codes of the
 // formats. See "type_code" in DebugToFile.cpp
 
+constexpr int kNumTypeCodes = 11;
+
 // TIFF sample type values are:
 //     1 => Unsigned int
 //     2 => Signed int
 //     3 => Floating-point
-WEAK int16_t pixel_type_to_tiff_sample_type[] = {
+WEAK int16_t pixel_type_to_tiff_sample_type[kNumTypeCodes] = {
     // float, double, uint8, int8, ... uint64, int64
-    3, 3, 1, 2, 1, 2, 1, 2, 1, 2};
+    3, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0};
 
 // See the .mat level 5 documentation for matlab class codes.
-WEAK uint8_t pixel_type_to_matlab_class_code[] = {
-    7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+WEAK uint8_t pixel_type_to_matlab_class_code[kNumTypeCodes] = {
+    7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0};
 
-WEAK uint8_t pixel_type_to_matlab_type_code[] = {
-    7, 9, 2, 1, 4, 3, 6, 5, 13, 12};
+WEAK uint8_t pixel_type_to_matlab_type_code[kNumTypeCodes] = {
+    7, 9, 2, 1, 4, 3, 6, 5, 13, 12, 0};
 
 #pragma pack(push)
 #pragma pack(2)
@@ -125,6 +130,39 @@ struct ScopedFile {
     }
 };
 
+// Halide runtime has lots of assumptions that we are always little-endian,
+// so we'll hardcode this here; leaving in the logic to make it clear.
+constexpr bool host_is_big_endian = false;
+constexpr char little_endian_char = '<';
+constexpr char big_endian_char = '>';
+constexpr char no_endian_char = '|';
+constexpr char host_endian_char = (host_is_big_endian ? big_endian_char : little_endian_char);
+
+struct npy_dtype_info_t {
+    char byte_order;
+    char kind;
+    size_t item_size;
+};
+
+struct htype_to_dtype {
+    halide_type_t htype;
+    npy_dtype_info_t dtype;
+};
+
+WEAK htype_to_dtype npy_dtypes[] = {
+    {halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2}},
+    {halide_type_of<float>(), {host_endian_char, 'f', sizeof(float)}},
+    {halide_type_of<double>(), {host_endian_char, 'f', sizeof(double)}},
+    {halide_type_of<int8_t>(), {no_endian_char, 'i', sizeof(int8_t)}},
+    {halide_type_of<int16_t>(), {host_endian_char, 'i', sizeof(int16_t)}},
+    {halide_type_of<int32_t>(), {host_endian_char, 'i', sizeof(int32_t)}},
+    {halide_type_of<int64_t>(), {host_endian_char, 'i', sizeof(int64_t)}},
+    {halide_type_of<uint8_t>(), {no_endian_char, 'u', sizeof(uint8_t)}},
+    {halide_type_of<uint16_t>(), {host_endian_char, 'u', sizeof(uint16_t)}},
+    {halide_type_of<uint32_t>(), {host_endian_char, 'u', sizeof(uint32_t)}},
+    {halide_type_of<uint64_t>(), {host_endian_char, 'u', sizeof(uint64_t)}},
+};
+
 }  // namespace Internal
 }  // namespace Runtime
 }  // namespace Halide
@@ -142,11 +180,15 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
         return halide_error_code_bad_dimensions;
     }
 
-    if (auto result = halide_copy_to_host(user_context, buf);
-        result != halide_error_code_success) {
+    if (auto result = halide_copy_to_host(user_context, buf); result != halide_error_code_success) {
+        // halide_error() has already been called
         return result;
     }
 
+    // Note: all calls to this function are wrapped in an assert that identifies
+    // the function that failed, so calling halide_error() anywhere after this is redundant
+    // and actually unhelpful.
+
     ScopedFile f(filename, "wb");
     if (!f.open()) {
         return halide_error_code_debug_to_file_failed;
@@ -167,7 +209,73 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
 
     uint32_t final_padding_bytes = 0;
 
-    if (ends_with(filename, ".tiff") || ends_with(filename, ".tif")) {
+    if (ends_with(filename, ".npy")) {
+        npy_dtype_info_t di = {0, 0, 0};
+        for (const auto &d : npy_dtypes) {
+            if (d.htype == buf->type) {
+                di = d.dtype;
+                break;
+            }
+        }
+        if (di.byte_order == 0) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
+        constexpr int max_dict_string_size = 1024;
+        char dict_string_buf[max_dict_string_size];
+        char *dst = dict_string_buf;
+        char *end = dict_string_buf + max_dict_string_size - 1;
+
+        dst = halide_string_to_string(dst, end, "{'descr': '");
+        *dst++ = di.byte_order;
+        *dst++ = di.kind;
+        dst = halide_int64_to_string(dst, end, di.item_size, 1);
+        dst = halide_string_to_string(dst, end, "', 'fortran_order': False, 'shape': (");
+        for (int d = 0; d < buf->dimensions; ++d) {
+            if (d > 0) {
+                dst = halide_string_to_string(dst, end, ",");
+            }
+            dst = halide_int64_to_string(dst, end, buf->dim[d].extent, 1);
+            if (buf->dimensions == 1) {
+                dst = halide_string_to_string(dst, end, ",");  // special-case for single-element tuples
+            }
+        }
+        dst = halide_string_to_string(dst, end, ")}\n");
+        if (dst >= end) {
+            // bloody unlikely, but just in case
+            return halide_error_code_debug_to_file_failed;
+        }
+
+        const char *npy_magic_string_and_version = "\x93NUMPY\x01\x00";
+
+        const size_t unpadded_length = 8 + 2 + (dst - dict_string_buf);
+        const size_t padded_length = (unpadded_length + 64 - 1) & ~(64 - 1);
+        const size_t padding = padded_length - unpadded_length;
+        memset(dst, ' ', padding);
+        dst += padding;
+
+        const size_t header_len = dst - dict_string_buf;
+        if (header_len > 65535) {
+            return halide_error_code_debug_to_file_failed;
+        }
+        const uint8_t header_len_le[2] = {
+            (uint8_t)((header_len >> 0) & 0xff),
+            (uint8_t)((header_len >> 8) & 0xff)};
+
+        if (!f.write(npy_magic_string_and_version, 8)) {
+            return halide_error_code_debug_to_file_failed;
+        }
+        if (!f.write(header_len_le, 2)) {
+            return halide_error_code_debug_to_file_failed;
+        }
+        if (!f.write(dict_string_buf, dst - dict_string_buf)) {
+            return halide_error_code_debug_to_file_failed;
+        }
+    } else if (ends_with(filename, ".tiff") || ends_with(filename, ".tif")) {
+        if (type_code == 10) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
         int32_t channels;
         int32_t width = shape[0].extent;
         int32_t height = shape[1].extent;
@@ -243,6 +351,10 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
             }
         }
     } else if (ends_with(filename, ".mat")) {
+        if (type_code == 10) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
         // Construct a name for the array from the filename
         const char *end = filename;
         while (*end) {
@@ -279,7 +391,6 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
         // level 5 .mat files have a size limit. (Padding itself should never cause the overflow.
         // Code written this way for safety.)
         if (((uint64_t)payload_bytes + final_padding_bytes) >> 32) {
-            halide_error(user_context, "Can't debug_to_file to a .mat file greater than 4GB\n");
             return halide_error_code_debug_to_file_failed;
         }
 
@@ -325,6 +436,10 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
             return halide_error_code_debug_to_file_failed;
         }
     } else {
+        if (type_code == 10) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
         int32_t header[] = {shape[0].extent,
                             shape[1].extent,
                             shape[2].extent,
@@ -370,7 +485,6 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
     const uint64_t zero = 0;
     if (final_padding_bytes) {
         if (final_padding_bytes > sizeof(zero)) {
-            halide_error(user_context, "Unexpectedly large final_padding_bytes");
             return halide_error_code_debug_to_file_failed;
         }
         if (!f.write(&zero, final_padding_bytes)) {
diff --git a/test/correctness/debug_to_file.cpp b/test/correctness/debug_to_file.cpp
index 2b0aee28e8c0..780428c3389f 100644
--- a/test/correctness/debug_to_file.cpp
+++ b/test/correctness/debug_to_file.cpp
@@ -15,88 +15,93 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    std::string f_mat = Internal::get_test_tmp_dir() + "f.mat";
-    std::string g_mat = Internal::get_test_tmp_dir() + "g.mat";
-    std::string h_mat = Internal::get_test_tmp_dir() + "h.mat";
-
-    Internal::ensure_no_file_exists(f_mat);
-    Internal::ensure_no_file_exists(g_mat);
-    Internal::ensure_no_file_exists(h_mat);
-
-    {
-        Func f, g, h, j;
-        Var x, y, z;
-        f(x, y, z) = cast<int32_t>(x + y + z);
-        g(x, y) = cast<float>(f(x, y, 0) + f(x + 1, y, 1));
-        h(x, y) = cast<int32_t>(f(x, y, -1) + g(x, y));
-
-        Target target = get_jit_target_from_environment();
-        if (target.has_gpu_feature()) {
-            Var xi, yi;
-            f.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(f_mat);
-            g.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(g_mat);
-            h.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(h_mat);
-        } else {
-            f.compute_root().debug_to_file(f_mat);
-            g.compute_root().debug_to_file(g_mat);
-            h.compute_root().debug_to_file(h_mat);
-        }
+    std::vector<std::string> formats = {"npy", "mat"};
+    for (const auto &format : formats) {
+        std::cout << "Testing format " << format << "...\n";
+
+        std::string f_path = Internal::get_test_tmp_dir() + "f." + format;
+        std::string g_path = Internal::get_test_tmp_dir() + "g." + format;
+        std::string h_path = Internal::get_test_tmp_dir() + "h." + format;
+
+        Internal::ensure_no_file_exists(f_path);
+        Internal::ensure_no_file_exists(g_path);
+        Internal::ensure_no_file_exists(h_path);
+
+        {
+            Func f, g, h, j;
+            Var x, y, z;
+            f(x, y, z) = cast<int32_t>(x + y + z);
+            g(x, y) = cast<float>(f(x, y, 0) + f(x + 1, y, 1));
+            h(x, y) = cast<int32_t>(f(x, y, -1) + g(x, y));
+
+            Target target = get_jit_target_from_environment();
+            if (target.has_gpu_feature()) {
+                Var xi, yi;
+                f.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(f_path);
+                g.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(g_path);
+                h.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(h_path);
+            } else {
+                f.compute_root().debug_to_file(f_path);
+                g.compute_root().debug_to_file(g_path);
+                h.compute_root().debug_to_file(h_path);
+            }
 
-        Buffer<int32_t> im = h.realize({10, 10}, target);
-    }
+            Buffer<int32_t> im = h.realize({10, 10}, target);
+        }
 
-    {
-        Internal::assert_file_exists(f_mat);
-        Internal::assert_file_exists(g_mat);
-        Internal::assert_file_exists(h_mat);
+        {
+            Internal::assert_file_exists(f_path);
+            Internal::assert_file_exists(g_path);
+            Internal::assert_file_exists(h_path);
+
+            Buffer<int32_t> f = Tools::load_image(f_path);
+            assert(f.dimensions() == 3 &&
+                   f.dim(0).extent() == 11 &&
+                   f.dim(1).extent() == 10 &&
+                   f.dim(2).extent() == 3);
+
+            for (int z = 0; z < 3; z++) {
+                for (int y = 0; y < 10; y++) {
+                    for (int x = 0; x < 11; x++) {
+                        int32_t val = f(x, y, z);
+                        // The min coord gets lost on debug_to_file, so f should be shifted up by one.
+                        if (val != x + y + z - 1) {
+                            printf("f(%d, %d, %d) = %d instead of %d\n", x, y, z, val, x + y);
+                            return 1;
+                        }
+                    }
+                }
+            }
 
-        Buffer<int32_t> f = Tools::load_image(f_mat);
-        assert(f.dimensions() == 3 &&
-               f.dim(0).extent() == 11 &&
-               f.dim(1).extent() == 10 &&
-               f.dim(2).extent() == 3);
+            Buffer<float> g = Tools::load_image(g_path);
+            assert(g.dimensions() == 2 &&
+                   g.dim(0).extent() == 10 &&
+                   g.dim(1).extent() == 10);
 
-        for (int z = 0; z < 3; z++) {
             for (int y = 0; y < 10; y++) {
-                for (int x = 0; x < 11; x++) {
-                    int32_t val = f(x, y, z);
-                    // The min coord gets lost on debug_to_file, so f should be shifted up by one.
-                    if (val != x + y + z - 1) {
-                        printf("f(%d, %d, %d) = %d instead of %d\n", x, y, z, val, x + y);
+                for (int x = 0; x < 10; x++) {
+                    float val = g(x, y);
+                    float correct = (float)(f(x, y, 1) + f(x + 1, y, 2));
+                    if (val != correct) {
+                        printf("g(%d, %d) = %f instead of %f\n", x, y, val, correct);
                         return 1;
                     }
                 }
             }
-        }
 
-        Buffer<float> g = Tools::load_image(g_mat);
-        assert(g.dimensions() == 2 &&
-               g.dim(0).extent() == 10 &&
-               g.dim(1).extent() == 10);
-
-        for (int y = 0; y < 10; y++) {
-            for (int x = 0; x < 10; x++) {
-                float val = g(x, y);
-                float correct = (float)(f(x, y, 1) + f(x + 1, y, 2));
-                if (val != correct) {
-                    printf("g(%d, %d) = %f instead of %f\n", x, y, val, correct);
-                    return 1;
-                }
-            }
-        }
+            Buffer<int32_t> h = Tools::load_image(h_path);
+            assert(h.dimensions() == 2 &&
+                   h.dim(0).extent() == 10 &&
+                   h.dim(1).extent() == 10);
 
-        Buffer<int32_t> h = Tools::load_image(h_mat);
-        assert(h.dimensions() == 2 &&
-               h.dim(0).extent() == 10 &&
-               h.dim(1).extent() == 10);
-
-        for (int y = 0; y < 10; y++) {
-            for (int x = 0; x < 10; x++) {
-                int32_t val = h(x, y);
-                int32_t correct = f(x, y, 0) + g(x, y);
-                if (val != correct) {
-                    printf("h(%d, %d) = %d instead of %d\n", x, y, val, correct);
-                    return 1;
+            for (int y = 0; y < 10; y++) {
+                for (int x = 0; x < 10; x++) {
+                    int32_t val = h(x, y);
+                    int32_t correct = f(x, y, 0) + g(x, y);
+                    if (val != correct) {
+                        printf("h(%d, %d) = %d instead of %d\n", x, y, val, correct);
+                        return 1;
+                    }
                 }
             }
         }