Dev/zhzhang/searchsorted fwd kernel (#19)

* Added searchsorted forward kernel * Added the searchsorted bwd kerbel. * Added search sorted fwd kernels and unit test * Added pytorch custom op for searchsorted * Added README
HabanaAI · Aug 15, 2023 · abb8094 · abb8094
1 parent 27e0bd3
commit abb8094
Show file tree

Hide file tree

Showing 15 changed files with 858 additions and 2 deletions.
diff --git a/kernels/gaudi/searchsorted_fwd_f32.c b/kernels/gaudi/searchsorted_fwd_f32.c
@@ -0,0 +1,152 @@
+/**********************************************************************
+Copyright (c) 2023 Habana Labs.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+*   Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+*   Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+#define bv_cmp_eq_v_v(a, b) from_bool64(v_f32_cmp_eq_b(a, b))
+
+void main(
+    tensor ifm_seq,
+    tensor ifm_val,
+    tensor ofm_idx,
+    bool   side
+)
+{
+    const int depth  = 0;
+    const int width  = 1;
+    const int height = 2;
+    const int batch  = 3;
+    const int fifdim  = 4;
+
+    const int5 index_space_start = get_index_space_offset();
+    const int5 index_space_end = get_index_space_size() + index_space_start;
+
+    // depth
+    const int depthStep  = 64;
+    const int depthStart = index_space_start[depth] * depthStep;
+    const int depthEnd   = index_space_end[depth] * depthStep;
+
+    // width
+    const int widthStep  = 1;
+    const int widthStart = 0;
+    const int widthEnd   = get_dim_size(ifm_seq, 1);
+
+    // height
+    const int heightStep  = 1;
+    const int heightStart = index_space_start[height] * heightStep;
+    const int heightEnd   = index_space_end[height]   * heightStep;
+
+    // batch
+    const int batchStep  = 1;
+    const int batchStart = index_space_start[batch] * batchStep;
+    const int batchEnd   = index_space_end[batch]   * batchStep;
+
+    // fifdim
+    const int fifdimStep  = 1;
+    const int fifdimStart = index_space_start[fifdim] * fifdimStep;
+    const int fifdimEnd   = index_space_end[fifdim]   * fifdimStep;
+
+    // value width
+    const int valueWidthStep  = 1;
+    const int valueWidthStart  = 0;
+    // Returns the dim0 size of ifm
+    const int valueWidthEnd   = get_dim_size(ifm_val, 1);
+
+    int64 one = 0;
+
+    int5 ifmCoords = { depthStart, widthStart, heightStart, batchStart, fifdimStart };
+    int5 ofmCoords = { depthStart, valueWidthStart, heightStart, batchStart, fifdimStart };
+
+    // side is right
+    if(side == 1)
+    {
+        for (int f = fifdimStart; f < fifdimEnd; f += fifdimStep)
+        {
+            ifmCoords[fifdim] = ofmCoords[fifdim] = f;
+
+            for (int b = batchStart; b < batchEnd; b += batchStep)
+            {
+                ifmCoords[batch] = ofmCoords[batch] = b;
+
+                for (int h = heightStart; h < heightEnd; h += heightStep)
+                {
+                    ifmCoords[height] = ofmCoords[height] = h;
+
+                    for (int d = depthStart; d < depthEnd; d += depthStep)
+                    {
+                        ifmCoords[depth] = ofmCoords[depth] = d;
+
+                        for (int vw = valueWidthStart; vw < valueWidthEnd; vw += valueWidthStep)
+                        {
+                            ofmCoords[width] = vw;
+                            float64 value = v_f32_ld_tnsr_b(ofmCoords, ifm_val);
+                            int64 index = 0;
+
+                            for (int w = widthStart; w < widthEnd; w += widthStep)    
+                            {
+                                ifmCoords[width] = w;
+                                float64 sequence = v_f32_ld_tnsr_b(ifmCoords, ifm_seq);
+
+                                float64 cmps = v_f32_sel_leq_f32_b(sequence, value, 0, 1);
+                                bool256 pred = bv_cmp_eq_v_v(cmps, (float64) one);
+                                index = v_i32_mov_vb(w+1, 0, index, to_bool64(pred),0);
+                            }
+                            v_i32_st_tnsr(ofmCoords, ofm_idx, index);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // side is left
+    else
+    {
+        for (int f = fifdimStart; f < fifdimEnd; f += fifdimStep)
+        {
+            ifmCoords[fifdim] = ofmCoords[fifdim] = f;
+
+            for (int b = batchStart; b < batchEnd; b += batchStep)
+            {
+                ifmCoords[batch] = ofmCoords[batch] = b;
+
+                for (int h = heightStart; h < heightEnd; h += heightStep)
+                {
+                    ifmCoords[height] = ofmCoords[height] = h;
+
+                    for (int d = depthStart; d < depthEnd; d += depthStep)
+                    {
+                        ifmCoords[depth] = ofmCoords[depth] = d;
+
+                        for (int vw = valueWidthStart; vw < valueWidthEnd; vw += valueWidthStep)
+                        {
+                            ofmCoords[width] = vw;
+                            float64 value = v_f32_ld_tnsr_b(ofmCoords, ifm_val);
+                            int64 index = 0;
+
+                            for (int w = widthStart; w < widthEnd; w += widthStep)    
+                            {
+                                ifmCoords[width] = w;
+                                float64 sequence = v_f32_ld_tnsr_b(ifmCoords, ifm_seq);
+
+                                float64 cmps = v_f32_sel_less_f32_b(sequence, value, 0, 1);
+                                bool256 pred = bv_cmp_eq_v_v(cmps, (float64) one);
+                                index = v_i32_mov_vb(w+1, 0, index, to_bool64(pred),0);
+                            }
+                            v_i32_st_tnsr(ofmCoords, ofm_idx, index);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/scripts/pytorch_custom_op/README.md b/scripts/pytorch_custom_op/README.md
@@ -0,0 +1,65 @@
+# Create searchsorted custom op in PyTorch
+
+This README provides an example of how to write custom PyTorch Ops using a TPC Kernel supported on an HPU device. For more details, refer to [PyTorch CustomOP API](https://docs.habana.ai/en/latest/PyTorch/PyTorch_CustomOp_API/page_index.html) documentation. 
+
+
+
+## Table of Contents
+
+* [Prerequisites](#Prerequisites) 
+* [Content](#content)
+* [Build and Run with Custom Kernels](#build-and-run-with-custom-kernels)
+* [Important to Know](#important-to-know)
+* [Applying CustomOps to a Real Training Model Example](#applying-customops-to-a-real-training-model-example)
+
+
+## Prerequisites
+
+- A TPC kernel on which the HpuKernel will run. To write a CustomOp, you must define the TPC kernel that HpuKernel will run on first. This document provides the required steps for using the custom TPC kernels `searchsorted_fwd_f32`, to implement CustomSearchsortedOp. For further information on how to write TPC kernels, refer to the [Habana Custom Kernel GitHub page](https://github.com/HabanaAI/Habana_Custom_Kernel).
+
+- **habana-torch-plugin** Python package must be installed. Make sure to install by following the instructions detailed in the [Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html).
+
+## Content
+
+- C++ file with **custom_op::custom_searchsorted**, definition and Kernel implementation on HPU:
+    - `custom_searchsorted` performs searchsorted on sorted input.
+- `setup.py` file for building the solution:
+    - To compile to Op, run ```python setup.py build```.
+- Python test to run and validate `CustomSearchSorted`:
+    - ```python hpu_custom_op_searchsorted_test.py```
+
+## Build and Run with Custom Kernels 
+
+To build and run `custom_searchsorted`, run the following: 
+```python setup.py build```
+
+## Important to Know
+
+In order to make the custom op work in the training process, usually we need to implement both forward and backward ops. But due to searchsorted op return an integer index, no backward op required at this time.
+
+## Applying CustomSearchsorted to a Real Training Model Example
+
+This section provides an example for applying CustomOps to a real training model NeuS. 
+Follow the below steps:
+
+1. Build the `custom_searchsorted` Op with the custom kernel `searchsorted_fwd_f32` as described above. 
+2. If the build steps are successful, the run the unit test to make sure the custom_searchsorted op pass the test.
+3. Make sure add the custom tc kernel to the GC_KERNEL_PATH, i.e., export `GC_KERNEL_PATH=/your/path/to/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH`.
+4. Add the custom_searchsorted path to PYTHONPATH, i.e., `export PYTHONPATH = /your/path/to/pytorch_custom_op:$PYTHONPATH`.
+4. Replace `inds = torch.searchsorted(cdf, u, right=True)` with the following to train the model.
+    ```
+    from custom_searchsorted import CustomSearchSorted
+
+    cdf = cdf.to('hpu').detach()
+    u = u.to('hpu').detach()
+    cdf_h=cdf.transpose(0,1).unsqueeze(0)
+    u_h=u.transpose(0,1).unsqueeze(0)
+
+    sop_hpu = CustomSearchSorted()
+
+    inds_h = sop_hpu(cdf_h, u_h, 1) # 1(right), 0(left)
+    inds = inds_h.squeeze(0).transpose(0,1)
+    ```
+
+
+
diff --git a/scripts/pytorch_custom_op/__init__.py b/scripts/pytorch_custom_op/__init__.py
@@ -0,0 +1,8 @@
+###############################################################################
+# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+from .custom_tpc import CustomSearchSorted
+
+__all__ = [CustomSearchSorted]
+
diff --git a/scripts/pytorch_custom_op/custom_searchsorted.py b/scripts/pytorch_custom_op/custom_searchsorted.py
@@ -0,0 +1,33 @@
+###############################################################################
+# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import torch
+import os
+import habana_frameworks.torch.core
+
+custom_searchsorted_op_lib_path = "./build/lib.linux-x86_64-cpython-38/hpu_custom_searchsorted.cpython-38-x86_64-linux-gnu.so"
+my_dir = os.path.realpath(__file__)
+my_len = my_dir.rfind('/')
+base_dir = my_dir[:my_len]
+torch.ops.load_library(os.path.join(base_dir, custom_searchsorted_op_lib_path))
+
+class CustomSearchSortedFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, sequence, value, side):
+        # ctx is a context object that can be used to stash information
+        # for backward computation
+        tensor = torch.ops.custom_op.custom_searchsorted(sequence, value, side)
+        ctx.tensor = tensor
+        return tensor
+
+class CustomSearchSorted(torch.nn.Module):
+    def __init__(self):
+        super(CustomSearchSorted, self).__init__()
+
+    def forward(self, sequence, value, side):
+        return CustomSearchSortedFunction.apply(sequence, value, side)
+
+    def extra_repr(self):
+        return 'CustomSearchSorted for float32 only'
+
diff --git a/scripts/pytorch_custom_op/hpu_custom_op_searchsorted_test.py b/scripts/pytorch_custom_op/hpu_custom_op_searchsorted_test.py
@@ -0,0 +1,31 @@
+###############################################################################
+# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import torch
+from custom_searchsorted import CustomSearchSorted
+
+def test_custom_searchsorted_op_function():
+    print(torch.ops.custom_op.custom_searchsorted)
+    input = torch.tensor([[1.0, 3.0, 5.0, 7.0, 9.0], [2.0, 4.0, 6.0, 8.0, 10.0]], requires_grad=True)
+    value = torch.tensor([[3.0, 6.0, 9.0], [3.0, 6.0, 9.0]], requires_grad=True)
+
+    output_cpu = torch.searchsorted(input, value, side='right')
+    print(output_cpu)
+
+    input_h=input.transpose(0,1).unsqueeze(0)
+    value_h=value.transpose(0,1).unsqueeze(0)
+
+    input_hpu = input_h.to('hpu').detach()
+    value_hpu = value_h.to('hpu').detach()
+
+    input_hpu.requires_grad = True
+    sop_hpu = CustomSearchSorted()
+    output = sop_hpu(input_hpu, value_hpu, 1)
+    output_hpu = output.squeeze(0).transpose(0,1)
+    print(output_hpu)
+    assert(torch.equal(output_hpu.detach().cpu(), output_cpu.detach()))
+    print("Searchsorted forward passed!!")
+
+test_custom_searchsorted_op_function()
+
diff --git a/scripts/pytorch_custom_op/hpu_custom_searchsorted.cpp b/scripts/pytorch_custom_op/hpu_custom_searchsorted.cpp
@@ -0,0 +1,85 @@
+/******************************************************************************
+###############################################################################
+# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
+###############################################################################
+*******************************************************************************/
+
+#include "hpu_custom_op.h"
+#include <torch/extension.h>
+#include <perf_lib_layer_params.h>
+typedef struct sParam{
+    int side;
+}searchParam;
+
+bool register_custom_searchsorted() {
+    // Registering custom_op::custom_searchsorted
+    // inputs desc
+    habana::custom_op::InputDesc input_a_desc{
+        habana::custom_op::input_type::TENSOR, 0};
+    habana::custom_op::InputDesc input_b_desc{
+        habana::custom_op::input_type::TENSOR, 0};
+    habana::custom_op::InputDesc input_c_desc{
+        habana::custom_op::input_type::SCALAR, 0};
+
+    std::vector<habana::custom_op::InputDesc> inputs_desc{
+        input_a_desc, input_b_desc, input_c_desc};
+
+    // output desc
+    // output shape callback
+    auto output_size_lambda =
+        [](const at::Stack& inputs) -> std::vector<int64_t> {
+      auto self = inputs[1].toTensor(); // input
+      std::vector<int64_t> result_sizes = self.sizes().vec();
+      return result_sizes;
+    };
+
+    habana::custom_op::OutputDesc output_desc{
+        0, c10::ScalarType::Int, output_size_lambda};
+
+    std::vector<habana::custom_op::OutputDesc> outputs_desc{
+        output_desc};
+
+    // user param callback
+    auto user_params_lambda = [](const at::Stack& inputs, size_t& size) {
+      HPU_PARAMS_STUB(searchParam);
+      params->side = inputs[2].toInt(); // bottom
+      return params;
+    };
+
+    // actual register
+    REGISTER_CUSTOM_OP_ATTRIBUTES(
+        "custom_op::custom_searchsorted", //schema name
+        "searchsorted_fwd_f32", // guid
+        inputs_desc,
+        outputs_desc,
+        user_params_lambda);
+    std::cout << "cpp registered custom_op::custom_searchsorted\n";
+    return true;
+}
+
+at::Tensor custom_searchsorted_execute(
+    torch::Tensor sequence,
+    torch::Tensor value,
+    c10::Scalar side) {
+  TORCH_CHECK(sequence.scalar_type() == c10::ScalarType::Float, "Input sequence expected to be Float tensor");
+  TORCH_CHECK(value.scalar_type() == c10::ScalarType::Float, "Input value expected to be Float tensor");
+  TORCH_CHECK(side.to<int>() == 0 || side.to<int>() == 1, "side values other than 0 or 1 are not supported")
+  // Registering the custom op, need to be called only once
+  static bool registered = register_custom_searchsorted();
+  TORCH_CHECK(registered, "custom_searchsorted kernel not registered" );
+  std::vector<c10::IValue> inputs{sequence, value, side};
+  // Get custom op descriptor from registry
+  auto op_desc = habana::custom_op::HabanaCustomOpDescriptor::getCustomOpDescriptor("custom_op::custom_searchsorted");
+  // Actual call for op execution
+  std::vector<at::Tensor> output = op_desc.execute(inputs);
+  // op_desc.execute will always return a vector
+  return output[0];
+}
+
+TORCH_LIBRARY(custom_op, m) {
+  m.def("custom_searchsorted(Tensor self, Tensor value, Scalar side) -> Tensor");
+}
+TORCH_LIBRARY_IMPL(custom_op, HPU, m) {
+  m.impl("custom_searchsorted", custom_searchsorted_execute);
+}
+