Skip to content

Commit

Permalink
Dev/zhzhang/searchsorted fwd kernel (#19)
Browse files Browse the repository at this point in the history
* Added searchsorted forward kernel

* Added the searchsorted bwd kerbel.

* Added search sorted fwd kernels and unit test

* Added pytorch custom op for searchsorted

* Added README
  • Loading branch information
zzhang37 authored Aug 15, 2023
1 parent 27e0bd3 commit abb8094
Show file tree
Hide file tree
Showing 15 changed files with 858 additions and 2 deletions.
152 changes: 152 additions & 0 deletions kernels/gaudi/searchsorted_fwd_f32.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/**********************************************************************
Copyright (c) 2023 Habana Labs.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#define bv_cmp_eq_v_v(a, b) from_bool64(v_f32_cmp_eq_b(a, b))

void main(
tensor ifm_seq,
tensor ifm_val,
tensor ofm_idx,
bool side
)
{
const int depth = 0;
const int width = 1;
const int height = 2;
const int batch = 3;
const int fifdim = 4;

const int5 index_space_start = get_index_space_offset();
const int5 index_space_end = get_index_space_size() + index_space_start;

// depth
const int depthStep = 64;
const int depthStart = index_space_start[depth] * depthStep;
const int depthEnd = index_space_end[depth] * depthStep;

// width
const int widthStep = 1;
const int widthStart = 0;
const int widthEnd = get_dim_size(ifm_seq, 1);

// height
const int heightStep = 1;
const int heightStart = index_space_start[height] * heightStep;
const int heightEnd = index_space_end[height] * heightStep;

// batch
const int batchStep = 1;
const int batchStart = index_space_start[batch] * batchStep;
const int batchEnd = index_space_end[batch] * batchStep;

// fifdim
const int fifdimStep = 1;
const int fifdimStart = index_space_start[fifdim] * fifdimStep;
const int fifdimEnd = index_space_end[fifdim] * fifdimStep;

// value width
const int valueWidthStep = 1;
const int valueWidthStart = 0;
// Returns the dim0 size of ifm
const int valueWidthEnd = get_dim_size(ifm_val, 1);

int64 one = 0;

int5 ifmCoords = { depthStart, widthStart, heightStart, batchStart, fifdimStart };
int5 ofmCoords = { depthStart, valueWidthStart, heightStart, batchStart, fifdimStart };

// side is right
if(side == 1)
{
for (int f = fifdimStart; f < fifdimEnd; f += fifdimStep)
{
ifmCoords[fifdim] = ofmCoords[fifdim] = f;

for (int b = batchStart; b < batchEnd; b += batchStep)
{
ifmCoords[batch] = ofmCoords[batch] = b;

for (int h = heightStart; h < heightEnd; h += heightStep)
{
ifmCoords[height] = ofmCoords[height] = h;

for (int d = depthStart; d < depthEnd; d += depthStep)
{
ifmCoords[depth] = ofmCoords[depth] = d;

for (int vw = valueWidthStart; vw < valueWidthEnd; vw += valueWidthStep)
{
ofmCoords[width] = vw;
float64 value = v_f32_ld_tnsr_b(ofmCoords, ifm_val);
int64 index = 0;

for (int w = widthStart; w < widthEnd; w += widthStep)
{
ifmCoords[width] = w;
float64 sequence = v_f32_ld_tnsr_b(ifmCoords, ifm_seq);

float64 cmps = v_f32_sel_leq_f32_b(sequence, value, 0, 1);
bool256 pred = bv_cmp_eq_v_v(cmps, (float64) one);
index = v_i32_mov_vb(w+1, 0, index, to_bool64(pred),0);
}
v_i32_st_tnsr(ofmCoords, ofm_idx, index);
}
}
}
}
}
}
// side is left
else
{
for (int f = fifdimStart; f < fifdimEnd; f += fifdimStep)
{
ifmCoords[fifdim] = ofmCoords[fifdim] = f;

for (int b = batchStart; b < batchEnd; b += batchStep)
{
ifmCoords[batch] = ofmCoords[batch] = b;

for (int h = heightStart; h < heightEnd; h += heightStep)
{
ifmCoords[height] = ofmCoords[height] = h;

for (int d = depthStart; d < depthEnd; d += depthStep)
{
ifmCoords[depth] = ofmCoords[depth] = d;

for (int vw = valueWidthStart; vw < valueWidthEnd; vw += valueWidthStep)
{
ofmCoords[width] = vw;
float64 value = v_f32_ld_tnsr_b(ofmCoords, ifm_val);
int64 index = 0;

for (int w = widthStart; w < widthEnd; w += widthStep)
{
ifmCoords[width] = w;
float64 sequence = v_f32_ld_tnsr_b(ifmCoords, ifm_seq);

float64 cmps = v_f32_sel_less_f32_b(sequence, value, 0, 1);
bool256 pred = bv_cmp_eq_v_v(cmps, (float64) one);
index = v_i32_mov_vb(w+1, 0, index, to_bool64(pred),0);
}
v_i32_st_tnsr(ofmCoords, ofm_idx, index);
}
}
}
}
}
}
}
65 changes: 65 additions & 0 deletions scripts/pytorch_custom_op/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Create searchsorted custom op in PyTorch

This README provides an example of how to write custom PyTorch Ops using a TPC Kernel supported on an HPU device. For more details, refer to [PyTorch CustomOP API](https://docs.habana.ai/en/latest/PyTorch/PyTorch_CustomOp_API/page_index.html) documentation.



## Table of Contents

* [Prerequisites](#Prerequisites)
* [Content](#content)
* [Build and Run with Custom Kernels](#build-and-run-with-custom-kernels)
* [Important to Know](#important-to-know)
* [Applying CustomOps to a Real Training Model Example](#applying-customops-to-a-real-training-model-example)


## Prerequisites

- A TPC kernel on which the HpuKernel will run. To write a CustomOp, you must define the TPC kernel that HpuKernel will run on first. This document provides the required steps for using the custom TPC kernels `searchsorted_fwd_f32`, to implement CustomSearchsortedOp. For further information on how to write TPC kernels, refer to the [Habana Custom Kernel GitHub page](https://github.com/HabanaAI/Habana_Custom_Kernel).

- **habana-torch-plugin** Python package must be installed. Make sure to install by following the instructions detailed in the [Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html).

## Content

- C++ file with **custom_op::custom_searchsorted**, definition and Kernel implementation on HPU:
- `custom_searchsorted` performs searchsorted on sorted input.
- `setup.py` file for building the solution:
- To compile to Op, run ```python setup.py build```.
- Python test to run and validate `CustomSearchSorted`:
- ```python hpu_custom_op_searchsorted_test.py```

## Build and Run with Custom Kernels

To build and run `custom_searchsorted`, run the following:
```python setup.py build```

## Important to Know

In order to make the custom op work in the training process, usually we need to implement both forward and backward ops. But due to searchsorted op return an integer index, no backward op required at this time.

## Applying CustomSearchsorted to a Real Training Model Example

This section provides an example for applying CustomOps to a real training model NeuS.
Follow the below steps:

1. Build the `custom_searchsorted` Op with the custom kernel `searchsorted_fwd_f32` as described above.
2. If the build steps are successful, the run the unit test to make sure the custom_searchsorted op pass the test.
3. Make sure add the custom tc kernel to the GC_KERNEL_PATH, i.e., export `GC_KERNEL_PATH=/your/path/to/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH`.
4. Add the custom_searchsorted path to PYTHONPATH, i.e., `export PYTHONPATH = /your/path/to/pytorch_custom_op:$PYTHONPATH`.
4. Replace `inds = torch.searchsorted(cdf, u, right=True)` with the following to train the model.
```
from custom_searchsorted import CustomSearchSorted
cdf = cdf.to('hpu').detach()
u = u.to('hpu').detach()
cdf_h=cdf.transpose(0,1).unsqueeze(0)
u_h=u.transpose(0,1).unsqueeze(0)
sop_hpu = CustomSearchSorted()
inds_h = sop_hpu(cdf_h, u_h, 1) # 1(right), 0(left)
inds = inds_h.squeeze(0).transpose(0,1)
```
8 changes: 8 additions & 0 deletions scripts/pytorch_custom_op/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
###############################################################################
# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
###############################################################################

from .custom_tpc import CustomSearchSorted

__all__ = [CustomSearchSorted]

33 changes: 33 additions & 0 deletions scripts/pytorch_custom_op/custom_searchsorted.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
###############################################################################
# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
###############################################################################

import torch
import os
import habana_frameworks.torch.core

custom_searchsorted_op_lib_path = "./build/lib.linux-x86_64-cpython-38/hpu_custom_searchsorted.cpython-38-x86_64-linux-gnu.so"
my_dir = os.path.realpath(__file__)
my_len = my_dir.rfind('/')
base_dir = my_dir[:my_len]
torch.ops.load_library(os.path.join(base_dir, custom_searchsorted_op_lib_path))

class CustomSearchSortedFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, sequence, value, side):
# ctx is a context object that can be used to stash information
# for backward computation
tensor = torch.ops.custom_op.custom_searchsorted(sequence, value, side)
ctx.tensor = tensor
return tensor

class CustomSearchSorted(torch.nn.Module):
def __init__(self):
super(CustomSearchSorted, self).__init__()

def forward(self, sequence, value, side):
return CustomSearchSortedFunction.apply(sequence, value, side)

def extra_repr(self):
return 'CustomSearchSorted for float32 only'

31 changes: 31 additions & 0 deletions scripts/pytorch_custom_op/hpu_custom_op_searchsorted_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
###############################################################################
# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
###############################################################################

import torch
from custom_searchsorted import CustomSearchSorted

def test_custom_searchsorted_op_function():
print(torch.ops.custom_op.custom_searchsorted)
input = torch.tensor([[1.0, 3.0, 5.0, 7.0, 9.0], [2.0, 4.0, 6.0, 8.0, 10.0]], requires_grad=True)
value = torch.tensor([[3.0, 6.0, 9.0], [3.0, 6.0, 9.0]], requires_grad=True)

output_cpu = torch.searchsorted(input, value, side='right')
print(output_cpu)

input_h=input.transpose(0,1).unsqueeze(0)
value_h=value.transpose(0,1).unsqueeze(0)

input_hpu = input_h.to('hpu').detach()
value_hpu = value_h.to('hpu').detach()

input_hpu.requires_grad = True
sop_hpu = CustomSearchSorted()
output = sop_hpu(input_hpu, value_hpu, 1)
output_hpu = output.squeeze(0).transpose(0,1)
print(output_hpu)
assert(torch.equal(output_hpu.detach().cpu(), output_cpu.detach()))
print("Searchsorted forward passed!!")

test_custom_searchsorted_op_function()

85 changes: 85 additions & 0 deletions scripts/pytorch_custom_op/hpu_custom_searchsorted.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/******************************************************************************
###############################################################################
# Copyright (C) 2020-2023 Habana Labs, Ltd. an Intel Company
###############################################################################
*******************************************************************************/

#include "hpu_custom_op.h"
#include <torch/extension.h>
#include <perf_lib_layer_params.h>
typedef struct sParam{
int side;
}searchParam;

bool register_custom_searchsorted() {
// Registering custom_op::custom_searchsorted
// inputs desc
habana::custom_op::InputDesc input_a_desc{
habana::custom_op::input_type::TENSOR, 0};
habana::custom_op::InputDesc input_b_desc{
habana::custom_op::input_type::TENSOR, 0};
habana::custom_op::InputDesc input_c_desc{
habana::custom_op::input_type::SCALAR, 0};

std::vector<habana::custom_op::InputDesc> inputs_desc{
input_a_desc, input_b_desc, input_c_desc};

// output desc
// output shape callback
auto output_size_lambda =
[](const at::Stack& inputs) -> std::vector<int64_t> {
auto self = inputs[1].toTensor(); // input
std::vector<int64_t> result_sizes = self.sizes().vec();
return result_sizes;
};

habana::custom_op::OutputDesc output_desc{
0, c10::ScalarType::Int, output_size_lambda};

std::vector<habana::custom_op::OutputDesc> outputs_desc{
output_desc};

// user param callback
auto user_params_lambda = [](const at::Stack& inputs, size_t& size) {
HPU_PARAMS_STUB(searchParam);
params->side = inputs[2].toInt(); // bottom
return params;
};

// actual register
REGISTER_CUSTOM_OP_ATTRIBUTES(
"custom_op::custom_searchsorted", //schema name
"searchsorted_fwd_f32", // guid
inputs_desc,
outputs_desc,
user_params_lambda);
std::cout << "cpp registered custom_op::custom_searchsorted\n";
return true;
}

at::Tensor custom_searchsorted_execute(
torch::Tensor sequence,
torch::Tensor value,
c10::Scalar side) {
TORCH_CHECK(sequence.scalar_type() == c10::ScalarType::Float, "Input sequence expected to be Float tensor");
TORCH_CHECK(value.scalar_type() == c10::ScalarType::Float, "Input value expected to be Float tensor");
TORCH_CHECK(side.to<int>() == 0 || side.to<int>() == 1, "side values other than 0 or 1 are not supported")
// Registering the custom op, need to be called only once
static bool registered = register_custom_searchsorted();
TORCH_CHECK(registered, "custom_searchsorted kernel not registered" );
std::vector<c10::IValue> inputs{sequence, value, side};
// Get custom op descriptor from registry
auto op_desc = habana::custom_op::HabanaCustomOpDescriptor::getCustomOpDescriptor("custom_op::custom_searchsorted");
// Actual call for op execution
std::vector<at::Tensor> output = op_desc.execute(inputs);
// op_desc.execute will always return a vector
return output[0];
}

TORCH_LIBRARY(custom_op, m) {
m.def("custom_searchsorted(Tensor self, Tensor value, Scalar side) -> Tensor");
}
TORCH_LIBRARY_IMPL(custom_op, HPU, m) {
m.impl("custom_searchsorted", custom_searchsorted_execute);
}

Loading

0 comments on commit abb8094

Please sign in to comment.