Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into parallel-matmul
Browse files Browse the repository at this point in the history
  • Loading branch information
chentong319 committed Feb 14, 2024
2 parents 74d14e5 + ffbdb25 commit 339d15e
Show file tree
Hide file tree
Showing 17 changed files with 805 additions and 634 deletions.
30 changes: 14 additions & 16 deletions docs/UsingPyRuntime.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,26 @@ Both utilities have an associated Python binding generated by [pybind library](h
## Configuring the Python interfaces

Using pybind, a C/C++ binary can be directly imported by the Python interpreter.
For onnx-mlir, there are three such libraries, one to compile onnx-mlir models,
one to run the models and the other one is to compile and run the models.

1. The library to compile onnx-mlir models is generated
by `PyOMCompileSession` (src/Compiler/PyOMCompileSession.hpp) and build as a shared
library to `build/Debug/lib/PyCompile.cpython-<target>.so`.
2. The library to run onnx-mlir models is generated
by by `PyExecutionSession` (src/Runtime/PyExecutionSession.hpp) and built
as a shared library to `build/Debug/lib/PyRuntime.cpython-<target>.so`.
3. The library to compile and run onnx-mlir models is generated
by by `PyOMCompileExecutionSession` (src/Runtime/PyOMCompileExecutionSession.hpp) and built
as a shared library to `build/Debug/lib/PyCompileAndRuntime.cpython-<target>.so`.
This library takes an .onnx file and the options as inputs, it will load it and then compile and run it.
For onnx-mlir, there are five such libraries, one to compile onnx-mlir models,
two to run the models and the other two are to compile and run the models.

1. The shapred library to compile onnx-mlir models is generated by `PyOMCompileSession` (src/Compiler/PyOMCompileSession.hpp) and build as a shared library to `build/Debug/lib/PyCompile.cpython-<target>.so`.
2. The shared library to run onnx-mlir models is generated by `PyExecutionSession` (src/Runtime/PyExecutionSession.hpp) and built as a shared library to `build/Debug/lib/PyRuntimeC.cpython-<target>.so`.
3. The Python library to run onnx-mlir models (src/Runtime/python/PyRuntime.py).
4. The shared library to compile and run onnx-mlir models is generated by `PyOMCompileExecutionSessionC` (src/Runtime/PyOMCompileExecutionSession.hpp) and built as a shared library to `build/Debug/lib/PyCompileAndRuntimeC.cpython-<target>.so`.
5. The Python library to compile run onnx-mlir models (src/Runtime/python/PyCompileAndRuntime.py). This library takes an .onnx file and the options as inputs, it will load it and then compile and run it.


The module can be imported normally by the Python interpreter as long as it is in your
PYTHONPATH. Another alternative is to create a symbolic link to it in your working directory.

```shell
cd <working directory>
ln -s <path to PyRuntime>
ln -s <path to PyCompile>
ln -s <path to PyCompileAndRuntime>
ln -s <path to the shared library to copmpile onnx-mlir models>(e.g. `build/Debug/lib/PyCompile.cpython-<target>.so`) .
ln -s <path to the shared library to run onnx-mlir models>(e.g. `build/Debug/lib/PyRuntimeC.cpython-<target>.so`) .
ln -s <path to the Python library to run onnx-mlir models>(e.g. src/Runtime/python/PyRuntime.py) .
ln -s <path to the shared library to compile and run onnx-mlir models>(e.g. `build/Debug/lib/PyCompileAndRuntimeC.cpython-<target>.so`) .
ln -s <path to the Python library to compile and run onnx-mlir models>(e.g. src/Runtime/python/PyCompileAndRuntime.py) .
python3
```

Expand Down
240 changes: 88 additions & 152 deletions src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,88 +33,64 @@ extern "C" {

typedef enum ElemementwiseOp {
// Binary
ZDNN_ADD_EXT,
ZDNN_DIV_EXT,
ZDNN_MAX_EXT,
ZDNN_MIN_EXT,
ZDNN_MUL_EXT,
ZDNN_SUB_EXT,
ZDNN_ADD_EXT = 0,
ZDNN_DIV_EXT = 1,
ZDNN_MAX_EXT = 2,
ZDNN_MIN_EXT = 3,
ZDNN_MUL_EXT = 4,
ZDNN_SUB_EXT = 5,
// Unary
ZDNN_EXP_EXT,
ZDNN_LOG_EXT,
ZDNN_RELU_EXT,
ZDNN_TANH_EXT,
ZDNN_SIGMOID_EXT,
ZDNN_EXP_EXT = 50,
ZDNN_LOG_EXT = 51,
ZDNN_RELU_EXT = 52,
ZDNN_TANH_EXT = 53,
ZDNN_SIGMOID_EXT = 54,
} ElemementwiseOp;

static SplitAxis selectSplitAxis(const zdnn_ztensor *t) {
// We prefer to split E1 over E2 if E1 >= E2, because we can reuse the full
// buffer in case of E1.
UnmappedShape unmappedShape;
getUnmappedShape(t, &unmappedShape);
if (unmappedShape.e1 >= unmappedShape.e2)
return E1;
return E2;
}

static zdnn_status zdnn_unary_elementwise_common(const zdnn_ztensor *input,
const void *clippingValue, zdnn_ztensor *output, ElemementwiseOp opType) {
// Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we
// will split e2 safely.
OrigShape origShapeOfX;
getOrigShape(input, &origShapeOfX);
uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size();
if ((origShapeOfX.e4 > maxDimSize) || (origShapeOfX.e3 > maxDimSize) ||
(origShapeOfX.e1 > maxDimSize)) {
printf("[UnaryElementwise] The input tensor dimension exceeds maximum "
"dimension index "
"size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n",
maxDimSize, origShapeOfX.e4, origShapeOfX.e3, origShapeOfX.e1);
return ZDNN_EXCEEDS_MDIS;
}

// We split e2 in (e4, e3, e2, e1).
SplitInfo splitInfoX = {
.origZTensor = input, .axis = 2, .chunkSize = OMZTensorSplitSize};
SplitInfo splitInfoY = {
.origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize};

// Dim is small or ztensor split is disabled.
if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoX) ||
!initSplitInfo(&splitInfoY)) {
if (OMZTensorSplitDebug)
printf("[UnaryElementwise] Not split zTensor ...\n");
if (opType == ZDNN_EXP_EXT)
return zdnn_exp(input, output);
else if (opType == ZDNN_LOG_EXT)
return zdnn_log(input, output);
else if (opType == ZDNN_RELU_EXT)
return zdnn_relu(input, clippingValue, output);
else if (opType == ZDNN_SIGMOID_EXT)
return zdnn_sigmoid(input, output);
else if (opType == ZDNN_TANH_EXT)
return zdnn_tanh(input, output);
else
return ZDNN_UNAVAILABLE_FUNCTION;
}
double splitTime = 0., computeTime = 0., mergeTime = 0.;
clock_t start_time = 0, end_time = 0;

// Split input.
if (OMZTensorSplitDebug)
printf("[UnaryElementwise] Split the input ztensor along e2 into %d chunks "
"of %d elements \n",
splitInfoX.numOfChunks, splitInfoX.chunkSize);

double splitTime = 0.;
double mmTime = 0.;
double mergeTime = 0.;
clock_t start_time, end_time;

// Split input into chunks.
printf("[UnaryElementwise opType %d]\n", opType);

// We split e1 or e2 in (e4, e3, e2, e1).
SplitAxis axis = selectSplitAxis(input);
SplitInfo splitInfoX = {.fullZTensor = input,
.axis = axis,
.numOfElemsPerTile = OMZTensorSplitSize};
SplitInfo splitInfoY = {.fullZTensor = output,
.axis = axis,
.numOfElemsPerTile = OMZTensorSplitSize};
initSplitInfo(&splitInfoX, true, "UnaryElementwise X");
initSplitInfo(&splitInfoY, true, "UnaryElementwise Y");

// Copy data from input to tiles.
if (OMZTensorSplitDebug)
start_time = clock();
splitZTensor(&splitInfoX, /*copyData=*/true);
splitZTensor(&splitInfoY, /*copyData=*/false);
copyData(&splitInfoX, FULL_TO_TILES);
if (OMZTensorSplitDebug) {
end_time = clock();
splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
}

// Call zdnn op on each chunk.
// Call zdnn op on each tile.
if (OMZTensorSplitDebug)
start_time = clock();
for (uint32_t i = 0; i < splitInfoX.numOfChunks; ++i) {
zdnn_ztensor *zxTensor = (splitInfoX.chunks + i)->ztensor;
zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor;
for (uint32_t i = 0; i < splitInfoX.numOfTiles; ++i) {
zdnn_ztensor *zxTensor = splitInfoX.tiles + i;
zdnn_ztensor *zyTensor = splitInfoY.tiles + i;
zdnn_status status;
if (opType == ZDNN_EXP_EXT)
status = zdnn_exp(zxTensor, zyTensor);
Expand All @@ -132,112 +108,70 @@ static zdnn_status zdnn_unary_elementwise_common(const zdnn_ztensor *input,
}
if (OMZTensorSplitDebug) {
end_time = clock();
mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
computeTime =
((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
}

// Merging the chunks into the output.
// Copy data from tiles to the output.
if (OMZTensorSplitDebug)
start_time = clock();
mergeZTensors(&splitInfoY);
copyData(&splitInfoY, TILES_TO_FULL);
if (OMZTensorSplitDebug) {
end_time = clock();
mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
}

freeSplitInfoBuffer(&splitInfoX);
freeSplitInfoBuffer(&splitInfoY);
FreeSplitInfoData(&splitInfoX);
FreeSplitInfoData(&splitInfoY);

if (OMZTensorSplitDebug)
printf("[UnaryElementwise] split, %f, mm, %f, merge, %f (milliseconds)\n",
splitTime, mmTime, mergeTime);
printf(
"[UnaryElementwise] split, %f, compute, %f, merge, %f (milliseconds)\n",
splitTime, computeTime, mergeTime);

return ZDNN_OK;
}

static zdnn_status zdnn_binary_elementwise_common(const zdnn_ztensor *inputA,
const zdnn_ztensor *inputB, zdnn_ztensor *output, ElemementwiseOp opType) {
// Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we
// will split e2 safely.
OrigShape origShapeOfA, origShapeOfB;
getOrigShape(inputA, &origShapeOfA);
getOrigShape(inputB, &origShapeOfB);
uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size();
if ((origShapeOfA.e4 > maxDimSize) || (origShapeOfA.e3 > maxDimSize) ||
(origShapeOfA.e1 > maxDimSize)) {
printf("[BinaryElementwise] The 1st tensor dimension exceeds maximum "
"dimension index "
"size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n",
maxDimSize, origShapeOfA.e4, origShapeOfA.e3, origShapeOfA.e1);
return ZDNN_EXCEEDS_MDIS;
}
if ((origShapeOfB.e4 > maxDimSize) || (origShapeOfB.e3 > maxDimSize) ||
(origShapeOfB.e1 > maxDimSize)) {
printf("[BinaryElementwise] The 2nd tensor dimension exceeds maximum "
"dimension index "
"size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n",
maxDimSize, origShapeOfB.e4, origShapeOfB.e3, origShapeOfB.e1);
return ZDNN_EXCEEDS_MDIS;
}

// We split e2 in (e4, e3, e2, e1).
SplitInfo splitInfoA = {
.origZTensor = inputA, .axis = 2, .chunkSize = OMZTensorSplitSize};
SplitInfo splitInfoB = {
.origZTensor = inputB, .axis = 2, .chunkSize = OMZTensorSplitSize};
SplitInfo splitInfoY = {
.origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize};

// Dim is small or ztensor split is disabled.
if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoA) ||
!initSplitInfo(&splitInfoB) || !initSplitInfo(&splitInfoY)) {
if (OMZTensorSplitDebug)
printf("[BinaryElementwise] Not split zTensor ...\n");
if (opType == ZDNN_ADD_EXT)
return zdnn_add(inputA, inputB, output);
else if (opType == ZDNN_SUB_EXT)
return zdnn_sub(inputA, inputB, output);
else if (opType == ZDNN_MUL_EXT)
return zdnn_mul(inputA, inputB, output);
else if (opType == ZDNN_DIV_EXT)
return zdnn_div(inputA, inputB, output);
else if (opType == ZDNN_MAX_EXT)
return zdnn_max(inputA, inputB, output);
else if (opType == ZDNN_MIN_EXT)
return zdnn_min(inputA, inputB, output);
else
return ZDNN_UNAVAILABLE_FUNCTION;
}
double splitTime = 0., computeTime = 0., mergeTime = 0.;
clock_t start_time = 0, end_time = 0;

// Split input.
if (OMZTensorSplitDebug)
printf(
"[BinaryElementwise] Split the input ztensors along e2 into %d chunks "
"of %d elements \n",
splitInfoA.numOfChunks, splitInfoA.chunkSize);

double splitTime = 0.;
double mmTime = 0.;
double mergeTime = 0.;
clock_t start_time, end_time;

// Split input into chunks.
printf("[BinaryElementwise opType %d]\n", opType);

// We split e1 or e2 in (e4, e3, e2, e1).
SplitAxis axis = selectSplitAxis(inputA);
SplitInfo splitInfoA = {.fullZTensor = inputA,
.axis = axis,
.numOfElemsPerTile = OMZTensorSplitSize};
SplitInfo splitInfoB = {.fullZTensor = inputB,
.axis = axis,
.numOfElemsPerTile = OMZTensorSplitSize};
SplitInfo splitInfoY = {.fullZTensor = output,
.axis = axis,
.numOfElemsPerTile = OMZTensorSplitSize};
initSplitInfo(&splitInfoA, true, "BinaryElementwise A");
initSplitInfo(&splitInfoB, true, "BinaryElementwise B");
initSplitInfo(&splitInfoY, true, "BinaryElementwise Y");

// Copy data from inputs into tiles.
if (OMZTensorSplitDebug)
start_time = clock();
splitZTensor(&splitInfoA, /*copyData=*/true);
splitZTensor(&splitInfoB, /*copyData=*/true);
splitZTensor(&splitInfoY, /*copyData=*/false);
copyData(&splitInfoA, FULL_TO_TILES);
copyData(&splitInfoB, FULL_TO_TILES);
if (OMZTensorSplitDebug) {
end_time = clock();
splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
}

// Call zdnn op on each chunk.
// Call zdnn op on each tile.
if (OMZTensorSplitDebug)
start_time = clock();
for (uint32_t i = 0; i < splitInfoA.numOfChunks; ++i) {
zdnn_ztensor *zaTensor = (splitInfoA.chunks + i)->ztensor;
zdnn_ztensor *zbTensor = (splitInfoB.chunks + i)->ztensor;
zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor;
for (uint32_t i = 0; i < splitInfoA.numOfTiles; ++i) {
zdnn_ztensor *zaTensor = splitInfoA.tiles + i;
zdnn_ztensor *zbTensor = splitInfoB.tiles + i;
zdnn_ztensor *zyTensor = splitInfoY.tiles + i;
zdnn_status status;
if (opType == ZDNN_ADD_EXT)
status = zdnn_add(zaTensor, zbTensor, zyTensor);
Expand All @@ -257,25 +191,27 @@ static zdnn_status zdnn_binary_elementwise_common(const zdnn_ztensor *inputA,
}
if (OMZTensorSplitDebug) {
end_time = clock();
mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
computeTime =
((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
}

// Merging the chunks into the output.
// Copy data from tiles to the output.
if (OMZTensorSplitDebug)
start_time = clock();
mergeZTensors(&splitInfoY);
copyData(&splitInfoY, TILES_TO_FULL);
if (OMZTensorSplitDebug) {
end_time = clock();
mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
}

freeSplitInfoBuffer(&splitInfoA);
freeSplitInfoBuffer(&splitInfoB);
freeSplitInfoBuffer(&splitInfoY);
FreeSplitInfoData(&splitInfoA);
FreeSplitInfoData(&splitInfoB);
FreeSplitInfoData(&splitInfoY);

if (OMZTensorSplitDebug)
printf("[BinaryElementwise] split, %f, mm, %f, merge, %f (milliseconds)\n",
splitTime, mmTime, mergeTime);
printf("[BinaryElementwise] split, %f, compute, %f, merge, %f "
"(milliseconds)\n",
splitTime, computeTime, mergeTime);

return ZDNN_OK;
}
Expand Down
Loading

0 comments on commit 339d15e

Please sign in to comment.