From 59c88d1ecd15c0651a5bd406e25f9e65c07acf46 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Wed, 24 Jun 2020 13:58:55 +0800 Subject: [PATCH] Revert commit --- docs/conf.py | 1 - include/tvm/relay/attrs/transform.h | 13 - include/tvm/relay/transform.h | 14 - include/tvm/runtime/c_runtime_api.h | 23 - include/tvm/runtime/device_api.h | 3 +- include/tvm/runtime/ndarray.h | 12 +- scripts/common.py | 1034 ----------------- scripts/shape_configs.py | 247 ---- scripts/tune_network.py | 405 ------- scripts/tune_op_subgraph.py | 602 ---------- scripts/tune_test.py | 394 ------- src/arith/rewrite_simplify.cc | 71 +- src/relay/analysis/type_solver.cc | 1 - src/relay/op/tensor/transform.cc | 54 - src/relay/transforms/defuse_ops.cc | 91 -- .../transforms/kernel_layout_transform.cc | 66 -- .../transforms/kernel_layout_transform.h | 102 -- src/relay/transforms/pattern_util.h | 2 - src/runtime/cuda/cuda_device_api.cc | 4 - src/runtime/ndarray.cc | 80 +- src/runtime/opencl/opencl_device_api.cc | 3 - src/runtime/rpc/rpc_module.cc | 30 - src/runtime/threading_backend.cc | 9 +- src/te/schedule/schedule_dataflow_rewrite.cc | 66 +- src/tir/analysis/verify_gpu_code.cc | 44 +- src/tir/transforms/unroll_loop.cc | 20 +- tests/python/unittest/test_ansor_feature.py | 150 --- .../unittest/test_ansor_relay_integration.py | 114 -- .../unittest/test_ansor_task_scheduler.py | 52 - .../test_tir_transform_unroll_loop.py | 24 - topi/include/topi/transform.h | 69 -- topi/python/topi/nn/conv2d.py | 39 +- tutorials/ansor/README.txt | 4 - tutorials/ansor/tune_conv2d_cuda.py | 179 --- tutorials/ansor/tune_simple_subgraph.py | 193 --- tutorials/autotvm/README.txt | 4 +- 36 files changed, 31 insertions(+), 4188 deletions(-) delete mode 100644 scripts/common.py delete mode 100644 scripts/shape_configs.py delete mode 100644 scripts/tune_network.py delete mode 100644 scripts/tune_op_subgraph.py delete mode 100644 scripts/tune_test.py delete mode 100644 src/relay/transforms/defuse_ops.cc delete mode 100644 src/relay/transforms/kernel_layout_transform.cc delete mode 100644 src/relay/transforms/kernel_layout_transform.h delete mode 100644 tests/python/unittest/test_ansor_feature.py delete mode 100644 tests/python/unittest/test_ansor_relay_integration.py delete mode 100644 tests/python/unittest/test_ansor_task_scheduler.py delete mode 100644 tutorials/ansor/README.txt delete mode 100644 tutorials/ansor/tune_conv2d_cuda.py delete mode 100644 tutorials/ansor/tune_simple_subgraph.py diff --git a/docs/conf.py b/docs/conf.py index 5826526d55b0..7ece63bd7aa8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -198,7 +198,6 @@ '../tutorials/language', '../tutorials/optimize', '../tutorials/autotvm', - '../tutorials/ansor', '../tutorials/dev', '../tutorials/topi', '../tutorials/deployment', diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h index 95476ed61bdd..750a8a43163c 100644 --- a/include/tvm/relay/attrs/transform.h +++ b/include/tvm/relay/attrs/transform.h @@ -296,19 +296,6 @@ struct LayoutTransformAttrs : public tvm::AttrsNode { } }; -/*! \brief Attributes for KernelLayoutTransform operator */ -struct KernelLayoutTransformAttrs : public tvm::AttrsNode { - std::string src_layout; - std::string dst_layout; - - TVM_DECLARE_ATTRS(KernelLayoutTransformAttrs, "relay.attrs.KernelLayoutTransformAttrs") { - TVM_ATTR_FIELD(src_layout) - .describe("The source layout of the tensor. (e.g. 1N32C112H112W)"); - TVM_ATTR_FIELD(dst_layout) - .describe("The destination layout of the tensor. (e.g. 1N2C112H112W16c)"); - } -}; - /*! \brief Attributes for ShapeOf operator */ struct ShapeOfAttrs : public tvm::AttrsNode { DataType dtype; diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index 5f5d9b643633..1b8b31aee5d1 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -277,20 +277,6 @@ TVM_DLL Pass CanonicalizeOps(); */ TVM_DLL Pass AlterOpLayout(); -/*! - * \brief Alternate the layouts of kernels. - * - * \return The pass. - */ -TVM_DLL Pass KernelLayoutTransform(); - -/*! - * \brief The reverse of FuseOps. - * - * \return The pass. - */ -TVM_DLL Pass DeFuseOps(); - /*! * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 5a32ac7d3d9f..213c7059a5f9 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -384,29 +384,6 @@ TVM_DLL int TVMFuncListGlobalNames(int* out_size, const char*** out_array); TVM_DLL int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_bits, int dtype_lanes, int device_type, int device_id, TVMArrayHandle* out); -/*! - * \brief Allocate a nd-array's memory of non-empty values, - * including space of shape, of given spec. - * - * \param shape The shape of the array, the data content will be copied to out - * \param ndim The number of dimension of the array. - * \param dtype_code The type code of the dtype - * \param dtype_bits The number of bits of dtype - * \param dtype_lanes The number of lanes in the dtype. - * \param device_type The device type of context - * \param device_id The device id of context. - * \param out The output handle. - * \return 0 when success, -1 when failure happens - */ -TVM_DLL int TVMArrayAllocNonEmpty(const tvm_index_t* shape, - int ndim, - int dtype_code, - int dtype_bits, - int dtype_lanes, - int device_type, - int device_id, - TVMArrayHandle* out); - /*! * \brief Free the TVM Array. * \param handle The array handle to be freed. diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index 9b2eb6be2160..421811a52c3b 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -44,8 +44,7 @@ enum DeviceAttrKind : int { kMaxClockRate = 6, kMultiProcessorCount = 7, kMaxThreadDimensions = 8, - kGcnArch = 9, - kMaxRegistersPerBlock = 10 + kGcnArch = 9 }; /*! \brief Number of bytes each allocation must align to */ diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index 9cc66a371974..e69d802652fd 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -138,17 +138,7 @@ class NDArray : public ObjectRef { * \param ctx The context of the Array. * \return The created Array */ - TVM_DLL static NDArray Empty(std::vector shape, - DLDataType dtype, DLContext ctx); - /*! - * \brief Create an NDArray with non-empty values. - * \param shape The shape of the new array. - * \param dtype The data type of the new array. - * \param ctx The context of the Array. - * \return The created Array - */ - TVM_DLL static NDArray NonEmpty(std::vector shape, - DLDataType dtype, DLContext ctx); + TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, DLContext ctx); /*! * \brief Create a NDArray backed by a dlpack tensor. * diff --git a/scripts/common.py b/scripts/common.py deleted file mode 100644 index e9cf58e128bb..000000000000 --- a/scripts/common.py +++ /dev/null @@ -1,1034 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Common utility for scripts""" -import argparse -import math -import os -import re -import time -from collections import defaultdict, namedtuple -from typing import Dict, List, Tuple - -import numpy as np -import matplotlib.pyplot as plt - -import topi -import tvm -from tvm import te -from tvm.ansor import (LogReader, make_workload_key_func, - register_workload_func, - write_measure_records_to_file) -from tvm.contrib import ndk, util - -############################################################ -###################### Test Workloads #################### -############################################################ - -@register_workload_func -def min_mn(M, N): - A = te.placeholder((M, N), name='A') - B = topi.min(A, axis=1) - - return [A, B] - -@register_workload_func -def argmin_mn(M, N): - A = te.placeholder((M, N), name='A') - B = topi.argmin(A, axis=1) - - return [A, B] - -@register_workload_func -def softmax_mn(M, N): - A = te.placeholder((M, N), name='A') - B = topi.nn.softmax(A, axis=1) - - return [A, B] - -@register_workload_func -def norm_bmn(B, M, N): - A = te.placeholder((B, M, N), name='A') - i = te.reduce_axis((0, M)) - j = te.reduce_axis((0, N)) - C = te.compute((B,), lambda b: te.sum(A[b][i][j] * A[b][i][j], axis=[i, j]), name='C') - D = te.compute((B,), lambda b: te.sqrt(C[b]), name='D') - - return [A, D] - -@register_workload_func -def add_mn(M, N): - A = te.placeholder((M, N), name='A') - B = te.placeholder((M, N), name='B') - C = te.compute((M, N), lambda i, j: A[i][j] + B[i][j], name='C') - - return [A, B, C] - -@register_workload_func -def matmul_nkkm(N, M, K, in_type='float32', out_type='float32', - tensor_core_support=False): - if tensor_core_support: - A = te.placeholder((N // 16, K // 16, 16, 16), name='A', dtype=in_type) - B = te.placeholder((K // 16, M // 16, 16, 16), name='B', dtype=in_type) - k = te.reduce_axis((0, K // 16), name='k') - kk = te.reduce_axis((0, 16), name='kk') - if not ((in_type == 'float16' and out_type == 'float32') or \ - (in_type == 'int8' and out_type == 'int32')): - raise ValueError - C = te.compute((N // 16, M // 16, 16, 16), - lambda i, j, ii, jj: te.sum(A[i][k][ii][kk].astype(out_type) * B[k][j][kk][jj].astype(out_type), - axis=[k, kk]), - name='C') - else: - A = te.placeholder((N, K), name='A', dtype=in_type) - B = te.placeholder((K, M), name='B', dtype=in_type) - k = te.reduce_axis((0, K), name='k') - C = te.compute((N, M), - lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), - name='C') - - return [A, B, C] - -@register_workload_func -def dense_layer(batch, in_dim, out_dim): - A = te.placeholder((batch, in_dim), name='A') - B = te.placeholder((out_dim, in_dim), name='B') - k = te.reduce_axis((0, in_dim), name='k') - C = te.compute((batch, out_dim), lambda i, j: te.sum(A[i][k] * B[j][k], axis=[k]), name='C') - - return [A, B, C] - -@register_workload_func -def max_pool_2d_nchw(N, C, H, W): - data = te.placeholder((N, C, H, W), name='data') - out = topi.nn.pool(data, (2, 2), (1, 1), (0, 0, 0, 0), pool_type='max', ceil_mode=True, - layout="NCHW", count_include_pad=True) - - return [data, out] - -@register_workload_func -def add_min_relu(M, N): - A = te.placeholder((M, N), name='A') - B = te.placeholder((M, N), name='B') - C = topi.add(A, B) - D = topi.min(C, axis=1) - out = topi.nn.relu(D) - return [A, B, out] - -@register_workload_func -def conv2d_relu_softmax_min(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, KH, KW), name='kernel') - conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation) - relu = topi.nn.relu(conv) - softmax = topi.nn.softmax(relu, axis=1) - out = topi.min(softmax, axis=1) - - return [data, kernel, out] - -@register_workload_func -def conv2d_nchw_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, KH, KW), name='kernel') - bias = te.placeholder((CO, 1, 1), name='bias') - conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation) - #out = topi.nn.relu(conv) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - -def conv2d_nhwc_without_layout_rewrite(Input, Filter, stride, padding, dilation, out_dtype='float32'): - """A copy of `topi.nn.conv2d_nhwc` but without the 'layout_free` attribute. - We use this in single op and subgraph evaluation because we don't want to introduce graph level optimization. - """ - assert isinstance(stride, int) or len(stride) == 2 - assert isinstance(dilation, int) or len(dilation) == 2 - - if isinstance(stride, int): - stride_h = stride_w = stride - else: - stride_h, stride_w = stride - - if isinstance(dilation, int): - dilation_h = dilation_w = dilation - else: - dilation_h, dilation_w = dilation - - batch, in_height, in_width, in_channel = Input.shape - if len(Filter.shape) == 10: - kernel_h = Filter.shape[2] * Filter.shape[6] - kernel_w = Filter.shape[3] * Filter.shape[7] - channel = Filter.shape[4] * Filter.shape[8] - num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5] * Filter.shape[9] - #Filter = te.placeholder([kernel_h, kernel_w, channel, num_filter], Filter.dtype, Filter.name) - elif len(Filter.shape) == 11: - kernel_h = Filter.shape[3] * Filter.shape[7] - kernel_w = Filter.shape[4] * Filter.shape[8] - channel = Filter.shape[5] * Filter.shape[9] - num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[2] * Filter.shape[6] * Filter.shape[10] - else: - kernel_h, kernel_w, channel, num_filter = Filter.shape - - # compute the output shape - dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 - dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 - pad_top, pad_left, pad_down, pad_right = topi.nn.get_pad_tuple( - padding, (dilated_kernel_h, dilated_kernel_w)) - out_channel = num_filter - out_height = topi.util.simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = topi.util.simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) - pad_before = [0, pad_top, pad_left, 0] - pad_after = [0, pad_down, pad_right, 0] - PaddedInput = topi.nn.pad(Input, pad_before, pad_after, name="PaddedInput") - rc = te.reduce_axis((0, in_channel), name='rc') - ry = te.reduce_axis((0, kernel_h), name='ry') - rx = te.reduce_axis((0, kernel_w), name='rx') - Output = te.compute( - (batch, out_height, out_width, out_channel), - lambda nn, yy, xx, ff: te.sum( - PaddedInput[nn, yy * stride_h + ry * dilation_h, - xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * - Filter[ry, rx, rc, ff].astype(out_dtype) - , axis=[ry, rx, rc]), - name="Conv2dOutput", tag="conv2d_nhwc") - return Output - - -@register_workload_func -def conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((KH, KW, CI, CO), name='kernel') - bias = te.placeholder((CO, ), name='bias') - conv = topi.nn.conv2d_nhwc(data, kernel, strides, padding, dilation) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - -@register_workload_func -def depthwise_conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((KH, KW, CI, 1), name='kernel') - bias = te.placeholder((CO, ), name='bias') - conv = topi.nn.depthwise_conv2d_nhwc(data, kernel, strides, padding, dilation) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - -@register_workload_func -def conv2d_nhwc_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((KH, KW, CI, CO), name='kernel') - bias = te.placeholder((CO, ), name='bias') - conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - - -@register_workload_func -def conv2d_nchw_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, kernel_size, kernel_size), name='kernel') - bias = te.placeholder((CO, 1, 1), name='bias') - bn_scale = te.placeholder((CO, 1, 1), name='bn_scale') - bn_offset = te.placeholder((CO, 1, 1), name='bn_offset') - - OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - - conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation) - conv = te.compute((N, CO, OH, OW), - lambda i, j, k, l: conv[i, j, k, l] + bias[j, 0, 0], - name='bias_add') - conv = te.compute((N, CO, OH, OW), - lambda i, j, k, l: conv[i, j, k, l] * bn_scale[j, 0, 0], - name='bn_mul') - conv = te.compute((N, CO, OH, OW), - lambda i, j, k, l: conv[i, j, k, l] + bn_offset[j, 0, 0], - name='bn_add') - out = topi.nn.relu(conv) - - return [data, kernel, bias, bn_offset, bn_scale, out] - -@register_workload_func -def conv2d_nhwc_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((kernel_size, kernel_size, CI, CO), name='kernel') - bias = te.placeholder((CO,), name='bias') - bn_scale = te.placeholder((CO,), name='bn_scale') - bn_offset = te.placeholder((CO,), name='bn_offset') - - OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - - conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation) - conv = te.compute((N, OH, OW, CO), - lambda i, j, k, l: conv[i, j, k, l] + bias[l], - name='bias_add') - conv = te.compute((N, OH, OW, CO), - lambda i, j, k, l: conv[i, j, k, l] * bn_scale[l], - name='bn_mul') - conv = te.compute((N, OH, OW, CO), - lambda i, j, k, l: conv[i, j, k, l] + bn_offset[l], - name='bn_add') - out = topi.nn.relu(conv) - - return [data, kernel, bias, bn_offset, bn_scale, out] - -resnet_conv2d_configs = { - # format : N, H, W, CI, CO, KH, KW, strides, padding, dilation - '18': [ - (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)), - (1, 56, 56, 64, 128, 3, 3, (2, 2), (1, 1), (1, 1)), - (1, 56, 56, 64, 128, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 128, 256, 3, 3, (2, 2), (1, 1), (1, 1)), - (1, 28, 28, 128, 256, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 14, 14, 256, 512, 3, 3, (2, 2), (1, 1), (1, 1)), - (1, 14, 14, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)), - ], - '50': [ - (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)), - (1, 56, 56, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 56, 56, 256, 128, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 56, 56, 256, 64, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 56, 56, 64, 256, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 512, 1024, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 28, 28, 512, 256, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 28, 28, 512, 128, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 128, 512, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 14, 14, 1024, 2048, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 14, 14, 1024, 512, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 14, 14, 1024, 256, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 14, 14, 256, 1024, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 7, 7, 2048, 512, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 7, 7, 512, 2048, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)), - ], -} - -# number of appearance for all conv2ds in resnet -resnet_conv2d_weights = { - '18': [1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 3, 3], - '50': [1, 1, 1, 2, 4, 3, 1, 1, 1, 3, 4, 4, 1, 1, 5, 6, 6, 2, 3, 3], -} - - -def parse_workload_name(name: str) -> List[str]: - """Parse workload name with wildcard character and abbreviation to standard names""" - if name.startswith('matmul-'): # e.g. matmul-512, matmul-1024, matmul-+ - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [256, 512, 1024] - else: - cfg_list = [N] - return ["matmul-%s" % x for x in cfg_list] - elif name.startswith('dense-'): # e.g. dense-1-512-1024, dense-16-512-512 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = ["1-512-512", "16-512-512"] - else: - cfg_list = [N] - return ["dense-%s" % x for x in cfg_list] - elif name.startswith('min-'): # e.g. min-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["min-%s" % x for x in cfg_list] - elif name.startswith('argmin-'): # e.g. argmin-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["argmin-%s" % x for x in cfg_list] - elif name.startswith('softmax-'): # e.g. softmax-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["softmax-%s" % x for x in cfg_list] - elif name.startswith('add-'): # e.g. add-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["add-%s" % x for x in cfg_list] - elif name.startswith('norm-'): # e.g. norm-1024 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["norm-%s" % x for x in cfg_list] - elif name.startswith('add-min-relu'): # e.g. add-min-relu-4096 - N = name.split('-', maxsplit=3)[3] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["add-min-relu-%s" % x for x in cfg_list] - elif name.startswith('nhwc-resnet-'): # e.g. nhwc-resnet-50.C1 - res = re.match(r'nhwc-resnet-(\d+).C([\d\+]+)(.B(\d+))?', name) - n_layers = res.group(1) - if res.group(2) == '+': - idx_list = range(len(resnet_conv2d_configs[n_layers])) - else: - idx_list = [int(res.group(2))] - - batch_size = 1 if res.group(4) is None else int(res.group(4)) - return ['nhwc-resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list] - elif name.startswith('resnet-'): # e.g. resnet-50.C1, resnet-50.C1.B2, resnet-50.C+.B2 - res = re.match(r'resnet-(\d+).C([\d\+]+)(.B(\d+))?', name) - n_layers = res.group(1) - if res.group(2) == '+': - idx_list = range(len(resnet_conv2d_configs[n_layers])) - else: - idx_list = [int(res.group(2))] - - batch_size = 1 if res.group(4) is None else int(res.group(4)) - return ['resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list] - elif name in ['conv2d-bn-relu', 'conv2d-relu-softmax-min', 'max-pool-2d', 'conv2d-rewrite', 'depthwise-conv2d-rewrite']: - return [name] - else: - raise ValueError("Invalid workload " + name) - - -def get_workload_keys(name: str) -> List[str]: - """Parse workload name and return the workload keys""" - normalized_names = parse_workload_name(name) - - ret = [] - for name in normalized_names: - if name.startswith('matmul-'): - name_split = name.split('-') - in_type = out_type = 'float32' - tensor_core_support = False - if len(name_split) == 2: # e.g. matmul-512 - N = K = M = int(name_split[1]) - elif len(name_split) == 4: # e.g. matmul-32-256-512 - N = int(name_split[1]) - K = int(name_split[2]) - M = int(name_split[3]) - elif len(name_split) == 6: # e.g. matmul-32-512-512-float16-float32 - N = int(name_split[1]) - K = int(name_split[2]) - M = int(name_split[3]) - in_type = name_split[4] - out_type = name_split[5] - elif len(name_split) == 7: # e.g. matmul-32-512-512-float16-float32-tc - N = int(name_split[1]) - K = int(name_split[2]) - M = int(name_split[3]) - in_type = name_split[4] - out_type = name_split[5] - tensor_core_support = name_split[6] == "tc" - else: - raise ValueError("Invalid matmul workload") - ret.append(make_workload_key_func(matmul_nkkm, - (N, M, K, in_type, out_type, tensor_core_support))) - elif name.startswith('dense-'): # e.g. dense-1-512-1024, dense-16-512-512 - name_split = name.split('-') - assert len(name_split) == 4 - batch = int(name_split[1]) - in_dim = int(name_split[2]) - out_dim = int(name_split[3]) - ret.append(make_workload_key_func(dense_layer, (batch, in_dim, out_dim))) - elif name.startswith('min-'): # e.g. min-4096 - name_split = name.split('-') - if len(name_split) == 2: - M = 64 - N = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid min workload") - ret.append(make_workload_key_func(min_mn, (M, N))) - elif name.startswith('argmin-'): # e.g. argmin-4096 - name_split = name.split('-') - if len(name_split) == 2: - M = 64 - N = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid argmin workload") - ret.append(make_workload_key_func(argmin_mn, (M, N))) - elif name.startswith('softmax-'): # e.g. softmax-4096 - name_split = name.split('-') - if len(name_split) == 2: - M = 64 - N = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid softmax workload") - ret.append(make_workload_key_func(softmax_mn, (M, N))) - elif name.startswith('add-min-relu'): # e.g. add-min-relu-4096 - name_split = name.split('-') - if len(name_split) == 4: - M = 64 - N = int(name_split[3]) - elif len(name_split) == 5: - M = int(name_split[3]) - N = int(name_split[4]) - else: - raise ValueError("Invalid workload") - ret.append(make_workload_key_func(add_min_relu, (M, N))) - elif name.startswith('add-'): # e.g. add-4096 - name_split = name.split('-') - if len(name_split) == 2: - N = M = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid add workload") - ret.append(make_workload_key_func(add_mn, (M, N))) - elif name.startswith('norm-'): # e.g. norm-4096 - name_split = name.split('-') - B = 2 - if len(name_split) == 2: - N = M = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid norm workload") - ret.append(make_workload_key_func(norm_bmn, (B, M, N))) - elif name.startswith('nhwc-resnet-'): # e.g. nhwc-resnet-50.C1.B2 - res = re.match(r'nhwc-resnet-(\d+).C(\d+).B(\d+)', name) - n_layers = res.group(1) - idx = int(res.group(2)) - batch_size = 1 if res.group(3) is None else int(res.group(3)) - args = list(resnet_conv2d_configs[n_layers][idx]) - args[0] = batch_size - ret.append(make_workload_key_func(conv2d_nhwc_bias, args)) - elif name.startswith('resnet-'): # e.g. resnet-50.C1.B2 - res = re.match(r'resnet-(\d+).C(\d+).B(\d+)', name) - n_layers = res.group(1) - idx = int(res.group(2)) - batch_size = 1 if res.group(3) is None else int(res.group(3)) - args = list(resnet_conv2d_configs[n_layers][idx]) - args[0] = batch_size - ret.append(make_workload_key_func(conv2d_nchw_bias, args)) - elif name == 'max-pool-2d': - return [make_workload_key_func(max_pool_2d_nchw, (2, 512, 7, 7))] - elif name == 'conv2d-bn-relu': - return [make_workload_key_func(conv2d_nhwc_bn_relu, - (1, 7, 7, 512, 512, 3, 1, 1, 1)) ] - elif name == 'conv2d-rewrite': - return [ make_workload_key_func(conv2d_nhwc_bias_with_rewrite, - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))] - elif name == 'depthwise-conv2d-rewrite': - return [ make_workload_key_func(depthwise_conv2d_nhwc_bias_with_rewrite, - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))] - elif name == 'conv2d-relu-softmax-min': - return [make_workload_key_func(conv2d_relu_softmax_min, - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))] - else: - raise ValueError("Invalid workload " + name) - - return ret - - -def get_workload_weights(name: str) -> List[float]: - """Return weights for workload name""" - if name.startswith('resnet-'): - res = re.match(r'resnet-(\d+).C+', name) - n_layers = res.group(1) - return np.array(resnet_conv2d_weights[n_layers]) - else: - return np.ones(len(get_workload_keys(name))) - - -############################################################ -###################### Measure Tools #################### -############################################################ - - -def measure_schedule(s, - bufs, - target, - target_host=None, - remote=None, - ndk_cc=None, - number=10, - repeat=3, - min_repeat_ms=500): - """Measure the time cost of a schedule""" - func = tvm.build(s, bufs, target=target, target_host=target_host) - if remote: - ctx = remote.context(str(target), 0) - temp = util.tempdir() - remote_path = temp.relpath("tmp_deploy_lib.so") - os.environ['TVM_NDK_CC'] = ndk_cc - func.export_library(remote_path, ndk.create_shared) - remote.upload(remote_path) - func = remote.load_module("tmp_deploy_lib.so") - else: - ctx = tvm.context(str(target), 0) - - if os.environ.get('TVM_AUTO_CACHE_FLUSH', '0') == '1': - min_repeat_ms = 0 - number = 1 - - time_f = func.time_evaluator(func.entry_name, - ctx, - number=number, - repeat=repeat, - min_repeat_ms=min_repeat_ms) - - np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs] - args = [tvm.nd.array(x, ctx=ctx) for x in np_args] - ctx.sync() - - costs = time_f(*args).results - - return costs - -def check_correctness(s, bufs, s_ref, buf_ref, target, target_host=None, remote=None, ndk_cc=None): - """Check the correctness of a schedule against a reference schedule""" - func = tvm.build(s, bufs, target=target, target_host=target_host) - func_ref = tvm.build(s_ref, buf_ref, target='llvm') - - if remote: - raise NotImplemented - else: - ctx = tvm.context(str(target), 0) - ctx_ref = tvm.cpu() - - np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs] - args = [tvm.nd.array(x, ctx=ctx) for x in np_args] - args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args] - ctx.sync() - - func(*args) - func_ref(*args_ref) - - for arr, arr_ref in zip(args, args_ref): - np.testing.assert_allclose(arr.asnumpy(), arr_ref.asnumpy()) - - -############################################################ -##################### Other Utilities #################### -############################################################ - - -def geomean(xs): - """Compute geometric mean""" - return math.exp(math.fsum(math.log(x) for x in xs) / len(xs)) - - -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - - -global last_tic -last_tic = None - - -def PRINT_TIME(msg): - """Print time interval between differnt calls. This is for debug so we make the name letters capital""" - global last_tic - now = time.time() - - if last_tic is None: - last_tic = now - - print(msg, now - last_tic) - last_tic = now - - -############################################################ -###################### I/O Utilities ##################### -############################################################ - -# The format for a line in resulst file -BenchmarkRecord = namedtuple("BenchmarkRecord", [ - 'device', 'backend', 'workload_type', 'workload_name', 'library', 'algorithm', 'value', - 'time_stamp' -]) - - -class BaselineDatabase: - """A class for query records in baseline database""" - def __init__(self, filename): - self.filename = filename - - self.lines = [] - for line in open(filename): - if line.startswith('#') or line.isspace(): - continue - self.lines.append(line.split('\t')) - - def filter_records(self, devices=None, backends=None, wkl_names=None, libraries=None): - ret = [] - for line in self.lines: - line = BenchmarkRecord(*line) - - if devices is not None and line.device not in devices: - continue - if backends is not None and line.backend not in backends: - continue - if wkl_names is not None and line.workload_name not in wkl_names: - continue - if libraries is not None and line.library not in libraries: - continue - - ret.append(line) - return ret - - def get_data_dict(self, device, target, wkl_names) -> Tuple[Dict, List]: - """Return a data dict s.t. data[wkl][library] = cost""" - data = defaultdict(lambda: defaultdict(lambda: 1e10)) - - all_libraries = set() - - if "cpu" in target.keys: - backends = ['cpu'] - elif "gpu" in target.keys: - backends = ['gpu'] - else: - raise ValueError("Invalid target: " + target) - - # Read costs for baselines - records = self.filter_records(devices=[device], backends=backends, wkl_names=wkl_names) - for record in records: - # use min over (possible) multiple algorithms - all_libraries.add(record.library) - data[record.workload_name][record.library] = \ - min(data[record.workload_name][record.library], - np.mean(eval(record.value)['costs'])) - - return data, list(all_libraries) - - -class LogFileDatabase: - """A class for indexing best records in a log file""" - def __init__(self, filename: str, n_lines: int = -1): - inputs, results = LogReader(filename).read_lines(n_lines) - - # best records, search by (target_key, workload_key). e.g. ('gpu', 'conv2d...') - self.best_by_targetkey = {} - - # best according to (model, workload_key). e.g. ('1080ti', 'conv2d...')) - self.best_by_model = {} - - # find best records and build the index - for inp, res in zip(inputs, results): - if res.error_no != 0: - continue - - # use target keys in tvm target system as key to build best map - for target_key in inp.task.target.keys: - key = (target_key, inp.task.workload_key) - if key not in self.best_by_targetkey: - self.best_by_targetkey[key] = (inp, res) - else: - _, other_res = self.best_by_targetkey[key] - if np.mean([x.value for x in other_res.costs]) > \ - np.mean([x.value for x in res.costs]): - self.best_by_targetkey[key] = (inp, res) - - # use model as key to build best map - key = (inp.task.target.model, inp.task.workload_key) - if key not in self.best_by_model: - if inp.task.target.model != 'unknown': - self.best_by_model[key] = (inp, res) - else: - _, other_res = self.best_by_model[key] - if np.mean([x.value for x in other_res.costs]) > \ - np.mean([x.value for x in res.costs]): - self.best_by_model[key] = (inp, res) - - def write_best(self, filename: str): - best_records = list(self.best_by_targetkey.values()) - inputs = [x[0] for x in best_records] - results = [x[1] for x in best_records] - write_measure_records_to_file(filename, inputs, results) - - -############################################################ -###################### Plot Utilities #################### -############################################################ - -def max_curve(raw_curve): - """Return b[i] = max(a[:i]) """ - ret = [] - cur_max = -np.inf - for x in raw_curve: - cur_max = max(cur_max, x) - ret.append(cur_max) - return ret - -def min_curve(raw_curve): - """Return b[i] = min(a[:i]) """ - ret = [] - cur_min = np.inf - for x in raw_curve: - cur_min = min(cur_min, x) - ret.append(cur_min) - return ret - -def mean_curve(raw_curve, window_size=None): - """Return b[i] = mean(a[:i]) """ - ret = [] - mean = 0 - if window_size is None: - for i, x in enumerate(raw_curve): - mean = (mean * i + x) / (i + 1) - ret.append(mean) - else: - for i, x in enumerate(raw_curve): - if i >= window_size: - mean = (mean * window_size + x - raw_curve[i - window_size]) / window_size - else: - mean = (mean * i + x) / (i + 1) - ret.append(mean) - return ret - - -def enhance_color(color, h=1, l=1, s=1): - """Make color looks better for pyplot""" - import matplotlib.colors as mc - import colorsys - try: - c = mc.cnames[color] - except: - c = color - c = np.array(colorsys.rgb_to_hls(*mc.to_rgb(c))) - - h, l, s = h * c[0], l * c[1], s * c[2] - h, l, s = [max(min(x, 1), 0) for x in [h, l, s]] - - return colorsys.hls_to_rgb(h, l, s) - - -method_color_dict = { - 'ours': 'C0', - 'AutoTVM': 'C1', - - 'tensorflow': 'C2', - 'tensorflow-tensorrt': 'C9', - 'tflite': 'C2', - - 'pytorch': enhance_color('C3', l=1.1, s=0.9), - - 'FlexTensor': enhance_color('C5'), - 'halide': enhance_color('teal', l=1.25), - - 'Limit space': 'C7', - 'No fine-tuning': 'C8', - 'No task scheduler': 'C1', -} - -def method2color(method): - if '-batch-' in method: - method, batch_size = method.split('-batch-') - #return enhance_color(method_color_dict[method], s=1.1, l=1.5) - return method_color_dict[method] - else: - return method_color_dict[method] - -method_order_list = [ - 'pytorch', 'tensorflow', 'tensorflow-xla', 'tensorflow-tensorrt', - 'tflite', 'halide', 'FlexTensor', 'AutoTVM', - - 'Limit space', 'No fine-tuning', - 'ours', -] - -def method2order(method): - if '-batch-' in method: - method, batch_size = method.split('-batch-') - batch_size = int(batch_size) - return method_order_list.index(method) + batch_size / 100 - else: - return method_order_list.index(method) - -show_name_replace_dict = { - 'pytorch': "PyTorch", - 'tensorflow-tensorrt': 'TensorRT-TF', - 'tensorflow': 'TensorFlow', - 'tflite': 'TensorFlow Lite', - 'halide': 'Halide', - - 'ours': 'Ansor (ours)', - 'batch-16': 'batch', - - 'resnet_50': 'ResNet-50', - 'mobilenet_v2': 'Mobilenet V2', - 'resnet_18_3d': '3D-ResNet', - 'dcgan': 'DCGAN', - 'dqn': 'DQN', - 'bert': 'BERT', -} - -def show_name(name): - # if name.startswith('resnet-'): - # return name.split('.')[1] - for key, value in show_name_replace_dict.items(): - name = name.replace(key, value) - - return name - -def draw_grouped_bar_chart(data, baseline='pytorch', output='out.png', - yscale_log=False, yticks=None, y_max=None, - legend_bbox_to_anchor=None, legend_nrow=None, - figure_size=None, figax=None, draw_ylabel=True, draw_legend=True): - width = 1 - gap = 1.5 - fontsize = 19 - xticks_font_size = fontsize - 2 - - figure_size = figure_size or (11, 4) - legend_bbox_to_anchor = legend_bbox_to_anchor or (0.45, 1.35) - - all_methods = set() - legend_set = {} - - if figax is None: - fig, ax = plt.subplots() - axes = [] - axes.append(ax) - else: - ax = figax - - x0 = 0 - xticks = [] - xlabels = [] - - workloads = list(data.keys()) - for wkl in workloads: - ys = [] - colors = [] - - methods = list(data[wkl].keys()) - - if baseline in data[wkl]: - baseline_cost = data[wkl][baseline] - else: - # normalize to best library - baseline_cost = 1e10 - for method in methods: - if data[wkl][method] < baseline_cost: - baseline_cost = data[wkl][method] - - methods.sort(key=lambda x: method2order(x)) - for method in methods: - relative_speedup = baseline_cost / data[wkl][method] - if yticks is None: - ys.append(relative_speedup) - else: - ys.append(max(relative_speedup, yticks[0] * 1.1)) - colors.append(method2color(method)) - - # draw the bars - xs = np.arange(x0, x0 + len(ys)) - bars = ax.bar(xs, ys, width=width, color=colors) - - for method, bar_obj in zip(methods, bars): - all_methods.add(method) - if method not in legend_set: - legend_set[method] = bar_obj - - # tick and label - x0 += len(ys) + gap - - xticks.append(x0 - gap - len(ys)*width/2.0 - width/2.0) - xlabels.append(show_name(wkl)) - - ax.set_xticks(xticks) - ax.set_xticklabels(xlabels, fontsize=xticks_font_size) - plt.tick_params(axis='x', which='both', bottom='off', top='off') - - if draw_ylabel is True: - ax.set_ylabel('Relative Speedup', fontsize=fontsize) - elif isinstance(draw_ylabel, str): - ax.set_ylabel(draw_ylabel, fontsize=fontsize) - - if yscale_log: - ax.set_yscale('log', basey=2) - if yticks is not None: - ax.set_yticks(yticks) - if y_max: - ax.set_ylim(top=y_max) - - from matplotlib.ticker import FormatStrFormatter - ax.set_yticklabels(ax.get_yticks(), fontsize=fontsize) - ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f')) - ax.yaxis.grid(linewidth=0.4, linestyle='dotted') # draw grid line - ax.set_axisbelow(True) # grid lines are behind the rest - ax.tick_params(bottom=False, top=False, right=False) - - # put legend outside the plot - all_methods = list(all_methods) - all_methods.sort(key=lambda x : method2order(x)) - - if draw_legend: - legend_nrow = legend_nrow or 2 - ncol = (len(all_methods) + legend_nrow - 1)// legend_nrow - ax.legend([legend_set[x] for x in all_methods], - [show_name(x) for x in all_methods], - fontsize=fontsize-1, - loc='upper center', - bbox_to_anchor=legend_bbox_to_anchor, - ncol=ncol, - handlelength=1.0, - handletextpad=0.5, - columnspacing=1.1) - - if figax is None: - fig.set_size_inches(figure_size) - fig.savefig(output, bbox_inches='tight') - print("Output the plot to %s" % output) - - -def to_str_round(x, decimal=6): - if isinstance(x, str): - return x - if isinstance(x, (list, tuple)) or isinstance(x, np.ndarray): - return "[" + ", ".join([to_str_round(y, decimal=decimal) - for y in x]) + "]" - if isinstance(x, dict): - return str({k: eval(to_str_round(v)) for k, v in x.items()}) - if isinstance(x, int): - return str(x) - if isinstance(x, float): - format_str = "%%.%df" % decimal - return format_str % x - raise ValueError("Invalid value: " + str(x)) - diff --git a/scripts/shape_configs.py b/scripts/shape_configs.py deleted file mode 100644 index db6b3b9dc9aa..000000000000 --- a/scripts/shape_configs.py +++ /dev/null @@ -1,247 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" Shape configurations for single operator / subgraph evaluation -This file is shared by tune_op_subgraph.py and scripts in scripts/baseline/ -""" - -matmul_shapes = [ - (1, 128, 128, 128), - (1, 512, 32, 512), - (1, 512, 512, 512), - (1, 1024, 1024, 1024), -] - -conv1d_shapes = [ - # derived from conv2d_shapes - (1, 256, 64, 128, 3, 2, 1), -# (1, 256, 64, 128, 1, 2, 0), -# (1, 256, 64, 64, 1, 1, 0), -# (1, 128, 128, 256, 3, 2, 1), - (1, 128, 128, 256, 1, 2, 0), -# (1, 128, 128, 128, 3, 1, 1), -# (1, 64, 256, 512, 3, 2, 1), -# (1, 64, 256, 512, 1, 2, 0), - (1, 64, 256, 256, 5, 1, 2), - (1, 32, 512, 512, 3, 1, 1), -] - -conv2d_shapes = [ - # all conv2d layers in resnet-18 - (1, 224, 224, 3, 64, 7, 2, 3), -# (1, 56, 56, 64, 128, 3, 2, 1), -# (1, 56, 56, 64, 128, 1, 2, 0), -# (1, 56, 56, 64, 64, 3, 1, 1), - (1, 56, 56, 64, 64, 1, 1, 0), -# (1, 28, 28, 128, 256, 3, 2, 1), -# (1, 28, 28, 128, 256, 1, 2, 0), -# (1, 28, 28, 128, 128, 3, 1, 1), -# (1, 14, 14, 256, 512, 3, 2, 1), -# (1, 14, 14, 256, 512, 1, 2, 0), - (1, 14, 14, 256, 256, 3, 1, 1), - (1, 7, 7, 512, 512, 3, 1, 1), -] - -conv3d_shapes = [ - # Derived from cnov2d_shapes. Use depth=16 for all configurations - (1, 16, 224, 224, 3, 64, 7, 2, 3), -# (1, 16, 56, 56, 64, 128, 3, 2, 1), -# (1, 16, 56, 56, 64, 128, 1, 2, 0), -# (1, 16, 56, 56, 64, 64, 3, 1, 1), - (1, 16, 56, 56, 64, 64, 1, 1, 0), -# (1, 16, 28, 28, 128, 256, 3, 2, 1), -# (1, 16, 28, 28, 128, 256, 1, 2, 0), -# (1, 16, 28, 28, 128, 128, 3, 1, 1), -# (1, 16, 14, 14, 256, 512, 3, 2, 1), -# (1, 16, 14, 14, 256, 512, 1, 2, 0), - (1, 16, 14, 14, 256, 256, 3, 1, 1), - (1, 16, 7, 7, 512, 512, 3, 1, 1), -] - -group_conv2d_shapes = [ - # Derived from cnov2d_shapes. Use group=4 for all configurations - (1, 56, 56, 64, 128, 3, 2, 1 , 1, 4), -# (1, 56, 56, 64, 128, 1, 2, 0 , 1, 4), -# (1, 56, 56, 64, 64, 3, 1, 1 , 1, 4), - (1, 56, 56, 64, 64, 1, 1, 0 , 1, 4), -# (1, 28, 28, 128, 256, 3, 2, 1, 1, 4), -# (1, 28, 28, 128, 256, 1, 2, 0, 1, 4), -# (1, 28, 28, 128, 128, 3, 1, 1, 1, 4), -# (1, 14, 14, 256, 512, 3, 2, 1, 1, 4), -# (1, 14, 14, 256, 512, 1, 2, 0, 1, 4), - (1, 14, 14, 256, 256, 3, 1, 1, 1, 4), - (1, 7, 7, 512, 512, 3, 1, 1 , 1, 4), -] - -dilation_conv2d_shapes = [ - # Derived from cnov2d_shapes. Use dilation=2 for all configurations - (1, 224, 224, 3, 64, 7, 2, 3 , 2), -# (1, 56, 56, 64, 128, 3, 2, 1 , 2), -# (1, 56, 56, 64, 128, 1, 2, 0 , 2), -# (1, 56, 56, 64, 64, 3, 1, 1 , 2), - (1, 56, 56, 64, 64, 1, 1, 0 , 2), -# (1, 28, 28, 128, 256, 3, 2, 1, 2), -# (1, 28, 28, 128, 256, 1, 2, 0, 2), -# (1, 28, 28, 128, 128, 3, 1, 1, 2), -# (1, 14, 14, 256, 512, 3, 2, 1, 2), -# (1, 14, 14, 256, 512, 1, 2, 0, 2), - (1, 14, 14, 256, 256, 3, 1, 1, 2), - (1, 7, 7, 512, 512, 3, 1, 1 , 2), -] - -depthwise_conv2d_shapes = [ - # all depthwise conv2d layers in mobilenet - (1, 112, 112, 32, 3, 1, 1), - (1, 112, 112, 64, 3, 2, 1), -# (1, 56, 56, 128, 3, 1, 1), -# (1, 56, 56, 128, 3, 2, 1), -# (1, 28, 28, 256, 3, 1, 1), -# (1, 28, 28, 256, 3, 2, 1), -# (1, 14, 14, 512, 3, 1, 1), - (1, 14, 14, 512, 3, 2, 1), - (1, 7, 7, 1024, 3, 1, 1), -] - -conv2d_transpose_shapes = [ - # all conv2d tranpose layers in DCGAN - (1, 4, 4, 512, 256, 4, 2, 1), - (1, 8, 8, 256, 128, 4, 2, 1), - (1, 16, 16, 128, 64, 4, 2, 1), - (1, 32, 32, 64, 3, 4, 2, 1), -] - -conv2d_capsule_shapes = [ - # all conv2d capsule layers in matrix capsules withemrouting (ICLR 2018) - (1, 16, 16, 32, 32, 3, 2, 1), - (1, 8, 8, 32, 32, 3, 1, 1), - (1, 16, 16, 8, 16, 3, 2, 1), - (1, 8, 8, 16, 16, 3, 1, 1), -] - -conv2d_winograd_nhwc_shapes = [ - (1, 56, 56, 64, 64, 3, 1, 1), - (1, 28, 28, 128, 128, 3, 1, 1), - (1, 14, 14, 256, 256, 3, 1, 1), - (1, 7, 7, 512, 512, 3, 1, 1), -] - -conv2d_winograd_nchw_shapes = [ - (1, 64, 56, 56, 64, 3, 1, 1), - (1, 128, 28, 28, 128, 3, 1, 1), - (1, 256, 14, 14, 256, 3, 1, 1), - (1, 512, 7, 7, 512, 3, 1, 1), -] - -matmul_tensor_core_shapes = [ - (16, 512, 512, 'float16', 'float32', True), - (32, 512, 512, 'float16', 'float32', True), - (512, 512, 512, 'float16', 'float32', True), -] - -norm_shapes = [ - (1, 256, 256), - (1, 512, 512), - (1, 1024, 1024), - (1, 4096, 1024), -] - -single_op_shape_dict = { - 'C1D': conv1d_shapes, - 'C2D': conv2d_shapes, - 'C3D': conv3d_shapes, - 'GMM': matmul_shapes, - 'GRP': group_conv2d_shapes, - 'DIL': dilation_conv2d_shapes, - 'DEP': depthwise_conv2d_shapes, - 'T2D': conv2d_transpose_shapes, - 'CAP': conv2d_capsule_shapes, - 'NRM': norm_shapes, - -# The following workloads are not in our sinle op evaluation plan. -# They should be moved to `common.py` and be used by `tune_wkl.py`. -# 'C2D_NCHW': conv2d_nchw_shapes, -# 'C2DWG_NHWC': conv2d_winograd_nhwc_shapes, -# 'C2DWG_NCHW': conv2d_winograd_nchw_shapes, -# 'GMM_TC': matmul_tensor_core_shapes, -} - -conv2d_bn_relu_shapes = [ - (1, 224, 224, 3, 64, 7, 2, 3), - (1, 56, 56, 64, 128, 3, 2, 1), - (1, 28, 28, 128, 256, 1, 2, 0), - (1, 7, 7, 512, 512, 3, 1, 1, 1), - (16, 224, 224, 3, 64, 7, 2, 3), - (16, 56, 56, 64, 128, 3, 2, 1), - (16, 28, 28, 128, 256, 1, 2, 0), - (16, 7, 7, 512, 512, 3, 1, 1, 1), -] - -transpose_batch_matmul_shapes = [ - (1, 128, 12, 64), - (1, 128, 16, 64), - (1, 64, 12, 128), - (1, 128, 12, 128), - (16, 128, 12, 64), - (16, 128, 16, 64), - (16, 64, 12, 128), - (16, 128, 12, 128), -] - -subgraph_shape_dict = { - "conv2d_bn_relu": conv2d_bn_relu_shapes, - "transpose_batch_matmul": transpose_batch_matmul_shapes, -} - -resnet_shapes = [ - (1, ), - (16, ), -] - -mobilenet_v2_shapes = [ - (1, ), - (16, ), -] - -dcgan_shapes = [ - (1, ), - (16, ), -] - -dqn_shapes = [ - (1, ), - (16, ), -] - -bert_shapes = [ - (1, ), - (16, ), -] - -resnet18_3d_shapes = [ - (1, ), - (16, ), -] - -network_shape_dict = { - 'resnet_50': resnet_shapes, - 'mobilenet_v2': mobilenet_v2_shapes, - 'dcgan': dcgan_shapes, - 'dqn': dqn_shapes, - 'bert': bert_shapes, - 'resnet_18_3d': resnet18_3d_shapes, -} - diff --git a/scripts/tune_network.py b/scripts/tune_network.py deleted file mode 100644 index 188da6cbe6e6..000000000000 --- a/scripts/tune_network.py +++ /dev/null @@ -1,405 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Tune a whole neural network""" -import argparse -import logging -import random -import os -import numpy as np - -import tvm -from tvm import ansor, relay -import tvm.contrib.graph_runtime as runtime -from tvm.contrib.debugger import debug_runtime -from tvm.contrib import util, ndk -from tvm.relay import testing -from tvm.ansor.utils import request_remote -#from baseline.utils import log_line, BenchmarkRecord - -from common import str2bool -from tune_test import create_tune_option - -dtype = "float32" - -def get_network(name, network_path, batch_size, layout): - """Get the relay module and random weights for a network""" - input_shape = (batch_size, 3, 224, 224) - output_shape = (batch_size, 1000) - input_name = 'data' - - if name.startswith("resnet3d"): - n_layer = int(name.split('-')[1]) - layout = "NDHWC" - image_shape = (16, 112, 112, 3) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.resnet3d.get_workload(num_layers=n_layer, batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout) - elif name.startswith("resnet"): - n_layer = int(name.split('-')[1]) - image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype) - elif "lstm" in name: - mod, params = relay.testing.lstm.get_workload(iterations=10, num_hidden=512, batch_size=batch_size, dtype=dtype) - elif "mlp" in name: - input_shape = (batch_size, 1, 28, 28) - mod, params = relay.testing.mlp.get_workload(batch_size=batch_size, dtype=dtype) - elif "vgg" in name: - n_layer = int(name.split('-')[1]) - mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype) - elif name == 'dcgan': - input_shape = (batch_size, 100) - mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size) - elif name == 'dqn': - layout = "NHWC" - image_shape = (84, 84, 4) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.dqn.get_workload(batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout) - elif name == 'mobilenet': - image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype) - elif name == 'r3d_18': - import torch - import torchvision - - model = getattr(torchvision.models.video, name)(pretrained=False) - model = model.eval() - - # We grab the TorchScripted model via tracing - input_shape = [batch_size, 3, 16, 112, 112] - input_data = torch.randn(input_shape) - scripted_model = torch.jit.trace(model, input_data).eval() - - input_name = 'input0' # only one input, set it to this name - shape_list = {input_name: input_shape} - mod, params = relay.frontend.from_pytorch(scripted_model, - shape_list) - elif name == 'squeezenet_v1.1': - mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype) - elif name == 'inception_v3': - input_shape = (batch_size, 3, 299, 299) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - elif name == 'mxnet': - # an example for mxnet model - from mxnet.gluon.model_zoo.vision import get_model - block = get_model('resnet18_v1', pretrained=True) - mod, params = relay.frontend.from_mxnet(block, shape={"input_name": input_shape}, dtype=dtype) - net = mod["main"] - net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs) - mod = relay.Module.from_expr(net) - elif name == 'tflite-mobilenet-v2' or name == 'tflite-resnet-v2-50': - try: - import tflite.Model - except ImportError: - raise ImportError("The tflite package must be installed") - input_name = "input" - input_shape = (1, 224, 224, 3) - output_shape = (1, 1001) - input_dtype = "float32" - tflite_model_buf = open(network_path, "rb").read() - tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) - mod, params = relay.frontend.from_tflite(tflite_model, - shape_dict={input_name: input_shape}, - dtype_dict={input_name: input_dtype}) - elif name == 'pytorch-mobilenet-v2': - import torch - - model = torch.hub.load('pytorch/vision:v0.5.0', 'mobilenet_v2', pretrained=False) - model.eval() - - input_shape = [batch_size, 3, 224, 224] - input_data = torch.randn(input_shape) - scripted_model = torch.jit.trace(model, input_data).eval() - - input_name = 'input0' - shape_list = {input_name: input_shape} - mod, params = relay.frontend.from_pytorch(scripted_model, - shape_list) - elif name == 'bert': - import tensorflow as tf - - bert_pb = './baseline/tensorflow/tf_models/bert/bert-B%d.pb' % batch_size - try: - with tf.compat.v1.gfile.GFile(bert_pb, 'rb') as f: - graph_def = tf.compat.v1.GraphDef() - graph_def.ParseFromString(f.read()) - except: - raise ValueError("Need to run ./baseline/tensorflow/bert/generate_bert_pb.py to get model first") - - input_shape = (batch_size, 128) - input_name = ['input'] - shape_dict = { - 'input': input_shape - } - out_names = [ - 'bert/pooler/dense/Tanh' - ] - - mod, params = relay.frontend.from_tensorflow(graph_def, - shape=shape_dict, - outputs=out_names) - else: - raise ValueError("Unsupported network: " + name) - - return mod, params, input_name, input_shape, output_shape - - -def create_module(data_shape, graph, lib, target, input_name, params, debug_profile, - local_measure, ndk_cc, rpc_device_key, rpc_host, rpc_port, rpc_num_threads, seed=43): - if local_measure: - if target.target_name == "cuda": - ctx = tvm.gpu() - else: - ctx = tvm.cpu() - else: - print("=============== Request Remote ===============") - if 'TVM_NDK_CC' not in os.environ: - os.environ['TVM_NDK_CC'] = ndk_cc - remote = request_remote(rpc_device_key, rpc_host, rpc_port) - - print("=============== Export ===============") - ctx = remote.cpu() - temp = util.tempdir() - path_lib = temp.relpath("deploy_lib.so") - lib.export_library(path_lib, ndk.create_shared) - - print("=============== Upload ===============") - remote.upload(path_lib) - - print("=============== Load ===============") - lib = remote.load_module("deploy_lib.so") - - if rpc_num_threads: - config_threadpool = remote.get_function('runtime.config_threadpool') - config_threadpool(0, rpc_num_threads) - - np.random.seed(seed) - data_tvm = tvm.nd.array(100 * (np.random.uniform(size=data_shape)).astype(dtype), ctx=ctx) - if debug_profile: - module = debug_runtime.create(graph, lib, ctx) - else: - module = runtime.create(graph, lib, ctx) - - if type(input_name) == list: - for name in input_name: - module.set_input(name, data_tvm) - else: - module.set_input(input_name, data_tvm) - for k, v in params.items(): - module.set_input(k, v) - - return module, ctx - - -def tune_and_evaluate(network_arguments, target, target_host, - search_policy, task_scheduler_arguments, tune_option_arguments, - tune, debug_profile, check_correctness, log_n_lines): - # Extract tasks from relay program - mod, params, input_name, data_shape, out_shape = get_network(**network_arguments) - - # Tune all - if tune: - print("=============== Extract Workloads ===============") - workloads, wkl_weights = ansor.extract_from_program(mod, target=target, params=params) - print("Extract %d workloads in total" % (len(workloads))) - - # Tune workloads with auto scheduler - print("=============== Tune ===============") - tasks = [] - for i, wkl_key in enumerate(workloads): - dag = ansor.workload_key_to_dag(wkl_key) - print("[========= Task %d =========]\n" % i, dag) - tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host)) - - tuner = ansor.SimpleTaskScheduler(tasks, - lambda costs: sum(c * w for c, w in zip(costs, wkl_weights)), - **task_scheduler_arguments) - tune_option, measure_ctx = create_tune_option(target, **tune_option_arguments) - - if tune_option_arguments['local_measure'] and target.target_name != 'cuda': - os.environ['TVM_BIND_MASTER_CORE_0'] = "1" - tuner.tune(tune_option, search_policy) - - if measure_ctx: - del measure_ctx - - kernel_layout_rewrite = True - - # Compile graph with best states found by auto-scheduler - print("=============== Compile ===============") - with ansor.apply_history_best(tune_option_arguments['log_file'], log_n_lines): - os.environ['TVM_AUTO_CACHE_FLUSH'] = "0" - - if kernel_layout_rewrite: - ansor.prepare_layout_rewrite(mod, target=target, params=params) - else: - # disable layout rewrite - ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build( - mod, target=target, params=params) - - ansor.finish_layout_rewrite() - print("=============== Compile Finish ===============") - - module, ctx = create_module(data_shape, graph, lib, target, input_name, - opt_params, debug_profile, **common_measure_parameters) - - # Evaluate - print("========== Evaluate ==========") - ftimer = module.module.time_evaluator("run", ctx, number=10, repeat=3) - prof_res = np.array(ftimer().results) - - # display profile information - if debug_profile or check_correctness: - module.run() - if check_correctness: - actual_output = module.get_output(0).asnumpy() - print(actual_output) - - print("Mean inference time (std dev): %.2f ms (%.2f ms)" % - (np.mean(prof_res) * 1000, np.std(prof_res) * 1000)) - #log_line(BenchmarkRecord(target.target_name, 'gpu' if target.target_name == 'cuda' else 'cpu', 'network', - # "%s.B%d" % (network_name, batch_size), 'AutoSchedule', layout, - # {"costs": prof_res}, time.time()), record_file) - - if check_correctness: - print("========== Check Correctness ==========") - # clean relay cache - relay.backend.compile_engine.get().clear() - - # disable layout rewrite - ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - target = tvm.target.create('llvm') - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build( - mod, target=target, params=params) - - module, _ = create_module(data_shape, graph, lib, target, input_name, - opt_params, debug_profile, **common_measure_parameters) - module.run() - - expected_output = module.get_output(0).asnumpy() - np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3, atol=1e-3) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Search task related arguments - parser.add_argument("--network", type=str, required=True) - parser.add_argument("--network-path", type=str, default=None, help="The path of tflite model") - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument("--layout", type=str, default='NHWC') - parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2') - parser.add_argument("--target-host", type=str, default=None) - parser.add_argument("--check-correctness", type=str2bool, nargs='?', const=True, default=False) - parser.add_argument("--debug-profile", type=str2bool, nargs='?', const=True, default=False) - parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True) - - # Search strategy related arguments - parser.add_argument("--n-trials", type=int, default=1000) - parser.add_argument("--policy", type=str, choices=['sketch'], default='sketch') - parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb') - parser.add_argument("--task-scheduler", type=str, default='gradient', - choices=['no', 'gradient', 'round-robin'], - help='The strategy of task scheduler') - parser.add_argument("--seed", type=int, default=0, help='random seed') - - # Log file related arguments - parser.add_argument("--log-file", type=str, help="Write measurement records to this log file") - parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search") - parser.add_argument("--log-n-lines", type=int, help="Only load the first n lines for history log") - parser.add_argument("--load-model", type=str, help="Load pre trained cost model file") - - # Measurement related and other arguments - parser.add_argument("--num-measure-per-iter", type=int, default=48, - help="The number of programs to be measured at each iteration") - parser.add_argument("--build-timeout", type=int, default=10) - parser.add_argument("--run-timeout", type=int, default=10) - parser.add_argument("--early-stopping", type=int, default=-1) - parser.add_argument("--verbose", type=int, default=1) - parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--rpc-device-key", type=str, default=None) - parser.add_argument("--rpc-host", type=str, default='0.0.0.0') - parser.add_argument("--rpc-port", type=int, default=9190) - parser.add_argument("--rpc-num-threads", type=int, default=None) - parser.add_argument("--n-parallel", type=int, default=1) - parser.add_argument("--ndk-cc", type=str, default=None) - args = parser.parse_args() - - np.random.seed(args.seed) - random.seed(args.seed) - logging.basicConfig() - logging.getLogger('ansor').setLevel(logging.DEBUG) - os.environ["TOPHUB_LOCATION"] = "NONE" # disable autotvm - - target = tvm.target.create(args.target) - log_file = args.log_file or "%s-B%d-%s.json" % (args.network, args.batch_size, - target.target_name) - load_log_file = args.load_log or log_file - search_policy = "%s.%s" % (args.policy, args.model_type) - if args.layout: - layout = args.layout - elif target.target_name == "cuda": - layout = "NCHW" - else: - layout = "NHWC" - - network_arguments = { - 'name': args.network, - 'network_path': args.network_path, - 'batch_size': args.batch_size, - 'layout': layout - } - - task_scheduler_parameters = { - 'strategy': args.task_scheduler, - 'load_log_file': load_log_file, - 'load_model_file': args.load_model, - 'verbose': args.verbose, - } - - common_measure_parameters = { - 'local_measure': args.local_measure, - 'rpc_device_key': args.rpc_device_key, - 'rpc_host': args.rpc_host, - 'rpc_port': args.rpc_port, - 'rpc_num_threads': args.rpc_num_threads, - 'ndk_cc': args.ndk_cc, - } - - tune_option_arguments = { - 'log_file': log_file, - 'n_trials': args.n_trials, - 'num_measure_per_iter': args.num_measure_per_iter, - 'verbose': args.verbose, - 'n_parallel': args.n_parallel, - 'build_timeout': args.build_timeout, - 'run_timeout': args.run_timeout, - 'early_stopping': args.early_stopping, - **common_measure_parameters - } - - tune_and_evaluate(network_arguments, target, args.target_host, - search_policy, task_scheduler_parameters, tune_option_arguments, - args.tune, args.debug_profile, args.check_correctness, - args.log_n_lines) diff --git a/scripts/tune_op_subgraph.py b/scripts/tune_op_subgraph.py deleted file mode 100644 index d3e70501873e..000000000000 --- a/scripts/tune_op_subgraph.py +++ /dev/null @@ -1,602 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Tune all workloads for single op & subgraph evaluation""" -import argparse -import logging -import random - -import numpy as np - -import tvm -from tvm import te, ansor -import topi -from topi.nn.winograd_util import winograd_transform_matrices -from topi.util import get_const_tuple - -from common import measure_schedule, str2bool, norm_bmn, conv2d_nhwc_bn_relu, conv2d_nchw_bn_relu -from shape_configs import single_op_shape_dict, subgraph_shape_dict -from tune_test import tune_workloads_jointly, replay_workload, create_tune_option - -# ========================== Single Ops ========================== - -@ansor.register_workload_func -def batch_matmul_nkkm(B, N, M, K): - X = te.placeholder((B, N, K), name='A') - Y = te.placeholder((B, K, M), name='B') - k = te.reduce_axis((0, K), name='k') - Z = te.compute((B, N, M), lambda b, i, j: te.sum(X[b][i][k] * Y[b][k][j], axis=[k]), name='C') - return [X, Y, Z] - -@ansor.register_workload_func -def conv1d_nlc(N, L, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, L, CI), name='inputs') - weight = te.placeholder((kernel_size, CI//groups, CO), name='weight') - - batch_size, in_len, in_channel = inputs.shape - k_len, channel_per_group, out_channel = weight.shape - out_channel_per_group = out_channel // groups - out_len = (in_len + 2 * padding - dilation * (k_len - 1) - 1) // stride + 1 - rc = te.reduce_axis((0, channel_per_group), name='rc') - rl = te.reduce_axis((0, k_len), name='rl') - - padded = topi.nn.pad(inputs, [0, padding, 0]) - output = te.compute( - (batch_size, out_len, out_channel), - lambda n, l, co: te.sum( - (padded[n, l * stride + rl * dilation, co // out_channel_per_group * channel_per_group + rc] * - weight[rl, rc, co]), axis=[rl, rc]), - name='conv1d_nlc' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, H, W, CI), name='inputs') - weight = te.placeholder((kernel_size, kernel_size, CI//groups, CO), name='weight') - batch_size, in_h, in_w, in_channel = inputs.shape - k_h, k_w, channel_per_group, out_channel = weight.shape - out_channel_per_group = out_channel // groups - - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rh = te.reduce_axis((0, k_h), name="rh") - rw = te.reduce_axis((0, k_w), name="rw") - rc = te.reduce_axis((0, channel_per_group), name="rc") - - padded = topi.nn.pad(inputs, [0, padding, padding, 0]) - output = te.compute( - (batch_size, out_h, out_w, out_channel), - lambda n, h, w, co: te.sum( - (padded[n, h * stride + rh * dilation, w * stride + rw * dilation, - co // out_channel_per_group * channel_per_group + rc] - * weight[rh, rw, rc, co]), axis=[rh, rw, rc] - ), - name='conv2d_nhwc' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_nchw(N, CI, H, W, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, CI, H, W), name='inputs') - weight = te.placeholder((CO, CI//groups, kernel_size, kernel_size), name='weight') - batch_size, in_channel, in_h, in_w = inputs.shape - out_channel, channel_per_group, k_h, k_w, = weight.shape - out_channel_per_group = out_channel // groups - - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rc = te.reduce_axis((0, channel_per_group), name="rc") - rh = te.reduce_axis((0, k_h), name="rh") - rw = te.reduce_axis((0, k_w), name="rw") - - padded = topi.nn.pad(inputs, [0, 0, padding, padding]) - output = te.compute( - (batch_size, out_channel, out_h, out_w), - lambda n, co, h, w: te.sum( - (padded[n, co // out_channel_per_group * channel_per_group + rc, - h * stride + rh * dilation, w * stride + rw * dilation] - * weight[co, rc, rh, rw]), axis=[rc, rh, rw] - ), - name='conv2d_nchw' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv3d_ndhwc(N, D, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, D, H, W, CI)) - weight = te.placeholder((kernel_size, kernel_size, kernel_size, CI//groups, CO)) - batch_size, in_d, in_h, in_w, in_channel = inputs.shape - k_d, k_h, k_w, channel_per_group, out_channel = weight.shape - out_channel_per_group = out_channel // groups - - out_d = (in_d + 2 * padding - dilation * (k_d - 1) - 1) // stride + 1 - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rd = te.reduce_axis((0, k_d), name='rd') - rh = te.reduce_axis((0, k_h), name='rh') - rw = te.reduce_axis((0, k_w), name='rw') - rc = te.reduce_axis((0, channel_per_group), name='rc') - - padded = topi.nn.pad(inputs, [0, padding, padding, padding, 0]) - output = te.compute( - (batch_size, out_d, out_h, out_w, out_channel), - lambda n, d, h, w, co: te.sum( - (padded[n, d * stride + rd * dilation, - h * stride + rh * dilation, w * stride + rw * dilation, - co // out_channel_per_group * channel_per_group + rc] - * weight[rd, rh, rw, rc, co]), - axis=[rd, rh, rw, rc] - ), - name='conv3d_ndhwc' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def depthwise_conv2d_nhwc(N, H, W, C, kernel_size, stride=1, padding=0, dilation=1, factor=1): - inputs = te.placeholder((N, H, W, C)) - weight = te.placeholder((factor, kernel_size, kernel_size, C)) - - batch_size, in_h, in_w, in_channel = inputs.shape - factor, k_h, k_w, in_channel = weight.shape - out_channel = in_channel * factor - - assert factor.value == 1, "Not optimized for factor != 1" - - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rh = te.reduce_axis((0, k_h), name='rh') - rw = te.reduce_axis((0, k_w), name='rw') - - padded = topi.nn.pad(inputs, [0, padding, padding, 0]) - output = te.compute( - (batch_size, out_h, out_w, out_channel), - lambda n, h, w, c: te.sum( - (padded[n, h * stride + rh * dilation, w * stride + rw * dilation, c // factor] - * weight[c % factor, rh, rw, c // factor]), - axis=[rh, rw] - ), - name="depth_conv2d_nhwc" - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_transpose_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0): - inputs = te.placeholder((N, H, W, CI), name='inputs') - weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight') - - batch, in_h, in_w, in_c = inputs.shape - filter_h, filter_w, in_c, out_c = weight.shape - stride_h, stride_w = (stride, stride) - - # compute padding - fpad_top, fpad_left, fpad_bottom, fpad_right = topi.nn.get_pad_tuple(padding, (filter_h, filter_w)) - bpad_top = filter_h - 1 - fpad_top - bpad_bottom = filter_h - 1 - fpad_bottom - bpad_left = filter_w - 1 - fpad_left - bpad_right = filter_w - 1 - fpad_right - - # padding stage - padded = topi.nn.pad(inputs, - [0, (bpad_top + stride_h - 1) // stride_h, - (bpad_left + stride_w - 1) // stride_w, 0], - [0, (bpad_bottom + stride_h - 1) // stride_h, - (bpad_right + stride_w - 1) // stride_w, 0]) - - # remove extra padding introduced by dilatation - idxdiv = te.indexdiv - idxmod = te.indexmod - border_h = idxmod(stride_h - idxmod(bpad_top, stride_h), stride_h) - border_w = idxmod(stride_w - idxmod(bpad_left, stride_w), stride_w) - - # dilation stage - strides = [1, stride_h, stride_w, 1] - n = len(padded.shape) - - # We should embed this dilation directly into te.compute rather than creating a new te.compute. - # Only in this way can we use unroll to eliminate the multiplication of zeros. - def _dilate(*indices): - not_zero = [] - index_tuple = [] - for i in range(n): - if not strides[i] == 1: - index_tuple.append(idxdiv(indices[i], strides[i])) - not_zero.append(idxmod(indices[i], strides[i]).equal(0)) - else: - index_tuple.append(indices[i]) - if not_zero: - not_zero = te.all(*not_zero) - return te.if_then_else(not_zero, padded(*index_tuple), tvm.tir.const(0.0, padded.dtype)) - return padded(*index_tuple) - - # convolution stage - out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h - out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w - rc = te.reduce_axis((0, in_c), name='rc') - rh = te.reduce_axis((0, filter_h), name='rh') - rw = te.reduce_axis((0, filter_w), name='rw') - - output = te.compute( - (batch, out_h, out_w, out_c), - lambda n, h, w, co: te.sum( - _dilate(n, h + rh + border_h, w + rw + border_w, rc) * - weight[filter_h - 1 - rh, filter_w - 1 - rw, rc, co], - axis=[rh, rw, rc]), - name="conv2d_transpose_nhwc", - attrs={"ansor_always_unroll_inner": ["h", "w", "rh", "rw", "h_c", "w_c"]}) - # todo(lmzheng): add constraints on the tile size of h and w - - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_capsule_nhwijc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, capsule_size=4): - inputs = te.placeholder((N, H, W, capsule_size, capsule_size, CI), name='inputs') - weight = te.placeholder((kernel_size, kernel_size, capsule_size, capsule_size, CI, CO), name='weight') - batch_size, in_h, in_w, _, _, in_channel = inputs.shape - k_h, k_w, _, _, _, out_channel = weight.shape - - out_h = (in_h + 2 * padding - kernel_size) // stride + 1 - out_w = (in_w + 2 * padding - kernel_size) // stride + 1 - - rh = te.reduce_axis((0, k_h), name="rh") - rw = te.reduce_axis((0, k_w), name="rw") - cap_k = te.reduce_axis((0, capsule_size), name='cap_k') - rc = te.reduce_axis((0, in_channel), name="rc") - - padded = topi.nn.pad(inputs, [0, padding, padding, 0, 0, 0]) - output = te.compute( - (batch_size, out_h, out_w, capsule_size, capsule_size, out_channel), - lambda n, h, w, cap_i, cap_j, co: te.sum( - (padded[n, h * stride + rh, w * stride + rw, cap_i, cap_k, rc] - * weight[rh, rw, cap_k, cap_j, rc, co]), axis=[rh, rw, cap_k, rc] - ), - name='conv2d_capsule_nhwijc' - ) - return [inputs, weight, output] - - -@ansor.register_workload_func -def conv2d_winograd_nhwc(N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1): - # TODO: implement tile_size - tile_size = 4 #_infer_tile_size(data, kernel) - inputs = te.placeholder((N, H, W, CI), name='inputs') - #weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight') - N, H, W, CI = get_const_tuple(inputs.shape) - if isinstance(dilation, int): - dilation_h = dilation_w = dilation - else: - dilation_h, dilation_w = dilation - # if dilation_h != 1 or dilation_w != 1: - # weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w)) - KH = KW = kernel_size - HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW)) - HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride - assert HSTR == 1 and WSTR == 1 and KH == KW - - data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad") - - r = KW - m = tile_size - alpha = m + r - 1 - A, B, G = winograd_transform_matrices(m, r, 'float32') - - H = (H + 2 * HPAD - KH) // HSTR + 1 - W = (W + 2 * WPAD - KW) // WSTR + 1 - nH, nW = (H + m - 1) // m, (W + m - 1) // m - P = N * nH * nW - r_kh = te.reduce_axis((0, KH), name='r_kh') - r_kw = te.reduce_axis((0, KW), name='r_kw') - # kernel_pack = te.compute((alpha, alpha, CO, CI), lambda eps, nu, co, ci: - # weight[0][0][0][0], - # name='kernel_pack') - kshape = (alpha, alpha, CO, CI) - kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight") - - idxdiv = te.indexdiv - idxmod = te.indexmod - # pack input tile - input_tile = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci: - data_pad[idxdiv(p, (nH * nW))][idxmod(idxdiv(p, nW), nH) * m + eps] - [idxmod(p, nW) * m + nu][ci], name='input_tile',) - - # transform data - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - data_pack = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci: - te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], - axis=[r_a, r_b]), name='data_pack', - attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"], - "ansor_last_split_is_one": ["ci", "p"], - "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"], - "ansor_no_cache_write": "True", - }) - - # do batch gemm - ci = te.reduce_axis((0, CI), name='ci') - bgemm = te.compute((alpha, alpha, P, CO), lambda eps, nu, p, co: - te.sum(data_pack[eps][nu][p][ci] * - kernel_pack[eps][nu][co][ci], - axis=[ci]), name='bgemm') - - # inverse transform - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - inverse = te.compute((m, m, P, CO), lambda vh, vw, p, co: - te.sum(bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], - axis=[r_a, r_b]), name='inverse', - attrs={"ansor_no_split_at_inner": ["vh", "vw", "r_a", "r_b"], - "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"], - "ansor_last_split_is_one": ["co", "p"], - "ansor_no_cache_write": "True", - }) - - # output - output = te.compute((N, H, W, CO), lambda n, h, w, co: - inverse[idxmod(h, m), - idxmod(w, m), - n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), - co], - name='conv2d_winograd', - tag='conv2d_winograd_nhwc', - attrs={"ansor_no_split_at_outer": ["n", "h", "w", "co"],}) - return [inputs, kernel_pack, output] - -@ansor.register_workload_func -def conv2d_winograd_nchw(N, CI, H, W, CO, kernel_size=3, stride=1, padding=0, dilation=1, precompute=False): - # TODO: implement tile_size - tile_size = 4 #_infer_tile_size(data, kernel) - inputs = te.placeholder((N, CI, H, W), name='inputs') - #weight = te.placeholder((CO, CI, kernel_size, kernel_size), name='weight') - N, CI, H, W = get_const_tuple(inputs.shape) - # if isinstance(dilation, int): - # dilation_h = dilation_w = dilation - # else: - # dilation_h, dilation_w = dilation - # if dilation_h != 1 or dilation_w != 1: - # weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w)) - KH = KW = kernel_size - HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW)) - HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride - assert HSTR == 1 and WSTR == 1 and KH == KW - - data_pad = topi.nn.pad(inputs, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad") - - r = KW - m = tile_size - alpha = m + r - 1 - A, B, G = winograd_transform_matrices(m, r, 'float32') - - H = (H + 2 * HPAD - KH) // HSTR + 1 - W = (W + 2 * WPAD - KW) // WSTR + 1 - nH, nW = (H + m - 1) // m, (W + m - 1) // m - P = N * nH * nW - r_kh = te.reduce_axis((0, KH), name='r_kh') - r_kw = te.reduce_axis((0, KW), name='r_kw') - # kernel_pack = te.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co: - # weight[0][0][0][0], - # name='kernel_pack') - kshape = (alpha, alpha, CI, CO) - kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight") - - idxdiv = te.indexdiv - idxmod = te.indexmod - # pack input tile - input_tile = te.compute((CI, P, alpha, alpha), lambda ci, p, eps, nu: - data_pad[idxdiv(p, (nH * nW))][ci][idxmod(idxdiv(p, nW), nH) * m + eps] - [idxmod(p, nW) * m + nu], name='input_tile') - - # transform data - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - data_pack = te.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p: - te.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu], - axis=[r_a, r_b]), name='data_pack', - attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"], - "ansor_no_split_at_outer": ["ci", "p"], - "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"], - "ansor_no_cache_write": "True", - }) - - # do batch gemm - ci = te.reduce_axis((0, CI), name='ci') - bgemm = te.compute((alpha, alpha, CO, P), lambda eps, nu, co, p: - te.sum(data_pack[eps][nu][ci][p] * - kernel_pack[eps][nu][ci][co], - axis=[ci]), name='bgemm') - - # inverse transform - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - inverse = te.compute((CO, P, m, m), lambda co, p, vh, vw: - te.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], - axis=[r_a, r_b]), name='inverse', - attrs={"ansor_no_split_at_outer": ["co", "p", "vh", "vw", "r_a", "r_b"], - "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"], - "ansor_no_cache_write": "True"}) - - # output - output = te.compute((N, CO, H, W), lambda n, co, h, w: - inverse[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), - idxmod(h, m), - idxmod(w, m)], - name='conv2d_winograd', - attrs={"ansor_no_split_at_outer": ["n", "co", "h", "w"],}) - return [inputs, kernel_pack, output] - -# ========================== Subgraphs ========================== - -@ansor.register_workload_func -def transpose_batch_matmul(batch, seq_len, n_head, n_dim): - query = te.placeholder((batch, seq_len, n_head, n_dim), name='query') - value = te.placeholder((batch, seq_len, n_head, n_dim), name='value') - query_T = te.compute((batch, n_head, seq_len, n_dim), - lambda b, h, l, d: query[b, l, h, d], name="query_T") - value_T = te.compute((batch, n_head, n_dim, seq_len), - lambda b, h, d, l: value[b, l, h, d], name="value_T") - k = te.reduce_axis((0, n_dim), name='k') - out = te.compute((batch, n_head, seq_len, seq_len), - lambda b, h, i, j: te.sum(query_T[b][h][i][k] * value_T[b][h][k][j], axis=[k]), - name='C') - return [query, value, out] - -# ========================== Tune function & Task dicts ========================== - -def tune_wkl(task_func_dict, shape_dict, wkl_type, args): - target = tvm.target.create(args.target) - - for wkl_meta_name, func in task_func_dict.items(): - if not args.wkl in ["all", wkl_type, wkl_meta_name]: - continue - - log_file = args.log_file or wkl_meta_name + ".json" - wkl_keys = [] - for shape in shape_dict[wkl_meta_name]: - if shape[0] == 1: - shape = list(shape) - shape[0] = args.batch_size - - wkl_key = ansor.make_workload_key_func(func, shape) - wkl_keys.append(wkl_key) - if args.fast_check: - break - - if not args.tune: - cost, gflops = replay_workload( - wkl_key, target, args.target_host, log_file, - args.local_measure, args.rpc_device_key, args.rpc_host, - args.rpc_port, args.rpc_num_threads, args.ndk_cc, False) - # log_line(BenchmarkRecord(target.name, 'gpu' if target.name == 'cuda' else 'cpu', 'subgraph', - # workload_name, "AutoSchedule", "default", - # {"costs": [cost]}, time.time()), args.out_file) - - if args.tune: - print("========== Tune for %s (%d shapes) ========== " % (wkl_meta_name, len(wkl_keys))) - - load_log_file = args.load_log or log_file - n_trials = args.n_trials_per_shape * len(wkl_keys) - - tune_option, measure_ctx = create_tune_option(target, log_file, - n_trials, args.num_measure_per_iter, args.verbose, - args.n_parallel, args.build_timeout, args.local_measure, - args.rpc_device_key, args.rpc_host, args.rpc_port, - args.rpc_num_threads, args.ndk_cc) - - # tune workloads jointly using JointTuner - tune_workloads_jointly(wkl_keys, np.ones(len(wkl_keys)), args.task_scheduler, - target, args.target_host, args.policy, args.model_type, - args.load_model, load_log_file, tune_option) - - if measure_ctx: - del measure_ctx - - -single_op_task_func_dict = { - 'GMM': batch_matmul_nkkm, - 'C1D': conv1d_nlc, - 'C2D': conv2d_nhwc, - 'C3D': conv3d_ndhwc, - 'GRP': conv2d_nhwc, - 'DIL': conv2d_nhwc, - 'DEP': depthwise_conv2d_nhwc, - 'T2D': conv2d_transpose_nhwc, - 'CAP': conv2d_capsule_nhwijc, - 'NRM': norm_bmn, - #'SMX': softmax_mn, - -# The following workloads are not in our sinle op evaluation plan. -# They should be moved to `common.py` and be used by `tune_wkl.py`. -# 'C2D_NCHW': conv2d_nchw, -# 'C2DWG_NHWC': conv2d_winograd_nhwc, -# 'C2DWG_NCHW': conv2d_winograd_nchw, -# 'GMM_TC': matmul_nkkm, -} - -subgraph_task_func_dict = { - 'conv2d_bn_relu': conv2d_nhwc_bn_relu, - #'conv2d_bn_relu': conv2d_nchw_bn_relu, # some old log uses conv2d_nchw_bn_relu - 'transpose_batch_matmul': transpose_batch_matmul, -} - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Search task related arguments - parser.add_argument("--wkl", type=str, required=True, - help="all - Tune all workloads; \ - op - Tune all single ops; \ - subgraph - Tune all subgraphs; \ - specific wkl name - Tune a specific workload") - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2') - parser.add_argument("--target-host", type=str, default=None) - parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--fast-check", action='store_true', - help='Only run one shape for each workload. This is used for fast checking') - - # Search strategy related arguments - parser.add_argument("--n-trials-per-shape", type=int, default=1000) - parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch') - parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb') - parser.add_argument("--task-scheduler", type=str, default='round-robin', - choices=['no', 'gradient', 'round-robin'], help='The strategy of task scheduler') - parser.add_argument("--seed", type=int, default=0, help='random seed') - - # Log file related arguments - parser.add_argument("--log-file", type=str, help="Write measurement records to this log file") - parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search") - parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file") - - # Measurement related and other arguments - parser.add_argument("--num-measure-per-iter", type=int, default=48, - help="The number of programs to be measured at each iteration") - parser.add_argument("--build-timeout", type=int, default=10) - parser.add_argument("--run-timeout", type=int, default=60) - parser.add_argument("--verbose", type=int, default=1) - parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--rpc-device-key", type=str, default=None) - parser.add_argument("--rpc-host", type=str, default='0.0.0.0') - parser.add_argument("--rpc-port", type=int, default=9190) - parser.add_argument("--rpc-num-threads", type=int, default=None) - parser.add_argument("--n-parallel", type=int, default=1) - parser.add_argument("--ndk-cc", type=str, default=None) - args = parser.parse_args() - - np.random.seed(args.seed) - random.seed(args.seed) - logging.basicConfig() - logging.getLogger('ansor').setLevel(logging.DEBUG) - - # compute the number of tasks - num_tasks = 0 - for wkl_meta_name in single_op_task_func_dict: - if not args.wkl in ["all", "op", wkl_meta_name]: - continue - if args.fast_check: - num_tasks += 1 - else: - num_tasks += len(single_op_shape_dict[wkl_meta_name]) - for wkl_meta_name in subgraph_task_func_dict: - if not args.wkl in ["all", "subgraph", wkl_meta_name]: - continue - if args.fast_check: - num_tasks += 1 - else: - num_tasks += len(subgraph_shape_dict[wkl_meta_name]) - print("Number of tasks: %d\tTotal trials: %d" % (num_tasks, num_tasks * args.n_trials_per_shape)) - - # tune for tasks - tune_wkl(single_op_task_func_dict, single_op_shape_dict, "op", args) - tune_wkl(subgraph_task_func_dict, subgraph_shape_dict, "subgraph", args) diff --git a/scripts/tune_test.py b/scripts/tune_test.py deleted file mode 100644 index 6b39cf5e7865..000000000000 --- a/scripts/tune_test.py +++ /dev/null @@ -1,394 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Use auto scheduler to tune workloads""" -import argparse -import logging -import os -import random - -import numpy as np - -import tvm -from tvm import ansor, te -from tvm.ansor.utils import request_remote - -from common import get_workload_keys, get_workload_weights, measure_schedule, str2bool - -def tensor_core_meet_condition(meta_policy, state, stage_id): - pass - -def intrin_wmma_load_matrix(scope): - n = 16 - A = te.placeholder((n, n), name='A', dtype='float16') - BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256) - C = te.compute((n, n), lambda i, j: A[i, j], name='C') - BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - - BA = ins[0] - BC = outs[0] - ib.emit(tvm.tir.call_intrin('handle', 'tvm_load_matrix_sync', - BC.data, n, n, n, BC.elem_offset // 256, - BA.access_ptr('r'), n, 'row_major')) - return ib.get() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC}) - -@tvm._ffi.register_func -def intrin_wmma_load_matrix_a(): - return intrin_wmma_load_matrix("wmma.matrix_a") - -@tvm._ffi.register_func -def intrin_wmma_load_matrix_b(): - return intrin_wmma_load_matrix("wmma.matrix_b") - -@tvm._ffi.register_func -def intrin_wmma_gemm(): - n = 16 - A = te.placeholder((n, n), name='A', dtype='float16') - B = te.placeholder((n, n), name='B', dtype='float16') - k = te.reduce_axis((0, n), name="k") - C = te.compute((n, n), - lambda ii, jj: - te.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k), - name='C') - BA = tvm.tir.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256) - BB = tvm.tir.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256) - BC = tvm.tir.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256) - - def intrin_func(ins, outs): - BA, BB = ins - BC, = outs - - def init(): - ib = tvm.tir.ir_builder.create() - ib.emit(tvm.tir.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0)) - return ib.get() - - def update(): - ib = tvm.tir.ir_builder.create() - ib.emit(tvm.tir.call_intrin('handle', 'tvm_mma_sync', - BC.data, BC.elem_offset // 256, - BA.data, BA.elem_offset // 256, - BB.data, BB.elem_offset // 256, - BC.data, BC.elem_offset // 256)) - return ib.get() - - return update(), init(), update() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC}) - -@tvm._ffi.register_func -def intrin_wmma_store_matrix(): - n = 16 - A = te.placeholder((n, n), name='A', dtype='float32') - BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256) - C = te.compute((n, n), lambda i, j: A[i, j], name='C') - BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - BA = ins[0] - BC = outs[0] - ib.emit(tvm.tir.call_intrin('handle', 'tvm_store_matrix_sync', - BA.data, n, n, n, BA.elem_offset // 256, - BC.access_ptr('w'), n, 'row_major')) - return ib.get() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC}) - -def tensor_core_apply(meta_policy, state, stage_id): - ret = [] - state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag) - - A, B, C = meta_policy.cur_task.compute_dag.ops - - C_local = state.cache_write(C, "wmma.accumulator") - - its0 = state.split(C_local, state[C_local].iters[0], [None, None]) - split_step0 = state.transform_steps_size() - 1 - its1 = state.split(C_local, state[C_local].iters[3], [None, None]) - split_step1 = state.transform_steps_size() - 1 - its2 = state.split(C_local, state[C_local].iters[8], [None]) - - state.reorder(C_local, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], - its2[0], its2[1], - state[C_local].iters[6], - state[C_local].iters[7], - state[C_local].iters[10]]) - state.fuse(C_local, [state[C_local].iters[0], state[C_local].iters[1]]) - state.fuse(C_local, [state[C_local].iters[1], state[C_local].iters[2]]) - state.fuse(C_local, [state[C_local].iters[2], state[C_local].iters[3]]) - - its0 = state.follow_split(C, state[C].iters[0], split_step0, 2) - its1 = state.follow_split(C, state[C].iters[3], split_step1, 2) - state.reorder(C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2], - state[C].iters[6], state[C].iters[7]]) - state.fuse(C, [state[C].iters[0], state[C].iters[1]]) - state.fuse(C, [state[C].iters[1], state[C].iters[2]]) - local_write_pos = state.fuse(C, [state[C].iters[2], state[C].iters[3]]) - state.compute_at(C_local, C, local_write_pos) - shared_read_pos = state[C_local].iters[3] - local_read_pos = state[C_local].iters[4] - state.bind_thread(C, state[C].iters[0], "blockIdx.x") - state.bind_thread(C, state[C].iters[1], "vthread") - state.bind_thread(C, state[C].iters[2], "threadIdx.x") - - B_shared = state.cache_read(B, "shared", [C_local]) - B_local = state.cache_read(B_shared, "wmma.matrix_b", [C_local]) - state.compute_at(B_shared, C_local, shared_read_pos) - state.compute_at(B_local, C_local, local_read_pos) - - it = state.fuse(B_shared, state[B_shared].iters[:]) - its = state.split(B_shared, it, [4]) # vectorize add a callback check function - state.vectorize(B_shared, its[1]) - its = state.follow_fused_split(B_shared, its[0], [split_step0, split_step1], 1, True) - state.bind_thread(B_shared, its[1], "threadIdx.x") - - A_shared = state.cache_read(A, "shared", [C_local]) - A_local = state.cache_read(A_shared, "wmma.matrix_a", [C_local]) - state.compute_at(A_shared, C_local, shared_read_pos) - state.compute_at(A_local, C_local, local_read_pos) - - it = state.fuse(A_shared, state[A_shared].iters[:]) - its = state.split(A_shared, it, [4]) # vectorize add a callback check function - state.vectorize(A_shared, its[1]) - its = state.follow_fused_split(A_shared, its[0], [split_step0, split_step1], 1, True) - state.bind_thread(A_shared, its[1], "threadIdx.x") - - state.tensorize(A_local, state[A_local].iters[-2], "intrin_wmma_load_matrix_a") - state.tensorize(B_local, state[B_local].iters[-2], "intrin_wmma_load_matrix_b") - state.tensorize(C_local, state[C_local].iters[-3], "intrin_wmma_gemm") - state.tensorize(C, state[C].iters[-2], "intrin_wmma_store_matrix") - - print(state) - - ret.append([state.state_object, -1]) - return ret - -def create_tune_option(target, log_file, n_trials, num_measure_per_iter, verbose, - n_parallel, build_timeout, local_measure, rpc_device_key, rpc_host, - rpc_port, rpc_num_threads, ndk_cc, early_stopping=-1, run_timeout=10, - tensor_core_matmul=False): - builder = runner = measure_ctx = None - if local_measure: - builder = ansor.LocalBuilder(timeout=build_timeout) - if target.target_name == "cuda": - measure_ctx = ansor.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400) - runner = measure_ctx.runner - else: - os.environ['TVM_AUTO_CACHE_FLUSH'] = "1" - runner = ansor.LocalRunner(repeat=10, number=1, min_repeat_ms=0, timeout=run_timeout) - else: - os.environ['TVM_NDK_CC'] = ndk_cc - builder = ansor.LocalBuilder(timeout=build_timeout, build_func='ndk') - runner = ansor.RPCRunner(key=rpc_device_key, host=rpc_host, port=rpc_port, - timeout=run_timeout, n_parallel=n_parallel, - repeat=1, min_repeat_ms=200) - remote = request_remote(rpc_device_key, rpc_host, rpc_port) - if rpc_num_threads: - config_threadpool = remote.get_function('runtime.config_threadpool') - config_threadpool(0, rpc_num_threads) - - pre_search_callbacks = [ansor.PreloadMeasuredStates(log_file)] - if tensor_core_matmul: - pre_search_callbacks.append(ansor.PreloadCustomSketchRule(tensor_core_meet_condition, tensor_core_apply)) - tune_option = ansor.TuneOption(n_trials=n_trials, early_stopping=early_stopping, - num_measure_per_iter=num_measure_per_iter, - verbose=verbose, - builder=builder, - runner=runner, - measure_callbacks=[ansor.LogToFile(log_file)], - pre_search_callbacks=pre_search_callbacks) - - return tune_option, measure_ctx - - -def replay_workload(wkl_key, target, target_host, log_file, - local_measure=True, rpc_device_key=None, rpc_host="0.0.0.0", - rpc_port=9190, rpc_num_threads=None, ndk_cc=None, - show_lower_result=True): - cost = gflops = None - - inp, res = ansor.best_measure_pair_in_file(log_file, wkl_key, target) - if inp is None: - print("Cannot find log for: %s" % wkl_key) - else: - dag = ansor.workload_key_to_dag(inp.task.workload_key) - print("Found schedule for: %s" % wkl_key) - - s, bufs = dag.apply_steps_from_state(inp.state) - if show_lower_result: - print(tvm.lower(s, bufs, simple_mode=True)) - - if local_measure: - remote = None - else: - remote = request_remote(rpc_device_key, rpc_host, rpc_port) - if rpc_num_threads: - config_threadpool = remote.get_function('runtime.config_threadpool') - config_threadpool(0, rpc_num_threads) - - cost = np.mean((measure_schedule(s, bufs, target, target_host, - remote=remote, ndk_cc=ndk_cc))) - gflops = ansor.ComputeDAG(bufs).flop_ct / cost / 1e9 - print("Best schedule: %.2f GFLOPS\tcost: %.3f ms" % (gflops, cost * 1e3)) - - return cost, gflops - - -def tune_workload(wkl_key, target, target_host, policy, model_type, - load_model_file, load_log_file, tune_option): - """Tune a workload""" - - if False: - # Debug info. Print static analysis results from the access analyzer - dag = ansor.workload_key_to_dag(wkl_key) - print(dag.access_analyzer) - exit() - - if model_type == 'xgb': - model = ansor.XGBModel() - if load_model_file: - print("Load pretrained model...") - model.load(load_model_file) - elif load_log_file: - model.load_log_file(load_log_file) - elif model_type == "random": - model = ansor.RandomModel() - else: - raise ValueError("Invalid model: " + model_type) - - if policy == 'sketch': - policy = ansor.SketchSearchPolicy(program_cost_model=model) - elif policy == 'beam-search': - policy = ansor.SketchSearchPolicy(program_cost_model=model, - params={'use_beam_search': 1}) - else: - raise ValueError("Invalid search policy: " + policy) - - s, bufs = ansor.auto_schedule(wkl_key, - target=target, target_host=target_host, - search_policy=policy, - tune_option=tune_option) - -def tune_workloads_jointly(wkl_keys, weights, task_scheduler, target, target_host, - search_policy, model_type, load_model_file, load_log_file, - tune_option): - """Tune for multiple workloads together with TaksScheduler""" - tasks = [] - for wkl_key in wkl_keys: - dag = ansor.workload_key_to_dag(wkl_key) - tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host)) - - def objective_func(costs): - return sum(c * w for c, w in zip(costs, weights)) - - tuner = ansor.SimpleTaskScheduler(tasks, objective_func, strategy=task_scheduler, - load_log_file=load_log_file, load_model_file=load_model_file) - search_policy = "%s.%s" % (search_policy, model_type) - tuner.tune(tune_option, search_policy) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Search task related arguments - parser.add_argument("--wkl", type=str, required=True) - parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2') - parser.add_argument("--target-host", type=str, default=None) - parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True) - - # Search strategy related arguments - parser.add_argument("--n-trials", type=int, default=1000) - parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch') - parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb') - parser.add_argument("--task-scheduler", type=str, default='no', - choices=['no', 'gradient', 'round-robin'], - help='The strategy of task scheduler') - parser.add_argument("--seed", type=int, default=0, help='random seed') - - # Log file related arguments - parser.add_argument("--log-file", type=str, help="Write measurement records to this log file") - parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search") - parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file") - - # Measurement related and other arguments - parser.add_argument("--num-measure-per-iter", type=int, default=48, - help="The number of programs to be measured at each iteration") - parser.add_argument("--build-timeout", type=int, default=10) - parser.add_argument("--run-timeout", type=int, default=60) - parser.add_argument("--verbose", type=int, default=1) - parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--rpc-device-key", type=str, default=None) - parser.add_argument("--rpc-host", type=str, default='0.0.0.0') - parser.add_argument("--rpc-port", type=int, default=9190) - parser.add_argument("--rpc-num-threads", type=int, default=None) - parser.add_argument("--n-parallel", type=int, default=1) - parser.add_argument("--ndk-cc", type=str, default=None) - args = parser.parse_args() - - np.random.seed(args.seed) - random.seed(args.seed) - logging.basicConfig() - logging.getLogger('ansor').setLevel(logging.DEBUG) - - wkl_keys = get_workload_keys(args.wkl) - target = tvm.target.create(args.target) - log_file = args.log_file or args.wkl + ".json" - - # Tune workloads - if args.tune: - load_log_file = args.load_log or log_file - weights = get_workload_weights(args.wkl) - - # Special check for tensor core - wkl_key = args.wkl - wkl_key = wkl_key.split("-") - tensor_core_matmul = False - if wkl_key[0] == "matmul" and wkl_key[6] == "tc": - tensor_core_matmul = True - - tune_option, measure_ctx = create_tune_option(target, log_file, - args.n_trials, args.num_measure_per_iter, args.verbose, - args.n_parallel, args.build_timeout, args.local_measure, - args.rpc_device_key, args.rpc_host, args.rpc_port, args.rpc_num_threads, - args.ndk_cc, tensor_core_matmul=tensor_core_matmul) - - if args.task_scheduler == 'no': - # tune workloads one by one - for wkl_key in wkl_keys: - tune_workload(wkl_key, target, args.target_host, args.policy, - args.model_type, args.load_model, load_log_file, - tune_option) - else: - # tune workloads jointly with TaskScheduler - tune_workloads_jointly(wkl_keys, weights, args.task_scheduler, - target, args.target_host, args.policy, - args.model_type, args.load_model, load_log_file, - tune_option) - if measure_ctx: - del measure_ctx - - # Replay the best found schedule - if len(wkl_keys) == 1 or not args.tune: - for wkl_key in wkl_keys: - replay_workload(wkl_key, target, args.target_host, log_file, - args.local_measure, args.rpc_device_key, args.rpc_host, - args.rpc_port, args.rpc_num_threads, args.ndk_cc) diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc index d3af64a4f576..4887ef0ee47d 100644 --- a/src/arith/rewrite_simplify.cc +++ b/src/arith/rewrite_simplify.cc @@ -132,13 +132,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) { TVM_TRY_REWRITE(ramp(b1, s1, lanes) + broadcast(x, lanes), ramp(b1 + x, s1, lanes)); TVM_TRY_REWRITE(broadcast(x, lanes) + ramp(b1, s1, lanes), ramp(x + b1, s1, lanes)); TVM_TRY_REWRITE(broadcast(x, lanes) + broadcast(y, lanes), broadcast(x + y, lanes)); - if ((x + broadcast(y, lanes)).Match(ret)) { - if (auto ps = y.Eval().as()) { - if (ps->value == 0.0) { - return x.Eval(); - } - } - } } if (IsIndexType(op->dtype)) { @@ -429,13 +422,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) { TVM_TRY_REWRITE(broadcast(x, lanes) * broadcast(y, lanes), broadcast(x * y, lanes)); TVM_TRY_REWRITE(ramp(b1, s1, lanes) * broadcast(x, lanes), ramp(b1 * x, s1 * x, lanes)); TVM_TRY_REWRITE(broadcast(x, lanes) * ramp(b1, s1, lanes), ramp(b1 * x, s1 * x, lanes)); - if ((broadcast(x, lanes) * y).Match(ret)) { - if (auto ps = x.Eval().as()) { - if (ps->value == 0.0) { - return make_const(op->dtype, 0.0); - } - } - } } if (IsIndexType(op->dtype)) { @@ -714,9 +700,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { PrimExpr const_res = TryConstFold(op->a, op->b); if (const_res.defined()) return const_res; // Pattern var to match any expression - PVar w, x, y, z, b1; + PVar x, y, z, b1; // Pattern var match IntImm - PVar c1, c2, c3, c4; + PVar c1, c2, c3; // Pattern var for lanes in broadcast and ramp PVar lanes; @@ -781,11 +767,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { TVM_TRY_REWRITE_IF(floordiv(max(y, x * c1), c2), max(floordiv(y, c2), x * floordiv(c1, c2)), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); - TVM_TRY_REWRITE_IF(floordiv(x * c1 + y, c2), floordiv(x * c1, c2), - c1.Eval()->value > 0 && c2.Eval()->value > 0 && - c2.Eval()->value % c1.Eval()->value == 0 && - CanProveGreaterEqual(-y.Eval(), -c1.Eval()->value + 1)); - // Rules involving 3-operands. TVM_TRY_REWRITE_IF(floordiv(x * c1 + y + z, c2), x * floordiv(c1, c2) + floordiv(y + z, c2), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); @@ -802,13 +783,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { TVM_TRY_REWRITE_IF(floordiv(x + c1, c2), floordiv(x, c2) + floordiv(c1, c2), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); - TVM_TRY_REWRITE_IF(floordiv(x * c1 + y * c2 + z, c3), floordiv(x * c1 + y * c2, c3), - c1.Eval()->value > 0 && c2.Eval()->value > 0 && c3.Eval()->value > 0 && - c3.Eval()->value % c1.Eval()->value == 0 && - c3.Eval()->value % c2.Eval()->value == 0 && - CanProveGreaterEqual(-z.Eval(), - std::max(-c1.Eval()->value, -c2.Eval()->value) + 1)); - TVM_TRY_REWRITE_IF(floordiv(x + y, x), floordiv(y, x) + 1, CanProveGreaterEqual(x.Eval(), 0)); TVM_TRY_REWRITE_IF(floordiv(y + x, x), floordiv(y, x) + 1, CanProveGreaterEqual(x.Eval(), 0)); @@ -833,18 +807,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { CanProveGreaterEqual(z.Eval(), 0)); TVM_TRY_REWRITE_IF(floordiv(y + z * x, z), floordiv(y, z) + x, CanProveGreaterEqual(z.Eval(), 0)); - - // Rules involving 4-operands - TVM_TRY_REWRITE_IF(floordiv(w * c1 + x * c2 + y * c3 + z, c4), - floordiv(w * c1 + x * c2 + y * c3, c4), - c1.Eval()->value > 0 && c2.Eval()->value > 0 && - c3.Eval()->value > 0 && c4.Eval()->value > 0 && - c4.Eval()->value % c1.Eval()->value == 0 && - c4.Eval()->value % c2.Eval()->value == 0 && - c4.Eval()->value % c3.Eval()->value == 0 && - CanProveGreaterEqual(-z.Eval(), - std::max(-c1.Eval()->value, - std::max(-c2.Eval()->value, -c3.Eval()->value)) + 1)); } return ret; } @@ -856,9 +818,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) { if (const_res.defined()) return const_res; // Pattern var to match any expression - PVar w, x, y, z, b1; + PVar x, y, z, b1; // Pattern var match IntImm - PVar c1, c2, c3, c4; + PVar c1, c2; // Pattern var for lanes in broadcast and ramp PVar lanes; @@ -902,31 +864,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) { TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x, c2), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); - TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x, floordiv(c2, c1)) * c1 + y, - c1.Eval()->value > 0 && c2.Eval()->value > 0 && - c2.Eval()->value % c1.Eval()->value == 0 && - CanProveGreaterEqual(-y.Eval(), -c1.Eval()->value + 1)); - - // TODO(jcf94): For the next three rules, better use the max common factor - // of c1, c2, c3 to do the simplify - TVM_TRY_REWRITE_IF(floormod(x * c1 + y * c2 + z, c3), - floormod(x * floordiv(c1, c2) + y, floordiv(c3, c2)) * c2 + z, - c1.Eval()->value > 0 && c2.Eval()->value > 0 && - c3.Eval()->value > 0 && - c3.Eval()->value % c2.Eval()->value == 0 && - c1.Eval()->value % c2.Eval()->value == 0 && - CanProveGreaterEqual(-z.Eval(), -c2.Eval()->value + 1)); - - TVM_TRY_REWRITE_IF(floormod(w * c1 + x * c2 + y * c3 + z, c4), - floormod(w * floordiv(c1, c3) + x * floordiv(c2, c3) + y, - floordiv(c4, c3)) * c3 + z, - c1.Eval()->value > 0 && c2.Eval()->value > 0 && - c3.Eval()->value > 0 && c4.Eval()->value > 0 && - c4.Eval()->value % c3.Eval()->value == 0 && - c1.Eval()->value % c3.Eval()->value == 0 && - c2.Eval()->value % c3.Eval()->value == 0 && - CanProveGreaterEqual(-z.Eval(), -c3.Eval()->value + 1)); - // try modular analysis if (floormod(x, c1).Match(ret)) { ModularSet mod = analyzer_->modular_set(x.Eval()); diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc index 5b063eca4337..a192002825e6 100644 --- a/src/relay/analysis/type_solver.cc +++ b/src/relay/analysis/type_solver.cc @@ -219,7 +219,6 @@ class TypeSolver::Unifier : public TypeFunctor { return Type(nullptr); } - tt1 = tt2; tvm::Array shape; if (tt1->shape.size() != tt2->shape.size()) { this->solver_->ReportError(ErrorBuilder() << "tensor type `" << PrettyPrint(tt1) << "` has " diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 30269b85795f..ee5e291e3d53 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -2455,60 +2455,6 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w] .set_support_level(5) .set_attr("FTVMCompute", LayoutTransformCompute); -// relay.kernel_layout_transform -TVM_REGISTER_NODE_TYPE(KernelLayoutTransformAttrs); - -Array KernelLayoutTransformCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type) { - const auto* param = attrs.as(); - CHECK(param != nullptr); - return Array{ - topi::kernel_layout_transform(inputs[0], param->src_layout, param->dst_layout) - }; -} - -bool KernelLayoutTransformRel(const Array& types, - int num_inputs, - const Attrs& attrs, - const TypeReporter& reporter) { - const auto* data = types[0].as(); - CHECK(data != nullptr); - const KernelLayoutTransformAttrs* params = attrs.as(); - - Array dst_shape; - std::vector dst_axes; - - topi::parse_kernel_layout(params->dst_layout, &dst_shape, &dst_axes); - - reporter->Assign(types[1], TensorType(dst_shape, data->dtype)); - return true; -} - -Expr MakeKernelLayoutTransform(Expr data, - String src_layout, - String dst_layout) { - auto attrs = make_object(); - attrs->src_layout = std::move(src_layout); - attrs->dst_layout = std::move(dst_layout); - static const Op& op = Op::Get("kernel_layout_transform"); - return Call(op, {data}, Attrs(attrs), {}); -} - -TVM_REGISTER_GLOBAL("relay.op._make.kernel_layout_transform") -.set_body_typed(MakeKernelLayoutTransform); - -RELAY_REGISTER_OP("kernel_layout_transform") - .describe(R"code(Transform the input kernel layout. -)code" TVM_ADD_FILELINE) - .set_attrs_type() - .set_num_inputs(1) - .add_argument("data", "Tensor", "The input tensor.") - .add_type_rel("kernel_layout_transform", KernelLayoutTransformRel) - .set_support_level(5) - .set_attr("FTVMCompute", KernelLayoutTransformCompute); - - /* relay._contrib_reverse_reshape */ Expr MakeReverseReshape(Expr data, Array newshape) { auto attrs = make_object(); diff --git a/src/relay/transforms/defuse_ops.cc b/src/relay/transforms/defuse_ops.cc deleted file mode 100644 index 1a108fb08888..000000000000 --- a/src/relay/transforms/defuse_ops.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "pattern_util.h" - -namespace tvm { -namespace relay { - -class DefuseOpsMutator : public ExprMutator { - public: - class FuncBodyMutator : public ExprMutator { - public: - Array args_; - - explicit FuncBodyMutator(const Array& args) : ExprMutator() { args_ = args; } - - Expr VisitExpr_(const VarNode* n) { - const std::string& name = n->name_hint(); - CHECK_EQ(name[0], 'p'); - std::string id_str = name.substr(1); - int id = atoi(id_str.c_str()); - CHECK(id >= 0 && size_t(id) < args_.size()); - return args_[id]; - } - }; - - Expr VisitExpr_(const CallNode* n) { - auto new_n = ExprMutator::VisitExpr_(n); - - const auto* call = new_n.as(); - if (call) { - const auto* func = call->op.as(); - if (func) { - const auto& func_call = func->body.as(); - if (func_call) { - return FuncBodyMutator(call->args).Mutate(func->body); - } - } - } - return new_n; - } -}; - -Expr DeFuseOps(const Expr& expr) { return DefuseOpsMutator().Mutate(expr); } - -namespace transform { - -Pass DeFuseOps() { - runtime::TypedPackedFunc pass_func = - [=](Function f, IRModule m, PassContext pc) { - return Downcast(relay::DeFuseOps(f)); - }; - return CreateFunctionPass(pass_func, 3, "DeFuseOps", {"InferType"}); -} - -TVM_REGISTER_GLOBAL("relay._transform.DeFuseOps").set_body_typed(DeFuseOps); - -} // namespace transform - -} // namespace relay -} // namespace tvm diff --git a/src/relay/transforms/kernel_layout_transform.cc b/src/relay/transforms/kernel_layout_transform.cc deleted file mode 100644 index 421968b8a6b9..000000000000 --- a/src/relay/transforms/kernel_layout_transform.cc +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "kernel_layout_transform.h" - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace tvm { -namespace relay { - -// Todo: do not use global variables -std::deque KernelLayoutVisitor::global_ori_layouts_queue; -std::deque KernelLayoutVisitor::global_new_layouts_queue; - -Expr KernelLayoutTransform(const Expr& expr) { - KernelLayoutVisitor visitor; - - // Do a pre-order DFS to gather the optimal kernel layouts for all conv2d nodes. - // These layouts were written to global static variables in python function - // `prepare_layout_rewrite` - visitor.VisitExpr(expr); - - // Do a post-order DSF to mutate layout for all conv2d nodes - return KernelLayoutTransformer(&visitor).Mutate(expr); -} - -namespace transform { - -Pass KernelLayoutTransform() { - runtime::TypedPackedFunc pass_func = - [=](Function f, IRModule m, PassContext pc) { - return Downcast(relay::KernelLayoutTransform(f)); - }; - return CreateFunctionPass(pass_func, 3, "KernelLayoutTransform", {"InferType"}); -} - -TVM_REGISTER_GLOBAL("relay._transform.KernelLayoutTransform").set_body_typed(KernelLayoutTransform); - -} // namespace transform - -} // namespace relay -} // namespace tvm diff --git a/src/relay/transforms/kernel_layout_transform.h b/src/relay/transforms/kernel_layout_transform.h deleted file mode 100644 index c6c38fb71cf4..000000000000 --- a/src/relay/transforms/kernel_layout_transform.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -#ifndef TVM_RELAY_TRANSFORMS_KERNEL_LAYOUT_TRANSFORM_H_ -#define TVM_RELAY_TRANSFORMS_KERNEL_LAYOUT_TRANSFORM_H_ - -#include -#include - -#include -#include -#include -#include -#include - -#include "../../ansor/compute_dag.h" -#include "pattern_util.h" - -namespace tvm { -namespace relay { - -/*! \brief A visitor to gather the optimal kernel layout for all conv2d nodes. */ -class KernelLayoutVisitor : public ExprVisitor { - public: - void VisitExpr_(const CallNode* n) { - if (n && n->op.as() && - (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as()->name) != - op_white_lists.end()) && - n->args[1]->type_as()->shape[3].as()->value > 1 && - !global_ori_layouts_queue.empty() && !global_new_layouts_queue.empty()) { - ori_layouts_map[n] = global_ori_layouts_queue.front(); - new_layouts_map[n] = global_new_layouts_queue.front(); - // std::cout << "ori_layout " << global_ori_layouts_queue.front() - // << " Filter_shape " << n->args[1]->type_as()->shape << std::endl; - global_ori_layouts_queue.pop_front(); - global_new_layouts_queue.pop_front(); - } - ExprVisitor::VisitExpr_(n); - } - - std::unordered_map ori_layouts_map; - std::unordered_map new_layouts_map; - std::vector op_white_lists{"nn.contrib_conv2d_winograd_without_weight_transform", - "nn.conv2d", "nn.conv3d"}; - - static std::deque global_ori_layouts_queue; - static std::deque global_new_layouts_queue; -}; - -/*! \brief A mutator to rewrite kernel layout for all conv2d nodes */ -class KernelLayoutTransformer : public ExprMutator { - public: - explicit KernelLayoutTransformer(KernelLayoutVisitor* visitor) - : ExprMutator(), visitor_(visitor) {} - - Expr VisitExpr_(const CallNode* n) { - auto new_n = ExprMutator::VisitExpr_(n); - - const auto* call = new_n.as(); - std::vector op_white_lists{"nn.contrib_conv2d_winograd_without_weight_transform", - "nn.conv2d", "nn.conv3d"}; - if (call && call->op.as() && - (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as()->name) != - op_white_lists.end() && - n->args[1]->type_as()->shape[3].as()->value > 1)) { - auto ori_layout_iter = visitor_->ori_layouts_map.find(n); - auto new_layout_iter = visitor_->new_layouts_map.find(n); - if (ori_layout_iter != visitor_->ori_layouts_map.end() && - new_layout_iter != visitor_->new_layouts_map.end()) { - const std::string& ori_layout = ori_layout_iter->second; - const std::string& new_layout = new_layout_iter->second; - Expr updated_kernel = MakeKernelLayoutTransform(call->args[1], ori_layout, new_layout); - Array updated_args = {call->args[0], updated_kernel}; - new_n = Call(call->op, updated_args, call->attrs); - } - } - return new_n; - } - - private: - KernelLayoutVisitor* visitor_; -}; - -} // namespace relay -} // namespace tvm - -#endif // TVM_RELAY_TRANSFORMS_KERNEL_LAYOUT_TRANSFORM_H_ diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h index a9d3b5168e47..7518eb9ac81a 100644 --- a/src/relay/transforms/pattern_util.h +++ b/src/relay/transforms/pattern_util.h @@ -685,8 +685,6 @@ Expr MakeExpandDims(Expr data, int axis, int num_newaxis); Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout); -Expr MakeKernelLayoutTransform(Expr data, String src_layout, String dst_layout); - Expr StopFusion(Expr data); Expr CastHint(Expr data, DataType dtype); diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 4e71383cc1bb..a6d4a5499469 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -94,10 +94,6 @@ class CUDADeviceAPI final : public DeviceAPI { } case kGcnArch: return; - case kMaxRegistersPerBlock: { - CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, ctx.device_id)); - break; - } } *rv = value; } diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index 714535ecc8a6..800a9167dadc 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -26,9 +26,6 @@ #include #include -#include -#include - #include "runtime_base.h" extern "C" { @@ -183,8 +180,7 @@ NDArray NDArray::CreateView(std::vector shape, DLDataType dtype) { DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); } -NDArray NDArray::Empty(std::vector shape, DLDataType dtype, - DLContext ctx) { +NDArray NDArray::Empty(std::vector shape, DLDataType dtype, DLContext ctx) { NDArray ret = Internal::Create(shape, dtype, ctx); // setup memory content size_t size = GetDataSize(ret.get_mutable()->dl_tensor); @@ -194,59 +190,6 @@ NDArray NDArray::Empty(std::vector shape, DLDataType dtype, return ret; } - -NDArray NDArray::NonEmpty(std::vector shape, DLDataType dtype, - DLContext ctx) { - NDArray ret = Internal::Create(shape, dtype, ctx); - NDArray dummy_cpu_arr = Internal::Create(shape, dtype, {kDLCPU, 0}); - - // setup memory content - size_t size = GetDataSize(ret.get_mutable()->dl_tensor); - size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor); - dummy_cpu_arr.get_mutable()->dl_tensor.data = - DeviceAPI::Get(dummy_cpu_arr->ctx)->AllocDataSpace( - {kDLCPU, 0}, size, alignment, dummy_cpu_arr->dtype); - size_t elem_cnt = 1; - for (tvm_index_t i = 0; i < dummy_cpu_arr->ndim; ++i) { - elem_cnt *= static_cast(dummy_cpu_arr->shape[i]); - } - - // TODO(..): maybe we could have better solution for assigning values - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<> dis(1.0, 10.0); - // Use float representation could make us work well on float / int type too. - for (size_t i = 0; i < elem_cnt; ++i) { - if (dummy_cpu_arr->dtype.bits == 1) { - (reinterpret_cast( - dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen); - } else if (dummy_cpu_arr->dtype.bits == 8) { - (reinterpret_cast( - dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen); - } else if (dummy_cpu_arr->dtype.bits == 16) { - (reinterpret_cast( - dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = - __truncXfYf2__( - static_cast(dis(gen))); - } else if (dummy_cpu_arr->dtype.bits == 32) { - (reinterpret_cast( - dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen); - } else if (dummy_cpu_arr->dtype.bits == 64) { - (reinterpret_cast( - dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen); - } else { - LOG(FATAL) << "Doesn't support dtype code " << dtype.code - << " dtype bits " << dtype.bits; - } - } - ret.get_mutable()->dl_tensor.data = - DeviceAPI::Get(ret->ctx)->AllocDataSpace( - ret->ctx, size, alignment, ret->dtype); - CopyFromTo(&(dummy_cpu_arr.get_mutable()->dl_tensor), - &(ret.get_mutable()->dl_tensor)); - return ret; -} - NDArray NDArray::FromDLPack(DLManagedTensor* tensor) { NDArray::Container* data = new NDArray::Container(); // construct header @@ -314,9 +257,8 @@ int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex) { API_END(); } -int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, - int dtype_bits, int dtype_lanes, int device_type, - int device_id, TVMArrayHandle* out) { +int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_bits, + int dtype_lanes, int device_type, int device_id, TVMArrayHandle* out) { API_BEGIN(); DLDataType dtype; dtype.code = static_cast(dtype_code); @@ -330,22 +272,6 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, API_END(); } -int TVMArrayAllocNonEmpty(const tvm_index_t* shape, int ndim, int dtype_code, - int dtype_bits, int dtype_lanes, int device_type, - int device_id, TVMArrayHandle* out) { - API_BEGIN(); - DLDataType dtype; - dtype.code = static_cast(dtype_code); - dtype.bits = static_cast(dtype_bits); - dtype.lanes = static_cast(dtype_lanes); - DLContext ctx; - ctx.device_type = static_cast(device_type); - ctx.device_id = device_id; - *out = NDArray::Internal::MoveToFFIHandle( - NDArray::NonEmpty(std::vector(shape, shape + ndim), dtype, ctx)); - API_END(); -} - int TVMArrayFree(TVMArrayHandle handle) { API_BEGIN(); NDArray::Internal::FFIDecRef(handle); diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 71d3232ca4d5..6d9835e6231c 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -109,9 +109,6 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* } case kGcnArch: return; - default: { - LOG(WARNING) << "Attr not implemented."; - } } } diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index d58130d700f4..89f3e7c6c7f8 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -24,14 +24,9 @@ #include #include -#include #include #include -#if defined(_M_X64) || defined(__x86_64__) -#include -#endif - #include "rpc_endpoint.h" #include "rpc_session.h" @@ -305,22 +300,6 @@ std::shared_ptr RPCModuleGetSession(Module mod) { return rmod->sess(); } -inline void CacheFlush(const char* p, unsigned int allocation_size) { -// TODO(FrozenGene): Support ARM. -#if (defined(_M_X64) || defined(__x86_64__)) - size_t cache_line = 64; - - if (p == nullptr || allocation_size <= 0) { - return; - } - - for (size_t i = 0; i < allocation_size; i += cache_line) { - _mm_clflush(static_cast(&p[i])); - } - -#endif -} - PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat, int min_repeat_ms) { CHECK(pf != nullptr); @@ -334,21 +313,12 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) mutable { TVMRetValue temp; std::ostringstream os; - const char* cache_flush = std::getenv("TVM_AUTO_CACHE_FLUSH"); // skip first time call, to activate lazy compilation components. pf.CallPacked(args, &temp); DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); for (int i = 0; i < repeat; ++i) { - if (cache_flush && std::atoi(cache_flush) != 0) { - CHECK_EQ(number, 1); - // we want to keep input data - for (int j = 1; j < args.size(); j++) { - CacheFlush(reinterpret_cast(args[j].operator DLTensor*()->data), - GetDataSize(*(args[j].operator DLTensor*()))); - } - } std::chrono::time_point tbegin, tend; double duration_ms = 0.0; diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 3b1889aed8ef..e5520efe30a6 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -166,13 +166,8 @@ class ThreadGroup::Impl { #if defined(_M_X64) || defined(__x86_64__) big_count /= 2; // ignore hyper-threading #endif - const char* bind_master_core_0 = getenv("TVM_BIND_MASTER_CORE_0"); - if (bind_master_core_0 && atoi(bind_master_core_0) != 0) { - CPU_SET(sorted_order_[0], &cpuset); - } else { - for (int i = 0; i < big_count; ++i) { - CPU_SET(sorted_order_[i], &cpuset); - } + for (int i = 0; i < big_count; ++i) { + CPU_SET(sorted_order_[i], &cpuset); } } #if defined(__ANDROID__) diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc index 04a3f0b25bee..af72d3b1a1df 100644 --- a/src/te/schedule/schedule_dataflow_rewrite.cc +++ b/src/te/schedule/schedule_dataflow_rewrite.cc @@ -461,7 +461,7 @@ void RebaseNonZeroMinLoop(const Schedule& sch) { for (IterVar iv : root_iter_vars) { size_t idx = FindNodeRef(leaf_vars, iv); auto it = s->iter_var_attrs.find(iv); - // don't need to rebase path that are binded. + // don;t need to rebase path that are binded. if (it != s->iter_var_attrs.end() && (*it).second->bind_thread.defined()) { continue; } @@ -614,74 +614,10 @@ void InjectInline(ScheduleNode* sch) { } } -void LegalizeInvalidAttach(ScheduleNode* sch) { - std::unordered_map replace_map; - - for (Stage stage : sch->stages) { - for (Stage s = stage; s.defined();) { - Stage spec = s.GetAttachSpec(); - if (spec->attach_type != kScope) { - break; - } - bool start_attach = false; - IterVar attach_ivar = spec->attach_ivar; - s = spec->attach_stage; - CHECK(attach_ivar.defined()); - CHECK(s.defined()); - - for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) { - IterVar iv = s->leaf_iter_vars[i - 1]; - if (!start_attach && iv.same_as(attach_ivar)) { - start_attach = true; - } - } - if (!start_attach) { - // If the attach_var is fused into another iter_var, update the - // attach_var to be the fused one - // Do this recursively. - IterVar new_attach_ivar = attach_ivar;; - bool updated = true; - while (updated) { - updated = false; - for (const auto& rel : s->relations) { - if (const FuseNode* r = rel.as()) { - if (new_attach_ivar.same_as(r->inner)) { - new_attach_ivar = r->fused; - updated = true; - } - } else if (const SplitNode* r = rel.as()) { - if (new_attach_ivar.same_as(r->parent)) { - new_attach_ivar = r->inner; - updated = true; - } - } - } - replace_map[attach_ivar] = new_attach_ivar; - } - } - } - } - - // remap the parent relation - for (Stage s : sch->stages) { - if (s->attach_type != kScope) continue; - if (replace_map.count(s->attach_ivar)) { - s->attach_ivar = replace_map.at(s->attach_ivar); - } - } - for (Stage s : sch->groups) { - if (s->attach_type != kScope) continue; - if (replace_map.count(s->attach_ivar)) { - s->attach_ivar = replace_map.at(s->attach_ivar); - } - } -} - Schedule Schedule::normalize() { Schedule sn = copy(); InjectInline(sn.operator->()); RebaseNonZeroMinLoop(sn); - LegalizeInvalidAttach(sn.operator->()); return sn; } diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc index f6a8ad034aa5..1fbae0fd2dcd 100644 --- a/src/tir/analysis/verify_gpu_code.cc +++ b/src/tir/analysis/verify_gpu_code.cc @@ -33,22 +33,20 @@ namespace tvm { namespace tir { -class GPUCodeVerifier : public StmtExprVisitor { +class GPUCodeVerifier : public StmtVisitor { public: bool Verify(Stmt stmt, int64_t max_local_memory_per_block, int64_t max_shared_memory_per_block, int64_t max_threads_per_block, int64_t max_thread_x, int64_t max_thread_y, - int64_t max_thread_z, int64_t max_vector_bytes) { + int64_t max_thread_z) { max_local_memory_per_block_ = static_cast(max_local_memory_per_block); max_shared_memory_per_block_ = static_cast(max_shared_memory_per_block); max_threads_per_block_ = static_cast(max_threads_per_block); max_thread_x_ = static_cast(max_thread_x); max_thread_y_ = static_cast(max_thread_y); max_thread_z_ = static_cast(max_thread_z); - max_vector_bytes_ = static_cast(max_vector_bytes); Reset_(); - // TODO(jcf94): Add support of detecting CUDA Misaligned Address error this->VisitStmt(stmt); return valid_; @@ -64,10 +62,6 @@ class GPUCodeVerifier : public StmtExprVisitor { size_t size = static_cast(op->constant_allocation_size()); shared_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes(); } - - if (op->dtype.lanes() > 1) { - valid_ &= op->dtype.lanes() * op->dtype.bytes() <= static_cast(max_vector_bytes_); - } } void VisitStmt_(const AttrStmtNode* op) final { @@ -135,18 +129,6 @@ class GPUCodeVerifier : public StmtExprVisitor { } } - void VisitExpr_(const LoadNode* op) { - // Currently not able to check: - // if the index expression failed to be simplified to a Ramp - if (op->index->IsInstance()) { - if (op->dtype.lanes() > 1) { - valid_ &= op->dtype.lanes() * op->dtype.bytes() <= - static_cast(max_vector_bytes_); - } - } - ExprVisitor::VisitExpr_(op); - } - private: int nest_level_{0}; @@ -164,7 +146,6 @@ class GPUCodeVerifier : public StmtExprVisitor { size_t max_shared_memory_per_block_; size_t max_threads_per_block_; size_t max_thread_x_, max_thread_y_, max_thread_z_; - size_t max_vector_bytes_; bool valid_{true}; @@ -188,32 +169,27 @@ bool VerifyGPUCode(const PrimFunc& func, Map constraints) { int64_t max_thread_x = INT64_MAX; int64_t max_thread_y = INT64_MAX; int64_t max_thread_z = INT64_MAX; - int64_t max_vector_bytes = INT64_MAX; for (auto iter : constraints) { const IntImmNode* val = iter.second.as(); - if (iter.first == "max_local_memory_per_block") { + if (iter.first == "max_local_memory_per_block") max_local_memory_per_block = val->value; - } else if (iter.first == "max_shared_memory_per_block") { + else if (iter.first == "max_shared_memory_per_block") max_shared_memory_per_block = val->value; - } else if (iter.first == "max_threads_per_block") { + else if (iter.first == "max_threads_per_block") max_threads_per_block = val->value; - } else if (iter.first == "max_thread_x") { + else if (iter.first == "max_thread_x") max_thread_x = val->value; - } else if (iter.first == "max_thread_y") { + else if (iter.first == "max_thread_y") max_thread_y = val->value; - } else if (iter.first == "max_thread_z") { + else if (iter.first == "max_thread_z") max_thread_z = val->value; - } else if (iter.first == "max_vector_bytes") { - max_vector_bytes = val->value; - } else { + else LOG(FATAL) << "Invalid check item: " << iter.first; - } } return verifier.Verify(func->body, max_local_memory_per_block, max_shared_memory_per_block, - max_threads_per_block, max_thread_x, max_thread_y, max_thread_z, - max_vector_bytes); + max_threads_per_block, max_thread_x, max_thread_y, max_thread_z); } TVM_REGISTER_GLOBAL("tir.analysis.verify_gpu_code").set_body_typed(VerifyGPUCode); diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc index 4f1078165f34..a15190665949 100644 --- a/src/tir/transforms/unroll_loop.cc +++ b/src/tir/transforms/unroll_loop.cc @@ -43,7 +43,6 @@ struct UnrollLoopConfigNode : public tvm::AttrsNode { int auto_max_depth; int auto_max_extent; int explicit_unroll; - int explicit_unroll_max_extent; TVM_DECLARE_ATTRS(UnrollLoopConfigNode, "tir.transform.UnrollLoopConfig") { TVM_ATTR_FIELD(auto_max_step) @@ -58,9 +57,6 @@ struct UnrollLoopConfigNode : public tvm::AttrsNode { TVM_ATTR_FIELD(explicit_unroll) .describe("Whether to explicitly unroll the loop instead of setting a pragma") .set_default(true); - TVM_ATTR_FIELD(explicit_unroll_max_extent) - .describe("The maximum extent of a loop that can be unrolled explicitly (-1 for infinite)") - .set_default(32); } }; @@ -75,12 +71,11 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.UnrollLoop", UnrollLoopConfig); class LoopUnroller : public StmtExprMutator { public: explicit LoopUnroller(int auto_max_step, int auto_max_depth, int auto_max_extent, - bool explicit_unroll, int explicit_unroll_max_extent) + bool explicit_unroll) : auto_max_step_(auto_max_step), auto_max_depth_(auto_max_depth), auto_max_extent_(auto_max_extent), - explicit_unroll_(explicit_unroll), - explicit_unroll_max_extent_(explicit_unroll_max_extent) {} + explicit_unroll_(explicit_unroll) {} Stmt VisitStmt_(const AttrStmtNode* op) final { if (op->attr_key == "pragma_auto_unroll_max_step") { @@ -170,12 +165,6 @@ class LoopUnroller : public StmtExprMutator { // For loop must have a constant integer extent CHECK_NE(value, -1) << "loop doesn't have a constant integer extent"; if (value == 0) return Evaluate(0); - if (explicit_unroll_max_extent_ > 0 && value > explicit_unroll_max_extent_ && - explicit_unroll_) { - // Do not unroll too long loops - ForType for_type = op->for_type == ForType::Unrolled ? ForType::Serial : op->for_type; - return For(op->loop_var, op->min, op->extent, for_type, op->device_api, op->body); - } Stmt body = op->body; Map vmap; Array unrolled; @@ -208,10 +197,7 @@ class LoopUnroller : public StmtExprMutator { // max extent of loop to auto unroll // this not not count the total steps, only count the number of loops int auto_max_extent_; - // Whether to explicitly unroll the loop instead of setting a pragma bool explicit_unroll_; - // The maximum extent of a loop that can be unrolled explicitly (-1 means infinite) - int explicit_unroll_max_extent_; // Number of normal loops in scope int normal_loop_depth_{0}; // number of unrolled cases in current scope. @@ -224,7 +210,7 @@ class LoopUnroller : public StmtExprMutator { Stmt UnrollLoop(Stmt stmt, UnrollLoopConfig cfg) { Stmt ret = LoopUnroller(cfg->auto_max_step, cfg->auto_max_depth, cfg->auto_max_extent, - cfg->explicit_unroll, cfg->explicit_unroll_max_extent)(stmt); + cfg->explicit_unroll)(stmt); if (!ret.same_as(stmt)) { return ConvertSSA(ret); } else { diff --git a/tests/python/unittest/test_ansor_feature.py b/tests/python/unittest/test_ansor_feature.py deleted file mode 100644 index 705556c65edf..000000000000 --- a/tests/python/unittest/test_ansor_feature.py +++ /dev/null @@ -1,150 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Test feature extraction""" - -import math -import tempfile - -import tvm -from tvm import te, ansor - -from test_ansor_common import matmul_ansor_test - - -def fequal(a, b): - return math.fabs(a - b) < 1e-6 - - -def test_cpu_matmul(): - dag = ansor.ComputeDAG(matmul_ansor_test(512, 512, 512)) - s = dag.get_init_state() - C = s.stage_ops[2] - - i, j, k = s[C].iters - io, ii = s.split(C, i, [16]) - jo, ji = s.split(C, j, [8]) - s.reorder(C, [io, jo, k, ji, ii]) - s.vectorize(C, ji) - s.parallel(C, io) - s.parallel(C, jo) - s.unroll(C, k) - - target = tvm.target.create('llvm') - task = ansor.SearchTask(dag, "test", target) - names = ansor.feature.get_per_stmt_feature_names() - fea = ansor.feature.get_per_stmt_features_from_states([s], task)[0] - - stage_0 = fea[0] - assert len(stage_0) == len(names), "%d vs %d" % (len(stage_0), len(names)) - fea_dict = {} - for name, value in zip(names, stage_0): - fea_dict[name] = value - - for name in ["B0", "B1", "B2"]: - if fequal(fea_dict[name + ".acc_type.kReadWrite"], 1.0): - c_name = name - if fequal(fea_dict[name + ".acc_type.kRead"], 1.0): - if fequal(fea_dict[name + ".stride"], 0.0): - b_name = name - else: - a_name = name - - assert fequal(fea_dict[c_name + ".bytes"], math.log2(512 ** 3 * 4 + 1)) - assert fequal(fea_dict[b_name + ".unique_bytes"], math.log2(512 ** 2 * 4 + 1)) - assert fequal(fea_dict[c_name + ".reuse_dis_iter"], math.log2(8 * 16 + 1)) - assert fequal(fea_dict[c_name + ".reuse_dis_bytes"], math.log2((8 * 16 + 8 + 16) * 4 + 1)) - assert fequal(fea_dict[c_name + ".reuse_ct"], math.log2(512 + 1)) - - assert fequal(fea_dict["unroll_num"], math.log2(1 + 1)) - # assert fequal(fea_dict["unroll_type.kPosInnerReduce"], 1.0) - assert fequal(fea_dict["vec_num"], math.log2(1 + 1)) - assert fequal(fea_dict["parallel_num"], math.log2(2 + 1)) - assert fequal(fea_dict["parallel_prod"], math.log2((512 * 512 / 16 / 8) + 1)) - - -def test_cpu_fusion(): - def fusion_test(N, M): - A = te.placeholder((N, M), name='A') - B = te.compute((N, M), lambda i, j: A[i][j], name='B') - C = te.compute((N, M), lambda i, j: B[i][j], name='C') - return [A, B, C] - - dag = ansor.ComputeDAG(fusion_test(64, 32)) - s = dag.get_init_state() - s.compute_at(1, 2, s.stages[2].iters[1]) - - target = tvm.target.create('llvm') - task = ansor.SearchTask(dag, "test", target) - names = ansor.feature.get_per_stmt_feature_names() - fea = ansor.feature.get_per_stmt_features_from_states([s], task)[0] - - found = False - for stage_fea in fea: - for i, (name, value) in enumerate(zip(names, stage_fea)): - if 'reuse_type.kSerialMultipleReadWrite' in name and value > 0.5: - assert fequal(stage_fea[i + 2], 1.0) - assert fequal(stage_fea[i + 3], math.log2(16 + 1)) - found = True - assert found - - -def test_gpu_feature(): - ctx = tvm.context("cuda", 0) - if not ctx.exist: - return - - json_records = "\n".join(( - """{"i": [["[\\"matmul_ansor_test\\", 512, 512, 512]", "cuda"], [[], [["CHW", 2, "local"], ["SP", 2, 0, 512, [1, 16, 32, 1], 1], ["SP", 2, 5, 512, [4, 1, 1, 16], 1], ["SP", 2, 10, 512, [1, 2], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 3, 0, 1, 3], ["FSP", 3, 4, 2, 3], ["RE", 3, [0, 4, 1, 5, 2, 6, 3, 7]], ["FU", 2, [0, 1]], ["FU", 3, [0, 1]], ["FU", 2, [1, 2]], ["FU", 3, [1, 2]], ["FU", 2, [2, 3]], ["FU", 3, [2, 3]], ["CA", 2, 3, 2], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 3], ["FU", 2, [0, 1]], ["FFSP", 2, 0, [1, 2], 1, 1], ["AN", 2, 1, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 3], ["FU", 1, [0, 1]], ["FFSP", 1, 0, [1, 2], 1, 1], ["AN", 1, 1, 6], ["AN", 5, 0, 5], ["AN", 5, 1, 4], ["AN", 5, 2, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00536798], 0, 2.49277, 1585564852], "v": "v0.1"}""", - )) - - # load states - with tempfile.NamedTemporaryFile(mode='w') as f: - f.write(json_records) - f.flush() - inputs, results = ansor.LogReader(f.name).read_lines() - - inp = inputs[0] - dag = ansor.workload_key_to_dag(inp.task.workload_key) - task = ansor.SearchTask(dag, inp.task.workload_key, inp.task.target, None, ansor.HardwareParams(100000, 16, 64, 4, 64)) - - state = ansor.serialization.get_states_from_measure_inputs(inputs, task)[0] - state = dag.infer_bound_from_state(state) - fea = ansor.feature.get_per_stmt_features_from_states([state], task)[0] - names = ansor.feature.get_per_stmt_feature_names() - - # build feature dict - fea_dicts = [] - for i in range(len(fea)): - tmp_dict = {} - for j in range(len(names)): - tmp_dict[names[j]] = fea[i][j] - fea_dicts.append(tmp_dict) - - # check values - assert fequal(fea_dicts[0]['blockIdx_x_len'], math.log2(8 + 1)) - assert fequal(fea_dicts[0]['vthread_len'], math.log2(4 + 1)) - assert fequal(fea_dicts[1]['threadIdx_x_len'], math.log2(16 + 1)) - assert fequal(fea_dicts[0]['threadIdx_y_len'], math.log2(1 + 1)) - assert fequal(fea_dicts[2]['blockIdx_z_len'], math.log2(1 + 1)) - assert fequal(fea_dicts[0]['is_gpu'], 1.0) - - -if __name__ == "__main__": - test_cpu_matmul() - test_cpu_fusion() - test_gpu_feature() diff --git a/tests/python/unittest/test_ansor_relay_integration.py b/tests/python/unittest/test_ansor_relay_integration.py deleted file mode 100644 index 1ad507e2f371..000000000000 --- a/tests/python/unittest/test_ansor_relay_integration.py +++ /dev/null @@ -1,114 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" Test Relay Integration """ - -import tempfile -import numpy as np - -import tvm -from tvm import ansor, relay -import tvm.contrib.graph_runtime as runtime -from tvm.relay.testing import dqn - -def test_tune_dense_graph(): - def dense_graph(N, dtype="float32"): - ori_data = relay.var("data", shape=(N, N), dtype=dtype) - weight = relay.var("weight", shape=(N, N), dtype=dtype) - data = relay.multiply(ori_data, relay.const(2, dtype=dtype)) - dense = relay.nn.dense(data, weight, out_dtype=dtype) - dense = relay.add(dense, weight) - dense = relay.nn.dense(dense, weight, out_dtype=dtype) - return ori_data, weight, dense - - N = 128 - data, weight, dense = dense_graph(N) - mod = relay.Function([data, weight], dense) - mod = tvm.IRModule.from_expr(mod) - - ctx = tvm.context("llvm") - target = tvm.target.create("llvm") - d = tvm.nd.array(np.random.uniform(size=(N, N)).astype(data.type_annotation.dtype), ctx) - w = tvm.nd.array(np.random.uniform(size=(N, N)).astype(weight.type_annotation.dtype), ctx) - wkl_keys, wkl_weights = ansor.extract_from_program(mod, {}, target=target) - - assert len(wkl_keys) == 2 - assert len(wkl_weights) == 2 - - tasks = [] - for wkl_key in wkl_keys: - dag = ansor.workload_key_to_dag(wkl_key) - tasks.append(ansor.SearchTask(dag, wkl_key, target)) - - tuner = ansor.SimpleTaskScheduler(tasks) - measure_ctx = ansor.LocalRPCMeasureContext() - with tempfile.NamedTemporaryFile() as fp: - tuner.tune(ansor.TuneOption(n_trials=2, runner=measure_ctx.runner, - measure_callbacks=[ansor.LogToFile(fp.name)])) - with ansor.apply_history_best(fp.name): - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build( - mod, target=target) - - m = runtime.create(graph, lib, ctx) - m.set_input('data', d) - m.set_input('weight', w) - m.run() - res = m.get_output(0) - - del measure_ctx - - d = d.asnumpy() - d = d * 2 - w = w.asnumpy() - d = np.dot(d, np.transpose(w)) - d = d + w - d = np.dot(d, np.transpose(w)) - - tvm.testing.assert_allclose(res.asnumpy(), d, rtol=1e-5) - - -def test_tune_dqn(): - mod, params = dqn.get_workload(1, image_shape=(84, 84, 4), layout='NHWC') - target = tvm.target.create('llvm') - - wkl_keys, wkl_weights = ansor.extract_from_program(mod, params, target) - - tasks = [] - for wkl_key in wkl_keys: - dag = ansor.workload_key_to_dag(wkl_key) - tasks.append(ansor.SearchTask(dag, wkl_key, target)) - - assert len(tasks) == 5 - - tuner = ansor.SimpleTaskScheduler(tasks) - measure_ctx = ansor.LocalRPCMeasureContext() - with tempfile.NamedTemporaryFile() as fp: - tuner.tune(ansor.TuneOption(n_trials=len(tasks), runner=measure_ctx.runner, - measure_callbacks=[ansor.LogToFile('tmp.json')]), - search_policy='sketch.random') - with ansor.apply_history_best('tmp.json'): - ansor.prepare_layout_rewrite(mod, params, target) - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build(mod, target=target) - ansor.finish_layout_rewrite() - - del measure_ctx - -if __name__ == "__main__": - test_tune_dense_graph() - test_tune_dqn() - diff --git a/tests/python/unittest/test_ansor_task_scheduler.py b/tests/python/unittest/test_ansor_task_scheduler.py deleted file mode 100644 index 53cf2059c1f3..000000000000 --- a/tests/python/unittest/test_ansor_task_scheduler.py +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Test the task scheduler """ - -import threading - -import tvm -from tvm import ansor - -from test_ansor_common import matmul_ansor_test - -def test_task_scheduler_basic(): - N = 128 - A, B, C = matmul_ansor_test(N, N, N) - dag = ansor.ComputeDAG([A, B, C]) - tgt = tvm.target.create("llvm") - task1 = ansor.SearchTask(dag, "test", tgt) - task2 = ansor.SearchTask(dag, "test", tgt) - - def basic_test_func(task1, task2): - def objective(costs): - return sum(costs) - - task_scheduler = ansor.SimpleTaskScheduler([task1, task2], objective) - tune_option = ansor.TuneOption(n_trials=3, runner='local') - task_scheduler.tune(tune_option) - - # Ansor search process with local runner has some modification on thread - # binding, wrap this to a subprocess to eliminate the impacts to other tests - t = threading.Thread(target=basic_test_func, - kwargs={'task1': task1, 'task2': task2}) - t.start() - t.join() - - -if __name__ == "__main__": - test_task_scheduler_basic() diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py index 12c686634548..68639940bb05 100644 --- a/tests/python/unittest/test_tir_transform_unroll_loop.py +++ b/tests/python/unittest/test_tir_transform_unroll_loop.py @@ -110,31 +110,7 @@ def test_unroll_single_count_loops(): ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body assert ret == stmt -def test_unroll_explicitly_max_extent(): - n = 64 - A = te.placeholder((n,), name='A') - B = te.compute((n,), lambda *i: A(*i), name='B') - s = te.create_schedule(B.op) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt)) - - with tvm.transform.PassContext(config={ - "tir.UnrollLoop": {"explicit_unroll_max_extent": n-1} - }): - ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body - assert tvm.ir.structural_equal(ret, stmt) - - with tvm.transform.PassContext(config={ - "tir.UnrollLoop": {"explicit_unroll_max_extent": n} - }): - ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body - assert not tvm.ir.structural_equal(ret, stmt) - - if __name__ == "__main__": test_unroll_loop() test_unroll_fake_loop() test_unroll_single_count_loops() - test_unroll_explicitly_max_extent() diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h index 7dd782f5b622..e0e455667889 100644 --- a/topi/include/topi/transform.h +++ b/topi/include/topi/transform.h @@ -1295,75 +1295,6 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, name, tag); } -/*! - * \brief utility function for kernel_layout_transform - */ -inline void parse_kernel_layout(const String& layout, - Array* shape, - std::vector* axes) { - int32_t factor = 0; - std::string axis = ""; - for (char c : std::string(layout)) { - if (c >= 'A' && c <= 'z') { - axis += c; - if (factor != 0) { - shape->push_back(factor); - factor = 0; - } - } else if (c >= '0' && c <= '9') { - factor = factor * 10 + c - '0'; - if (!axis.empty()) { - axes->push_back(axis); - axis = ""; - } - } else { - LOG(FATAL) << "Invalid layout " << layout; - } - } - if (!axis.empty()) { - axes->push_back(axis); - } -} - -/*! - * \brief Transform the kernel layout according to \p src_layout and \p dst_layout - * \param src the source input. - * \param src_layout the source layout. - * \param dst_layout the destination layout. - * \param name output tensor name. - * \param tag output tensor tag. - * \return A tensor with shape in \p dst_layout - */ -inline Tensor kernel_layout_transform(const Tensor& src, - const String& src_layout, - const String& dst_layout, - const String name = "T_kernel_layout_trans", - const String tag = kInjective) { - Array src_shape; - std::vector src_axes; - Array dst_shape; - std::vector dst_axes; - - parse_kernel_layout(src_layout, &src_shape, &src_axes); - parse_kernel_layout(dst_layout, &dst_shape, &dst_axes); - return compute( - dst_shape, [&](const Array& dst_indices) { - Array dst_indices_expr(dst_indices.begin(), dst_indices.end()); - Array src_indices; - for (const std::string& src_axis : src_axes) { - PrimExpr src_index = 0; - CHECK_EQ(dst_indices_expr.size(), dst_axes.size()); - for (size_t i = 0; i < dst_axes.size(); ++i) { - if (dst_axes[i] == src_axis) { - src_index = src_index * dst_shape[i] + dst_indices_expr[i]; - } - } - src_indices.push_back(src_index); - } - return src(src_indices); - }, name, tag); -} - /*! * \brief Get the shape of input tensor. * \param src the input tensor. diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 6800129c12aa..4c7941b49692 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -20,7 +20,7 @@ from __future__ import absolute_import as _abs from collections import namedtuple import tvm -from tvm import te, ansor +from tvm import te from .pad import pad from .util import get_pad_tuple @@ -342,37 +342,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape - if ansor.GLOBAL_SCOPE.topi_in_compute_rewrite_mode: - # infer shape for the rewritten layout - if len(Filter.shape) >= 10: - # For cpu tile structure SSRSRS - base = len(Filter.shape) - 10 - kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base] - kernel_w = Filter.shape[3 + base] * Filter.shape[7 + base] - channel = Filter.shape[4 + base] * Filter.shape[8 + base] - num_filter = Filter.shape[5 + base] * Filter.shape[9 + base] - for i in range(base + 2): - num_filter *= Filter.shape[i] - elif len(Filter.shape) == 6: - # For cpu tile structure SRS - num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5] - kernel_h = Filter.shape[2] - kernel_w = Filter.shape[3] - channel = Filter.shape[4] - elif len(Filter.shape) == 5: - # For cpu tile structure SRS - num_filter = Filter.shape[0] * Filter.shape[4] - kernel_h = Filter.shape[1] - kernel_w = Filter.shape[2] - channel = Filter.shape[3] - elif len(Filter.shape) == 4: - num_filter, kernel_h, kernel_w, channel = Filter.shape - else: - raise ValueError("Don't know how to infer layout for filter shape: %s. " \ - "You can add a new branch for it to fix this." % str(Filter)) - else: - kernel_h, kernel_w, channel, num_filter = Filter.shape - + kernel_h, kernel_w, channel, num_filter = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 @@ -392,9 +362,8 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): lambda nn, yy, xx, ff: te.sum( PaddedInput[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * - Filter[ry, rx, rc, ff].astype(out_dtype) - , axis=[ry, rx, rc]), - name="Conv2dOutput", tag="conv2d_nhwc", attrs={"layout_free_placeholders": [Filter]}) + Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]), + name="Conv2dOutput", tag="conv2d_nhwc") return Output diff --git a/tutorials/ansor/README.txt b/tutorials/ansor/README.txt deleted file mode 100644 index 85b6ba401dae..000000000000 --- a/tutorials/ansor/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -.. _tutorial-ansor-auto-schedule: - -Ansor: Template Free Auto Scheduling ------------------------------------- diff --git a/tutorials/ansor/tune_conv2d_cuda.py b/tutorials/ansor/tune_conv2d_cuda.py deleted file mode 100644 index 03f1b24a768e..000000000000 --- a/tutorials/ansor/tune_conv2d_cuda.py +++ /dev/null @@ -1,179 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling High Performance Convolution on NVIDIA GPUs -=========================================================== -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_, \ - `Minmin Sun `_, \ - `Zhao Wu `_ - -This is an tutorial for searching high performance schedule for NVIDIA GPU using -Ansor auto-scheduler. By running Ansor on this template, we can outperform the -vendor provided library CuDNN in many cases. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use autotvm package in tvm, we need to install some extra dependencies. -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost tornado -# -# To make TVM run faster in tuning, it is recommended to use cython -# as FFI of tvm. In the root directory of tvm, execute -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - -import random -import sys - -import numpy as np -import tvm -import topi -from topi.testing import conv2d_nchw_python -from tvm import te - -# the module is called `ansor` -from tvm import ansor - -###################################################################### -# Step 1: Define the search task -# ------------------------------- -# There are plenty of useful schedule primitives in tvm. You can also find -# some tutorials that describe them in more details, such as -# (1). :ref:`opt-conv-gpu` -# (2). `Optimizing DepthwiseConv on NVIDIA GPU `_ -# -# It's usually a hard job if one wants to get a high performance schedule for a -# specific workload. Even writing an AutoTVM tunable template needs user to have -# expertises on how each schedule primitive works as well as how they finally -# reflect on the hardward architecture. -# -# However, with Ansor this will be quite simple. Firstly, define the target workload. -# Both :code:`tvm.te` API or topi op API are fine to be used. -# -# We can use the retuned :code:`Tensors` to create a ComputeDAG just like what we do -# in :ref:`ansor-simple-subgraph`, while the way to use workload registry is more -# recommended. - -# Use an extra function decorator to regist this workload -@ansor.register_workload_func -def conv2d_nchw(N, H, W, CO, CI, KH, KW, stride, padding): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, KH, KW), name='kernel') - conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32') - - return [data, kernel, conv] - -###################################################################### -# Step 2: Search through the schedule space -# ------------------------------------------ -# We pick the last layer on resnet as test case. -# Since our space is very large, :code:`XGBModel` is most suitable -# for our case. Here we only do 20 trials for demonstration. -# In practice, making 1000 trials usually can find some good kernels -# for this workload. - -tgt = tvm.target.cuda() - -# The last layer in resnet -N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) -# Generate workload key with the ansor API -wkl_key = ansor.make_workload_key_func(conv2d_nchw, (N, H, W, CO, CI, KH, KW, strides, padding)) -# Generate ComputeDAG using the workload key -dag = ansor.workload_key_to_dag(wkl_key) -task = ansor.SearchTask(dag, wkl_key, target=tgt) - -log_file = "conv2d_nchw.json" -seed = 0 -random.seed(seed) -cost_model = ansor.XGBModel(seed=seed) -search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed) - -######################################################################### -# The :code:`ansor.LocalRPCMeasureContext` is used to create a RPC runner environment. -# -# Use local gpu, measure 10 times for every schedule to reduce variance. The timeout -# for each running is set to 4 seconds. -# -# During the searching process, we may generate several invalid schedules and they -# will be filtered out. It's fine to see "Encountered errors during feature extraction." -# in the tuning logs. -# :code:`ansor.LogToFile` callback will log the tuning results into a -# log file, which can be used to get the best config later. -# :code:`ansor.PreloadMeasuredStates` callback will load measured states -# from history log before schedule search, we can add this callback to make -# sure a same schedule will never be measured for multiple times. - -measure_ctx = ansor.LocalRPCMeasureContext(repeat=3, min_repeat_ms=100, timeout=4) -tune_option = ansor.TuneOption(n_trials=20, - runner=measure_ctx.runner, - measure_callbacks=[ansor.LogToFile(log_file)], - pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)]) -s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy, tune_option=tune_option) - -print("==== Get Lowered Stmt ====") -print(tvm.lower(s, arg_bufs, simple_mode=True)) - -# Release the RPC runner environment -del measure_ctx - -######################################################################### -# From the example lower result showed above, we can see that Ansor has tried -# techniques such as `Shared Memory Cooperative Fetching`, `Kernel Fusion`, -# `Axis unroll`, `Axis Vectorize` and so on. There is no need for users to care -# about the details, and Ansor will catch them well. -# -# Finally we can directly use the returned result to get the generated schedule, -# while in the following tutorial we'll show how to inspect the best config from -# log file, check correctness, and measure running time. - -# Get history best from log file -inp, res = ansor.best_measure_pair_in_file(log_file) -# Get the task ComputeDAG from log result -dag = ansor.workload_key_to_dag(inp.task.workload_key) -# Apply log result to TVM schedule -s, arg_bufs = dag.apply_steps_from_state(inp.state) -func = tvm.build(s, arg_bufs, target=tgt) - -# check correctness -a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) -w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) -c_np = conv2d_nchw_python(a_np, w_np, strides, padding) - -ctx = tvm.gpu() -a_tvm = tvm.nd.array(a_np, ctx=ctx) -w_tvm = tvm.nd.array(w_np, ctx=ctx) -c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) -func(a_tvm, w_tvm, c_tvm) - -tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) - -# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise -# and the overhead of kernel launch. You can also use nvprof to validate the result. -evaluator = func.time_evaluator(func.entry_name, ctx, number=400) -print('Time cost of this operator: %f s' % evaluator(a_tvm, w_tvm, c_tvm).mean) - diff --git a/tutorials/ansor/tune_simple_subgraph.py b/tutorials/ansor/tune_simple_subgraph.py deleted file mode 100644 index 00bef82cf855..000000000000 --- a/tutorials/ansor/tune_simple_subgraph.py +++ /dev/null @@ -1,193 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _ansor-simple-subgraph: - -Writing compute expression and Using Ansor auto-scheduler -========================================================= -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_, \ - `Minmin Sun `_, \ - `Zhao Wu `_ - -This is an introduction tutorial to the auto-scheduler module in TVM. - -There are two steps in auto-scheduling. -The first step is defining the target task. -The second step is running a search algorithm to auto explore the schedule. -In this tutorial, you can learn how to perform these two steps in TVM. -The whole workflow is illustrated by a matrix multiplication with bias add example. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use Ansor package in TVM, we need to install some extra dependencies. -# This step (installing xgboost) can be skipped as it doesn't need XGBoost -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost -# -# To make TVM run faster in tuning, it is recommended to use cython -# as FFI of TVM. In the root directory of TVM, execute -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - -import random -import sys - -import numpy as np -import tvm -from tvm import te - -# the module is called `ansor` -from tvm import ansor - -###################################################################### -# Step 1: Define the target compute subgraph -# ------------------------------------------- -# In this section, we will write a deterministic TVM compute expression code -# to a compute subgraph. -# -# .. note:: Comparing to :ref:`tutorials-autotvm-sec` -# -# In Ansor, we do not need users to provide a schedule template, the only input -# is the compute expression writing by :code:`tvm.te` API or topi op API. -# -# Here is how we implement a matrix multiplication subgraph in TVM. - -# Matmul with bias add -def matmul_add(N, L, M, dtype): - A = te.placeholder((N, L), name='A', dtype=dtype) - B = te.placeholder((L, M), name='B', dtype=dtype) - C = te.placeholder((N, M), name='C', dtype=dtype) - - k = te.reduce_axis((0, L), name='k') - mul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), - name='Mul') - D = te.compute((N, M), lambda i, j: C[i, j] + mul[i, j], name='D') - - return [A, B, C, D] - -###################################################################### -# Step 2: Search through the schedule space -# ------------------------------------------ -# In step 1, we build the compute subgraph. -# The next step is to pick a cost model as well as a search policy and explore the -# possible schedule. -# -# Auto-scheduler in TVM -# ^^^^^^^^^^^^^^^^^^^^^ -# The job for the Ansor auto-scheduler can be described by following pseudo code -# -# .. code-block:: c -# -# ct = 0 -# while ct < max_number_of_trials: -# auto generate a batch of schedules -# measure this batch of schedules on real hardware and get results -# ct += batch_size -# -# When proposing the next batch of schedules, Ansor can take different cost models to -# guide the schedule generating process. -# -# * :code:`RandomModel`: Generate and take new schedule randomly -# * :code:`XGBModel`: Use XGBoost model to estimate the performance of potential schedules, try to pick schedules with better performance in each step -# -# XGBModel can explore more efficiently and find better schedules. - -################################################################ -# Begin tuning -# ^^^^^^^^^^^^ -# Here we continue our matrix multiplication example. -# -# The :code:`ansor.ComputeDAG` takes the Tensor list as input, and generates -# a dag structure. During which process, :code:`ansor.ComputeDAG` will -# do some analyzes with the target subgraph and the results will be used in -# search policy later. -# -# Then we create the :code:`tvm.target` and a tuning task. - -N, L, M = 128, 128, 128 -A, B, C, D = matmul_add(N, L, M, 'float32') -dag = ansor.ComputeDAG([A, B, C, D]) - -print(dag) -print(dag.access_analyzer) - -tgt = tvm.target.create("llvm") -task = ansor.SearchTask(dag, "test", tgt) - -################################################################ -# Next, we choose random model and create a default search policy: -# :code:`ansor.SketchSearchPolicy`. -# -# We only make 5 trials in this tutorial for demonstration. In practice, -# you can do more trials according to your time budget. -# :code:`ansor.LogToFile` callback will log the tuning results into a -# log file, which can be used to get the best config later. -# :code:`ansor.PreloadMeasuredStates` callback will load measured states -# from history log before schedule search, we can add this callback to make -# sure a same schedule will never be measured for multiple times. - -log_file = "matmul_add.json" - -seed = 0 -random.seed(seed) -cost_model = ansor.RandomModel() -search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed) - -tune_option = ansor.TuneOption(n_trials=5, - measure_callbacks=[ansor.LogToFile(log_file)], - pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)]) - -################################################################ -# Then just call :code:`ansor.auto_schedule` and Ansor will try to find a high -# performance schedule for the target subgraph automatically. -# -# The returned result will be a :code:`te.schedule` and a list of :code:`te.Tensor`, -# which can be used as the input of :code:`tvm.lower` or :code:`tvm.build`. - -s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy, - tune_option=tune_option) - -print("==== Get Lowered Stmt ====") -print(tvm.lower(s, arg_bufs, simple_mode=True)) - -######################################################################### -# Check the correctness to make sure we generate a right schedule. - -func = tvm.build(s, arg_bufs) - -# check correctness -a_np = np.random.uniform(size=(N, L)).astype(np.float32) -b_np = np.random.uniform(size=(L, M)).astype(np.float32) -c_np = np.random.uniform(size=(N, M)).astype(np.float32) -d_np = a_np.dot(b_np) + c_np - -d_tvm = tvm.nd.empty(d_np.shape) -func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm) - -tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-2) diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt index 4ad36c000e3c..38e3b3343f4e 100644 --- a/tutorials/autotvm/README.txt +++ b/tutorials/autotvm/README.txt @@ -1,4 +1,4 @@ .. _tutorials-autotvm-sec: -AutoTVM: Template Based Auto Tuning ------------------------------------ +Auto tuning +-----------