From b03f11dfde4566ffeed2b473c3d6e8bd8aea557f Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 17 May 2022 09:48:04 -0700 Subject: [PATCH 01/59] [Hexagon]Use requires_hexagon instead of requires_hexagon_toolchain if running on hexagon target (#11294) * refactor requires_hexagon_toolchain * trigger * lint --- python/tvm/testing/utils.py | 4 ++++ .../contrib/test_hexagon/benchmark_hexagon.py | 12 +++--------- .../test_hexagon/test_2d_physical_buffers.py | 10 +++++++--- .../python/contrib/test_hexagon/test_launcher.py | 16 +++++++--------- tests/python/contrib/test_hexagon/test_models.py | 6 ++---- .../contrib/test_hexagon/test_run_unit_tests.py | 9 +++++---- .../contrib/test_hexagon/test_thread_pool.py | 13 +++---------- .../test_hexagon/topi/test_batch_matmul.py | 6 +++--- .../test_hexagon/topi/test_cache_read_write.py | 7 +++---- .../test_hexagon/topi/test_conv2d_nchw.py | 4 +--- .../test_hexagon/topi/test_conv2d_nhwc.py | 5 +---- .../test_hexagon/topi/test_conv2d_transpose.py | 4 +--- .../contrib/test_hexagon/topi/test_dense.py | 4 +--- .../test_hexagon/topi/test_depthwise_conv2d.py | 3 +-- .../contrib/test_hexagon/topi/test_pooling.py | 16 +++++++--------- .../contrib/test_hexagon/topi/test_reduce.py | 4 +--- .../contrib/test_hexagon/topi/test_softmax.py | 4 +--- 17 files changed, 51 insertions(+), 76 deletions(-) diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index b86596feed6b..8be5cc8ec471 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -404,6 +404,10 @@ def _get_targets(target_str=None): if target_kind == "cuda" and "cudnn" in tvm.target.Target(target).attrs.get("libs", []): is_enabled = tvm.support.libinfo()["USE_CUDNN"].lower() in ["on", "true", "1"] is_runnable = is_enabled and cudnn.exists() + elif target_kind == "hexagon": + is_enabled = tvm.support.libinfo()["USE_HEXAGON"].lower() in ["on", "true", "1"] + # If Hexagon has compile-time support, we can always fall back + is_runnable = is_enabled and "ANDROID_SERIAL_NUMBER" in os.environ else: is_enabled = tvm.runtime.enabled(target_kind) is_runnable = is_enabled and tvm.device(target_kind).exist diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py index f17530c3efdc..979bd111707b 100644 --- a/tests/python/contrib/test_hexagon/benchmark_hexagon.py +++ b/tests/python/contrib/test_hexagon/benchmark_hexagon.py @@ -27,13 +27,7 @@ import tvm.testing from tvm import te -from tvm import relay -from tvm.relay.backend import Executor, Runtime -from tvm.contrib import utils, ndk -from tvm.contrib.hexagon.build import HexagonLauncher -import tvm.contrib.hexagon as hexagon - -from .conftest import requires_hexagon_toolchain +from tvm.contrib.hexagon.build import HexagonLauncherRPC RPC_SERVER_PORT = 7070 @@ -47,8 +41,8 @@ # server to bind to the same port until the wait time elapses. -@requires_hexagon_toolchain -def test_elemwise_add(android_serial_number, hexagon_launcher): +@tvm.testing.requires_hexagon +def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC): """ Starting with an elementwise-add computation, try various schedules / optimizations to see the impact they have on performance. diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py index 9de55996b031..78e1eb11ad9f 100644 --- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py +++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py @@ -19,8 +19,6 @@ import contextlib import sys -import tempfile -import pathlib import pytest import numpy as np @@ -272,6 +270,12 @@ def test_lower(self, schedule_args): @requires_hexagon_toolchain def test_build(self, schedule_args, target_host, input_layout, working_layout, output_layout): + """Testing build success/failure + + * On Hexagon targets, build must succeed for both 1-d and 2-d memory. + * On non-Hexagon targets, build must succeed 1-d memory. + * On non-Hexagon targets, build must fail and report an error for 2-d memory. + """ # contextlib.nullcontext wasn't added until python3.7, and the # CI currently runs on python3.6. Therefore, using ExitStack # to manage an optional context instead. @@ -292,7 +296,7 @@ def runtime_module(self, schedule_args, target_host): return tvm.build(*schedule_args, target=target_host) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_execute( self, runtime_module, diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py index 7dadc8f2f4ab..5c5e8f6c39f1 100644 --- a/tests/python/contrib/test_hexagon/test_launcher.py +++ b/tests/python/contrib/test_hexagon/test_launcher.py @@ -25,10 +25,8 @@ from tvm.relay.backend import Executor, Runtime from tvm.contrib.hexagon.session import Session -from .conftest import requires_hexagon_toolchain - -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_add(hexagon_session: Session): dtype = "int8" A = tvm.te.placeholder((2,), dtype=dtype) @@ -53,7 +51,7 @@ def test_add(hexagon_session: Session): assert (C_data.numpy() == np.array([6, 7])).all() -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_add_vtcm(hexagon_session: Session): dtype = "int8" A = tvm.te.placeholder((2,), dtype=dtype) @@ -87,7 +85,7 @@ class TestMatMul: N = tvm.testing.parameter(32) K = tvm.testing.parameter(32) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_matmul(self, hexagon_session, M, N, K): X = te.placeholder((M, K), dtype="float32") Y = te.placeholder((K, N), dtype="float32") @@ -122,7 +120,7 @@ def test_matmul(self, hexagon_session, M, N, K): tvm.testing.assert_allclose(zt.numpy(), ztcpu.numpy(), rtol=1e-4) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_graph_executor(hexagon_session: Session): dtype = "float32" data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype)) @@ -178,7 +176,7 @@ def test_graph_executor(hexagon_session: Session): tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_graph_executor_multiple_conv2d(hexagon_session: Session): dtype = "float32" input_shape = (1, 8, 8, 3) @@ -255,7 +253,7 @@ def test_graph_executor_multiple_conv2d(hexagon_session: Session): tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target): dtype = "float32" input_shape = (1, 128, 128, 3) @@ -314,7 +312,7 @@ def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target): tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_aot_executor_multiple_conv2d(hexagon_session: Session, aot_host_target, aot_target): dtype = "float32" input_shape = (1, 8, 8, 3) diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py index 649cc5b3f4dd..74f52f20d97c 100644 --- a/tests/python/contrib/test_hexagon/test_models.py +++ b/tests/python/contrib/test_hexagon/test_models.py @@ -24,8 +24,6 @@ from tvm.relay.backend import Executor, Runtime from tvm.contrib.hexagon.session import Session -from .conftest import requires_hexagon_toolchain - def get_mobilenet(): """Download and import mobilenet model with ONNX""" @@ -38,7 +36,7 @@ def get_mobilenet(): return onnx.load(model_path) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_mobilenet(hexagon_session: Session): dtype = "float32" onnx_model = get_mobilenet() @@ -88,7 +86,7 @@ def test_mobilenet(hexagon_session: Session): enable_usmp = tvm.testing.parameter(False, True) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, enable_usmp): if hexagon_session._launcher._serial_number == "simulator": pytest.skip(msg="Skip on simulator due to long runtime.") diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py index 3a383d30e5f4..010c79b8f554 100644 --- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py +++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py @@ -18,20 +18,21 @@ import os import pytest import numpy as np -from tvm.contrib.hexagon.build import HexagonLauncher -from .conftest import requires_hexagon_toolchain + +import tvm +from tvm.contrib.hexagon.session import Session # use pytest -sv to observe gtest output # use --gtest_args to pass arguments to gtest # for example to run all "foo" tests twice and observe gtest output run # pytest -sv --gtests_args="--gtest_filter=*foo* --gtest_repeat=2" -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon @pytest.mark.skipif( os.environ.get("HEXAGON_GTEST") == None, reason="Test requires environment variable HEXAGON_GTEST set with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest", ) -def test_run_unit_tests(hexagon_session, gtest_args): +def test_run_unit_tests(hexagon_session: Session, gtest_args): try: func = hexagon_session._rpc.get_function("hexagon.run_unit_tests") except: diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py index 8a35bff7e7c9..d95c4120b775 100644 --- a/tests/python/contrib/test_hexagon/test_thread_pool.py +++ b/tests/python/contrib/test_hexagon/test_thread_pool.py @@ -25,7 +25,6 @@ import tvm.testing from tvm import te -from .conftest import requires_hexagon_toolchain from tvm.script import tir as T @@ -67,11 +66,8 @@ def benchmark_func(mod, name, args, hexagon_session): return evaluator(a, b, c, n).mean -@requires_hexagon_toolchain -def test_speedup(hexagon_session, capsys): - if hexagon_session is None: - pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.") - +@tvm.testing.requires_hexagon +def test_speedup(hexagon_session: Session, capsys): target_hexagon = tvm.target.hexagon("v68", link_params=True) func = tvm.build( ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon) @@ -85,11 +81,8 @@ def test_speedup(hexagon_session, capsys): print("... speedup of {:.2f}".format(serial_mean / parallel_mean), end=" ") -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_elemwise_sum_parallel(hexagon_session: Session): - if hexagon_session is None: - pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.") - target_hexagon = tvm.target.hexagon("v68", link_params=True) func = tvm.build( ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon) diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py index 2816322b6d43..093ce37e5efa 100644 --- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py +++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py @@ -25,8 +25,8 @@ from tvm.contrib.hexagon.session import Session import tvm.topi.testing from tvm.topi.utils import get_const_tuple +from tvm.contrib.hexagon.session import Session -from ..conftest import requires_hexagon_toolchain dtype = tvm.testing.parameter( "float32", @@ -46,7 +46,7 @@ class TestMatMulFloat: ) # TODO(mehrdadh): add dynamic testing - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M, N, K, dtype): if dtype == "float16": pytest.xfail("float16 is not supported.") @@ -98,7 +98,7 @@ class TestMatMulInt8: (5, 1, 16, 16, 32), ) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_batch_matmul_int8(self, hexagon_session: Session, x_batch, y_batch, M, N, K): dtype = "int8" out_dtype = "int8" diff --git a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py index bfb597f7b7f3..435ab7190752 100644 --- a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py +++ b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py @@ -21,8 +21,7 @@ import tvm.testing from tvm import te - -from ..conftest import requires_hexagon_toolchain +from tvm.contrib.hexagon.session import Session def intrin_mem_copy(shape, dtype, dst_scope, src_scope): @@ -98,7 +97,7 @@ def verify(hexagon_session: Session, s, x, y, z, size): np.testing.assert_equal(zt.numpy(), ref) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_cache_read_write(hexagon_session: Session): size = 128 outer_shape = (size,) @@ -140,7 +139,7 @@ def layout_transform_2d(n): return [n // 16, te.AXIS_SEPARATOR, n % 16] -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_cache_read_write_2d(hexagon_session: Session): size = 128 outer_shape = (size,) diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py index b3d6832ffaa9..7f530a5c4d80 100644 --- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py @@ -27,8 +27,6 @@ from tvm.topi.utils import get_const_tuple from tvm.topi.nn.utils import get_pad_tuple -from ..conftest import requires_hexagon_toolchain - dtype = tvm.testing.parameter("float32") random_seed = tvm.testing.parameter(0) @@ -91,7 +89,7 @@ class BaseConv2DTests: dilation = tvm.testing.parameter(1) batch = tvm.testing.parameter(1) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_conv2d_nchw( self, hexagon_session: Session, diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py index 30b54d51348d..74a3f8dafa3e 100644 --- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py @@ -25,9 +25,6 @@ from tvm.contrib.hexagon.session import Session import tvm.topi.testing from tvm.topi.utils import get_const_tuple -from tvm.topi.nn.utils import get_pad_tuple - -from ..conftest import requires_hexagon_toolchain dtype = tvm.testing.parameter("float32") @@ -46,7 +43,7 @@ def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padd class BaseConv2DTests: - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_conv2d_nhwc( self, hexagon_session: Session, diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py index 0da740614f9d..629403965eae 100644 --- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py @@ -22,9 +22,7 @@ from tvm import te from tvm import topi import tvm.topi.testing -from tvm.contrib.pickle_memoize import memoize from tvm.topi.utils import get_const_tuple -from ..conftest import requires_hexagon_toolchain # TODO Should add kernal to tvm.testing.fixture @@ -68,7 +66,7 @@ def shift_shape(output_padding): class BaseConv2DTransposeTests: - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_conv2d( self, hexagon_session: Session, diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py index c63873a62d96..189b05fcaade 100644 --- a/tests/python/contrib/test_hexagon/topi/test_dense.py +++ b/tests/python/contrib/test_hexagon/topi/test_dense.py @@ -26,8 +26,6 @@ import tvm.topi.testing from tvm.topi.utils import get_const_tuple -from ..conftest import requires_hexagon_toolchain - random_seed = tvm.testing.parameter(0) use_bias = tvm.testing.parameter(True, False) @@ -68,7 +66,7 @@ def dense_ref_data(random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype, return (a_np, b_np, c_np, d_np) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_dense( hexagon_session: Session, batch_size, diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py index ab2ce36e1f82..63ae0e7b3253 100644 --- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py +++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py @@ -28,7 +28,6 @@ from tvm import te, topi from tvm.topi.utils import get_const_tuple from tvm.topi.nn.utils import get_pad_tuple -from ..conftest import requires_hexagon_toolchain random_seed = tvm.testing.parameter(0) @@ -155,7 +154,7 @@ class BaseDepthwiseConv2D: (e.g. implemented only for llvm). """ - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_conv2d( self, hexagon_session: Session, diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py index 38b7f387e5c6..9ce54bf9a6eb 100644 --- a/tests/python/contrib/test_hexagon/topi/test_pooling.py +++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py @@ -26,8 +26,6 @@ import tvm.topi.testing from tvm.topi.utils import get_const_tuple -from ..conftest import requires_hexagon_toolchain - class TestAdaptivePool: dshape, out_size, pool_type, layout = tvm.testing.parameters( @@ -57,7 +55,7 @@ class TestAdaptivePool: ((1, 16, 32, 32, 32), (2, 4, 4), "max", "NDHWC"), ) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size, pool_type, layout): dtype = "float32" np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype) @@ -233,10 +231,10 @@ class TestPool1D: ([1, 31, 16], [3], [3], [3], [3, 0], "max", True, True, "NWC"), ) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_pool1d( self, - hexagon_session, + hexagon_session: Session, input_shape, kernel, stride, @@ -310,10 +308,10 @@ class TestPool2D: ([1, 31, 31, 16], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, True, "NHWC"), ) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_pool2d( self, - hexagon_session, + hexagon_session: Session, input_shape, kernel, stride, @@ -708,10 +706,10 @@ class TestPool3D: ), ) - @requires_hexagon_toolchain + @tvm.testing.requires_hexagon def test_pool3d( self, - hexagon_session, + hexagon_session: Session, input_shape, kernel, stride, diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py index beacb8cd1800..203a2bd31d6e 100644 --- a/tests/python/contrib/test_hexagon/topi/test_reduce.py +++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py @@ -25,8 +25,6 @@ from tvm.contrib.hexagon.session import Session import tvm.topi.testing -from ..conftest import requires_hexagon_toolchain - in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters( ((32,), 0, False, "argmax", "float32"), @@ -101,7 +99,7 @@ def ref_data(in_shape, axis, keepdims, reduce_type, dtype): return in_npy, in_npy_map, out_npy -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_reduce_map( hexagon_session: Session, ref_data, in_shape, axis, keepdims, reduce_type, dtype ): diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py index 6857decabf95..7e734af7e026 100644 --- a/tests/python/contrib/test_hexagon/topi/test_softmax.py +++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py @@ -26,8 +26,6 @@ import tvm.topi.testing from tvm.topi.utils import get_const_tuple -from ..conftest import requires_hexagon_toolchain - dtype = tvm.testing.parameter( "float16", "float32", @@ -54,7 +52,7 @@ ) -@requires_hexagon_toolchain +@tvm.testing.requires_hexagon def test_softmax(hexagon_session: Session, shape, dtype, softmax_operation): if dtype == "float16": pytest.xfail("float16 is not supported.") From 82086ed6bf347f61b58bac7e6bf93586c85fe9a6 Mon Sep 17 00:00:00 2001 From: Alan MacDonald Date: Tue, 17 May 2022 10:32:58 -0700 Subject: [PATCH 02/59] [docs][microtvm] fix command path in microTVM Reference Virtual Machines Running Tests documentation (#11333) --- gallery/how_to/work_with_microtvm/micro_reference_vm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py index 773329405282..9eacd9a963e1 100644 --- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py +++ b/gallery/how_to/work_with_microtvm/micro_reference_vm.py @@ -138,12 +138,12 @@ Running tests ============= -Once the VM has been provisioned, tests can executed using ``poetry``: +Once the VM has been provisioned, tests can be executed using ``poetry``: .. code-block:: bash $ cd apps/microtvm/reference-vm/zephyr - $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=stm32f746g_disco + $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=stm32f746g_disco If you do not have physical hardware attached, but wish to run the tests using the local QEMU emulator running within the VM, run the following commands instead: @@ -152,7 +152,7 @@ $ cd /Users/yourusername/path/to/tvm $ cd apps/microtvm/reference-vm/zephyr/ - $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=qemu_x86 + $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=qemu_x86 From 1c63c3db86e2c67948189579b71c35af1566edd3 Mon Sep 17 00:00:00 2001 From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com> Date: Tue, 17 May 2022 22:50:38 +0300 Subject: [PATCH 03/59] [Runtime][ThreadPool] Enhance CPU Affinity configuration for OpenMP case. (#11343) This commit allows to pin threads to cores when we use OMP. It enhances `tvm::runtime::threading::Configure` method to work with OMP and "kSpecify" affinity mode. --- src/runtime/thread_pool.cc | 55 +++++++++++++++++++++++++++++ tests/cpp/threading_backend_test.cc | 2 +- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index ef1369c7496f..4692e0673427 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -398,6 +398,57 @@ TVM_REGISTER_GLOBAL("runtime.NumThreads").set_body_typed([]() -> int32_t { }); namespace threading { + +#if TVM_THREADPOOL_USE_OPENMP +/*! + * \brief Helper function that allows to pin threads to cores in case of multi instance execution + * when we use OpenMP thread pool. + * + * \param mode Affinity mode (now supports only kSpecifyOneCorePerThread and + * kSpecifyThreadShareAllCore). + * \param nthreads The number of threads to use (0 = use all). + * \param cpus A list of CPU ids to set 'cpu affinity'. + * + */ +static void ConfigureOMP(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads, + const std::vector& cpus) { +#if defined(__linux__) || defined(__ANDROID__) + const int num_workers = MaxConcurrency(); + + if (mode == ThreadGroup::kSpecifyOneCorePerThread) { +#pragma omp parallel num_threads(num_workers) + { + int core_id = cpus[omp_get_thread_num()]; + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); +#if defined(__ANDROID__) + sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset); +#else + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); +#endif + } + } else if (mode == ThreadGroup::kSpecifyThreadShareAllCore) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + for (auto id : cpus) { + CPU_SET(id, &cpuset); + } + +#pragma omp parallel num_threads(num_workers) + { +#if defined(__ANDROID__) + sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset); +#else + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); +#endif + } + } +#endif +} + +#endif + void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); } /*! * \brief configure the CPU id affinity @@ -410,7 +461,11 @@ void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); } void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads, std::vector cpus) { tvm::runtime::threading::SetMaxConcurrency(cpus.size()); +#if !TVM_THREADPOOL_USE_OPENMP tvm::runtime::ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads, cpus); +#else + ConfigureOMP(mode, nthreads, cpus); +#endif } int32_t NumThreads() { return tvm::runtime::ThreadPool::ThreadLocal()->NumThreads(); } } // namespace threading diff --git a/tests/cpp/threading_backend_test.cc b/tests/cpp/threading_backend_test.cc index db32623531b8..5adf1f9ae36c 100644 --- a/tests/cpp/threading_backend_test.cc +++ b/tests/cpp/threading_backend_test.cc @@ -169,7 +169,7 @@ TEST(ThreadingBackend, TVMBackendAffinityConfigure) { std::atomic acc(0); AffinityCheck ac(thread_pool_index, sys_max_concurrency, &acc); std::vector cpus; - std::cout << affinity_mode << std::endl; + LOG(INFO) << affinity_mode << std::endl; for (int k = 0; k < cpus_num_per_thread; k++) { cpus.push_back(thread_pool_index * cpus_num_per_thread + k); } From 2f7d732972f3605bd094609ab9ce5b7d5d80eac9 Mon Sep 17 00:00:00 2001 From: apeskov Date: Tue, 17 May 2022 22:51:03 +0300 Subject: [PATCH 04/59] [BYOC] Threadsafe initialization of JSONRuntime module (#11339) Signed-off-by: Alexander Peskov --- src/runtime/contrib/json/json_runtime.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h index 0c6d0f6d7136..374a440e2902 100644 --- a/src/runtime/contrib/json/json_runtime.h +++ b/src/runtime/contrib/json/json_runtime.h @@ -88,8 +88,11 @@ class JSONRuntimeBase : public ModuleNode { // The function to initialize constant tensors. return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { ICHECK_EQ(args.size(), 1U); - this->Init(args[0]); - this->initialized_ = true; + std::lock_guard guard(this->initialize_mutex_); + if (!this->initialized_) { + this->Init(args[0]); + this->initialized_ = true; + } *rv = 0; }); } else { @@ -270,6 +273,8 @@ class JSONRuntimeBase : public ModuleNode { std::vector const_idx_; /*! \brief Indicate if the engine has been initialized. */ bool initialized_{false}; + /*! \brief Initializer mutex*/ + std::mutex initialize_mutex_; }; } // namespace json From 9b66f66f63a264b6a7a1f50ace29bf1f9e53d43e Mon Sep 17 00:00:00 2001 From: Christian Convey Date: Tue, 17 May 2022 16:24:06 -0400 Subject: [PATCH 05/59] [build] Fix/simplify `ccache` logic (#11189) - Remove TVM's `USE_CCACHE` option in favor of CMake's built-in `CMAKE_C_COMPILER_LAUNCHER` and `CMAKE_CXX_COMPILER_LAUNCHER` variables. This eliminates a significant source of complexity, especially: - TVM's CI scripts, which use `sccache` instead of `ccache`, and - calls to `ExternalProject_add` in TVM's CMake logic. - Ensure that `CMAKE_C[XX]_COMPILER_LAUNCHER` variables are passed through in all `ExternalProject_add` calls. - Update user documentation. --- CMakeLists.txt | 29 ----------------------------- apps/hexagon_api/CMakeLists.txt | 7 +++++++ cmake/config.cmake | 12 ------------ docs/install/from_source.rst | 16 ++++++++++++++-- 4 files changed, 21 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7023caf97eb5..5352eddd2598 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -766,35 +766,6 @@ if(BUILD_FOR_HEXAGON) endif() endif() -#Caches the build. -#Note that ccache-3.x doesn't support nvcc well, so CUDA kernels may never hit the cache and still -#need to be re-compiled every time. Using ccache 4.0+ can resolve this issue. - -if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache - if("${USE_CCACHE}" STREQUAL "AUTO") # Auto mode - find_program(CCACHE_FOUND ccache) - if(CCACHE_FOUND) - message(STATUS "Found the path to ccache, enabling ccache") - set(PATH_TO_CCACHE ccache) - else() - message(STATUS "Didn't find the path to CCACHE, disabling ccache") - endif(CCACHE_FOUND) - elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN}) - find_program(CCACHE_FOUND ccache) - if(CCACHE_FOUND) - message(STATUS "Found the path to ccache, enabling ccache") - set(PATH_TO_CCACHE ccache) - else() - message(FATAL_ERROR "Cannot find ccache. Set USE_CCACHE mode to AUTO or OFF to build without ccache. USE_CCACHE=" "${USE_CCACHE}") - endif(CCACHE_FOUND) - else() # /path/to/ccache - set(PATH_TO_CCACHE USE_CCACHE) - message(STATUS "Setting ccache path to " "${PATH_TO_CCACHE}") - endif() - # Set the flag for ccache - set(CXX_COMPILER_LAUNCHER PATH_TO_CCACHE) -endif(USE_CCACHE) - find_and_set_linker(${USE_ALTERNATIVE_LINKER}) if(${SUMMARIZE}) diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt index 0725b87913a0..feafff3f98da 100644 --- a/apps/hexagon_api/CMakeLists.txt +++ b/apps/hexagon_api/CMakeLists.txt @@ -42,6 +42,8 @@ ExternalProject_Add(x86_tvm_runtime_rpc CMAKE_ARGS "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}" + "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}" "-DUSE_HEXAGON_TOOLCHAIN=${USE_HEXAGON_TOOLCHAIN}" "-DCMAKE_CXX_STANDARD=14" "-DUSE_LIBBACKTRACE=OFF" @@ -70,6 +72,8 @@ ExternalProject_Add(android_tvm_runtime_rpc SOURCE_DIR "${TVM_SOURCE_DIR}" BUILD_COMMAND $(MAKE) runtime tvm_rpc CMAKE_ARGS + "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}" + "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}" "-DCMAKE_TOOLCHAIN_FILE=${USE_ANDROID_TOOLCHAIN}" "-DANDROID_PLATFORM=${ANDROID_PLATFORM}" "-DANDROID_ABI=${ANDROID_ABI}" @@ -86,6 +90,7 @@ ExternalProject_Add(android_tvm_runtime_rpc INSTALL_COMMAND "" BUILD_ALWAYS ON ) + ExternalProject_Get_Property(android_tvm_runtime_rpc BINARY_DIR) ExternalProject_Add_Step(android_tvm_runtime_rpc copy_runtime COMMAND ${CMAKE_COMMAND} -E copy_if_different @@ -109,6 +114,8 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc SOURCE_DIR "${TVM_SOURCE_DIR}" BUILD_COMMAND $(MAKE) runtime hexagon_rpc_sim CMAKE_ARGS + "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}" + "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}" "-DCMAKE_C_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang" "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang++" "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}" diff --git a/cmake/config.cmake b/cmake/config.cmake index dc2512175b42..c436c3feaa9f 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -334,18 +334,6 @@ set(USE_LIBBACKTRACE AUTO) # runtime functions to be unavailable to the program. set(BUILD_STATIC_RUNTIME OFF) - -# Caches the build so that building is faster when switching between branches. -# If you switch branches, build and then encounter a linking error, you may -# need to regenerate the build tree through "make .." (the cache will -# still provide significant speedups). -# Possible values: -# - AUTO: search for path to ccache, disable if not found. -# - ON: enable ccache by searching for the path to ccache, report an error if not found -# - OFF: disable ccache -# - /path/to/ccache: use specific path to ccache -set(USE_CCACHE AUTO) - # Whether to enable PAPI support in profiling. PAPI provides access to hardware # counters while profiling. # Possible values: diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst index 5fb48cb0e54f..8597de224cd9 100644 --- a/docs/install/from_source.rst +++ b/docs/install/from_source.rst @@ -109,7 +109,7 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas export TVM_LOG_DEBUG="ir/transform.cc=1;relay/ir/transform.cc=1" -- TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on. +- TVM requires LLVM for CPU codegen. We highly recommend you to build with the LLVM support on. - LLVM 4.0 or higher is needed for build with LLVM. Note that version of LLVM from default apt may lower than 4.0. - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from @@ -126,6 +126,18 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas - If you are a PyTorch user, it is recommended to set ``(USE_LLVM "/path/to/llvm-config --link-static")`` and ``set(HIDE_PRIVATE_SYMBOLS ON)`` to avoid potential symbol conflicts between different versions LLVM used by TVM and PyTorch. + - On supported platforms, the `Ccache compiler wrapper `_ may be helpful for + reducing TVM's build time. There are several ways to enable CCache in TVM builds: + + - Ccache's Masquerade mode. This is typically enabled during the Ccache installation process. + To have TVM use Ccache in masquerade, simply specify the appropriate C/C++ compiler + paths when configuring TVM's build system. For example: + ``cmake -DCMAKE_CXX_COMPILER=/usr/lib/ccache/c++ ...``. + + - Ccache as CMake's C++ compiler prefix. When configuring TVM's build system, + set the CMake variable ``CMAKE_CXX_COMPILER_LAUNCHER`` to an appropriate value. + E.g. ``cmake -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ...``. + - We can then build tvm and related libraries. .. code:: bash @@ -315,7 +327,7 @@ configuration. A workaround for this is to do the following commands: brew install openblas gfortran - pip install pybind11 cython pythran   + pip install pybind11 cython pythran export OPENBLAS=/opt/homebrew/opt/openblas/lib/ From 1bde845814dd751d11659c3ba6781a6ffc4ede45 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 17 May 2022 13:37:36 -0700 Subject: [PATCH 06/59] [ci] Use r5.large nodes for hexagon build and some tests (#11120) * PR #11314 - [ci][docker] Update images to include sccache changes * [ci] Use r5.large nodes for less-intensive jobs This uses the `CPU-SMALL` label for certain jobs in CI, which is backed by r5.large instances in EC2 rather than c4.4xlarge instances which are much more expensive Co-authored-by: driazati --- Jenkinsfile | 22 +++++++++++----------- jenkins/Jenkinsfile.j2 | 21 ++++++++++++--------- tests/scripts/ci.py | 14 +++++++++++--- tests/scripts/task_build.py | 12 +++++++----- tests/scripts/task_config_build_hexagon.sh | 2 +- tests/scripts/task_lint.sh | 3 --- 6 files changed, 42 insertions(+), 32 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index c0fb3f5df20c..6fcdc3cd4a15 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-05-17T10:32:14.621387 +// Generated at 2022-05-17T09:16:58.363027 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -688,7 +688,7 @@ stage('Build') { }, 'BUILD: Hexagon': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") { init_git() sh ( @@ -845,7 +845,7 @@ stage('Test') { }, 'unittest: CPU': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") { timeout(time: max_time, unit: 'MINUTES') { try { @@ -873,7 +873,7 @@ stage('Test') { }, 'python: i386 1 of 3': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { init_git() @@ -904,7 +904,7 @@ stage('Test') { }, 'python: i386 2 of 3': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { init_git() @@ -934,7 +934,7 @@ stage('Test') { }, 'python: i386 3 of 3': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { init_git() @@ -964,7 +964,7 @@ stage('Test') { }, 'test: Hexagon 1 of 4': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { init_git() @@ -997,7 +997,7 @@ stage('Test') { }, 'test: Hexagon 2 of 4': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { init_git() @@ -1029,7 +1029,7 @@ stage('Test') { }, 'test: Hexagon 3 of 4': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { init_git() @@ -1061,7 +1061,7 @@ stage('Test') { }, 'test: Hexagon 4 of 4': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { init_git() @@ -1093,7 +1093,7 @@ stage('Test') { }, 'test: QEMU': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") { timeout(time: max_time, unit: 'MINUTES') { try { diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2 index 3b2ca5d71103..0264a526e7b5 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/jenkins/Jenkinsfile.j2 @@ -605,7 +605,7 @@ stage('Build') { }, 'BUILD: Hexagon': { if (!skip_ci && is_docs_only_build != 1) { - node('CPU') { + node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-hexagon') }}) { init_git() sh ( @@ -681,7 +681,8 @@ stage('Test') { {% endcall %} {% call m.test_step( name="unittest: CPU", - node="CPU", ws="tvm/ut-python-cpu", + node="CPU-SMALL", + ws="tvm/ut-python-cpu", platform="cpu", ) %} unpack_lib('cpu', tvm_multilib_tsim) @@ -696,11 +697,11 @@ stage('Test') { {% endcall %} {% call(shard_index, num_shards) m.sharded_test_step( name="python: i386", - node="CPU", - num_shards=3, - ws="tvm/integration-python-i386", - platform="i386", - ) %} + node="CPU-SMALL", + num_shards=3, + ws="tvm/integration-python-i386", + platform="i386", + ) %} unpack_lib('i386', tvm_multilib) ci_setup(ci_i386) {% if shard_index == 1 %} @@ -715,7 +716,8 @@ stage('Test') { {% endcall %} {% call(shard_index, num_shards) m.sharded_test_step( name="test: Hexagon", - node="CPU", ws="tvm/test-hexagon", + node="CPU-SMALL", + ws="tvm/test-hexagon", platform="hexagon", num_shards=4, ) %} @@ -735,7 +737,8 @@ stage('Test') { {% endcall %} {% call m.test_step( name="test: QEMU", - node="CPU", ws="tvm/test-qemu", + node="CPU-SMALL", + ws="tvm/test-qemu", platform="qemu", ) %} unpack_lib('qemu', tvm_lib) diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py index 22e6690beb11..d45c3b1ae9cb 100755 --- a/tests/scripts/ci.py +++ b/tests/scripts/ci.py @@ -561,11 +561,14 @@ def add_subparser( return subparser +CPP_UNITTEST = ("run c++ unitests", ["./tests/scripts/task_cpp_unittest.sh"]) + generated = [ generate_command( name="gpu", help="Run GPU build and test(s)", options={ + "cpp": CPP_UNITTEST, "topi": ("run topi tests", ["./tests/scripts/task_python_topi.sh"]), "unittest": ( "run unit tests", @@ -582,6 +585,7 @@ def add_subparser( name="cpu", help="Run CPU build and test(s)", options={ + "cpp": CPP_UNITTEST, "integration": ( "run integration tests", ["./tests/scripts/task_python_integration.sh"], @@ -601,6 +605,7 @@ def add_subparser( name="i386", help="Run i386 build and test(s)", options={ + "cpp": CPP_UNITTEST, "integration": ( "run integration tests", [ @@ -619,26 +624,28 @@ def add_subparser( name="qemu", help="Run QEMU build and test(s)", options={ + "cpp": CPP_UNITTEST, "test": ( "run microTVM tests", [ "./tests/scripts/task_python_microtvm.sh", "./tests/scripts/task_demo_microtvm.sh", ], - ) + ), }, ), generate_command( name="hexagon", help="Run Hexagon build and test(s)", options={ + "cpp": CPP_UNITTEST, "test": ( "run Hexagon API/Python tests", [ "./tests/scripts/task_build_hexagon_api.sh", "./tests/scripts/task_python_hexagon.sh", ], - ) + ), }, ), generate_command( @@ -646,13 +653,14 @@ def add_subparser( help="Run ARM build and test(s) (native or via QEMU on x86)", precheck=check_arm_qemu, options={ + "cpp": CPP_UNITTEST, "python": ( "run full Python tests", [ "./tests/scripts/task_python_unittest.sh", "./tests/scripts/task_python_arm_compute_library.sh", ], - ) + ), }, ), ] diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py index 52b7dd421b46..e4583fe6af04 100755 --- a/tests/scripts/task_build.py +++ b/tests/scripts/task_build.py @@ -37,21 +37,22 @@ env = {"VTA_HW_PATH": str(Path(os.getcwd()) / "3rdparty" / "vta-hw")} sccache_exe = shutil.which("sccache") - use_sccache = sccache_exe is not None and args.sccache_bucket is not None + use_sccache = sccache_exe is not None build_dir = Path(os.getcwd()) / args.build_dir build_dir = build_dir.relative_to(REPO_ROOT) if use_sccache: - env["SCCACHE_BUCKET"] = args.sccache_bucket + if args.sccache_bucket: + env["SCCACHE_BUCKET"] = args.sccache_bucket + logging.info(f"Using sccache bucket: {args.sccache_bucket}") + else: + logging.info(f"No sccache bucket set, using local cache") env["CXX"] = "/opt/sccache/c++" env["CC"] = "/opt/sccache/cc" - logging.info(f"Using sccache bucket: {args.sccache_bucket}") else: if sccache_exe is None: reason = "'sccache' executable not found" - elif args.sccache_bucket is None: - reason = "'sccache' executable not found" else: reason = "" logging.info(f"Not using sccache, reason: {reason}") @@ -71,6 +72,7 @@ num_cpus = max(available_cpus, 1) sh.run("cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo ..", cwd=build_dir) + target = "" if args.cmake_target: target = args.cmake_target diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh index c298800fcd4e..7bce64cddb5a 100755 --- a/tests/scripts/task_config_build_hexagon.sh +++ b/tests/scripts/task_config_build_hexagon.sh @@ -29,7 +29,7 @@ echo set\(USE_RPC ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake -echo set\(CMAKE_CXX_COMPILER "${CLANG_LLVM_HOME}/bin/clang++"\) >> config.cmake +echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake echo set\(USE_HEXAGON "ON"\) >> config.cmake echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_PATH}"\) >> config.cmake echo set\(USE_CCACHE OFF\) >> config.cmake diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh index e0c953d61841..8fbba52662de 100755 --- a/tests/scripts/task_lint.sh +++ b/tests/scripts/task_lint.sh @@ -31,9 +31,6 @@ function shard1 { echo "Convert scripts to Python..." tests/scripts/task_convert_scripts_to_python.sh - # TODO: Remove this ad-hoc pip install once https://github.com/apache/tvm/pull/11265 - # is added to the ci_lint Docker image - python3 -m pip install --user -r jenkins/requirements.txt echo "Check Jenkinsfile generation" python3 jenkins/generate.py --check From 0705bd765037088eca803b7ac80c8e9d83c06ab2 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 17 May 2022 13:53:20 -0700 Subject: [PATCH 07/59] [Hexagon][Docker] Update image version (#11332) --- Jenkinsfile | 2 +- jenkins/Jenkinsfile.j2 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6fcdc3cd4a15..4db9a45e2e5c 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -56,7 +56,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e' ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e' ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e' ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e' -ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e' +ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2 index 0264a526e7b5..88ced73a8f97 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/jenkins/Jenkinsfile.j2 @@ -58,7 +58,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e' ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e' ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e' ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e' -ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e' +ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images From 0e2f869eeadbb349f849ed2add86a622e97053cd Mon Sep 17 00:00:00 2001 From: czh978 <41666381+czh978@users.noreply.github.com> Date: Wed, 18 May 2022 05:08:08 +0800 Subject: [PATCH 08/59] logsoftmax reusing the softmax function (#11141) Co-authored-by: caizihua <978497756@qq.com> --- python/tvm/relay/frontend/onnx.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 81f12c2d8103..e68daca4c4f0 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -2412,30 +2412,18 @@ class LogSoftmax(OnnxOpConverter): """Operator converter for Softmax.""" @classmethod - def run_calculation(cls, x, axes): + def run_calculation(cls, inputs, attr, params, opset): """Run the calculation for Log Softmax calculation.""" - m = _op.max(x, axes, keepdims=True) - e = _op.exp(x - m) - s = _op.sum(e, axes, keepdims=True) - return x - m - _op.log(s) + res = Softmax.get_converter(opset)(inputs, attr, params) + return _op.log(res) @classmethod def _impl_v1(cls, inputs, attr, params): - axis = attr.get("axis", 1) - ndim = len(infer_shape(inputs[0])) - if axis < 0: - axis += ndim - axes = list(range(axis, ndim)) - return cls.run_calculation(inputs[0], axes) + return cls.run_calculation(inputs, attr, params, opset=1) @classmethod def _impl_v13(cls, inputs, attr, params): - axis = attr.get("axis", -1) - ndim = len(infer_shape(inputs[0])) - if axis < 0: - axis += ndim - axes = [axis] - return cls.run_calculation(inputs[0], axes) + return cls.run_calculation(inputs, attr, params, opset=13) class Hardmax(OnnxOpConverter): @@ -4852,7 +4840,8 @@ def _impl_v13(cls, inputs, attr, params): weight_tensor = None get_log_prob = attr["tvm_custom"]["num_outputs"] == 2 - log_softmax_tensor = LogSoftmax.run_calculation(input_tensor, axes=[1]) + log_softmax_attr = {"axis": 1} + log_softmax_tensor = LogSoftmax.get_converter(13)([input_tensor], log_softmax_attr, None) loss, weight_total = NegativeLogLikelihoodLoss.run_calculation( log_softmax_tensor, From 75c31cae75fe31af9e0901210ba7fa597e6f153a Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Tue, 17 May 2022 16:17:48 -0700 Subject: [PATCH 09/59] [Relay] Bug fix when applying history using an iterator or records. (#11306) * Bug fix when applying history using an iterator or records. * I forgot strings are iterables. --- python/tvm/auto_scheduler/dispatcher.py | 3 ++- python/tvm/autotvm/task/dispatcher.py | 5 +++-- tests/python/relay/test_auto_scheduler_tuning.py | 7 +++++++ tests/python/unittest/test_autotvm_record.py | 5 +++++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py index eceeba38e081..98566f863650 100644 --- a/python/tvm/auto_scheduler/dispatcher.py +++ b/python/tvm/auto_scheduler/dispatcher.py @@ -25,6 +25,7 @@ import logging import pathlib +from collections.abc import Iterable import numpy as np @@ -199,7 +200,7 @@ def load(self, records, n_lines=None): if it is not None, only load the first `n_lines` lines of log """ joint_records = [] - if not isinstance(records, (list, tuple)): + if not isinstance(records, Iterable) or isinstance(records, str): records = [records] for rec in records: diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index ffff50b9dc0b..6c072dc1fa17 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -31,6 +31,7 @@ from __future__ import absolute_import as _abs import logging +from collections.abc import Iterable import numpy as np @@ -212,7 +213,7 @@ def load(self, records): Collection of tuning records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. If it is a list - it can either be a list of paths to logs that will loaded jointly or + it can either be a list of paths to logs that will be loaded jointly or an iterator of measurement results. """ # pylint: disable=import-outside-toplevel @@ -220,7 +221,7 @@ def load(self, records): from ..record import load_from_file joint_records = [] - if not isinstance(records, (list, tuple)): + if not isinstance(records, Iterable) or isinstance(records, str): records = [records] for rec in records: diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py index c9ce5b59ff09..735486ef27c6 100644 --- a/tests/python/relay/test_auto_scheduler_tuning.py +++ b/tests/python/relay/test_auto_scheduler_tuning.py @@ -62,6 +62,13 @@ def tune_network(network, target): best, auto_scheduler.dispatcher.ApplyHistoryBest ), "Unable to load multiple log files jointly." + # Confirm iterables can be directly loaded. + loaded_recs = auto_scheduler.dispatcher.load_records(log_file) + with auto_scheduler.ApplyHistoryBest(iter(loaded_recs)) as best: + assert isinstance( + best, auto_scheduler.dispatcher.ApplyHistoryBest + ), "Unable to ingest logs from an interator." + # Sample a schedule when missing with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2): with tvm.transform.PassContext( diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py index 2ee75cf18c0e..147122ff10d6 100644 --- a/tests/python/unittest/test_autotvm_record.py +++ b/tests/python/unittest/test_autotvm_record.py @@ -91,6 +91,11 @@ def test_apply_history_best(): x = hist_best.query(target, tsk.workload) assert str(x) == str(tsk.config_space.get(2)) + # Confirm same functionality for iterators. + hist_best = ApplyHistoryBest(iter(records)) + x = hist_best.query(target, tsk.workload) + assert str(x) == str(tsk.config_space.get(2)) + if __name__ == "__main__": test_load_dump() From f755c97492c7e851277b9fc52854afeb18e14952 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 17 May 2022 16:24:48 -0700 Subject: [PATCH 10/59] [skip ci][ci][docker] Pin Pillow version (#11348) A recent release depends on some things we don't have installed, so don't use it. e.g. https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-11319/5/pipeline/ Co-authored-by: driazati --- docker/install/ubuntu_install_python_package.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh index 4f99f1784238..0353814efcb8 100755 --- a/docker/install/ubuntu_install_python_package.sh +++ b/docker/install/ubuntu_install_python_package.sh @@ -30,7 +30,7 @@ pip3 install --upgrade \ numpy~=1.19.5 \ orderedset \ packaging \ - Pillow \ + Pillow==9.1.0 \ psutil \ pytest \ tlcpack-sphinx-addon==0.2.1 \ From 9c27ff5e58bb5ceccbc8a5855689da0cb59dac79 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 17 May 2022 23:22:54 -0700 Subject: [PATCH 11/59] [ci] Bump job timeout to 3 hours (#11350) This is intended to be temporary to avoid timeouts on jobs while we work on getting some things under control like artifact upload time and shards for various jobs. Co-authored-by: driazati --- Jenkinsfile | 4 ++-- jenkins/Jenkinsfile.j2 | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4db9a45e2e5c..424f97494d76 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-05-17T09:16:58.363027 +// Generated at 2022-05-17T17:26:21.660243 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -83,7 +83,7 @@ upstream_revision = null docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM' docker_build = 'docker/build.sh' // timeout in minutes -max_time = 120 +max_time = 180 rebuild_docker_images = false def per_exec_ws(folder) { diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2 index 88ced73a8f97..f250ff12feed 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/jenkins/Jenkinsfile.j2 @@ -80,7 +80,7 @@ upstream_revision = null docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM' docker_build = 'docker/build.sh' // timeout in minutes -max_time = 120 +max_time = 180 rebuild_docker_images = false def per_exec_ws(folder) { From b5e1fdd3ddb47b097be36c44a8c8de2b305ecd2b Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Wed, 18 May 2022 01:16:36 -0700 Subject: [PATCH 12/59] Improve error messages with TVM_LOG_DEBUG and add docs (#11344) * Improve error messages with TVM_LOG_DEBUG and add docs. * Fix requirement to prepend "src" with /. --- docs/dev/how_to/debugging_tvm.rst | 72 +++++++++++++++++++++++++++++++ docs/dev/how_to/how_to.rst | 1 + src/runtime/logging.cc | 26 +++++++++-- tests/cpp/runtime/logging_test.cc | 29 +++++++++++-- 4 files changed, 121 insertions(+), 7 deletions(-) create mode 100644 docs/dev/how_to/debugging_tvm.rst diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst new file mode 100644 index 000000000000..6060f797b3e4 --- /dev/null +++ b/docs/dev/how_to/debugging_tvm.rst @@ -0,0 +1,72 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _debugging-tvm: + +Debuggging TVM +============== + +**NOTE**: This page is a work in-progress. Everyone is welcomed to add suggestions and tips via +sending a PR to modify this page. The goal with this page is to centralize the commonly-used +techniques being used to debug TVM and to spread awareness to the community. To that end, we may +seek to promote more broadly-used techniques to the top of this doc. + +VLOGging +-------- + +TVM provides a verbose-logging facility that allows you to commit trace-level debugging messages +without impacting the binary size or runtime of TVM in production. You can use VLOG in your code +as follows: + +.. code-block:: c++ + + void Foo(const std::string& bar) { + VLOG(2) << "Running Foo(" << bar << ")"; + // ... + } + +In this example, the integer ``2`` passed to ``VLOG()`` indicates a verbosity level. The higher the +level, the more logs printed. In general, TVM levels range from 0 to 2, with 3 being used only for +extremely low-level core runtime properties. The VLOG system is configured at startup time to print +VLOG statements between ``0`` and some integer ``N``. ``N`` can be set per-file or globally. + +VLOGs don't print or impact binary size or runtime by default (when compiled with proper +optimization). To enable VLOGging, do the following: + +1. In ``config/cmake``, ensure you ``set(USE_RELAY_DEBUG ON)``. This flag is used to enable + VLOGging. +2. Launch Python passing ``TVM_LOG_DEBUG=``, where ``>`` is a comma-separated list of + level assignments of the form ``=``. Here are some specializations: + + - The special filename ``DEFAULT`` sets the VLOG level setting for all files. + - ``>`` can be set to ``-1`` to disable VLOG in that file. + - ```` is the name of the c++ source file (e.g. ``.cc``, not ``.h``) relative to the + ``src/`` directory in the TVM repo. You do not need to supply ``src/`` when specifying the + file path, but if you do, VLOG will still interpret the path correctly. + +Examples: + +.. code-block: shell + + # enable VLOG(0), VLOG(1), VLOG(2) in all files. + $ TVM_LOG_DEBUG=DEFAULT=2 python3 -c 'import tvm' + + # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not VLOG(2) in src/bar/baz.cc. + $ TVM_LOG_DEBUG=DEFAULT=2,bar/baz.cc=1 python3 -c 'import tvm' + + # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not in src/foo/bar.cc. + $ TVM_LOG_DEBUG=DEFAULT=2,src/foo/bar.cc=-1 python3 -c 'import tvm' diff --git a/docs/dev/how_to/how_to.rst b/docs/dev/how_to/how_to.rst index 844ae0ad527e..67bb94b007c4 100644 --- a/docs/dev/how_to/how_to.rst +++ b/docs/dev/how_to/how_to.rst @@ -25,6 +25,7 @@ various areas of the TVM stack. .. toctree:: :maxdepth: 1 + debugging_tvm relay_add_op relay_add_pass relay_bring_your_own_codegen diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc index 0f614a7eaff1..c6c756d85c7e 100644 --- a/src/runtime/logging.cc +++ b/src/runtime/logging.cc @@ -197,6 +197,12 @@ std::string FileToVLogMapKey(const std::string& filename) { // Canonicalize the filename. // TODO(mbs): Not Windows friendly. size_t last_src = filename.rfind(kSrcPrefix, std::string::npos, kSrcPrefixLength); + if (last_src == std::string::npos) { + std::string no_slash_src{kSrcPrefix + 1}; + if (filename.substr(0, no_slash_src.size()) == no_slash_src) { + return filename.substr(no_slash_src.size()); + } + } // Strip anything before the /src/ prefix, on the assumption that will yield the // TVM project relative filename. If no such prefix fallback to filename without // canonicalization. @@ -222,6 +228,15 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) { return settings; } std::istringstream spec_stream(spec); + auto tell_pos = [&](const std::string& last_read) { + int pos = spec_stream.tellg(); + if (pos == -1) { + LOG(INFO) << "override pos: " << last_read; + // when pos == -1, failbit was set due to std::getline reaching EOF without seeing delimiter. + pos = spec.size() - last_read.size(); + } + return pos; + }; while (spec_stream) { std::string name; if (!std::getline(spec_stream, name, '=')) { @@ -229,7 +244,7 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) { break; } if (name.empty()) { - LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, empty name"; + LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(name) << ": empty filename"; return settings; } @@ -237,18 +252,21 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) { std::string level; if (!std::getline(spec_stream, level, ',')) { - LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, expecting level"; + LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level) + << ": expecting \"=\" after \"" << name << "\""; return settings; } if (level.empty()) { - LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, empty level"; + LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level) + << ": empty level after \"" << name << "\""; return settings; } // Parse level, default to 0 if ill-formed which we don't detect. char* end_of_level = nullptr; int level_val = static_cast(strtol(level.c_str(), &end_of_level, 10)); if (end_of_level != level.c_str() + level.size()) { - LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, invalid level"; + LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level) + << ": invalid level: \"" << level << "\""; return settings; } LOG(INFO) << "TVM_LOG_DEBUG enables VLOG statements in '" << name << "' up to level " << level; diff --git a/tests/cpp/runtime/logging_test.cc b/tests/cpp/runtime/logging_test.cc index ae5140ed1815..e707606843bf 100644 --- a/tests/cpp/runtime/logging_test.cc +++ b/tests/cpp/runtime/logging_test.cc @@ -17,6 +17,7 @@ * under the License. */ +#include #include #include @@ -60,17 +61,39 @@ TEST(TvmLogDebugSettings, VLogEnabledComplex) { EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 0)); } +#define MATCH_THROW(stmt, err_type, matcher) \ + try { \ + stmt; \ + } catch (const err_type& e) { \ + EXPECT_THAT(e.what(), matcher); \ + } catch (...) { \ + EXPECT_FALSE("stmt threw an unexpected exception"); \ + } + TEST(TvmLogDebugSettings, IllFormed) { - EXPECT_THROW(TvmLogDebugSettings::ParseSpec("foo/bar.cc=bogus;"), InternalError); + MATCH_THROW( + TvmLogDebugSettings::ParseSpec("foo/bar.cc=bogus;"), InternalError, + ::testing::HasSubstr("TVM_LOG_DEBUG ill-formed at position 11: invalid level: \"bogus;\"")); + + MATCH_THROW(TvmLogDebugSettings::ParseSpec("DEFAULT=2;bar/baz.cc=2"), InternalError, + ::testing::HasSubstr( + "TVM_LOG_DEBUG ill-formed at position 8: invalid level: \"2;bar/baz.cc=2\"")); + + MATCH_THROW(TvmLogDebugSettings::ParseSpec("DEFAULT=2,bar/baz.cc+2"), InternalError, + ::testing::HasSubstr("TVM_LOG_DEBUG ill-formed at position 22: expecting " + "\"=\" after \"bar/baz.cc+2\"")); } TEST(TvmLogDebugSettings, SpecPrefix) { TvmLogDebugSettings settings = TvmLogDebugSettings::ParseSpec( - "../src/foo/bar.cc=3,src/baz.cc=-1,foo/bar/src/another/file.cc=4"); + "../src/foo/bar.cc=3,src/baz.cc=3,foo/bar/src/another/file.cc=4"); EXPECT_TRUE(settings.dlog_enabled()); EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/foo/bar.cc", 3)); - EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 0)); + EXPECT_TRUE(settings.VerboseEnabled("foo/bar.cc", 3)); + EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 3)); + EXPECT_TRUE(settings.VerboseEnabled("baz.cc", 3)); EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/another/file.cc", 4)); + EXPECT_TRUE(settings.VerboseEnabled("another/file.cc", 4)); } } // namespace From a4be2ed9046a97fa826da9beba64c791e2c36ccf Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 18 May 2022 17:56:10 +0900 Subject: [PATCH 13/59] [TVMScript] Support inlined function call as a sugar (#11324) * [TVMScript] Support function call to help construct AST * add test * update test * more comment * fix for avoiding Buffer.vload(...) case * update parse error msg * wrap func call with try / catch, emit error msg * silence pylint --- python/tvm/script/parser.py | 44 +++++++++- .../unittest/test_tvmscript_syntax_sugar.py | 81 +++++++++++++++++++ 2 files changed, 121 insertions(+), 4 deletions(-) diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py index fe71b064320f..daeb018ea989 100644 --- a/python/tvm/script/parser.py +++ b/python/tvm/script/parser.py @@ -20,7 +20,8 @@ different python versions. Synr also provides an error handling context that we use for error reporting. """ -# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return +# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return, broad-except +import types import json import operator import inspect @@ -543,7 +544,7 @@ def transform_Assign(self, node): AST abstract grammar: Assign(expr* targets, expr value, string? type_comment) - By now 3 patterns of Assign is supported: + By now 5 patterns of Assign is supported: 1. special stmts with return value 1.1 Buffer = T.match_buffer()/T.buffer_decl() 1.2 Var = T.var() @@ -552,6 +553,9 @@ def transform_Assign(self, node): 3. (Store) Var[PrimExpr] = PrimExpr 4. with scope handlers with concise scoping and var def 4.1 var = T.allocate() + 5. A call to a pure python function, consuming and producing TVMScript values. + The outputs are inlined into the following body (no variable is created). + x, y = f(...) """ if isinstance(node.rhs, ast.Call): @@ -577,6 +581,35 @@ def transform_Assign(self, node): arg_list = self.parse_arg_list(func, node.rhs) func.handle(node, self.context, arg_list, node.rhs.func_name.span) return self.parse_body(node) + elif isinstance(func, types.FunctionType): + # Pattern 5 + args = [self.transform(arg) for arg in node.rhs.params] + try: + out = func(*args) + except Exception as e: + self.report_error( + "Error occured when invoking the function " + + func.__name__ + + ": \n" + + str(e), + node.rhs.span, + ) + + if len(node.lhs) == 1 and not isinstance(out, list): + out = [out] + + assert len(out) == len(node.lhs) + + for var, value in zip(node.lhs, out): + self.context.update_symbol(var.id.name, value, node) + + body = self.parse_body(node) + + for var, value in zip(node.lhs, out): + self.context.remove_symbol(var.id.name) + + return body + if isinstance(node.rhs, (ast.Call, ast.Constant)): # Pattern 4 of let binding value = self.transform(node.rhs) @@ -606,7 +639,7 @@ def transform_Assign(self, node): return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span)) self.report_error( - """Assignments should be either + """Assignments should be one of: 1. A "special statement" with return value 1.1 Buffer = T.match_buffer()/T.buffer_decl() 1.2 Var = T.var() @@ -614,7 +647,10 @@ def transform_Assign(self, node): 2. A store into a buffer: Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr 3. A store into a variable: Var[PrimExpr] = PrimExpr 4. A with scope handler with concise scoping and var def - 4.1 var = T.allocate()""", + 4.1 var = T.allocate() + 5. The right-hand side being a call to a pure python function, consuming and + producing TVMScript values. + x, y = f(...)""", node.span, ) diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py index a0964ea4d77c..b3fe5674a873 100644 --- a/tests/python/unittest/test_tvmscript_syntax_sugar.py +++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py @@ -265,5 +265,86 @@ def constant_binds_wrapped(): assert_structural_equal(constant_binds, constant_binds_wrapped) +def test_func_call(): + def shared_16x16_to_ldmatrix_32x8_layout(i, j): + thread_id = (i % 8) * 4 + (j % 8) // 2 + return thread_id, (j // 8) * 4 + (i // 8) * 2 + (j % 2) + + @T.prim_func + def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + + with T.block("root"): + T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8]) + T.writes(C[0:32, 0:8]) + for i, j, k in T.grid(16, 16, 16): + with T.block("C"): + i, j, k = T.axis.remap("SSR", [i, j, k]) + thread_id_C, local_id_C = shared_16x16_to_ldmatrix_32x8_layout(i, j) + thread_id_A, local_id_A = shared_16x16_to_ldmatrix_32x8_layout(i, k) + thread_id_B, local_id_B = shared_16x16_to_ldmatrix_32x8_layout(k, j) + + T.reads( + C[thread_id_C, local_id_C], + A[thread_id_A, local_id_A], + B[thread_id_B, local_id_B], + ) + T.writes(C[thread_id_C, local_id_C]) + + C[thread_id_C, local_id_C] += ( + A[thread_id_A, local_id_A] * B[thread_id_B, local_id_B] + ) + + @T.prim_func + def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + + with T.block("root"): + T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8]) + T.writes(C[0:32, 0:8]) + for i, j, k in T.grid(16, 16, 16): + with T.block("C"): + i, j, k = T.axis.remap("SSR", [i, j, k]) + T.reads( + C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2], + A[i % 8 * 4 + k % 8 // 2, k // 8 * 4 + i // 8 * 2 + k % 2], + B[k % 8 * 4 + j % 8 // 2, j // 8 * 4 + k // 8 * 2 + j % 2], + ) + T.writes(C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2]) + C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2] = ( + C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2] + + A[i % 8 * 4 + k % 8 // 2, k // 8 * 4 + i // 8 * 2 + k % 2] + * B[k % 8 * 4 + j % 8 // 2, j // 8 * 4 + k // 8 * 2 + j % 2] + ) + + assert_structural_equal(mma_sync_m16n16k16_desc, mma_sync_m16n16k16_desc_manual) + + # The following is an example of an error message from calling an invalid function + + # error: Error occured when invoking the function sqrt: + # loop of ufunc does not support argument 0 of type Var which has no callable sqrt method + # --> test_tvmscript_syntax_sugar.py:334:19 + # | + # 334 | ind = sqrt(i) + # | ^^^^^^^ + # note: run with `TVM_BACKTRACE=1` environment variable to display a backtrace. + + # Uncomment to see the error above. + # def sqrt(x): + # import numpy as np + # return np.sqrt(x) + + # @T.prim_func + # def loop(a: T.handle) -> None: + # A = T.match_buffer(a, (128,)) + # for i in T.serial(128): + # ind = sqrt(i) + # A[i] = A[ind] + + if __name__ == "__main__": sys.exit(pytest.main([__file__] + sys.argv[1:])) From dd986fd989cf002ba7c2665867b4212cbebf26dc Mon Sep 17 00:00:00 2001 From: Ziqang XU Date: Wed, 18 May 2022 18:56:41 +0800 Subject: [PATCH 14/59] [Runtime]Considering DLTensor's byte_offset in ZeroCopy function (#11340) --- src/runtime/graph_executor/graph_executor.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index f713671317b8..8ae98d930f13 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -165,7 +165,9 @@ void GraphExecutor::CheckExternalDLTensor(const DLTensor* external, uint32_t eid const DLTensor* internal = data_entry_[eid].operator->(); ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*external)); - ICHECK_EQ(reinterpret_cast(external->data) % kAllocAlignment, 0); + ICHECK_EQ(reinterpret_cast(static_cast(external->data) + external->byte_offset) % + kAllocAlignment, + 0); ICHECK_EQ(internal->ndim, static_cast(external->ndim)); ICHECK_EQ(internal->device.device_type, external->device.device_type); ICHECK_EQ(internal->device.device_id, external->device.device_id); @@ -185,7 +187,7 @@ void GraphExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) { CheckExternalDLTensor(data_ref, eid); // Update the data pointer for each argument of each op for (DLTensor* t : input_dltensors_[eid]) { - t->data = data_ref->data; + t->data = static_cast(data_ref->data) + data_ref->byte_offset; } } /*! @@ -204,12 +206,12 @@ void GraphExecutor::SetOutputZeroCopy(int index, DLTensor* data_ref) { // Update the data pointer for output op for (DLTensor* t : output_dltensors_[output_node_eid]) { - t->data = data_ref->data; + t->data = static_cast(data_ref->data) + data_ref->byte_offset; } // Update the input of the op connected to the output for (DLTensor* t : both_output_opinput_dltensors_[output_node_eid]) { - t->data = data_ref->data; + t->data = static_cast(data_ref->data) + data_ref->byte_offset; } } /*! From 7f1c54f96ae4099c178f45402f3c156a565dedce Mon Sep 17 00:00:00 2001 From: Andrey Malyshev Date: Wed, 18 May 2022 14:00:07 +0300 Subject: [PATCH 15/59] Fix eltwise alter op layout for broadcast axis (#11337) * Fix eltwise alter op layout for broadcast axis * Add tests on boradcast blocking over already blocked layout --- src/relay/transforms/infer_layout_utils.cc | 3 +- .../python/relay/test_pass_alter_op_layout.py | 200 ++++++++++++++++++ 2 files changed, 202 insertions(+), 1 deletion(-) diff --git a/src/relay/transforms/infer_layout_utils.cc b/src/relay/transforms/infer_layout_utils.cc index 32838e09a441..efe886c29d23 100644 --- a/src/relay/transforms/infer_layout_utils.cc +++ b/src/relay/transforms/infer_layout_utils.cc @@ -64,7 +64,8 @@ Layout AdjustSubordinateFactors(const Layout& src_layout, const Layout& old_layo // 4) a) Check if this shape element is 1. if (auto* shape_int = shape_val.as()) { - if (shape_int->value == 1) { + // We can treat 1 as broadcast only if axis was not split before + if (shape_int->value == 1 && old_layout.IndexOf(LayoutAxis::Get(axis)) == -1) { new_layout += "1"; is_shape_one = true; } diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py index cffc33b0bc24..5aff77ad36f5 100644 --- a/tests/python/relay/test_pass_alter_op_layout.py +++ b/tests/python/relay/test_pass_alter_op_layout.py @@ -1602,6 +1602,206 @@ def alter_conv2d(attrs, inputs, tinfos, out_type): np.testing.assert_allclose(res.numpy(), res1.numpy()) +def test_alter_layout_blocked_no_broadcast(): + """Test boradcast operators working on already blocked layout""" + + def before(): + dtype = "float32" + input_shape = (1, 8, 16, 16, 4) + filter_shape = (1, 8, 4, 4, 4, 4) + bias_shape = (1, 1, 1, 1, 4) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + C = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW4c", + kernel_layout="OIHW4i4o", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=4, + kernel_size=(4, 4), + ) + bias = relay.op.add(conv, C) + bias = relay.Function(analysis.free_vars(bias), bias) + return bias + + def expected(): + return before() + + def alter_conv2d(attrs, inputs, tinfos, out_type): + data, weight = inputs + new_attrs = dict(attrs) + new_attrs["data_layout"] = "NCHW4c" + new_attrs["kernel_layout"] = "OIHW4i4o" + return relay.nn.conv2d(data, weight, **new_attrs) + + with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d): + a = run_opt_pass(before(), transform.AlterOpLayout()) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b) + + inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32) + weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32) + z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32) + mod = tvm.IRModule.from_expr(before()) + with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d): + with tvm.transform.PassContext(opt_level=4): + res = relay.build_module.create_executor( + "graph", mod, target="llvm", device=tvm.cpu() + ).evaluate()(inp, weight, z) + with tvm.transform.PassContext(opt_level=0): + res1 = relay.build_module.create_executor( + "debug", mod, target="llvm", device=tvm.cpu() + ).evaluate()(inp, weight, z) + np.testing.assert_allclose(res.numpy(), res1.numpy()) + + +def test_alter_layout_blocked_broadcast(): + """Test boradcast operators working on already blocked layout""" + + def before(): + dtype = "float32" + input_shape = (1, 8, 16, 16, 4) + filter_shape = (1, 8, 4, 4, 4, 4) + bias_shape = (1, 1, 1, 1, 1) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + C = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW4c", + kernel_layout="OIHW4i4o", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=4, + kernel_size=(4, 4), + ) + bias = relay.op.add(conv, C) + bias = relay.Function(analysis.free_vars(bias), bias) + return bias + + def expected(): + return before() + + def alter_conv2d(attrs, inputs, tinfos, out_type): + data, weight = inputs + new_attrs = dict(attrs) + new_attrs["data_layout"] = "NCHW4c" + new_attrs["kernel_layout"] = "OIHW4i4o" + return relay.nn.conv2d(data, weight, **new_attrs) + + with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d): + a = run_opt_pass(before(), transform.AlterOpLayout()) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b) + + inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32) + weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32) + z = np.random.uniform(size=(1, 1, 1, 1, 1)).astype(np.float32) + mod = tvm.IRModule.from_expr(before()) + with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d): + with tvm.transform.PassContext(opt_level=4): + res = relay.build_module.create_executor( + "graph", mod, target="llvm", device=tvm.cpu() + ).evaluate()(inp, weight, z) + with tvm.transform.PassContext(opt_level=0): + res1 = relay.build_module.create_executor( + "debug", mod, target="llvm", device=tvm.cpu() + ).evaluate()(inp, weight, z) + np.testing.assert_allclose(res.numpy(), res1.numpy()) + + +def test_alter_layout_re_blocking_broadcast(): + """Test of re-blocking shapes with boradcast operators""" + + def before(): + dtype = "float32" + input_shape = (1, 8, 16, 16, 4) + filter_shape = (1, 8, 4, 4, 4, 4) + bias_shape = (1, 1, 1, 1, 4) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + C = relay.var("bias", shape=bias_shape, dtype=dtype) + + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW4c", + kernel_layout="OIHW4i4o", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=4, + kernel_size=(4, 4), + ) + bias = relay.op.add(conv, C) + bias = relay.Function(analysis.free_vars(bias), bias) + return bias + + def expected(): + dtype = "float32" + input_shape = (1, 8, 16, 16, 4) + filter_shape = (1, 8, 4, 4, 4, 4) + bias_shape = (1, 1, 1, 1, 4) + A = relay.var("data", shape=input_shape, dtype=dtype) + B = relay.var("weight", shape=filter_shape, dtype=dtype) + C = relay.var("bias", shape=bias_shape, dtype=dtype) + + A = relay.layout_transform(A, src_layout="NCHW4c", dst_layout="NCHW2c") + B = relay.layout_transform(B, src_layout="OIHW4i4o", dst_layout="OIHW2i2o") + + conv = relay.nn.conv2d( + A, + B, + data_layout="NCHW2c", + kernel_layout="OIHW2i2o", + padding=[3, 3, 0, 0], + strides=[2, 2], + out_dtype=dtype, + channels=4, + kernel_size=(4, 4), + ) + C = relay.layout_transform(C, src_layout="NCHW4c", dst_layout="NCHW2c") + bias = relay.op.add(conv, C) + bias = relay.layout_transform(bias, src_layout="NCHW2c", dst_layout="NCHW4c") + bias = relay.Function(analysis.free_vars(bias), bias) + return bias + + def alter_conv2d(attrs, inputs, tinfos, out_type): + data, weight = inputs + new_attrs = dict(attrs) + new_attrs["data_layout"] = "NCHW2c" + new_attrs["kernel_layout"] = "OIHW2i2o" + return relay.nn.conv2d(data, weight, **new_attrs) + + with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d): + a = run_opt_pass(before(), transform.AlterOpLayout()) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b) + + inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32) + weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32) + z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32) + mod = tvm.IRModule.from_expr(before()) + with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d): + with tvm.transform.PassContext(opt_level=4): + res = relay.build_module.create_executor( + "graph", mod, target="llvm", device=tvm.cpu() + ).evaluate()(inp, weight, z) + with tvm.transform.PassContext(opt_level=0): + res1 = relay.build_module.create_executor( + "debug", mod, target="llvm", device=tvm.cpu() + ).evaluate()(inp, weight, z) + np.testing.assert_allclose(res.numpy(), res1.numpy(), rtol=1e-5, atol=1e-5) + + def test_broadcast_non_adaptable(): """NCHW4c + [x, x, 4] and NCHW4c is being altered to NCHW""" From 99caa6533fde8e7264e6659575c03e5ecf54cd6b Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 18 May 2022 12:17:47 +0100 Subject: [PATCH 16/59] [TVMC][ETHOSN] Improve target string to avoid duplication (#11272) * [TVMC][ETHOSN] Improve target string to avoid duplication Improves the TVMC target string to avoid duplication of the NPU variant. The new target string will require the just the NPU name followed by -variant=n78. The old target string is deprecated and will be removed in a subsequent version of TVM. Change-Id: I4638f36788df3f478435ac13d3531aad2b23f204 * fix linting Change-Id: I76a9da511899f24a163be669877605cd1a440022 * fix make variant functions and update test error message Change-Id: Iff553d4b255c0ce0b86bad42eaa94ee9b1c62508 --- python/tvm/driver/tvmc/composite_target.py | 18 +++++++++++++++--- python/tvm/relay/op/contrib/ethosn.py | 18 +++++++++++++++--- src/relay/backend/contrib/ethosn/codegen.cc | 11 ++++++++--- .../backend/contrib/ethosn/codegen_ethosn.h | 2 +- .../test_ethosn/test_partition_params.py | 14 +++++++------- tests/python/driver/tvmc/test_compiler.py | 4 +--- .../driver/tvmc/test_composite_target.py | 2 +- tests/python/driver/tvmc/test_target.py | 4 ++-- 8 files changed, 50 insertions(+), 23 deletions(-) diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py index de743799f01c..88bea9980014 100644 --- a/python/tvm/driver/tvmc/composite_target.py +++ b/python/tvm/driver/tvmc/composite_target.py @@ -18,12 +18,13 @@ Provides support to composite target on TVMC. """ import logging +import warnings # Make sure Vitis AI codegen is registered import tvm.contrib.target.vitis_ai # pylint: disable=unused-import from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib -from tvm.relay.op.contrib.ethosn import partition_for_ethosn78 +from tvm.relay.op.contrib.ethosn import partition_for_ethosn from tvm.relay.op.contrib.cmsisnn import partition_for_cmsisnn from tvm.relay.op.contrib.ethosu import partition_for_ethosu from tvm.relay.op.contrib.bnns import partition_for_bnns @@ -55,9 +56,9 @@ "config_key": "relay.ext.cmsisnn.options", "pass_pipeline": partition_for_cmsisnn, }, - "ethos-n78": { + "ethos-n": { "config_key": "relay.ext.ethos-n.options", - "pass_pipeline": partition_for_ethosn78, + "pass_pipeline": partition_for_ethosn, }, "ethos-u": { "config_key": "relay.ext.ethos-u.options", @@ -71,6 +72,11 @@ "config_key": "relay.ext.vitis_ai.options", "pass_pipeline": partition_for_vitis_ai, }, + # Deprecated in favour of "ethos-n". + "ethos-n78": { + "config_key": "relay.ext.ethos-n.options", + "pass_pipeline": partition_for_ethosn, + }, } @@ -99,6 +105,12 @@ def get_codegen_by_target(name): requested target codegen information """ try: + if name == "ethos-n78": + warnings.warn( + "Please use 'ethos-n' instead of the deprecated 'ethos-n78' target, " + "which will be removed in a later release of TVM.", + DeprecationWarning, + ) return REGISTERED_CODEGEN[name] except KeyError: raise TVMCException("Composite target %s is not defined in TVMC." % name) diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index a1a3e2dccc4c..17038e749f8e 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -17,6 +17,7 @@ # pylint: disable=invalid-name, unused-argument """Arm(R) Ethos(TM)-N NPU supported operators.""" from enum import Enum +import warnings import tvm.ir from tvm.relay import transform @@ -46,7 +47,7 @@ def ethosn_available(): return Available.SW_AND_HW if hw else Available.SW_ONLY -def partition_for_ethosn78(mod, params=None, **opts): +def partition_for_ethosn(mod, params=None, **opts): """Partition the graph greedily offloading supported operators to Arm Ethos-N NPU. @@ -61,8 +62,19 @@ def partition_for_ethosn78(mod, params=None, **opts): ------- ret : annotated and partitioned module. """ - if not opts or opts.get("variant", "").lower() != "ethos-n78": - raise ValueError("When targeting Ethos(TM)-N78, -variant=Ethos-N78 should be set.") + opts = opts or {} + if "variant" not in opts: + raise ValueError("Please specify a variant in the target string, e.g. -variant=n78.") + + # -variant=ethos-n78 deprecated in favour of -variant=n78 + if opts["variant"].lower() == "ethos-n78": + warnings.warn( + "Please use '-variant=n78' instead of the deprecated " + "'-variant=ethos-n78', which will be removed in TVM v0.9.", + DeprecationWarning, + ) + elif opts["variant"] != "n78": + raise ValueError("When targeting Ethos(TM)-N78, -variant=n78 should be set.") if params: mod["main"] = bind_params_by_name(mod["main"], params) diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc index d9f7b84b2f76..fc8a4c48dfef 100644 --- a/src/relay/backend/contrib/ethosn/codegen.cc +++ b/src/relay/backend/contrib/ethosn/codegen.cc @@ -213,9 +213,14 @@ String MakeVariant(Optional configuration) { String variant = configuration.value()->variant; // Transform variant string to lowercase for comparison std::string variant_string = variant.c_str(); - std::transform(variant_string.begin(), variant_string.end(), variant_string.begin(), ::tolower); - std::string variant_n78 = "ethos-n78"; - if (variant_string == variant_n78) { + + // Checking deprecated variant format. Support for specifying + // the variant in this way only remains for backwards compatibility + // and will be removed in a later release of TVM. + std::string deprecated_variant_string = variant_string; + std::transform(deprecated_variant_string.begin(), deprecated_variant_string.end(), + deprecated_variant_string.begin(), ::tolower); + if (variant_string == "n78" || deprecated_variant_string == "ethos-n78") { String tops = configuration.value()->tops; String ple_ratio = configuration.value()->ple_ratio; variant = "Ethos-N78_" + tops + "TOPS_" + ple_ratio + "PLE_RATIO"; diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h index cca96c044c84..9da4e5b18bd5 100644 --- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h +++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h @@ -251,7 +251,7 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode 0 diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py index eb3ffdea42b3..b842618efccd 100644 --- a/tests/python/driver/tvmc/test_target.py +++ b/tests/python/driver/tvmc/test_target.py @@ -153,10 +153,10 @@ def test_parse_quotes_and_separators_on_options(): def test_parse_multiple_target_with_opts_ethos_n78(): - targets = parse_target("ethos-n78 -myopt=value, llvm -device=arm_cpu --system-lib") + targets = parse_target("ethos-n -myopt=value, llvm -device=arm_cpu --system-lib") assert len(targets) == 2 - assert "ethos-n78" == targets[0]["name"] + assert "ethos-n" == targets[0]["name"] assert "myopt" in targets[0]["opts"] assert "value" == targets[0]["opts"]["myopt"] assert "llvm" == targets[1]["name"] From 1b32245f0bb4a76ff10b34c37e01413bda6a4021 Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Wed, 18 May 2022 14:22:30 +0200 Subject: [PATCH 17/59] [microNPU] Add a pass to reorder copy and compute nodes (#10959) --- .../backend/contrib/ethosu/tir/compiler.py | 1 + .../backend/contrib/ethosu/tir/passes.py | 25 + src/tir/contrib/ethosu/passes.cc | 108 ++++ .../cascader/test_memory_reduction.py | 16 +- .../test_copy_compute_reordering.py | 472 ++++++++++++++++++ .../test_ethosu/test_encode_constants.py | 247 +++++---- .../contrib/test_ethosu/test_networks.py | 18 +- .../contrib/test_ethosu/test_replace_copy.py | 6 +- .../contrib/test_ethosu/test_scheduler.py | 43 +- 9 files changed, 768 insertions(+), 168 deletions(-) create mode 100644 tests/python/contrib/test_ethosu/test_copy_compute_reordering.py diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py index f2c294cfed1a..db216e43e2d1 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py @@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"): mod = tvm.tir.transform.RemoveNoOp()(mod) mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod) mod = ethosu_passes.HoistAllocates()(mod) + mod = ethosu_passes.CopyComputeReordering()(mod) disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False) if not disable_storage_rewrite: mod = tvm.tir.transform.StorageRewrite()(mod) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py index baadede08d66..76726132e05d 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py @@ -17,6 +17,7 @@ # pylint: disable=invalid-name, unused-argument, no-else-return, inconsistent-return-statements, too-many-nested-blocks """The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler.""" from collections import namedtuple +from typing import Optional import numpy as np # type: ignore import tvm @@ -913,3 +914,27 @@ def HoistAllocates() -> tvm.IRModule: The new module with hoisted allocate nodes. """ return _ffi_api.HoistAllocates() + + +def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRModule: + """ + Reorders copy and compute nodes in such a way that independent DMA copies, + and computes happen in parallel. + Copies to buffers with local scope are not reordered, indeed they copy LUT + into the SHRAM which already happens in parallel with copying weights into + the weights encoder. + + Parameters + ---------- + max_copy_movements: Optional[int] + The maximum number of movements allowed for a copy. + If None, the pass context option + tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements + is used if provided, otherwise the default value will be 1. + + Returns + ------- + tvm.IRModule + The new module with copy and compute nodes reordered. + """ + return _ffi_api.CopyComputeReordering(max_copy_movements) diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index 45161499f5be..2b7b2b4741e6 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -27,7 +27,17 @@ #include #include +#include + namespace tvm { + +/*! + * \brief The maximum number of movements allowed for a copy in the CopyComputeReordering pass. + */ +constexpr const char* kCopyComputeReorderingMaxCopyMovements = + "tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements"; +TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingMaxCopyMovements, Integer); + namespace tir { namespace contrib { namespace ethosu { @@ -110,6 +120,104 @@ tvm::transform::Pass HoistAllocates() { TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates); +/*! + * \brief Reorders copy and compute nodes in such a way that independent DMA copies, + * and computes happen in parallel. + * Copies to buffers with local scope are not reordered, indeed they copy LUT + * into the SHRAM which already happens in parallel with copying weights into + * the weights encoder. + */ +class CopyComputeReorderingMutator : public StmtExprMutator { + public: + explicit CopyComputeReorderingMutator(int max_copy_movements) + : _max_copy_movements{max_copy_movements} {} + + PrimFunc operator()(PrimFunc main_func) { + if (_max_copy_movements > 0) { + auto prim_func_node{main_func.CopyOnWrite()}; + prim_func_node->body = this->VisitStmt(main_func->body); + return GetRef(prim_func_node); + } + return main_func; + } + + private: + Stmt VisitStmt_(const SeqStmtNode* op) override { + if (op->size() <= 1) { + return StmtExprMutator::VisitStmt_(op); + } + + auto seq_stmt{GetRef(op)}; + std::vector new_seq(seq_stmt->size()); + std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin()); + + // Each copy statement to a buffer with global scope is moved up + // at most `_max_copy_movements` times. + for (size_t index = 0; index < new_seq.size(); ++index) { + if (stmt_is_global_copy(new_seq[index])) { + int lower = std::max(0, static_cast(index) - _max_copy_movements); + for (int i = index; i > lower && !stmt_is_copy(new_seq[i - 1]); --i) { + std::swap(new_seq[i - 1], new_seq[i]); + } + } + } + + auto seq_stmt_node{CopyOnWrite(op)}; + seq_stmt_node->seq = std::move(new_seq); + return Stmt{seq_stmt_node}; + } + + tvm::runtime::Array get_stmt_args(const Stmt& stmt) { + auto eval_node{stmt.as()}; + ICHECK(eval_node) << "Expected statement to be an evaluate node, but was " + << stmt->GetTypeKey(); + auto call_node{eval_node->value.as()}; + ICHECK(call_node) << "Expected expression to be a call node, but was " + << eval_node->value->GetTypeKey(); + return call_node->args; + } + + bool stmt_is_copy(const Stmt& stmt) { + auto args{get_stmt_args(stmt)}; + return args[0].as()->value == "ethosu_copy"; + } + + bool stmt_is_global_copy(const Stmt& stmt) { + auto args{get_stmt_args(stmt)}; + return args[0].as()->value == "ethosu_copy" && + args[3].as()->buffer.scope() == "global"; + } + + /*! The maximum number of movements allowed for a copy. */ + int _max_copy_movements; +}; + +/*! + * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies, + * and computes happen in parallel. + * + * \param max_copy_movements: The maximum number of movements allowed for a copy. + * If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements + * is used if provided, otherwise the default value will be 1. + * \return tvm::transform::Pass + */ +tvm::transform::Pass CopyComputeReordering(Optional max_copy_movements) { + auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) { + ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main")) + << "Expected a single primitive function called 'main'. Please run the " + "CopyComputeReordering " + "pass in conjunction with the LowerToTIR() pass."; + auto value = max_copy_movements.value_or( + ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value()); + return CopyComputeReorderingMutator(value)(f); + }; + return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, + "tir.contrib.ethos-u.CopyComputeReordering", {}); +} + +TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering") + .set_body_typed(CopyComputeReordering); + } // namespace ethosu } // namespace contrib } // namespace tir diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py index 5e4117e50f8e..01545217beb4 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py +++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py @@ -91,10 +91,10 @@ def _get_ethosu_workspace_size( @pytest.mark.parametrize( "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping", [ - ("ethos-u55-256", 1067408, 14096), - ("ethos-u55-128", 1067408, 3968), - ("ethos-u55-64", 1067408, 3968), - ("ethos-u55-32", 1067392, 3952), + ("ethos-u55-256", 1067520, 14208), + ("ethos-u55-128", 1067520, 4080), + ("ethos-u55-64", 1067520, 4080), + ("ethos-u55-32", 1067504, 4064), ], ) def test_double_conv2d( @@ -161,10 +161,10 @@ def tf_graph(x): @pytest.mark.parametrize( "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping", [ - ("ethos-u55-256", 180096, 15008), - ("ethos-u55-128", 180096, 14240), - ("ethos-u55-64", 180096, 14240), - ("ethos-u55-32", 180096, 14240), + ("ethos-u55-256", 180288, 15200), + ("ethos-u55-128", 180288, 14432), + ("ethos-u55-64", 180288, 14432), + ("ethos-u55-32", 180272, 14416), ], ) def test_depthwise2d_conv2d_pooling( diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py new file mode 100644 index 000000000000..eebaa3b816b4 --- /dev/null +++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py @@ -0,0 +1,472 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +pytest.importorskip("ethosu.vela") + +import tvm +from tvm.script import tir as T +from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering + +# fmt: off +@tvm.script.ir_module +class AllOperatorsWithWeights: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +def test_all_operators_with_weights_max_copy_movements_0(): + test_mod = CopyComputeReordering(0)(AllOperatorsWithWeights) + reference_mod = AllOperatorsWithWeights + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_all_operators_with_weights_max_copy_movements_1(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(1)(AllOperatorsWithWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_all_operators_with_weights_max_copy_movements_2(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(2)(AllOperatorsWithWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +# fmt: off +@tvm.script.ir_module +class AllOperatorsWithoutWeights: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([36], "int8") + buffer2 = T.buffer_decl([9], "int8") + # body + p1 = T.allocate([96], "int8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +@pytest.mark.parametrize("max_copy_movements", [0, 1, 2]) +def test_all_operators_without_weights(max_copy_movements): + test_mod = CopyComputeReordering(max_copy_movements)(AllOperatorsWithoutWeights) + reference_mod = AllOperatorsWithoutWeights + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +# fmt: off +@tvm.script.ir_module +class OperatorsWithAndWithoutWeights: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +def test_operators_with_and_without_weights_max_copy_movements_0(): + test_mod = CopyComputeReordering(0)(OperatorsWithAndWithoutWeights) + reference_mod = OperatorsWithAndWithoutWeights + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_operators_with_and_without_weights_max_copy_movements_1(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(1)(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_operators_with_and_without_weights_max_copy_movements_2(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(2)(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +# fmt: off +@tvm.script.ir_module +class CopyToBufferWithLocalScope: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([64], "uint8") + buffer2 = T.buffer_decl([48], "uint8") + buffer3 = T.buffer_decl([48], "uint8") + buffer4 = T.buffer_decl([256], "uint8") + buffer5 = T.buffer_decl([16], "uint8") + buffer6 = T.buffer_decl([48], "uint8") + buffer7 = T.buffer_decl([256], "uint8") + buffer8 = T.buffer_decl([64], "uint8") + # body + p1 = T.allocate([48], "uint8", "global") + p2 = T.allocate([48], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p4 = T.allocate([256], "int8", "global") + p5 = T.allocate([16], "uint8", "global") + p6 = T.allocate([48], "uint8", "global") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +def test_copy_to_buffer_with_local_scope_max_copy_movements_0(): + test_mod = CopyComputeReordering(0)(CopyToBufferWithLocalScope) + reference_mod = CopyToBufferWithLocalScope + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +@pytest.mark.parametrize("max_copy_movements", [1, 2]) +def test_copy_to_buffer_with_local_scope_max_copy_movements_n(max_copy_movements): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([64], "uint8") + buffer2 = T.buffer_decl([48], "uint8") + buffer3 = T.buffer_decl([48], "uint8") + buffer4 = T.buffer_decl([256], "uint8") + buffer5 = T.buffer_decl([16], "uint8") + buffer6 = T.buffer_decl([48], "uint8") + buffer7 = T.buffer_decl([256], "uint8") + buffer8 = T.buffer_decl([64], "uint8") + # body + p1 = T.allocate([48], "uint8", "global") + p2 = T.allocate([48], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p4 = T.allocate([256], "int8", "global") + p5 = T.allocate([16], "uint8", "global") + p6 = T.allocate([48], "uint8", "global") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(max_copy_movements)(CopyToBufferWithLocalScope) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_multiple_prim_funcs(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(): + T.evaluate(0) + + @T.prim_func + def abc(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + CopyComputeReordering(1)(InputModule) + + +def test_no_main_prim_func(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def abs(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + CopyComputeReordering(1)(InputModule) + + +def test_default_max_copy_movements(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_pass_context_option_max_copy_movements(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + with tvm.transform.PassContext( + config={"tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements": 2} + ): + test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py index 92e6cd3e19cb..15b719f33c3f 100644 --- a/tests/python/contrib/test_ethosu/test_encode_constants.py +++ b/tests/python/contrib/test_ethosu/test_encode_constants.py @@ -37,33 +37,34 @@ class WeightStreamOnlyU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([128], "uint8") - buffer_1 = T.buffer_decl([32], "uint8") - buffer_2 = T.buffer_decl([112], "uint8") - buffer_3 = T.buffer_decl([32], "uint8") - buffer_4 = T.buffer_decl([112], "uint8") - buffer_5 = T.buffer_decl([32], "uint8") - buffer_6 = T.buffer_decl([112], "uint8") - buffer_7 = T.buffer_decl([32], "uint8") + buffer1 = T.buffer_decl([128], "uint8") + buffer2 = T.buffer_decl([32], "uint8") + buffer3 = T.buffer_decl([112], "uint8") + buffer4 = T.buffer_decl([32], "uint8") + buffer5 = T.buffer_decl([112], "uint8") + buffer6 = T.buffer_decl([32], "uint8") + buffer7 = T.buffer_decl([112], "uint8") + buffer8 = T.buffer_decl([32], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - p1_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - p1_global_1 = T.buffer_decl([112], dtype="uint8", data=p1_global.data) - p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, T.int8(-1), T.int8(-1), 12, p2_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + buffer9 = T.buffer_decl([112], "uint8", data=p1.data) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, buffer9[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -74,37 +75,34 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - buffer_encoded = T.buffer_decl([160], dtype="uint8") - buffer_encoded_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_2 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_3 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_4 = T.buffer_decl([176], dtype="uint8") - buffer_encoded_5 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_6 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_7 = T.buffer_decl([32], dtype="uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) + buffer_encoded_1 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_1_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_2_1 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_4_1 = T.buffer_decl([176], dtype="uint8") + buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_6_1 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8") # body placeholder_global = T.allocate([176], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) - placeholder_global_2 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) - placeholder_global_3 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 160, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle")) + placeholder_global_2 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 160, placeholder_global_1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_1[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 160, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 80, placeholder_global_1[80], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 160, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 176, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 160, placeholder_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 80, placeholder_global_3[80], 80, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 176, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 160, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -172,19 +170,21 @@ class RereadWeightsU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([304], "uint8") - buffer_1 = T.buffer_decl([80], "uint8") + buffer1 = T.buffer_decl([304], "uint8") + buffer2 = T.buffer_decl([80], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p2[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 304, T.int8(-1), T.int8(-1), 12, p4[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -195,20 +195,20 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - placeholder_encoded = T.buffer_decl([368], dtype="uint8") - placeholder_encoded_1 = T.buffer_decl([96], dtype="uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) + placeholder_encoded_1 = T.buffer_decl([368], "uint8") + placeholder_encoded_1_2 = T.buffer_decl([96], "uint8") # body placeholder_global = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.buffer_decl([368], dtype="uint8", data=placeholder_global.data) placeholder_d_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_d_global.data) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global[0], dtype="handle")) + placeholder_global_1 = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global_1 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global_1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 192, placeholder_global[192], 176, 12, placeholder_d_global[0], 48, placeholder_d_global[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 192, placeholder_global_1[192], 176, 12, placeholder_d_global_1[0], 48, placeholder_d_global_1[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -374,35 +374,37 @@ class MixedReadU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([592], "uint8") - buffer_1 = T.buffer_decl([160], "uint8") - buffer_2 = T.buffer_decl([80], "uint8") - buffer_3 = T.buffer_decl([32], "uint8") - buffer_4 = T.buffer_decl([80], "uint8") - buffer_5 = T.buffer_decl([32], "uint8") - buffer_6 = T.buffer_decl([80], "uint8") - buffer_7 = T.buffer_decl([32], "uint8") - buffer_8 = T.buffer_decl([80], "uint8") - buffer_9 = T.buffer_decl([32], "uint8") + buffer1 = T.buffer_decl([80], "uint8") + buffer2 = T.buffer_decl([32], "uint8") + buffer3 = T.buffer_decl([80], "uint8") + buffer4 = T.buffer_decl([32], "uint8") + buffer5 = T.buffer_decl([80], "uint8") + buffer6 = T.buffer_decl([32], "uint8") + buffer7 = T.buffer_decl([80], "uint8") + buffer8 = T.buffer_decl([32], "uint8") + buffer9 = T.buffer_decl([592], "uint8") + buffer10 = T.buffer_decl([160], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p5 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 80, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 80, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -412,42 +414,37 @@ class MixedReadU65: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) + T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) # buffer definition - buffer_encoded = T.buffer_decl([96], dtype="uint8") - buffer_encoded_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_2 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_3 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_4 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_5 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_6 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_7 = T.buffer_decl([32], dtype="uint8") - placeholder_encoded = T.buffer_decl([608], dtype="uint8") - placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8") - T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) - # body - ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + buffer_encoded_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_1_2 = T.buffer_decl([32], dtype="uint8") + placeholder_encoded_1 = T.buffer_decl([608], dtype="uint8") + placeholder_encoded_1_2 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_2_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_4_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_6_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8") placeholder_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data) - placeholder_global_2 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data) - placeholder_global_3 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data) placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 96, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle")) + ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + placeholder_global_2 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 96, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_2[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_1[0], 304, placeholder_encoded_1[304], 304, 12, placeholder_encoded_1_2[0], 80, placeholder_encoded_1_2[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 96, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 96, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 48, placeholder_global_1[48], 48, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 96, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 96, placeholder_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 48, placeholder_global_3[48], 48, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 96, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 96, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index e04cabe79d2f..f64263ca0623 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -43,13 +43,13 @@ @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1423344), - ("ethos-u65-256", MOBILENET_V2_URL, 2185584), - ("ethos-u55-256", MOBILENET_V1_URL, 1423344), - ("ethos-u55-256", MOBILENET_V2_URL, 2185584), - ("ethos-u55-128", MOBILENET_V2_URL, 2185584), - ("ethos-u55-64", MOBILENET_V2_URL, 2185584), - ("ethos-u55-32", MOBILENET_V2_URL, 2185584), + ("ethos-u65-256", MOBILENET_V1_URL, 1892704), + ("ethos-u65-256", MOBILENET_V2_URL, 2257984), + ("ethos-u55-256", MOBILENET_V1_URL, 1892704), + ("ethos-u55-256", MOBILENET_V2_URL, 2257984), + ("ethos-u55-128", MOBILENET_V2_URL, 2257984), + ("ethos-u55-64", MOBILENET_V2_URL, 2257984), + ("ethos-u55-32", MOBILENET_V2_URL, 2258000), ], ) def test_networks_without_usmp(accel_type, model_url, workspace_size): @@ -71,8 +71,8 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size): @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1205872), - ("ethos-u55-256", MOBILENET_V2_URL, 1507152), + ("ethos-u65-256", MOBILENET_V1_URL, 1206880), + ("ethos-u55-256", MOBILENET_V2_URL, 1509408), ], ) def test_networks_with_usmp(accel_type, model_url, workspace_size): diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py index 4f06695b25b1..932df71d2402 100644 --- a/tests/python/contrib/test_ethosu/test_replace_copy.py +++ b/tests/python/contrib/test_ethosu/test_replace_copy.py @@ -88,14 +88,14 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409 T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data) # body placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_global_unrolled_iter_1 = T.buffer_decl([272], "uint8", data=placeholder_global_unrolled_iter_0.data) placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data) + placeholder_global_unrolled_iter_1 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_d_global_unrolled_iter_1 = T.allocate([64], "uint8", "global", annotations={"disable_lower_builtin": True}) T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py index 8a83e769141d..4baea26e591e 100644 --- a/tests/python/contrib/test_ethosu/test_scheduler.py +++ b/tests/python/contrib/test_ethosu/test_scheduler.py @@ -180,29 +180,27 @@ def test_schedule_cache_reads(): @tvm.script.ir_module class DiamondGraphTir: @T.prim_func - def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(75264,), "int8"]) -> None: + def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None: T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - T.preflattened_buffer(input_buffer, [1, 56, 56, 96], dtype='int8', data=input_buffer.data) - T.preflattened_buffer(output_buffer, [1, 56, 56, 24], dtype='int8', data=output_buffer.data) - - weight_buffer = T.buffer_decl([2608], "uint8") - bias_buffer = T.buffer_decl([240], "uint8") - weight_buffer2 = T.buffer_decl([736], "uint8") - bias_buffer2 = T.buffer_decl([240], "uint8") - - weight_global = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True}) - weight_global2 = T.buffer_decl([736], "uint8", data=weight_global.data) - bias_global = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) - featuremap_buffer = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True}) - featuremap_buffer2 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True}) - - T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle")) + T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data) + T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data) + buffer1 = T.buffer_decl([2608], "uint8") + buffer2 = T.buffer_decl([240], "uint8") + buffer3 = T.buffer_decl([736], "uint8") + buffer4 = T.buffer_decl([240], "uint8") + p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) + p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p2[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, T.int8(-1), T.int8(-1), 12, p4[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -218,7 +216,6 @@ def test_schedule_diamond_graph(): test_mod, _ = _lower_to_tir(func, copy_constants()) reference_mod = DiamondGraphTir - tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True) From f88a10fb00419c51a116a63f931a98d8286b23de Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Wed, 18 May 2022 14:04:24 +0100 Subject: [PATCH 18/59] [TFLite] Add support to int16 data type in TFLite frontend (#10915) * [TFLite] Add support to int16 data type in TFLite frontend Add support for int16 data type and int64 biases/accumulators in the TFLite frontend. Adjusts TFLite tests to cover int16 convolutions and element-wise; Fixes a minor typo negtive->negative in the element-wise tests. * Update src/relay/qnn/op/convolution.cc Co-authored-by: Elen Kalda Co-authored-by: Elen Kalda --- python/tvm/relay/frontend/tflite.py | 11 +- src/relay/qnn/op/convolution.cc | 48 ++-- src/relay/qnn/op/dequantize.cc | 4 +- src/relay/qnn/op/quantize.cc | 4 +- src/relay/qnn/op/requantize.cc | 8 +- tests/python/frontend/tflite/test_forward.py | 257 ++++++++++++++----- 6 files changed, 235 insertions(+), 97 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index 8d18cc2962ae..b696bd6d056b 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -390,6 +390,7 @@ def get_tensor_type_as_numpy(self, tensor_wrapper): return { TensorType.UINT8: np.uint8, TensorType.INT8: np.int8, + TensorType.INT16: np.int16, TensorType.FLOAT16: np.float16, TensorType.FLOAT32: np.float32, TensorType.INT32: np.int32, @@ -430,6 +431,8 @@ def get_tensor_type_str(self, tensor_type): if tensor_type == TensorType.INT8: return "int8" + if tensor_type == TensorType.INT16: + return "int16" if tensor_type == TensorType.UINT8: return "uint8" if tensor_type == TensorType.FLOAT16: @@ -2149,7 +2152,9 @@ def convert_conv(self, op, conv_type): qnn_conv2d_params = dict(params) qnn_conv2d_params["input_zero_point"] = input_tensor.qnn_params["zero_point"] qnn_conv2d_params["kernel_zero_point"] = weight_tensor.qnn_params["zero_point"] - qnn_conv2d_params["out_dtype"] = "int32" + qnn_conv2d_params["out_dtype"] = ( + "int64" if output_tensor_type_str == "int16" else "int32" + ) qnn_conv2d_params["input_scale"] = input_tensor.qnn_params["scale"] qnn_conv2d_params["kernel_scale"] = weight_tensor.qnn_params["scale"] out = _qnn.op.conv2d(in_expr, weight_expr, **qnn_conv2d_params) @@ -2160,8 +2165,8 @@ def convert_conv(self, op, conv_type): if len(input_tensors) == 3: bias_tensor = input_tensors[2] bias_tensor_type = bias_tensor.tensor.Type() - # bias tensor type should be INT32 (quantization) or FLOAT32 - assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32) + # bias tensor type should be INT32 (int8 qnn) or INT64 (int16 qnn) or FLOAT32 + assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32) bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type) if self.has_expr(bias_tensor.tensor_idx): bias_expr = self.get_expr(bias_tensor.tensor_idx) diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc index 8a7521e8ee50..42e4540f0f2c 100644 --- a/src/relay/qnn/op/convolution.cc +++ b/src/relay/qnn/op/convolution.cc @@ -50,12 +50,14 @@ bool QnnConv2DRel(const Array& types, int num_inputs, const Attrs& attrs, if (data == nullptr || weight == nullptr) return false; const auto* param = attrs.as(); ICHECK(param != nullptr) << "Conv2DAttrs cannot be nullptr."; - ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8)) - << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype; + ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) || + data->dtype == DataType::Int(16)) + << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype; ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8)) << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype; - ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32)) - << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype; + ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) || + param->out_dtype == DataType::Int(64)) + << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype; ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0."; // Check the types of scale and zero points. @@ -190,19 +192,21 @@ WorkloadType GetWorkload(const Array& arg_types, const Conv2DA */ Expr Conv2DFallBack(const Expr& data, const Expr& weight, const Expr& input_zero_point, const Expr& kernel_zero_point, const Conv2DAttrs* param) { - // Upcast the zero point to Int16. - auto zp_data = Cast(input_zero_point, DataType::Int(16)); - auto zp_kernel = Cast(kernel_zero_point, DataType::Int(16)); + // Upcast the parameters to be at least int32 to avoid overflow + auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits(); - auto shifted_data = Cast(data, DataType::Int(16)); - auto zero_scalar = MakeConstantScalar(DataType::Int(32), 0); + auto zp_data = Cast(input_zero_point, DataType::Int(upcast_bits)); + auto zp_kernel = Cast(kernel_zero_point, DataType::Int(upcast_bits)); + + auto shifted_data = Cast(data, DataType::Int(upcast_bits)); + auto zero_scalar = MakeConstantScalar(DataType::Int(upcast_bits), 0); if (!IsEqualScalar(input_zero_point, zero_scalar)) { - shifted_data = Subtract(Cast(data, DataType::Int(16)), zp_data); + shifted_data = Subtract(Cast(data, DataType::Int(upcast_bits)), zp_data); } - auto shifted_kernel = Cast(weight, DataType::Int(16)); + auto shifted_kernel = Cast(weight, DataType::Int(upcast_bits)); if (!IsEqualScalar(kernel_zero_point, zero_scalar)) { - shifted_kernel = Subtract(Cast(weight, DataType::Int(16)), zp_kernel); + shifted_kernel = Subtract(Cast(weight, DataType::Int(upcast_bits)), zp_kernel); } return Conv2D(shifted_data, shifted_kernel, param->strides, param->padding, param->dilation, @@ -557,6 +561,7 @@ Expr Conv2DThirdTerm(const Expr& weight, const Expr& input_zero_point, const Con * \param in_channels The number of input channels. * \param kernel_h The height of kernel. * \param kernel_w The width of kernel. + * \param param The qnn conv2d attributes. * \return The sequence of Relay operators for term4. * \note The term4 looks like this * @@ -564,10 +569,11 @@ Expr Conv2DThirdTerm(const Expr& weight, const Expr& input_zero_point, const Con * */ Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int in_channels, - int kernel_h, int kernel_w) { + int kernel_h, int kernel_w, const Conv2DAttrs* param) { + auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits(); int scalar_term4 = input_zero_point_int * kernel_zero_point_int * in_channels * kernel_h * kernel_w; - return MakeConstantScalar(DataType::Int(32), scalar_term4); + return MakeConstantScalar(DataType::Int(upcast_bits), scalar_term4); } /* @@ -578,6 +584,7 @@ Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int i * \param in_channels The number of input channels. * \param kernel_h The height of kernel. * \param kernel_w The width of kernel. + * \param param The qnn conv2d attributes. * \return The sequence of Relay operators for term4. * \note The term4 looks like this * @@ -585,8 +592,10 @@ Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int i * */ Expr Conv2DFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point, int in_channels, - int kernel_h, int kernel_w) { - Expr scalar_term4 = MakeConstantScalar(DataType::Int(32), in_channels * kernel_h * kernel_w); + int kernel_h, int kernel_w, const Conv2DAttrs* param) { + auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits(); + Expr scalar_term4 = + MakeConstantScalar(DataType::Int(upcast_bits), in_channels * kernel_h * kernel_w); Expr variable_term4 = Multiply(input_zero_point, kernel_zero_point); return Multiply(scalar_term4, variable_term4); } @@ -791,10 +800,11 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array& new_args, auto term3 = Conv2DThirdTerm(weight, input_zero_point, param, out_channels); Expr term4; if (dynamic_zp) { - term4 = Conv2DFourthTerm(input_zero_point, kernel_zero_point, in_channels, kernel_h, kernel_w); + term4 = Conv2DFourthTerm(input_zero_point, kernel_zero_point, in_channels, kernel_h, kernel_w, + param); } else { term4 = Conv2DFourthTerm(input_zero_point_int, kernel_zero_point_int, in_channels, kernel_h, - kernel_w); + kernel_w, param); } return Conv2DCombineTerms(term1, term2, term3, term4, input_zero_point_int, kernel_zero_point_int); @@ -829,7 +839,7 @@ This operator convolves quantized weight with quantized data. The scale of the output quantized tensor is the product of the weight_scale and input_scale of the input quantized tensors. The zero point of the output quantized tensor is 0. By default, the dtype of output is int32. Please also refer to Requantize -operator to understand how to scale back the int32 output to (u)int8. +operator to understand how to scale back the int32 output to (u)int8 or (u)int16. - **data**: This depends on the `layout` parameter. Input is 4D array of shape (batch_size, in_channels, height, width) if `layout` is `NCHW`. - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1]) diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc index 9a9c60d9ea6f..1ddcde81234d 100644 --- a/src/relay/qnn/op/dequantize.cc +++ b/src/relay/qnn/op/dequantize.cc @@ -47,8 +47,8 @@ bool DequantizeRel(const Array& types, int num_inputs, const Attrs& attrs, const auto input_dtype = data->dtype; ICHECK(input_dtype == DataType::Int(8) || input_dtype == DataType::UInt(8) || - input_dtype == DataType::Int(32)) - << "Input type should be one of the quantized types [unit8, int8, int32] but was " + input_dtype == DataType::Int(16) || input_dtype == DataType::Int(32)) + << "Input type should be one of the quantized types [unit8, int8, int16, int32] but was " << input_dtype; const auto* dequantize_attrs = attrs.as(); diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc index 1a4c853d8929..06a73ee91cbf 100644 --- a/src/relay/qnn/op/quantize.cc +++ b/src/relay/qnn/op/quantize.cc @@ -76,8 +76,8 @@ bool QuantizeRel(const Array& types, int num_inputs, const Attrs& attrs, const Array oshape = data->shape; const DataType out_dtype = quantize_attrs->out_dtype; ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) || - out_dtype == DataType::Int(32)) - << "Output type should be one of [int8, unit8, int32] but was " << out_dtype; + out_dtype == DataType::Int(16) || out_dtype == DataType::Int(32)) + << "Output type should be one of [int8, unit8, int16, int32] but was " << out_dtype; // assign output type reporter->Assign(types[3], TensorType(oshape, out_dtype)); return true; diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc index ea143fe41713..8601264f5313 100644 --- a/src/relay/qnn/op/requantize.cc +++ b/src/relay/qnn/op/requantize.cc @@ -480,8 +480,8 @@ bool RequantizeRel(const Array& types, int num_inputs, const Attrs& attrs, } const auto in_dtype = data->dtype; ICHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) || - in_dtype == DataType::Int(32)) - << "Input type should be one of [int8, uint8, int32] but was " << in_dtype; + in_dtype == DataType::Int(32) || in_dtype == DataType::Int(64)) + << "Input type should be one of [int8, uint8, int32, int64] but was " << in_dtype; const RequantizeAttrs* requantize_attrs = attrs.as(); int axis = requantize_attrs->axis; @@ -507,8 +507,8 @@ bool RequantizeRel(const Array& types, int num_inputs, const Attrs& attrs, // assign output type auto out_dtype = requantize_attrs->out_dtype; ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) || - out_dtype == DataType::Int(32)) - << "Output type should be one of [int8, uint8, int32] but was " << out_dtype; + out_dtype == DataType::Int(16) || out_dtype == DataType::Int(32)) + << "Output type should be one of [int8, uint8, int16, int32] but was " << out_dtype; reporter->Assign(types[5], TensorType(oshape, out_dtype)); return true; } diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 80cdcf327f4b..8c8ca0eab2ff 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -139,19 +139,38 @@ def vmobj_to_list(o): def _quantize_keras_model( - keras_model, representative_data_gen, is_float_input=False, is_float_output=False + keras_model, + representative_data_gen, + is_float_input=False, + is_float_output=False, + int_quant_dtype=tf.int8, ): """Utility function to quantize a Keras model using TFLite converter.""" converter = interpreter_wrapper.TFLiteConverter.from_keras_model(keras_model) - converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE] - converter.representative_dataset = representative_data_gen - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] + if int_quant_dtype == tf.int8: + converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE] + converter.representative_dataset = representative_data_gen + converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] + inference_dtype = tf.uint8 + elif int_quant_dtype == tf.int16: + converter.optimizations = [tf.lite.Optimize.DEFAULT] + converter.representative_dataset = representative_data_gen + converter.target_spec.supported_ops = [ + tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + ] + inference_dtype = tf.uint16 + else: + raise RuntimeError( + f"Invalid quantized dtype {int_quant_dtype}. Supported types: int8, int16." + ) + # NOTE: If representative dataset is provided, and inference input type is not set, # then converter will self add quant & dequant Op accordingly. if not is_float_input: - converter.inference_input_type = tf.uint8 + converter.inference_input_type = inference_dtype if not is_float_output: - converter.inference_output_type = tf.uint8 + converter.inference_output_type = inference_dtype + return converter.convert() @@ -271,6 +290,7 @@ def compare_tflite_with_tvm( mode="graph_executor", experimental_new_converter=False, fp16_quantized=False, + int_quant_dtype=tf.int8, ): """Generic function to generate and compare TFLite and TVM output""" in_data = convert_to_list(in_data) @@ -287,7 +307,15 @@ def compare_tflite_with_tvm( converter = tf.lite.TFLiteConverter.from_session(sess, input_tensors, output_tensors) converter.experimental_new_converter = experimental_new_converter if quantized: - converter.inference_type = tf.lite.constants.QUANTIZED_UINT8 + if int_quant_dtype == tf.int16: + converter.optimizations = [tf.lite.Optimize.DEFAULT] + converter.target_spec.supported_ops = [ + tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + ] + else: + # default to int8 quantization + converter.inference_type = tf.lite.constants.QUANTIZED_UINT8 + input_arrays = converter.get_input_arrays() input_stats = {} # calculate the mean and quantization scale for every input tensor, @@ -875,7 +903,7 @@ def test_forward_l2_pool2d(): def _test_tflite2_quantized_convolution( - input_shape, kernel_shape, dilations, strides, padding, data_format + input_shape, kernel_shape, filters, padding="valid", data_format=None, int_quant_dtype=tf.int8 ): """One iteration of TFLite2 quantized convolution with given shapes and attributes""" data_format = "channels_last" if "NHWC" else "channels_first" @@ -884,23 +912,26 @@ def _test_tflite2_quantized_convolution( data_in = tf.keras.layers.Input(shape=data.shape[1:]) conv = tf.keras.layers.Conv2D( - filters=kernel_shape[3], + filters=filters, kernel_size=(kernel_shape[0], kernel_shape[1]), - strides=strides, + activation=tf.nn.relu, padding=padding, data_format=data_format, - activation="relu", - use_bias=False, )(data_in) keras_model = tf.keras.models.Model(data_in, conv) - keras_model.layers[1].set_weights([kernel]) # To create quantized values with dynamic range of activations, needs representative dataset def representative_data_gen(): for i in range(1): yield [data] - tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen) + tflite_model_quant = _quantize_keras_model( + keras_model, + representative_data_gen, + is_float_input=True, + is_float_output=True, + int_quant_dtype=int_quant_dtype, + ) tflite_output = run_tflite_graph(tflite_model_quant, data) tvm_output = run_tvm_graph(tflite_model_quant, data, data_in.name.replace(":0", "")) @@ -909,6 +940,25 @@ def representative_data_gen(): ) +def test_forward_quantized_convolution(): + for int_quant_dtype in [tf.int8, tf.int16]: + _test_tflite2_quantized_convolution( + (1, 28, 28, 1), + (1, 1), + 12, + data_format="NHWC", + int_quant_dtype=int_quant_dtype, + ) + + _test_tflite2_quantized_convolution( + (1, 1, 28, 28), + (1, 1), + 12, + data_format="NCWH", + int_quant_dtype=int_quant_dtype, + ) + + def _test_tflite2_quantized_depthwise_convolution( input_shape, kernel_shape, dilations, strides, padding, data_format, depth_multiplier ): @@ -1046,7 +1096,6 @@ def _test_convolution( quantized=quantized, input_range=input_range, experimental_new_converter=True, - fp16_quantized=fp16_quantized, ) else: data_array = np.reshape(data_array, tensor_in_sizes).astype("float32") @@ -1765,7 +1814,7 @@ def test_forward_concatenation(): # -------------- -def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]): +def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6], int_quant_dtype=tf.int8): """One iteration of unary elemwise""" if quantized: with tf.Graph().as_default(): @@ -1787,6 +1836,7 @@ def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]): quantized=True, input_range=input_range, experimental_new_converter=True, + int_quant_dtype=int_quant_dtype, ) else: with tf.Graph().as_default(): @@ -1795,14 +1845,20 @@ def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]): compare_tflite_with_tvm(data, ["in:0"], [in_data], [out]) -def _unary_elewise_create_model(math_op, data, offset=0): +def _unary_elewise_create_model(math_op, data, offset=0, int_quant_dtype=tf.int8): class Model(tf.Module): @tf.function def tf_function(self, x): op = math_op(x) return op - dtype = "int8" + if int_quant_dtype in (tf.int8, tf.uint8): + dtype = "int8" + elif int_quant_dtype in (tf.int16, tf.uint16): + dtype = "int16" + else: + raise Exception(f"Unsupported dtype '{int_quant_dtype}' for unary elementwise test.") + model = Model() # Save the model @@ -1824,9 +1880,17 @@ def representative_dataset(): converter = tf.lite.TFLiteConverter.from_saved_model(export_dir) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 + + if int_quant_dtype in (tf.int16, tf.uint16): + converter.target_spec.supported_ops = [ + tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + ] + else: + converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] + + converter.inference_input_type = int_quant_dtype + converter.inference_output_type = int_quant_dtype + tflite_model = converter.convert() return tflite_model @@ -1836,24 +1900,28 @@ def representative_dataset(): # ---- -def _test_abs(data, quantized): +def _test_abs(data, quantized, int_quant_dtype=tf.int8): """One iteration of abs""" if quantized: - tflite_model_quant = _unary_elewise_create_model(tf.math.abs, data, offset=1) + tflite_model_quant = _unary_elewise_create_model( + tf.math.abs, data, offset=1, int_quant_dtype=int_quant_dtype + ) tflite_output = run_tflite_graph(tflite_model_quant, data) # TFLite 2.6.x upgrade support if tf.__version__ < LooseVersion("2.6.1"): in_node = ["serving_default_input_int8"] else: - in_node = ["tfl.quantize"] + in_node = ( + ["serving_default_input_int16"] if int_quant_dtype == tf.int16 else ["tfl.quantize"] + ) tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) tvm.testing.assert_allclose( np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 ) else: - return _test_unary_elemwise(math_ops.abs, data, quantized) + return _test_unary_elemwise(math_ops.abs, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1861,14 +1929,18 @@ def _test_abs(data, quantized): # ---- -def _test_rsqrt(data, quantized): +def _test_rsqrt(data, quantized, int_quant_dtype=tf.int8): """One iteration of rsqrt""" # tensorflow version upgrade support if tf.__version__ < LooseVersion("2.6.1") or not quantized: - return _test_unary_elemwise(math_ops.rsqrt, data, quantized, quant_range=[1, 6]) + return _test_unary_elemwise( + math_ops.rsqrt, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype + ) else: - tflite_model_quant = _unary_elewise_create_model(tf.math.rsqrt, data) + tflite_model_quant = _unary_elewise_create_model( + tf.math.rsqrt, data, int_quant_dtype=int_quant_dtype + ) tflite_output = run_tflite_graph(tflite_model_quant, data) in_node = ["tfl.quantize"] @@ -1883,9 +1955,9 @@ def _test_rsqrt(data, quantized): # ---- -def _test_ceil(data, quantized): +def _test_ceil(data, quantized, int_quant_dtype=tf.int8): """One iteration of ceil""" - return _test_unary_elemwise(math_ops.ceil, data, quantized) + return _test_unary_elemwise(math_ops.ceil, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1893,9 +1965,9 @@ def _test_ceil(data, quantized): # ----- -def _test_floor(data, quantized): +def _test_floor(data, quantized, int_quant_dtype=tf.int8): """One iteration of floor""" - return _test_unary_elemwise(math_ops.floor, data, quantized) + return _test_unary_elemwise(math_ops.floor, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1903,9 +1975,9 @@ def _test_floor(data, quantized): # ----- -def _test_round(data, quantized): +def _test_round(data, quantized, int_quant_dtype=tf.int8): """One iteration of round""" - return _test_unary_elemwise(math_ops.round, data, quantized) + return _test_unary_elemwise(math_ops.round, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1913,9 +1985,9 @@ def _test_round(data, quantized): # --- -def _test_exp(data, quantized): +def _test_exp(data, quantized, int_quant_dtype=tf.int8): """One iteration of exp""" - return _test_unary_elemwise(math_ops.exp, data, quantized) + return _test_unary_elemwise(math_ops.exp, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1923,9 +1995,11 @@ def _test_exp(data, quantized): # --- -def _test_log(data, quantized): +def _test_log(data, quantized, int_quant_dtype=tf.int8): """One iteration of log""" - return _test_unary_elemwise(math_ops.log, data, quantized, quant_range=[1, 6]) + return _test_unary_elemwise( + math_ops.log, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype + ) ####################################################################### @@ -1933,9 +2007,9 @@ def _test_log(data, quantized): # --- -def _test_sin(data, quantized): +def _test_sin(data, quantized, int_quant_dtype=tf.int8): """One iteration of sin""" - return _test_unary_elemwise(math_ops.sin, data, quantized) + return _test_unary_elemwise(math_ops.sin, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1943,10 +2017,12 @@ def _test_sin(data, quantized): # --- -def _test_cos(data, quantized): +def _test_cos(data, quantized, int_quant_dtype=tf.int8): """One iteration of cos""" if quantized: - tflite_model_quant = _unary_elewise_create_model(tf.math.cos, data) + tflite_model_quant = _unary_elewise_create_model( + tf.math.cos, data, int_quant_dtype=int_quant_dtype + ) tflite_output = run_tflite_graph(tflite_model_quant, data) in_node = ["tfl.quantize"] tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) @@ -1962,9 +2038,9 @@ def _test_cos(data, quantized): # --- -def _test_tan(data, quantized): +def _test_tan(data, quantized, int_quant_dtype=tf.int8): """One iteration of tan""" - return _test_unary_elemwise(math_ops.tan, data, quantized) + return _test_unary_elemwise(math_ops.tan, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1972,9 +2048,9 @@ def _test_tan(data, quantized): # ------ -def _test_square(data, quantized): +def _test_square(data, quantized, int_quant_dtype=tf.int8): """One iteration of square""" - return _test_unary_elemwise(math_ops.square, data, quantized) + return _test_unary_elemwise(math_ops.square, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### @@ -1982,19 +2058,21 @@ def _test_square(data, quantized): # ------ -def _test_neg(data, quantized): +def _test_neg(data, quantized, int_quant_dtype=tf.int8): """One iteration of neg""" - return _test_unary_elemwise(math_ops.neg, data, quantized) + return _test_unary_elemwise(math_ops.neg, data, quantized, int_quant_dtype=int_quant_dtype) ####################################################################### -# Neg +# Sqrt # ------ -def _test_sqrt(data, quantized): +def _test_sqrt(data, quantized, int_quant_dtype=tf.int8): """One iteration of sqrt""" - return _test_unary_elemwise(math_ops.sqrt, data, quantized, quant_range=[1, 6]) + return _test_unary_elemwise( + math_ops.sqrt, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype + ) ####################################################################### @@ -2002,28 +2080,29 @@ def _test_sqrt(data, quantized): # --- -def _test_elu(data, quantized): +def _test_elu(data, quantized, int_quant_dtype=tf.int8): """One iteration of elu""" - return _test_unary_elemwise(nn_ops.elu, data, quantized) + return _test_unary_elemwise(nn_ops.elu, data, quantized, int_quant_dtype=int_quant_dtype) -def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negtive=True): +def _test_forward_unary_elemwise(test_op, int_quant_dtype=None, quantized=True, negative=True): # input data in_data, inq_data = [], [] + np_dtype = int_quant_dtype.as_numpy_dtype if int_quant_dtype else np.uint8 + # quantized input data if quantized: - quant_dtype = quant_dtype or np.uint8 - inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype)) - inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype).reshape((2, 1, 3))) - if quant_dtype == np.int8: + inq_data.append(np.arange(1, 240, 40, dtype=np_dtype)) + inq_data.append(np.arange(1, 240, 40, dtype=np_dtype).reshape((2, 1, 3))) + if int_quant_dtype == np.int8: inq_data.append(np.arange(-128, 127, 45, dtype=np.int8)) for data in inq_data: - test_op(data, quantized=True) + test_op(data, quantized=True, int_quant_dtype=int_quant_dtype) # normal input data - if negtive: + if negative: in_data.append(np.arange(-2.0, 4.0, dtype=np.float32)) in_data.append(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3))) else: @@ -2031,30 +2110,31 @@ def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negt in_data.append(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3))) for data in in_data: - test_op(data, quantized=False) + test_op(data, quantized=False, int_quant_dtype=int_quant_dtype) def test_all_unary_elemwise(): - _test_forward_unary_elemwise(_test_abs, quant_dtype=np.int8) + _test_forward_unary_elemwise(_test_abs, int_quant_dtype=tf.int8) + _test_forward_unary_elemwise(_test_abs, int_quant_dtype=tf.int16) _test_forward_unary_elemwise(_test_floor) _test_forward_unary_elemwise(_test_exp) - _test_forward_unary_elemwise(_test_log, negtive=False) + _test_forward_unary_elemwise(_test_log, negative=False) _test_forward_unary_elemwise(_test_square) _test_forward_unary_elemwise(_test_sin) _test_forward_unary_elemwise(_test_neg) - _test_forward_unary_elemwise(_test_sqrt, negtive=False) + _test_forward_unary_elemwise(_test_sqrt, negative=False) # tensorflow version upgrade support if tf.__version__ < LooseVersion("2.6.1"): - _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.uint8) + _test_forward_unary_elemwise(_test_rsqrt, negative=False, int_quant_dtype=tf.uint8) else: - _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.int8) + _test_forward_unary_elemwise(_test_rsqrt, negative=False, int_quant_dtype=tf.int8) # ceil and cos come with TFLite 1.14.0.post1 fbs schema if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"): _test_forward_unary_elemwise(_test_ceil) if tf.__version__ < LooseVersion("2.6.1"): _test_forward_unary_elemwise(_test_cos, quantized=False) else: - _test_forward_unary_elemwise(_test_cos, quant_dtype=np.int8) + _test_forward_unary_elemwise(_test_cos, int_quant_dtype=tf.int8) _test_forward_unary_elemwise(_test_round) # This fails with TF and Tflite 1.15.2, this could not have been tested # in CI or anywhere else. The failure mode is that we see a backtrace @@ -4572,6 +4652,47 @@ def test_forward_tflite_float16(): tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels) +def test_forward_mobilenet_int16(): + """Test int16 quantized model""" + # MobilenetV2 + model_file = tf_testing.get_workload_official( + "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz", + "mobilenet_v1_0.25_128_frozen.pb", + ) + + # Test image. Checking the labels because the requantize implementation is different between + # TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via + # labels. Also, giving a real image, instead of random inputs. + # + # According to TFLite documentation, despite the quantization being done to make this model + # use int16 types, inputs and outputs are kept float32 by default. + # https://www.tensorflow.org/lite/performance/post_training_integer_quant_16x8 + data = get_real_image(128, 128, quantized=False) + + converter = tf.lite.TFLiteConverter.from_frozen_graph( + model_file, ["input"], ["MobilenetV1/Predictions/Reshape_1"] + ) + + def representative_dataset(): + for _ in range(1): + yield [data] + + converter.optimizations = [tf.lite.Optimize.DEFAULT] + converter.target_spec.supported_ops = [ + tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 + ] + converter.representative_dataset = representative_dataset + tflite_model_buf = converter.convert() + + tflite_output = run_tflite_graph(tflite_model_buf, data) + tflite_predictions = np.squeeze(tflite_output) + tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1] + tvm_output = run_tvm_graph(tflite_model_buf, data, "input") + tvm_predictions = np.squeeze(tvm_output) + tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1] + tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels) + + ####################################################################### # Quantized SSD Mobilenet # ----------------------- @@ -4867,3 +4988,5 @@ def test_prevent_tensorflow_dynamic_range(): test_forward_tflite2_qnn_mobilenet_v2() test_forward_tflite_float16() + + test_forward_tflite_int16() From 2b1e5ce8dc2a23810f47b2b89e36a61c497f5c7f Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Wed, 18 May 2022 16:37:05 +0100 Subject: [PATCH 19/59] [microNPU] Fix bug in channels extraction in the matcher (#11335) * [microNPU] Fix bug in channels extraction in the matcher If the input tensor layout is in NHCWB16, we were passing W value instead of the channels to get_valid_block_configs. * Add test for conv2d --- .../backend/contrib/ethosu/te/convolution.py | 4 +- .../backend/contrib/ethosu/te/depthwise.py | 7 +- .../backend/contrib/ethosu/te/pooling.py | 8 +- .../cascader/test_ethosu_conv2d_matcher.py | 80 +++++++++++++++++++ 4 files changed, 90 insertions(+), 9 deletions(-) diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py index e309ab5a2af4..645a0d58221c 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py @@ -287,7 +287,9 @@ def match_ethosu_conv2d(output_tensor, device_config): ifm_dtype = input_tensors[0].dtype ofm_dtype = output_tensor.dtype - ifm_channels = int(input_tensors[0].shape[3]) + # Use channels from the weights tensor since that its shape doesn't change during layout + # conversion + ifm_channels = int(input_tensors[1].shape[3]) ofm_channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3]) kernel_elements = kernel_height * kernel_width diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py index 03ce0e534964..344cd64a323d 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py @@ -279,8 +279,7 @@ def match_ethosu_depthwise_conv2d(output_tensor, device_config): ifm_dtype = input_tensors[0].dtype ofm_dtype = output_tensor.dtype - ifm_channels = int(input_tensors[0].shape[3]) - ofm_channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3]) + channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3]) subkernels = len( device_config.get_kernel_steps(depthwise2d.op.name, kernel_height, kernel_width, ifm_dtype) @@ -294,8 +293,8 @@ def match_ethosu_depthwise_conv2d(output_tensor, device_config): propagators[0], depthwise2d.op.attrs, output_tensor.shape, - ofm_channels, - ifm_channels, + channels, + channels, output_layout, input_layout, ifm_dtype, diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py index 8c20ea716526..ca8c2ec9b395 100644 --- a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py +++ b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py @@ -239,8 +239,8 @@ def match_ethosu_pooling(output_tensor, device_config): ifm_dtype = input_tensors[0].dtype ofm_dtype = output_tensor.dtype - ifm_channels = int(input_tensors[0].shape[3]) - ofm_channels = ifm_channels + # Use channels from a stage of TE graph where the IFM is always NHWC + channels = int(pool2d.shape[3]) pool_shape_h = int(pool2d.op.attrs["pool_shape_h"]) pool_shape_w = int(pool2d.op.attrs["pool_shape_w"]) @@ -256,8 +256,8 @@ def match_ethosu_pooling(output_tensor, device_config): propagators[0], pool2d.op.attrs, output_tensor.shape, - ofm_channels, - ifm_channels, + channels, + channels, output_layout, input_layout, ifm_dtype, diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py index 17b41cbaf511..76adb0b4cbd4 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py @@ -98,5 +98,85 @@ def test_ethosu_conv2d_matcher( assert part.propagators[2].offset == scale_bias_offset +@pytest.mark.parametrize( + "ifm_layout, ofm_layout, ifm_channels, expected_cycles", + [ + ("NHWC", "NHWC", 24, 2304), + ("NHCWB16", "NHWC", 12, 2352), + ("NHWC", "NHCWB16", 38, 7056), + ("NHCWB16", "NHCWB16", 55, 4608), + ], +) +def test_ethosu_conv2d_block_config_from_matcher( + ifm_layout, ofm_layout, ifm_channels, expected_cycles +): + ofm_channels = 10 + ifm_height = 123 + ifm_width = 155 + + ifm_shape = ( + (1, ifm_height, ifm_width, ifm_channels) + if ifm_layout == "NHWC" + else (1, ifm_height, 1 + ((ifm_channels - 1) // 16), ifm_width, 16) + ) + weight_shape = (ofm_channels, 3, 3, ifm_channels) + scale_bias_shape = (ofm_channels, 10) + + ifm = te.placeholder(ifm_shape, dtype="int8") + weight = te.placeholder(weight_shape, dtype="int8") + scale_bias = te.placeholder(scale_bias_shape, dtype="uint8") + lut = te.placeholder((), dtype="uint8") + out = conv2d_compute( + ifm=ifm, + weight=weight, + scale_bias=scale_bias, + lut=lut, + ifm_scale=1, + ifm_zero_point=0, + ofm_scale=1, + ofm_zero_point=0, + weight_zero_point=0, + strides=(1, 1), + padding=(0, 0, 0, 0), + dilation=(1, 1), + activation="NONE", + clip_min=0, + clip_max=0, + upscale="NONE", + rounding_mode="TFL", + ifm_layout=ifm_layout, + ofm_layout=ofm_layout, + ) + + device_config = cs.EthosuDeviceConfig("ethos-u55-256") + part = match_ethosu_conv2d(out, device_config) + + ofm_shape = [int(i) for i in part.subgraph.output_tensor.shape] + + # Add inputs and outputs to the part + input_tensor = cs.Tensor(ifm_shape, "int8") + part.set_input(0, input_tensor) + weight_tensor = cs.Tensor(weight_shape, "int8") + part.set_input(1, weight_tensor) + scale_bias_tensor = cs.Tensor(scale_bias_shape, "int8") + part.set_input(2, scale_bias_tensor) + output_tensor = cs.Tensor(ofm_shape, "int8") + part.set_output(output_tensor) + + # Create a stripe of a size of the output tensor + order = [1, 2, 3, 4] if ofm_layout == "NHWC" else [1, 2, 4, 3, 0] + stripes = [1] * len(order) + offset = [0] * len(order) + + stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset) + + block = part.get_block_config(stripe_config) + + # Since we dont know the values of the variables we passed to the get_valid_block_configs in + # the matcher, best we can do is to verify the compute cycle count since the channels have a + # significant effect on it + assert block.compute_cycles == expected_cycles + + if __name__ == "__main__": pytest.main([__file__]) From fe1090e8aa6b6307f150f46ab968451765a6a079 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 18 May 2022 11:38:55 -0500 Subject: [PATCH 20/59] [TIR] IndexMap Simplification Constraints (#11342) * [TIR] Added optional arith::Analyzer argument to IndexMap methods Simplifications done when applying a transformation may require iteration bounds from the caller scope. This is a C++ only feature, because `arith::Analyzer` doesn't inherit from `ObjectRef`, and cannot be passed through the FFI. * [TIR] Pass analyzer from TransformLayoutRewriter to IndexMap Avoid needing to simplify twice, now that IndexMap can accept the analyzer from the calling scope. * [TIR] Added BlockNode handling to IRMutatorWithAnalyzer Iteration variables defined in `BlockNode::iter_vars` may be useful for simplifications. This functionality was extracted from `TransformLayoutRewriter`. --- include/tvm/tir/index_map.h | 22 ++++++++-- src/arith/ir_mutator_with_analyzer.cc | 7 ++++ src/arith/ir_mutator_with_analyzer.h | 1 + src/tir/ir/index_map.cc | 42 ++++++++++++------- .../primitive/layout_transformation.cc | 28 +++++++------ 5 files changed, 70 insertions(+), 30 deletions(-) diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h index b6faa67ab53a..315bda259993 100644 --- a/include/tvm/tir/index_map.h +++ b/include/tvm/tir/index_map.h @@ -33,6 +33,12 @@ #include +namespace tvm { +namespace arith { +class Analyzer; +} +} // namespace tvm + namespace tvm { namespace tir { @@ -78,10 +84,14 @@ class IndexMapNode : public Object { * \param indices The indices in the input space. Should contain * one value for each variable in `initial_indices`. * + * \param analyzer An optional analyzer to be used to simplify the + * resulting expressions. If null, will use a fresh analyzer. + * * \returns The indices in the output space. Contains one value for * each expression in `final_indices`. */ - Array MapIndices(const Array& indices) const; + Array MapIndices(const Array& indices, + arith::Analyzer* analyzer = nullptr) const; /*! \brief Map a memory range to the output space * @@ -93,20 +103,26 @@ class IndexMapNode : public Object { * \param ranges The ranges in the input space. Should contain one * value for each variable in `initial_indices`. * + * \param analyzer An optional analyzer to be used to simplify the + * resulting expressions. If null, will use a fresh analyzer. + * * \returns The ranges in the output space. Contains one value for * each expression in `final_indices`. */ - Array MapRanges(const Array& ranges) const; + Array MapRanges(const Array& ranges, arith::Analyzer* analyzer = nullptr) const; /*! \brief Map a buffer shape to the output space * * \param shape The buffer shape in the input space. Should contain * one value for each variable in `initial_indices`. * + * \param analyzer An optional analyzer to be used to simplify the + * resulting expressions. If null, will use a fresh analyzer. + * * \returns The buffer shape in the output space. Contains one * value for each expression in `final_indices`. */ - Array MapShape(const Array& shape) const; + Array MapShape(const Array& shape, arith::Analyzer* analyzer = nullptr) const; /*! * \brief Convert to string representation in Python. diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc index 7bc0d946ade7..9cae3b7a6ac8 100644 --- a/src/arith/ir_mutator_with_analyzer.cc +++ b/src/arith/ir_mutator_with_analyzer.cc @@ -35,6 +35,13 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const ForNode* op) { return StmtExprMutator::VisitStmt_(op); } +Stmt IRMutatorWithAnalyzer::VisitStmt_(const BlockNode* op) { + for (const auto& iter_var : op->iter_vars) { + analyzer_->Bind(iter_var->var, iter_var->dom); + } + return StmtExprMutator::VisitStmt_(op); +} + Stmt IRMutatorWithAnalyzer::VisitStmt_(const LetStmtNode* op) { PrimExpr value = this->VisitExpr(op->value); if (SideEffect(value) <= CallEffectKind::kPure) { diff --git a/src/arith/ir_mutator_with_analyzer.h b/src/arith/ir_mutator_with_analyzer.h index 004265bbe50a..3bd3a98a8445 100644 --- a/src/arith/ir_mutator_with_analyzer.h +++ b/src/arith/ir_mutator_with_analyzer.h @@ -50,6 +50,7 @@ class IRMutatorWithAnalyzer : public tir::StmtExprMutator { // override functions that need to populate the context information. tir::Stmt VisitStmt_(const tir::ForNode* op) override; + tir::Stmt VisitStmt_(const tir::BlockNode* op) override; tir::Stmt VisitStmt_(const tir::LetStmtNode* op) override; tir::Stmt VisitStmt_(const tir::IfThenElseNode* op) override; tir::Stmt VisitStmt_(const tir::AttrStmtNode* op) override; diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc index 4c0a7d3508c1..77678d829a8e 100644 --- a/src/tir/ir/index_map.cc +++ b/src/tir/ir/index_map.cc @@ -159,24 +159,29 @@ IndexMap IndexMap::Inverse(Array initial_ranges) const { return IndexMap(output_vars, inverse_exprs); } -Array IndexMapNode::MapIndices(const Array& indices) const { +Array IndexMapNode::MapIndices(const Array& indices, + arith::Analyzer* analyzer) const { ICHECK_EQ(indices.size(), initial_indices.size()); - arith::Analyzer analyzer; + Map vmap; for (size_t i = 0; i < initial_indices.size(); i++) { - analyzer.Bind(initial_indices[i], indices[i]); + vmap.Set(initial_indices[i], indices[i]); } - Array output; - for (const auto& output_dim : final_indices) { - output.push_back(analyzer.Simplify(output_dim)); + arith::Analyzer local_analyzer; + if (!analyzer) { + analyzer = &local_analyzer; } + Array output = final_indices; + output.MutateByApply( + [&](const PrimExpr& index) { return analyzer->Simplify(Substitute(index, vmap)); }); + return output; } -Array IndexMapNode::MapRanges(const Array& ranges) const { +Array IndexMapNode::MapRanges(const Array& ranges, arith::Analyzer* analyzer) const { ICHECK_EQ(ranges.size(), initial_indices.size()); Map input_iters; @@ -189,25 +194,30 @@ Array IndexMapNode::MapRanges(const Array& ranges) const { dom_map[initial_indices[i].get()] = arith::IntSet::FromRange(ranges[i]); } + arith::Analyzer local_analyzer; + if (!analyzer) { + analyzer = &local_analyzer; + } + Array output; - arith::Analyzer analyzer; for (const auto& final_index : final_indices) { auto int_set = arith::EvalSet(final_index, dom_map); - output.push_back(Range::FromMinExtent(analyzer.Simplify(int_set.min()), - analyzer.Simplify(int_set.max() - int_set.min() + 1))); + output.push_back(Range::FromMinExtent(analyzer->Simplify(int_set.min()), + analyzer->Simplify(int_set.max() - int_set.min() + 1))); } return output; } -Array IndexMapNode::MapShape(const Array& shape) const { +Array IndexMapNode::MapShape(const Array& shape, + arith::Analyzer* analyzer) const { ICHECK_EQ(shape.size(), initial_indices.size()); Array ranges; for (auto& dim : shape) { ranges.push_back(Range(0, dim)); } - Array mapped = MapRanges(std::move(ranges)); + Array mapped = MapRanges(std::move(ranges), analyzer); Array output; for (auto& range : mapped) { @@ -265,8 +275,12 @@ TVM_REGISTER_GLOBAL("tir.IndexMap") return IndexMap(initial_indices, final_indices); }); -TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices").set_body_method(&IndexMapNode::MapIndices); -TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_method(&IndexMapNode::MapShape); +TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices") + .set_body_typed([](IndexMap map, Array indices) { return map->MapIndices(indices); }); + +TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_typed([](IndexMap map, Array shape) { + return map->MapShape(shape); +}); TVM_REGISTER_GLOBAL("tir.IndexMapInverse").set_body_method(&IndexMap::Inverse); TVM_REGISTER_GLOBAL("tir.IndexMapNonSurjectiveInverse") diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc index 87e09505f502..fb63b1b289b1 100644 --- a/src/tir/schedule/primitive/layout_transformation.cc +++ b/src/tir/schedule/primitive/layout_transformation.cc @@ -16,12 +16,13 @@ * specific language governing permissions and limitations * under the License. */ +#include "../../../arith/ir_mutator_with_analyzer.h" #include "../utils.h" namespace tvm { namespace tir { -class TransformLayoutRewriter : private StmtExprMutator { +class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer { public: /*! * \brief Rewrite the access to the buffer after the transformation @@ -36,27 +37,32 @@ class TransformLayoutRewriter : private StmtExprMutator { const Buffer& old_buffer, const Buffer& new_buffer, const IndexMap& index_map) { - TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map); + arith::Analyzer analyzer; + TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map, &analyzer); Stmt result = rewriter(scope_stmt); return {result, rewriter.block_sref_reuse_}; } private: TransformLayoutRewriter(const Buffer& old_buffer, const Buffer& new_buffer, - const IndexMap& index_map) - : old_buffer_(old_buffer), + const IndexMap& index_map, arith::Analyzer* analyzer) + : IRMutatorWithAnalyzer(analyzer), + old_buffer_(old_buffer), new_buffer_(new_buffer), index_map_(index_map), buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {} void RewriteBufferAccess(Buffer* buffer, Array* indices) { *buffer = new_buffer_; - *indices = index_map_->MapIndices(*indices); - (*indices).MutateByApply([this](const PrimExpr& index) { return analyzer_.Simplify(index); }); + *indices = index_map_->MapIndices(*indices, analyzer_); } + using Parent = arith::IRMutatorWithAnalyzer; + using Parent::VisitExpr_; + using Parent::VisitStmt_; + PrimExpr VisitExpr_(const BufferLoadNode* op) final { - BufferLoad buffer_load = Downcast(StmtExprMutator::VisitExpr_(op)); + BufferLoad buffer_load = Downcast(Parent::VisitExpr_(op)); if (buffer_load->buffer.same_as(old_buffer_)) { auto* n = buffer_load.CopyOnWrite(); RewriteBufferAccess(&n->buffer, &n->indices); @@ -65,7 +71,7 @@ class TransformLayoutRewriter : private StmtExprMutator { } Stmt VisitStmt_(const BufferStoreNode* op) final { - BufferStore buffer_store = Downcast(StmtExprMutator::VisitStmt_(op)); + BufferStore buffer_store = Downcast(Parent::VisitStmt_(op)); if (buffer_store->buffer.same_as(old_buffer_)) { auto* n = buffer_store.CopyOnWrite(); RewriteBufferAccess(&n->buffer, &n->indices); @@ -86,10 +92,7 @@ class TransformLayoutRewriter : private StmtExprMutator { } Stmt VisitStmt_(const BlockNode* op) final { - for (const auto& iter_var : op->iter_vars) { - analyzer_.Bind(iter_var->var, iter_var->dom); - } - Block block = Downcast(StmtExprMutator::VisitStmt_(op)); + Block block = Downcast(Parent::VisitStmt_(op)); auto infered_access_regions = GetBlockReadWriteRegion(block, buffer_data_to_buffer_); auto* n = block.CopyOnWrite(); RewriteAccessRegion(&n->reads, infered_access_regions[0]); @@ -101,7 +104,6 @@ class TransformLayoutRewriter : private StmtExprMutator { const Buffer& old_buffer_; const Buffer& new_buffer_; const IndexMap& index_map_; - arith::Analyzer analyzer_; Map buffer_data_to_buffer_; Map block_sref_reuse_; }; From 95509eed2650d58463c7b1d89c969bd17770864f Mon Sep 17 00:00:00 2001 From: ah cheng Date: Thu, 19 May 2022 01:10:00 +0800 Subject: [PATCH 21/59] fix matmul broadcast (#11242) --- python/tvm/relay/frontend/onnx.py | 38 +++++++++++++++------- tests/python/frontend/onnx/test_forward.py | 1 + 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index e68daca4c4f0..1294852ba197 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -259,23 +259,39 @@ def flatten_to_nd(x, x_shape, nd=3): return out # Determine the output batch dimension. + new_a_shape = a_shape + new_b_shape = b_shape if a_rank > b_rank: - out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2]) + rank_diff = a_rank - b_rank + new_b_shape = _op.concatenate( + [ + _expr.const([1] * rank_diff, dtype=infer_type(b_shape).checked_type.dtype), + b_shape, + ], + 0, + ) elif a_rank < b_rank: - out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2]) - # If its unclear how broadcasting should be applied, the output - # shape is determined by choosing the maximum value from each input. - else: - out_batch = _op.concatenate( + rank_diff = b_rank - a_rank + new_a_shape = _op.concatenate( [ - _op.maximum( - _op.strided_slice(a_shape, [i], [i + 1]), - _op.strided_slice(b_shape, [i], [i + 1]), - ) - for i in range(a_rank - 2) + _expr.const([1] * rank_diff, dtype=infer_type(a_shape).checked_type.dtype), + a_shape, ], 0, ) + else: + pass + + out_batch = _op.concatenate( + [ + _op.maximum( + _op.strided_slice(new_b_shape, [i], [i + 1]), + _op.strided_slice(new_a_shape, [i], [i + 1]), + ) + for i in range(max(a_rank, b_rank) - 2) + ], + 0, + ) b_type = infer_type(inputs[1]) # Convert to dense if the second matrix is 2d and non-dynamic diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 643dfe820b91..6fac7f2f20aa 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -1286,6 +1286,7 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, convert_config=None): verify_batch_matmul((4, 32, 16), (16, 32), (4, 32, 32)) verify_batch_matmul((4, 32, 16, 32), (32, 16), (4, 32, 16, 16)) verify_batch_matmul((4, 32, 16, 32), (1, 32, 32, 16), (4, 32, 16, 16)) + verify_batch_matmul((4, 1, 16, 32), (1, 32, 32, 16), (4, 32, 16, 16)) # Test transb=False verify_batch_matmul( (2, 3, 4, 3), From f34bd22ddc4e7064eabe9fac42c4c04f54ede399 Mon Sep 17 00:00:00 2001 From: A1245967 Date: Thu, 19 May 2022 02:58:35 +0800 Subject: [PATCH 22/59] Fix function number datatype from char to uint16_t (#10014) rewrite the modified part to pass lint check Use 2 bytes for func num in fun_registry Fix errors in linter Add the declaration of the helper functions set 2 bytes for func num in func_registry test units pass num_func by value This commit change the datatype of the number of the function from 1 Byte to 2 Bytes. Besides, I use some helper functions to access the number of function and the first function name. --- include/tvm/runtime/crt/func_registry.h | 27 ++++++++++++++++- src/runtime/crt/common/func_registry.c | 39 ++++++++++++++++++------- src/target/func_registry_generator.cc | 8 ++++- tests/crt/func_registry_test.cc | 7 +++-- 4 files changed, 66 insertions(+), 15 deletions(-) diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h index 4f8a19af591e..50737f871798 100644 --- a/include/tvm/runtime/crt/func_registry.h +++ b/include/tvm/runtime/crt/func_registry.h @@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry { /*! \brief Names of registered functions, concatenated together and separated by \0. * An additional \0 is present at the end of the concatenated blob to mark the end. * - * Byte 0 is the number of functions in `funcs`. + * Byte 0 and 1 are the number of functions in `funcs`. */ const char* names; @@ -50,6 +50,31 @@ typedef struct TVMFuncRegistry { const TVMBackendPackedCFunc* funcs; } TVMFuncRegistry; +/*! + * \brief Get the of the number of functions from registry. + * + * \param reg TVMFunctionRegistry instance that contains the function. + * \return The number of functions from registry. + */ +uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg); + +/*! + * \brief Set the number of functions to registry. + * + * \param reg TVMFunctionRegistry instance that contains the function. + * \param num_funcs The number of functions + * \return 0 when successful. + */ +int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs); + +/*! + * \brief Get the address of 0th function from registry. + * + * \param reg TVMFunctionRegistry instance that contains the function. + * \return the address of 0th function from registry + */ +const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg); + /*! * \brief Get packed function from registry by name. * diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c index 116a5c496f1b..49cef8fd70eb 100644 --- a/src/runtime/crt/common/func_registry.c +++ b/src/runtime/crt/common/func_registry.c @@ -60,14 +60,29 @@ int strcmp_cursor(const char** cursor, const char* name) { return return_value; } +uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) { + uint16_t num_funcs; + memcpy(&num_funcs, reg->names, sizeof(num_funcs)); + return num_funcs; +} + +int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) { + memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs)); + return 0; +} + +const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) { + // NOTE: first function name starts at index 2 to skip num_funcs. + return (reg->names + sizeof(uint16_t)); +} + tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name, tvm_function_index_t* function_index) { tvm_function_index_t idx; - const char* reg_name_ptr; + const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg); idx = 0; - // NOTE: reg_name_ptr starts at index 1 to skip num_funcs. - for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) { + for (; *reg_name_ptr != '\0'; reg_name_ptr++) { if (!strcmp_cursor(®_name_ptr, name)) { *function_index = idx; return kTvmErrorNoError; @@ -82,9 +97,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg, tvm_function_index_t function_index, TVMBackendPackedCFunc* out_func) { - uint8_t num_funcs; + uint16_t num_funcs; - num_funcs = reg->names[0]; + num_funcs = TVMFuncRegistry_GetNumFuncs(reg); if (function_index >= num_funcs) { return kTvmErrorFunctionIndexInvalid; } @@ -101,7 +116,8 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8 reg->registry.names = (const char*)buffer; buffer[0] = 0; // number of functions present in buffer. - buffer[1] = 0; // end of names list marker. + buffer[1] = 0; // note that we combine the first two elements to form a 16-bit function index. + buffer[2] = 0; // end of names list marker. // compute a guess of the average size of one entry: // - assume average function name is around ~10 bytes @@ -117,13 +133,12 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name, TVMBackendPackedCFunc func, int override) { size_t idx; - char* reg_name_ptr; + char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry)); idx = 0; // NOTE: safe to discard const qualifier here, since reg->registry.names was set from // TVMMutableFuncRegistry_Create above. - // NOTE: reg_name_ptr starts at index 1 to skip num_funcs. - for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) { + for (; *reg_name_ptr != 0; reg_name_ptr++) { if (!strcmp_cursor((const char**)®_name_ptr, name)) { if (override == 0) { return kTvmErrorFunctionAlreadyDefined; @@ -149,7 +164,11 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch reg_name_ptr += name_len + 1; *reg_name_ptr = 0; ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func; - ((char*)reg->registry.names)[0]++; // increment num_funcs. + + uint16_t num_funcs; + // increment num_funcs. + num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1; + TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs); return kTvmErrorNoError; } diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc index 7c948d50cbb9..d679bf379b62 100644 --- a/src/target/func_registry_generator.cc +++ b/src/target/func_registry_generator.cc @@ -31,7 +31,13 @@ namespace target { std::string GenerateFuncRegistryNames(const Array& function_names) { std::stringstream ss; - ss << (unsigned char)(function_names.size()); + + unsigned char function_nums[sizeof(uint16_t)]; + *reinterpret_cast(function_nums) = function_names.size(); + for (auto f : function_nums) { + ss << f; + } + for (auto f : function_names) { ss << f << '\0'; } diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc index 9f0e7f8d1a5a..5962a3acee39 100644 --- a/tests/crt/func_registry_test.cc +++ b/tests/crt/func_registry_test.cc @@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) { } TEST(FuncRegistry, Empty) { - TVMFuncRegistry registry{"\000", NULL}; + TVMFuncRegistry registry{"\000\000", NULL}; EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(®istry, "foo", NULL)); EXPECT_EQ(kTvmErrorFunctionIndexInvalid, @@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_ } // Matches the style of registry defined in generated C modules. -const char* kBasicFuncNames = "\002Foo\0Bar\0"; // NOTE: final \0 +const char* kBasicFuncNames = "\002\000Foo\0Bar\0"; // NOTE: final \0 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar}; const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs}; @@ -111,7 +111,8 @@ TEST(FuncRegistry, ConstGlobalRegistry) { // Foo EXPECT_EQ(kBasicFuncNames[0], 2); - EXPECT_EQ(kBasicFuncNames[1], 'F'); + EXPECT_EQ(kBasicFuncNames[1], 0); + EXPECT_EQ(kBasicFuncNames[2], 'F'); EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index)); EXPECT_EQ(0, func_index); From dd3262fa0438182944794f87ee7dbe8768c89269 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Wed, 18 May 2022 12:17:46 -0700 Subject: [PATCH 23/59] [ci][docker] Conditionally link sccache to clang (#11316) This was causing errors with #11314 since it was making it appear as if `clang` was available when it was only the sccache wrapper. Co-authored-by: driazati --- docker/install/ubuntu_install_sccache.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docker/install/ubuntu_install_sccache.sh b/docker/install/ubuntu_install_sccache.sh index dff7d977860b..5ef78643a741 100755 --- a/docker/install/ubuntu_install_sccache.sh +++ b/docker/install/ubuntu_install_sccache.sh @@ -26,8 +26,14 @@ cargo install sccache mkdir /opt/sccache ln "$(which sccache)" /opt/sccache/cc ln "$(which sccache)" /opt/sccache/c++ -ln "$(which sccache)" /opt/sccache/clang -ln "$(which sccache)" /opt/sccache/clang++ + +# Only add clang if it's on the PATH +if command -v clang &> /dev/null +then + ln "$(which sccache)" /opt/sccache/clang + ln "$(which sccache)" /opt/sccache/clang++ +fi + # make rust usable by all users after install during container build chmod -R a+rw /opt/rust From 3fbd9b66b745eb59021c265a9708b6ac08f700d0 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 18 May 2022 14:19:43 -0500 Subject: [PATCH 24/59] [CI] Added message if test is running on another shard (#11331) --- conftest.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index 9768b6cc528d..3c04f0680a11 100644 --- a/conftest.py +++ b/conftest.py @@ -58,9 +58,9 @@ } -def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool: +def find_shard_index(nodeid: str, num_shards: int) -> int: """ - Return true if this test should run on this shard + Return the index of the shard that should run this test """ for prefix, target_shard_idx in FIXED_ALLOCATION_PREFIXES.items(): if nodeid.startswith(prefix): @@ -68,7 +68,7 @@ def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool: raise RuntimeError( f"Cannot collect sharded tests, {nodeid} has hardcoded shard index {target_shard_idx} among only {num_shards} shards" ) - return target_shard_idx == shard_index + return target_shard_idx if nodeid in HARDCODED_ALLOCATIONS: hash = HARDCODED_ALLOCATIONS[nodeid] @@ -76,7 +76,7 @@ def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool: hash = hashlib.md5(nodeid.encode()) hash = int(hash.hexdigest(), 16) - return hash % num_shards == shard_index + return hash % num_shards def pytest_collection_modifyitems(config, items): @@ -89,5 +89,10 @@ def pytest_collection_modifyitems(config, items): print(f"Marking tests for shard {shard_index} of {num_shards}") for item in items: - if not should_run(item.nodeid, num_shards=num_shards, shard_index=shard_index): - item.add_marker(pytest.mark.skip()) + item_shard_index = find_shard_index(item.nodeid, num_shards=num_shards) + item.add_marker( + pytest.mark.skipif( + item_shard_index != shard_index, + reason=f"Test running on shard {item_shard_index} of {num_shards}", + ) + ) From fb0938a5410ad91594cb4b56fdb5a84845197cb0 Mon Sep 17 00:00:00 2001 From: Youlei Yang Date: Thu, 19 May 2022 03:23:42 +0800 Subject: [PATCH 25/59] [CI] update oneDNN to v2.6 (#11140) * enable CI to get and build latest oneDNN release * remove the source code after installed * fix wget error and improve naming * refine the cmake/make commands Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> * pinned to v2.6 by default * simplify the logic and install to /usr/lib Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> --- docker/install/ubuntu_install_dnnl.sh | 33 +++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/docker/install/ubuntu_install_dnnl.sh b/docker/install/ubuntu_install_dnnl.sh index 34f917ce6606..3654d140f55b 100755 --- a/docker/install/ubuntu_install_dnnl.sh +++ b/docker/install/ubuntu_install_dnnl.sh @@ -20,10 +20,29 @@ set -e set -u set -o pipefail -cd /usr/local/ -wget -q https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_lnx_2.2.0_cpu_gomp.tgz -tar -xzf dnnl_lnx_2.2.0_cpu_gomp.tgz -mv dnnl_lnx_2.2.0_cpu_gomp/include/* /usr/local/include/ -mv dnnl_lnx_2.2.0_cpu_gomp/lib/libdnnl* /usr/local/lib/ -rm -rf dnnl_lnx_2.2.0_cpu_gomp.tgz dnnl_lnx_2.2.0_cpu_gomp -cd - +pre_dir=`pwd` +tmpdir=$(mktemp -d) + +rls_tag="v2.6" + +dnnl_ver=`echo ${rls_tag} | sed 's/v//g'` +echo "Using oneDNN release version ${dnnl_ver} with tag '${rls_tag}'" + +archive_name="${rls_tag}.tar.gz" +archive_url="https://github.com/oneapi-src/oneDNN/archive/refs/tags/${archive_name}" +archive_folder="${tmpdir}/oneDNN-${dnnl_ver}" +archive_hash="4cb7b80bfe16920bc096e18e7d8caa56b9ab7a4dab2a091a230bcf562c09533392f4a4ccd4db22754a10293670efdea20382db0994dc47949005a4c77f14b64c" + +cd "${tmpdir}" + +curl -sL "${archive_url}" -o "${archive_name}" +echo "$archive_hash" ${archive_name} | sha512sum -c +tar xf "${archive_name}" + +cd "${archive_folder}" +cmake . -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_LIBDIR=lib +make -j"$(nproc)" +make install + +cd ${pre_dir} +rm -rf "${tmpdir}" From 89a439ed4c8c392f0f144bef325aed64889e91a4 Mon Sep 17 00:00:00 2001 From: Adam Straw Date: Wed, 18 May 2022 13:05:23 -0700 Subject: [PATCH 26/59] [Hexagon] Add unit tests for Hexagon Device API (#11319) * [Hexagon] Add unit tests for Hexagon Device API * add scalar alloc for Hexagon + cleanup --- docker/Dockerfile.ci_hexagon | 1 - src/runtime/hexagon/hexagon_device_api.cc | 34 ++-- src/runtime/hexagon/hexagon_device_api.h | 10 ++ .../hexagon/hexagon_device_api_tests.cc | 148 ++++++++++++++++++ .../test_hexagon/test_run_unit_tests.py | 6 +- tests/scripts/task_build_hexagon_api.sh | 5 +- tests/scripts/task_python_hexagon.sh | 3 - 7 files changed, 177 insertions(+), 30 deletions(-) create mode 100644 tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon index 20b185ab6456..ddca5c6c2e66 100644 --- a/docker/Dockerfile.ci_hexagon +++ b/docker/Dockerfile.ci_hexagon @@ -63,7 +63,6 @@ ENV CLANG_LLVM_HOME /opt/clang-llvm ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib ENV PATH /opt/clang-llvm/bin:$PATH ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_ROOT}/tools/HEXAGON_Tools/8.5.08/Tools" -ENV HEXAGON_GTEST "${HEXAGON_SDK_ROOT}/utils/googletest/gtest" # sccache COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index db3ef3faa4f7..c9c1586008e3 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -55,10 +55,15 @@ void HexagonDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) // DataSpace: static allocations for Hexagon void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) { + CHECK(shape) << "shape array is null"; + CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; + if (!mem_scope.defined() || mem_scope.value() == "global") { return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } + // must be Hexagon device and VTCM scope after this point + CHECK_EQ(mem_scope.value(), "global.vtcm"); CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type; size_t typesize = (dtype.bits / 8) * dtype.lanes; @@ -68,7 +73,9 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap alignment = kHexagonAllocAlignment; } - if (ndim == 1) { + if (ndim == 0) { + return AllocateHexagonBuffer(typesize, alignment, mem_scope); + } else if (ndim == 1) { size_t nbytes = shape[0] * typesize; return AllocateHexagonBuffer(nbytes, alignment, mem_scope); } else if (ndim == 2) { @@ -84,10 +91,9 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) { - // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU; - bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) || - (DLDeviceType(dev.device_type) == kDLCPU); - CHECK(is_valid_device) << "dev.device_type: " << dev.device_type; + CHECK(nbytes) << "number of bytes is zero"; + CHECK(alignment) << "alignment is zero"; + CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; if (alignment < kHexagonAllocAlignment) { alignment = kHexagonAllocAlignment; } @@ -95,10 +101,8 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme } void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) { - // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU; - bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) || - (DLDeviceType(dev.device_type) == kDLCPU); - CHECK(is_valid_device) << "dev.device_type: " << dev.device_type; + CHECK(ptr) << "buffer pointer is null"; + CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; FreeHexagonBuffer(ptr); } @@ -109,18 +113,12 @@ struct HexagonWorkspacePool : public WorkspacePool { }; void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { - // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU; - bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) || - (DLDeviceType(dev.device_type) == kDLCPU); - CHECK(is_valid_device) << "dev.device_type: " << dev.device_type; + CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; return dmlc::ThreadLocalStore::Get()->AllocWorkspace(dev, size); } void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) { - // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU; - bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) || - (DLDeviceType(dev.device_type) == kDLCPU); - CHECK(is_valid_device) << "dev.device_type: " << dev.device_type; + CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; CHECK(hexagon_buffer_map_.count(data) != 0) << "Attempt made to free unknown or already freed workspace allocation"; dmlc::ThreadLocalStore::Get()->FreeWorkspace(dev, data); @@ -128,12 +126,14 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) { void* HexagonDeviceAPI::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope) { + // must be Hexagon device (not CPU) CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type; CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations"); return AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } void HexagonDeviceAPI::FreeVtcmWorkspace(Device dev, void* ptr) { + // must be Hexagon device (not CPU) CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type; FreeDataSpace(dev, ptr); } diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h index cc71adfb7794..6f65bf402757 100644 --- a/src/runtime/hexagon/hexagon_device_api.h +++ b/src/runtime/hexagon/hexagon_device_api.h @@ -138,6 +138,16 @@ class HexagonDeviceAPI final : public DeviceAPI { hexagon_buffer_map_.insert({ptr, std::move(buf)}); return ptr; } + + /*! \brief Helper to check if the device type is valid for the Hexagon Device API + * \return Boolean indicating whether the device type is valid + */ + bool IsValidDevice(DLDevice dev) { + // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU + return (TVMDeviceExtType(dev.device_type) == kDLHexagon) || + (DLDeviceType(dev.device_type) == kDLCPU); + } + /*! \brief Helper to free a HexagonBuffer and unregister the result * from the owned buffer map. */ diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc new file mode 100644 index 000000000000..fbcee37cb154 --- /dev/null +++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "../src/runtime/hexagon/hexagon_device_api.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::hexagon; + +class HexagonDeviceAPITest : public ::testing::Test { + protected: + void SetUp() override { + hexapi = HexagonDeviceAPI::Global(); + cpu_dev.device_type = DLDeviceType(kDLCPU); + hex_dev.device_type = DLDeviceType(kDLHexagon); + invalid_dev.device_type = DLDeviceType(kDLExtDev); + int8.bits = 8; + int8.code = 0; + int8.lanes = 1; + } + DLDevice cpu_dev; + DLDevice hex_dev; + DLDevice invalid_dev; + DLDataType int8; + HexagonDeviceAPI* hexapi; + size_t nbytes{256}; + size_t alignment{64}; + int64_t shape1d[1]{256}; + int64_t shape2d[2]{256, 256}; + int64_t shape3d[3]{256, 256, 256}; + Optional default_scope; + Optional invalid_scope{"invalid"}; + Optional global_scope{"global"}; + Optional global_vtcm_scope{"global.vtcm"}; +}; + +TEST_F(HexagonDeviceAPITest, global) { CHECK(hexapi != nullptr); } + +TEST_F(HexagonDeviceAPITest, alloc_free_cpu) { + void* buf = hexapi->AllocDataSpace(cpu_dev, nbytes, alignment, int8); + CHECK(buf != nullptr); + hexapi->FreeDataSpace(cpu_dev, buf); +} + +TEST_F(HexagonDeviceAPITest, alloc_free_hex) { + void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8); + CHECK(buf != nullptr); + hexapi->FreeDataSpace(hex_dev, buf); +} + +TEST_F(HexagonDeviceAPITest, alloc_errors) { + // invalid device + EXPECT_THROW(hexapi->AllocDataSpace(invalid_dev, nbytes, alignment, int8), InternalError); + // 0 size + EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 0, alignment, int8), InternalError); + // 0 alignment + EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, nbytes, 0, int8), InternalError); +} + +TEST_F(HexagonDeviceAPITest, free_errors) { + void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8); + + // invalid device + EXPECT_THROW(hexapi->FreeDataSpace(invalid_dev, buf), InternalError); + // invalid pointer + EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, &buf), InternalError); + // nullptr + EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, nullptr), InternalError); + // double free + hexapi->FreeDataSpace(hex_dev, buf); + EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, buf), InternalError); +} + +TEST_F(HexagonDeviceAPITest, allocnd_free_cpu) { + void* buf = hexapi->AllocDataSpace(cpu_dev, 3, shape3d, int8, global_scope); + CHECK(buf != nullptr); + hexapi->FreeDataSpace(cpu_dev, buf); +} + +TEST_F(HexagonDeviceAPITest, allocnd_free_hex) { + void* buf = hexapi->AllocDataSpace(hex_dev, 3, shape3d, int8, global_scope); + CHECK(buf != nullptr); + hexapi->FreeDataSpace(hex_dev, buf); +} + +TEST_F(HexagonDeviceAPITest, allocnd_free_hex_vtcm) { + void* buf1d = hexapi->AllocDataSpace(hex_dev, 1, shape1d, int8, global_vtcm_scope); + CHECK(buf1d != nullptr); + hexapi->FreeDataSpace(hex_dev, buf1d); + + void* buf2d = hexapi->AllocDataSpace(hex_dev, 2, shape2d, int8, global_vtcm_scope); + CHECK(buf2d != nullptr); + hexapi->FreeDataSpace(hex_dev, buf2d); +} + +TEST_F(HexagonDeviceAPITest, allocnd_erros) { + // invalid device + EXPECT_THROW(hexapi->AllocDataSpace(invalid_dev, 2, shape2d, int8, global_vtcm_scope), + InternalError); + + // Hexagon VTCM allocations must have 0 (scalar) 1 or 2 dimensions + EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 3, shape3d, int8, global_vtcm_scope), InternalError); + + // null shape + EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 2, nullptr, int8, global_vtcm_scope), InternalError); + + // null shape + EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 2, shape2d, int8, invalid_scope), InternalError); + + // cpu & global.vtcm scope + EXPECT_THROW(hexapi->AllocDataSpace(cpu_dev, 2, shape2d, int8, global_vtcm_scope), InternalError); +} + +TEST_F(HexagonDeviceAPITest, alloc_scalar) { + void* cpuscalar = hexapi->AllocDataSpace(cpu_dev, 0, new int64_t, int8, global_scope); + CHECK(cpuscalar != nullptr); + + void* hexscalar = hexapi->AllocDataSpace(hex_dev, 0, new int64_t, int8, global_vtcm_scope); + CHECK(hexscalar != nullptr); +} + +// alloc and free of the same buffer on different devices should throw +// but it currently works with no error +// hexagon and cpu device types may merge long term which would make this test case moot +// disabling this test case, for now +// TODO(HWE): Re-enable or delete this test case once we land on device type strategy +TEST_F(HexagonDeviceAPITest, DISABLED_alloc_free_diff_dev) { + void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8); + CHECK(buf != nullptr); + EXPECT_THROW(hexapi->FreeDataSpace(cpu_dev, buf), InternalError); +} diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py index 010c79b8f554..6a60b8fa81b9 100644 --- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py +++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py @@ -28,16 +28,12 @@ # for example to run all "foo" tests twice and observe gtest output run # pytest -sv --gtests_args="--gtest_filter=*foo* --gtest_repeat=2" @tvm.testing.requires_hexagon -@pytest.mark.skipif( - os.environ.get("HEXAGON_GTEST") == None, - reason="Test requires environment variable HEXAGON_GTEST set with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest", -) def test_run_unit_tests(hexagon_session: Session, gtest_args): try: func = hexagon_session._rpc.get_function("hexagon.run_unit_tests") except: print( - "Test requires TVM Runtime to be built with a Hexagon gtest version using Hexagon API cmake flag -DUSE_HEXAGON_GTEST=${HEXAGON_GTEST}" + "This test requires TVM Runtime to be built with a Hexagon gtest version using Hexagon API cmake flag -DUSE_HEXAGON_GTEST=/path/to/hexagon/sdk/utils/googletest/gtest" ) raise diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh index c5d05eaad80c..a3b501d9c554 100755 --- a/tests/scripts/task_build_hexagon_api.sh +++ b/tests/scripts/task_build_hexagon_api.sh @@ -37,9 +37,6 @@ cd build output_binary_directory=$(realpath ${PWD}/../../../build/hexagon_api_output) rm -rf ${output_binary_directory} -# should be removed after Hexagon Docker update -export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" - cmake -DANDROID_ABI=arm64-v8a \ -DANDROID_PLATFORM=android-28 \ -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \ @@ -47,6 +44,6 @@ cmake -DANDROID_ABI=arm64-v8a \ -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \ -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \ -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \ - -DUSE_HEXAGON_GTEST="${HEXAGON_GTEST}" .. + -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" .. make -j$(nproc) diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh index b639ac02a695..274b348f0935 100755 --- a/tests/scripts/task_python_hexagon.sh +++ b/tests/scripts/task_python_hexagon.sh @@ -43,9 +43,6 @@ if [[ "${device_serial}" == "simulator" ]]; then export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH} fi -# should be removed after Hexagon Docker update -export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" - export ANDROID_SERIAL_NUMBER=${device_serial} run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon From 9273ea5e49ca05404293cb651ced6d0bc0c0f206 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Wed, 18 May 2022 13:06:11 -0700 Subject: [PATCH 27/59] [Hexagon]Refactor Hexagon_SDK_PATH (#11282) * refactor HEXAGON_SDK_PATH and remove HEXAGON_GTEST --- python/tvm/contrib/hexagon/tools.py | 16 ++++++++-------- tests/python/contrib/test_hexagon/README.md | 4 ++-- tests/scripts/task_build_hexagon_api.sh | 2 +- tests/scripts/task_config_build_hexagon.sh | 2 +- tests/scripts/task_python_hexagon.sh | 3 --- 5 files changed, 12 insertions(+), 15 deletions(-) diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py index edf2821d3136..1aec8c7d565b 100644 --- a/python/tvm/contrib/hexagon/tools.py +++ b/python/tvm/contrib/hexagon/tools.py @@ -41,7 +41,7 @@ # Subsequent calls to 'link_shared' will use the newly registered linker. HEXAGON_TOOLCHAIN = os.environ.get("HEXAGON_TOOLCHAIN", default="") # pylint: disable=invalid-name -HEXAGON_SDK_PATH = os.environ.get("HEXAGON_SDK_PATH", default="") # pylint: disable=invalid-name +HEXAGON_SDK_ROOT = os.environ.get("HEXAGON_SDK_ROOT", default="") # pylint: disable=invalid-name HEXAGON_LINK_MAIN = ( pathlib.Path(HEXAGON_TOOLCHAIN) / "bin" / "hexagon-link" ) # pylint: disable=invalid-name @@ -49,8 +49,8 @@ pathlib.Path(HEXAGON_TOOLCHAIN) / "bin" / "hexagon-clang++" ) # pylint: disable=invalid-name HEXAGON_SDK_INCLUDE_DIRS = [ # pylint: disable=invalid-name - pathlib.Path(HEXAGON_SDK_PATH) / "incs", - pathlib.Path(HEXAGON_SDK_PATH) / "incs" / "stddef", + pathlib.Path(HEXAGON_SDK_ROOT) / "incs", + pathlib.Path(HEXAGON_SDK_ROOT) / "incs" / "stddef", ] @@ -154,10 +154,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st " The environment variable HEXAGON_TOOLCHAIN is unset. Please export " + "HEXAGON_TOOLCHAIN in your environment." ) - if not HEXAGON_SDK_PATH: + if not HEXAGON_SDK_ROOT: raise Exception( - " The environment variable HEXAGON_SDK_PATH is unset. Please export " - + "HEXAGON_SDK_PATH in your environment." + " The environment variable HEXAGON_SDK_ROOT is unset. Please export " + + "HEXAGON_SDK_ROOT in your environment." ) # The AOT C codegen uses TVM runtime functions @@ -180,8 +180,8 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st f"-I{tvm_dir / 'include'}", f"-I{tvm_dir / '3rdparty' / 'dlpack' / 'include'}", f"-I{tvm_dir / '3rdparty' / 'dmlc-core' / 'include'}", - f"-I{pathlib.Path(HEXAGON_SDK_PATH) / 'rtos' / 'qurt' / compute_arch / 'include'/ 'posix'}", - f"-I{pathlib.Path(HEXAGON_SDK_PATH) / 'rtos' / 'qurt' / compute_arch / 'include' / 'qurt'}", + f"-I{pathlib.Path(HEXAGON_SDK_ROOT) / 'rtos' / 'qurt' / compute_arch / 'include'/ 'posix'}", + f"-I{pathlib.Path(HEXAGON_SDK_ROOT) / 'rtos' / 'qurt' / compute_arch / 'include' / 'qurt'}", f"-DDMLC_USE_LOGGING_LIBRARY=", f"-D_MACH_I32=int", ] diff --git a/tests/python/contrib/test_hexagon/README.md b/tests/python/contrib/test_hexagon/README.md index ce854bb0ab23..a2b108f7a4ed 100644 --- a/tests/python/contrib/test_hexagon/README.md +++ b/tests/python/contrib/test_hexagon/README.md @@ -33,7 +33,7 @@ First, ensure to export Clang libraries to `LD_LIBRARY_PATH` and Hexagon toolcha ```bash export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"Path to `llvm-clang/lib` sub-directory. Currently we use LLVM-13 in TVM CI." -export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`. The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK." +export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_ROOT/tools/HEXAGON_Tools/x.y.z/Tools`. The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK." ``` You can find more information about downloading [Hexagon SDK](https://developer.qualcomm.com/software/hexagon-dsp-sdk). @@ -104,7 +104,7 @@ You have the options of running Hexagon test on real hardware or on Hexagon simu ```bash export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"path to `llvm-clang/lib` sub-directory" -export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the HexagonSDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`. The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK." +export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the HexagonSDK, for example `HEXAGON_SDK_ROOT/tools/HEXAGON_Tools/x.y.z/Tools`. The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK." export PYTHONPATH=$PYTHONPATH:"path to `tvm/python`" ``` diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh index a3b501d9c554..8e8397a424db 100755 --- a/tests/scripts/task_build_hexagon_api.sh +++ b/tests/scripts/task_build_hexagon_api.sh @@ -41,7 +41,7 @@ cmake -DANDROID_ABI=arm64-v8a \ -DANDROID_PLATFORM=android-28 \ -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \ -DUSE_HEXAGON_ARCH=v68 \ - -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \ + -DUSE_HEXAGON_SDK="${HEXAGON_SDK_ROOT}" \ -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \ -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \ -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" .. diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh index 7bce64cddb5a..a38180a2d971 100755 --- a/tests/scripts/task_config_build_hexagon.sh +++ b/tests/scripts/task_config_build_hexagon.sh @@ -31,6 +31,6 @@ echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake echo set\(USE_HEXAGON "ON"\) >> config.cmake -echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_PATH}"\) >> config.cmake +echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_ROOT}"\) >> config.cmake echo set\(USE_CCACHE OFF\) >> config.cmake echo set\(SUMMARIZE ON\) >> config.cmake diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh index 274b348f0935..883c296c5056 100755 --- a/tests/scripts/task_python_hexagon.sh +++ b/tests/scripts/task_python_hexagon.sh @@ -38,9 +38,6 @@ if [[ "${device_serial}" == "simulator" ]]; then # Temporary workaround for symbol visibility export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim" - - # HEXAGON_TOOLCHAIN is already set - export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH} fi export ANDROID_SERIAL_NUMBER=${device_serial} From ab8dfa151dfc965672bb4af6b752ddb50c9176ff Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Wed, 18 May 2022 23:06:24 +0200 Subject: [PATCH 28/59] use libtorch c++ distribution with c++11 strings in gpu image (#11346) * use libtorch c++ distribution with c++11 strings in gpu image * libtorch path * don't activate libtorch before merging the image --- docker/Dockerfile.ci_gpu | 3 +++ docker/install/ubuntu_install_libtorch.sh | 27 +++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100755 docker/install/ubuntu_install_libtorch.sh diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu index 5d0a642d3f20..73d13007f1d0 100644 --- a/docker/Dockerfile.ci_gpu +++ b/docker/Dockerfile.ci_gpu @@ -85,6 +85,9 @@ RUN bash /install/ubuntu_install_darknet.sh COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh RUN bash /install/ubuntu_install_onnx.sh +COPY install/ubuntu_install_libtorch.sh /install/ubuntu_install_libtorch.sh +RUN bash /install/ubuntu_install_libtorch.sh + COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh RUN bash /install/ubuntu_install_tflite.sh diff --git a/docker/install/ubuntu_install_libtorch.sh b/docker/install/ubuntu_install_libtorch.sh new file mode 100755 index 000000000000..d7eddc85402a --- /dev/null +++ b/docker/install/ubuntu_install_libtorch.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail + +pushd /usr/local/ +wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcpu.zip +unzip libtorch-cxx11-abi-shared-with-deps-1.11.0+cpu.zip +# now it is in /usr/local/libtorch +popd From 9aaf96ef13ec2f13fe677c023a10c5b81d1f5d8a Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Wed, 18 May 2022 14:07:34 -0700 Subject: [PATCH 29/59] [ci][actions] Add more HTTP retries for conda (#11360) Co-authored-by: driazati --- .github/actions/setup/action.yml | 1 + conda/condarc | 42 ++++++++++++++++++++++++++++++++ tests/lint/check_file_type.py | 1 + 3 files changed, 44 insertions(+) create mode 100644 conda/condarc diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 0ce2023ae4e0..81a0d4d48a8d 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -15,6 +15,7 @@ runs: auto-activate-base: false use-only-tar-bz2: true python-version: 3.7 + condarc-file: conda/condarc - name: Conda info shell: pwsh run: | diff --git a/conda/condarc b/conda/condarc new file mode 100644 index 000000000000..eef4967f90fe --- /dev/null +++ b/conda/condarc @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# See https://docs.conda.io/projects/conda/en/latest/configuration.html for details + +# remote_connect_timeout_secs (float) +# The number seconds conda will wait for your client to establish a +# connection to a remote url resource. +# +remote_connect_timeout_secs: 10 + +# remote_max_retries (int) +# The maximum number of retries each HTTP connection should attempt. +# +remote_max_retries: 6 + +# remote_backoff_factor (int) +# The factor determines the time HTTP connection should wait for +# attempt. +# +remote_backoff_factor: 5 + +# remote_read_timeout_secs (float) +# Once conda has connected to a remote resource and sent an HTTP +# request, the read timeout is the number of seconds conda will wait for +# the server to send a response. +# +remote_read_timeout_secs: 60.0 diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index b01174bfee4c..4dc0109bdef8 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -100,6 +100,7 @@ "Makefile", "Doxyfile", "pylintrc", + "condarc", "rat-excludes", "log4j.properties", ".clang-format", From c32224f314cf6128ddc2801a120232d9ffa80a54 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Wed, 18 May 2022 15:18:33 -0700 Subject: [PATCH 30/59] [skip ci] Revert "Fix function number datatype from char to uint16_t (#10014)" (#11363) This reverts commit f34bd22ddc4e7064eabe9fac42c4c04f54ede399. Co-authored-by: driazati --- include/tvm/runtime/crt/func_registry.h | 27 +---------------- src/runtime/crt/common/func_registry.c | 39 +++++++------------------ src/target/func_registry_generator.cc | 8 +---- tests/crt/func_registry_test.cc | 7 ++--- 4 files changed, 15 insertions(+), 66 deletions(-) diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h index 50737f871798..4f8a19af591e 100644 --- a/include/tvm/runtime/crt/func_registry.h +++ b/include/tvm/runtime/crt/func_registry.h @@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry { /*! \brief Names of registered functions, concatenated together and separated by \0. * An additional \0 is present at the end of the concatenated blob to mark the end. * - * Byte 0 and 1 are the number of functions in `funcs`. + * Byte 0 is the number of functions in `funcs`. */ const char* names; @@ -50,31 +50,6 @@ typedef struct TVMFuncRegistry { const TVMBackendPackedCFunc* funcs; } TVMFuncRegistry; -/*! - * \brief Get the of the number of functions from registry. - * - * \param reg TVMFunctionRegistry instance that contains the function. - * \return The number of functions from registry. - */ -uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg); - -/*! - * \brief Set the number of functions to registry. - * - * \param reg TVMFunctionRegistry instance that contains the function. - * \param num_funcs The number of functions - * \return 0 when successful. - */ -int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs); - -/*! - * \brief Get the address of 0th function from registry. - * - * \param reg TVMFunctionRegistry instance that contains the function. - * \return the address of 0th function from registry - */ -const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg); - /*! * \brief Get packed function from registry by name. * diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c index 49cef8fd70eb..116a5c496f1b 100644 --- a/src/runtime/crt/common/func_registry.c +++ b/src/runtime/crt/common/func_registry.c @@ -60,29 +60,14 @@ int strcmp_cursor(const char** cursor, const char* name) { return return_value; } -uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) { - uint16_t num_funcs; - memcpy(&num_funcs, reg->names, sizeof(num_funcs)); - return num_funcs; -} - -int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) { - memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs)); - return 0; -} - -const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) { - // NOTE: first function name starts at index 2 to skip num_funcs. - return (reg->names + sizeof(uint16_t)); -} - tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name, tvm_function_index_t* function_index) { tvm_function_index_t idx; - const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg); + const char* reg_name_ptr; idx = 0; - for (; *reg_name_ptr != '\0'; reg_name_ptr++) { + // NOTE: reg_name_ptr starts at index 1 to skip num_funcs. + for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) { if (!strcmp_cursor(®_name_ptr, name)) { *function_index = idx; return kTvmErrorNoError; @@ -97,9 +82,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg, tvm_function_index_t function_index, TVMBackendPackedCFunc* out_func) { - uint16_t num_funcs; + uint8_t num_funcs; - num_funcs = TVMFuncRegistry_GetNumFuncs(reg); + num_funcs = reg->names[0]; if (function_index >= num_funcs) { return kTvmErrorFunctionIndexInvalid; } @@ -116,8 +101,7 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8 reg->registry.names = (const char*)buffer; buffer[0] = 0; // number of functions present in buffer. - buffer[1] = 0; // note that we combine the first two elements to form a 16-bit function index. - buffer[2] = 0; // end of names list marker. + buffer[1] = 0; // end of names list marker. // compute a guess of the average size of one entry: // - assume average function name is around ~10 bytes @@ -133,12 +117,13 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name, TVMBackendPackedCFunc func, int override) { size_t idx; - char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry)); + char* reg_name_ptr; idx = 0; // NOTE: safe to discard const qualifier here, since reg->registry.names was set from // TVMMutableFuncRegistry_Create above. - for (; *reg_name_ptr != 0; reg_name_ptr++) { + // NOTE: reg_name_ptr starts at index 1 to skip num_funcs. + for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) { if (!strcmp_cursor((const char**)®_name_ptr, name)) { if (override == 0) { return kTvmErrorFunctionAlreadyDefined; @@ -164,11 +149,7 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch reg_name_ptr += name_len + 1; *reg_name_ptr = 0; ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func; - - uint16_t num_funcs; - // increment num_funcs. - num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1; - TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs); + ((char*)reg->registry.names)[0]++; // increment num_funcs. return kTvmErrorNoError; } diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc index d679bf379b62..7c948d50cbb9 100644 --- a/src/target/func_registry_generator.cc +++ b/src/target/func_registry_generator.cc @@ -31,13 +31,7 @@ namespace target { std::string GenerateFuncRegistryNames(const Array& function_names) { std::stringstream ss; - - unsigned char function_nums[sizeof(uint16_t)]; - *reinterpret_cast(function_nums) = function_names.size(); - for (auto f : function_nums) { - ss << f; - } - + ss << (unsigned char)(function_names.size()); for (auto f : function_names) { ss << f << '\0'; } diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc index 5962a3acee39..9f0e7f8d1a5a 100644 --- a/tests/crt/func_registry_test.cc +++ b/tests/crt/func_registry_test.cc @@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) { } TEST(FuncRegistry, Empty) { - TVMFuncRegistry registry{"\000\000", NULL}; + TVMFuncRegistry registry{"\000", NULL}; EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(®istry, "foo", NULL)); EXPECT_EQ(kTvmErrorFunctionIndexInvalid, @@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_ } // Matches the style of registry defined in generated C modules. -const char* kBasicFuncNames = "\002\000Foo\0Bar\0"; // NOTE: final \0 +const char* kBasicFuncNames = "\002Foo\0Bar\0"; // NOTE: final \0 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar}; const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs}; @@ -111,8 +111,7 @@ TEST(FuncRegistry, ConstGlobalRegistry) { // Foo EXPECT_EQ(kBasicFuncNames[0], 2); - EXPECT_EQ(kBasicFuncNames[1], 0); - EXPECT_EQ(kBasicFuncNames[2], 'F'); + EXPECT_EQ(kBasicFuncNames[1], 'F'); EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index)); EXPECT_EQ(0, func_index); From ddfa1da691bacbb0018b53fca8409c5cfd6dbf3a Mon Sep 17 00:00:00 2001 From: Mohamad Katanbaf Date: Wed, 18 May 2022 16:09:10 -0700 Subject: [PATCH 31/59] [bug fix] skip "__nop" functions in graph_executor_debug (#11353) * bug fix, skip __nop functions in running operation over RPC Co-authored-by: Mohamad --- src/runtime/graph_executor/debug/graph_executor_debug.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index 97d89206f5dc..bd3b0db0403f 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -140,6 +140,11 @@ class GraphExecutorDebug : public GraphExecutor { return 0; } + if (nodes_[index].param.func_name == "__nop") { + LOG_INFO << "Skipping __nop function"; + return 0; + } + const Device& dev = data_entry_[entry_id(index, 0)]->device; TVMOpParam param = nodes_[index].param; std::string name = param.func_name; From ffc0443913d837c6b7a6ec55375ea29cf3d1fa7c Mon Sep 17 00:00:00 2001 From: heliqi <1101791222@qq.com> Date: Thu, 19 May 2022 03:53:03 -0500 Subject: [PATCH 32/59] [Frontend] [PaddlePaddle] Add split operator (#11354) * suuport split op of paddlepaddle * black formatting --- python/tvm/relay/frontend/paddlepaddle.py | 45 +++++++++++++++++++ .../frontend/paddlepaddle/test_forward.py | 38 ++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py index 7f2460d66eeb..7042154709ae 100644 --- a/python/tvm/relay/frontend/paddlepaddle.py +++ b/python/tvm/relay/frontend/paddlepaddle.py @@ -1920,6 +1920,50 @@ def convert_softsign(g, op, block): g.add_node(op.output("Out")[0], out) +def convert_split(g, op, block): + """Operator converter for split.""" + + x = g.get_node(op.input("X")[0]) + axis = op.input("AxisTensor") + if axis: + axis = g.get_node(axis[0]) + axis, infered = try_infer_value(axis, g.get_params()) + if infered: + axis = axis.tolist()[0] + else: + axis = op.attr("axis") + + sections = op.input("SectionsTensorList") + if sections: + tmp_section = [] + for i in sections: + i = g.get_node(i) + i, infered = try_infer_value(i, g.get_params()) + if infered: + i = i.tolist() + else: + raise ValueError("Dynamic Split not yet supported.") + tmp_section.extend(i) + sections = tmp_section + else: + sections = op.attr("sections") + if sections: + indices = [] + split_index = 0 + for i in sections[:-1]: + if i == -1: + input_shape = infer_shape(x)[axis] + i = input_shape - np.sum(sections) - 1 + split_index += i + indices.append(split_index) + else: + indices = op.attr("num") + + out = _op.split(x, indices, axis) + for i, out_i in enumerate(out): + g.add_node(op.output("Out")[i], out_i) + + def convert_square(g, op, block): """Operator converter for square.""" @@ -2092,6 +2136,7 @@ def convert_unsqueeze(g, op, block): "softmax": convert_softmax, "softplus": convert_softplus, "softsign": convert_softsign, + "split": convert_split, "strided_slice": convert_slice, "sqrt": convert_unary_op, "square": convert_square, diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py index 9fa4063755f7..0f243e0ea02c 100644 --- a/tests/python/frontend/paddlepaddle/test_forward.py +++ b/tests/python/frontend/paddlepaddle/test_forward.py @@ -782,6 +782,44 @@ def full2(inputs): verify_model(full2, input_data=[input_data]) +@tvm.testing.uses_gpu +def test_forward_split(): + class Split(nn.Layer): + def __init__( + self, axis=None, num_or_sections=None, axis_is_tensor=False, num_is_tensor=False + ): + super(Split, self).__init__() + self.axis = axis + self.num_or_sections = num_or_sections + self.axis_is_tensor = axis_is_tensor + self.num_is_tensor = num_is_tensor + + @paddle.jit.to_static + def forward(self, inputs): + axis = self.axis + if self.axis_is_tensor: + axis = paddle.to_tensor(axis, dtype="int32") + num_or_sections = self.num_or_sections + if self.num_is_tensor: + new_num_or_sections = [] + for i in num_or_sections: + if isinstance(i, list): + i = paddle.to_tensor(i, dtype="int32") + new_num_or_sections.append(i) + num_or_sections = new_num_or_sections + return paddle.split(inputs, num_or_sections=num_or_sections, axis=axis) + + input_shape = [3, 6, 2] + input_data = paddle.rand(input_shape, dtype="float32") + verify_model(Split(axis=1, num_or_sections=3), input_data=input_data) + verify_model( + Split(axis=[1], num_or_sections=[2, 3, 1], axis_is_tensor=True), input_data=input_data + ) + verify_model( + Split(axis=1, num_or_sections=[2, -1, [3]], num_is_tensor=True), input_data=input_data + ) + + @tvm.testing.uses_gpu def test_forward_squeeze(): class Squeeze(nn.Layer): From 534c38bef3c98f8094bce6780cabdeedb017645b Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Thu, 19 May 2022 04:37:41 -0700 Subject: [PATCH 33/59] [Relay] Support i16, f16 scalars in Relay text (#11224) While testing fp16 models for Collage discovered the Relay text format did not support f16. While adding that cleaned up scalar handling in general. However I left two inlined tests for 'is simple const' in place (fuse_ops.cc and memory_alloc.cc) since it's not clear whether they should remain specific to just {i,f}{32,64} or whether they can be replaced with the support::IsSimpleScalar central predicate. --- src/parser/parser.cc | 45 +---- src/parser/tokenizer.h | 104 +++++++---- src/printer/doc.cc | 7 +- src/printer/relay_text_printer.cc | 80 ++++---- src/printer/text_printer.h | 7 - src/support/scalars.cc | 202 +++++++++++++++++++++ src/support/scalars.h | 67 +++++++ tests/cpp/support/scalars_test.cc | 63 +++++++ tests/python/relay/test_ir_parser.py | 41 ++++- tests/python/relay/test_ir_text_printer.py | 37 ++-- 10 files changed, 505 insertions(+), 148 deletions(-) create mode 100644 src/support/scalars.cc create mode 100644 src/support/scalars.h create mode 100644 tests/cpp/support/scalars_test.cc diff --git a/src/parser/parser.cc b/src/parser/parser.cc index 9b15893092f7..f51e3e5c9737 100644 --- a/src/parser/parser.cc +++ b/src/parser/parser.cc @@ -35,10 +35,12 @@ #include +#include "../support/scalars.h" #include "./meta_ref.h" #include "./op_table.h" #include "./span_check.h" #include "./tokenizer.h" +#include "tvm/runtime/builtin_fp16.h" namespace tvm { namespace parser { @@ -534,49 +536,15 @@ class Parser { /*! \brief Convert a numeric token to an NDArray for embedding into the Relay program. */ NDArray NumberToNDArray(const Token& token) { if (token->token_type == TokenType::kInteger) { - DLDevice dev = {DLDeviceType::kDLCPU, 0}; - int64_t i = Downcast(token->data); - if (i > std::numeric_limits::max()) { - auto dtype = String2DLDataType("int64"); - auto data = NDArray::Empty({}, dtype, dev); - auto array = reinterpret_cast(data->data); - // revisit this, literal node issue. - array[0] = i; - return data; - } else { - auto dtype = String2DLDataType("int32"); - auto data = NDArray::Empty({}, dtype, dev); - auto array = reinterpret_cast(data->data); - // revisit this, literal node issue. - array[0] = i; - return data; - } + return support::IntImmToNDArray(Downcast(token->data)); } else if (token->token_type == TokenType::kFloat) { - DLDevice dev = {DLDeviceType::kDLCPU, 0}; - auto float_imm = Downcast(token->data); - auto data = NDArray::Empty({}, float_imm->dtype, dev); - auto array = reinterpret_cast(data->data); - // revisit this, literal node issue. - // TODO(@jroesch): bounds checking - float value = float_imm->value; - array[0] = value; - return data; + return support::FloatImmToNDArray(Downcast(token->data)); } else { LOG(FATAL) << "internal error: should only call this function on numeric tokens"; - return NDArray(); + return {}; } } - /*! \brief Convert a boolean value to an NDArray for embedding into the Relay program. */ - NDArray BooleanToNDarray(bool value) { - DLDevice dev = {DLDeviceType::kDLCPU, 0}; - auto dtype = String2DLDataType("bool"); - auto data = NDArray::Empty({}, dtype, dev); - auto array = reinterpret_cast(data->data); - array[0] = value; - return data; - } - [[noreturn]] void ParseError(const Token& token, const std::string& msg) { throw std::runtime_error(msg); } @@ -1573,8 +1541,7 @@ class Parser { case TokenType::kBoolean: { Consume(TokenType::kBoolean); int64_t value = Downcast(next->data); - auto boolean = BooleanToNDarray(value); - Expr e = Constant(boolean, next->span); + Expr e = Constant(support::BoolToNDArray(value), next->span); ICHECK(e->span.defined()) << "constant spans must be defined"; return e; } diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h index f8098cf94100..4ac1ceef26dc 100644 --- a/src/parser/tokenizer.h +++ b/src/parser/tokenizer.h @@ -34,6 +34,7 @@ #include #include +#include "../support/scalars.h" #include "./meta_ref.h" #include "./token.h" @@ -174,35 +175,16 @@ struct Tokenizer { Token ParseNumber(bool is_pos, bool is_float, std::string number) { ICHECK(number.size() > 0) << "an empty string is an invalid number"; - if (!is_float) { - auto token = NewToken(TokenType::kInteger); - size_t index = 0; - int64_t value = 0; - try { - value = std::stoll(number, &index); - } catch (const std::invalid_argument& err) { - this->diag_ctx.Emit(Diagnostic::Error(token->span) << "invalid number `" << number << "`"); - } catch (const std::out_of_range& err) { - this->diag_ctx.Emit(Diagnostic::Error(token->span) << "invalid number `" << number << "`"); - } - if (number.size() <= index) { - value = is_pos ? value : -value; - if (value > std::numeric_limits::max()) { - token->data = tvm::IntImm(DataType::Int(64), value); - } else { - token->data = tvm::IntImm(DataType::Int(32), value); - } - return token; - } + Token token = NewToken(is_float ? TokenType::kFloat : TokenType::kInteger); + size_t suffix_pos = number.rfind(is_float ? 'f' : 'i'); + if (suffix_pos == std::string::npos) { + suffix_pos = number.size(); + } + std::string literal_text = number.substr(0, suffix_pos); + std::string suffix; + if (suffix_pos < number.size()) { + suffix = number.substr(suffix_pos + 1, number.size() - suffix_pos); } - auto token = NewToken(TokenType::kFloat); - - auto suffix_pos = number.rfind("f"); - - auto literal_text = number.substr(0, suffix_pos); - - auto suffix = number.substr(suffix_pos + 1, number.size() - suffix_pos); - int width = 32; if (suffix.size()) { @@ -217,9 +199,62 @@ struct Tokenizer { } } - double value = stod(literal_text); - value = is_pos ? value : -value; - token->data = tvm::FloatImm(DataType::Float(width), value); + if (is_float) { + double value = 0.0; + size_t index = 0; + try { + value = stod(literal_text, &index); + } catch (const std::invalid_argument& err) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "invalid floating point number `" << literal_text << "`"); + } catch (const std::out_of_range& err) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "invalid floating point number `" << literal_text << "`"); + } + if (index < literal_text.size()) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "invalid floating point number `" << literal_text << "`"); + } + value = is_pos ? value : -value; + token->data = support::ValueToFloatImm(value, width); + if (!token->data.defined()) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "floating point number `" << literal_text + << "` unrepresentable in width " << width); + token->data = support::ValueToFloatImm(0.0, width); + } + } else { + int64_t value = 0; + size_t index = 0; + try { + value = std::stoll(literal_text, &index); + } catch (const std::invalid_argument& err) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "invalid integer number `" << literal_text << "`"); + } catch (const std::out_of_range& err) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "invalid integer number `" << literal_text << "`"); + } + if (index < literal_text.size()) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "invalid integer number `" << literal_text << "`"); + } + value = is_pos ? value : -value; + token->data = support::ValueToIntImm(value, width); + if (!token->data.defined() && suffix.empty()) { + // Without any i suffix the legacy behavior was to default to int64 if out of range + // for int32. + width = 64; + token->data = support::ValueToIntImm(value, width); + } + if (!token->data.defined()) { + this->diag_ctx.Emit(Diagnostic::Error(token->span) + << "integer number `" << literal_text << "` unrepresentable in width " + << width); + token->data = support::ValueToIntImm(0, width); + } + } + return token; } @@ -230,14 +265,13 @@ struct Tokenizer { } bool is_float = false; - - // Remove trailing floating point prefix. - if (More() && Peek() == 'f') { + if (More() && (Peek() == 'f' || Peek() == 'i')) { + is_float = Peek() == 'f'; + // Capture trailing width suffix ss << Next(); while (More() && IsNumeric(Peek())) { ss << Next(); } - is_float = true; } return ParseNumber(is_pos, is_float, ss.str()); } diff --git a/src/printer/doc.cc b/src/printer/doc.cc index f7d9fdfd7dfb..b06995fb1286 100644 --- a/src/printer/doc.cc +++ b/src/printer/doc.cc @@ -52,12 +52,7 @@ TVM_REGISTER_OBJECT_TYPE(DocTextNode); class DocText : public DocAtom { public: - explicit DocText(std::string str) { - if (str.find_first_of("\t\n") != str.npos) { - LOG(WARNING) << "text node: '" << str << "' should not have tab or newline."; - } - data_ = runtime::make_object(str); - } + explicit DocText(std::string str) { data_ = runtime::make_object(str); } TVM_DEFINE_OBJECT_REF_METHODS(DocText, DocAtom, DocTextNode); }; diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc index 97231931ad88..35daf588fbeb 100644 --- a/src/printer/relay_text_printer.cc +++ b/src/printer/relay_text_printer.cc @@ -43,9 +43,11 @@ #include "../ir/attr_functor.h" #include "../parser/meta_ref.h" #include "../relay/analysis/dependency_graph.h" +#include "../support/scalars.h" #include "doc.h" #include "meta_data.h" #include "text_printer.h" +#include "tvm/runtime/builtin_fp16.h" namespace tvm { namespace relay { @@ -61,8 +63,17 @@ Doc RelayTextPrinter::PrintOptionalInfo(const Expr& expr) { } // default annotations if (annotate_ == nullptr) { - if ((expr.as() || expr.as()) && expr->checked_type_.defined()) { - doc << " /* ty=" << Print(expr->checked_type()) << " */"; + if ((expr.as() || expr.as() || expr.as() || + expr.as() || expr.as() || expr.as()) && + (expr->checked_type_.defined() || expr->span.defined())) { + doc << " /*"; + if (expr->checked_type_.defined()) { + doc << " ty=" << Print(expr->checked_type()); + } + if (expr->span.defined()) { + doc << " span=" << PrintSpan(expr->span); + } + doc << " */"; } } else { std::string annotated_expr = annotate_(expr); @@ -219,7 +230,7 @@ Doc RelayTextPrinter::AllocVar(const Var& var) { name = "v" + name; } Doc val = GetUniqueName("%" + name); - memo_[var] = val; + memo_[var] = val; // Referential occurrences will not include the following. if (!var->virtual_device()->IsFullyUnconstrained()) { val << " {" << kVirtualDevice << "=" << PrintAttributeValue(var->virtual_device()) << "}"; } @@ -335,51 +346,17 @@ Doc RelayTextPrinter::PrintExpr(const Expr& expr, bool meta, bool try_inline, bo // first time. Doc RelayTextPrinter::VisitExpr_(const VarNode* op) { return AllocVar(GetRef(op)); } -/*! - * \brief special method to print out const scalar - * \param dtype The data type - * \param value The value to be printed. - */ -template -Doc RelayTextPrinter::ScalarLiteral(DataType dtype, const T& value) { - std::ostringstream os; - if (dtype == DataType::Int(32)) { - os << value; - } else if (dtype == DataType::Float(32)) { - os << value << 'f'; - } else if (dtype == DataType::Float(64)) { - os << value << "f64"; - } else if (dtype == DataType::Bool()) { - return Doc::PyBoolLiteral(value != 0); - } else { - os << value; - } - return Doc::Text(os.str()); -} - Doc RelayTextPrinter::VisitExpr_(const ConstantNode* op) { // Print out simple scalars directly. - if (op->is_scalar()) { - std::ostringstream os; - DataType dtype = DataType(op->data->dtype); - ICHECK_EQ(op->data->device.device_type, kDLCPU); - if (dtype == DataType::Int(32)) { - return ScalarLiteral(dtype, static_cast(op->data->data)[0]); - } else if (dtype == DataType::Int(64)) { - return ScalarLiteral(dtype, static_cast(op->data->data)[0]); - } else if (dtype == DataType::Float(32)) { - return ScalarLiteral(dtype, static_cast(op->data->data)[0]); - } else if (dtype == DataType::Float(64)) { - return ScalarLiteral(dtype, static_cast(op->data->data)[0]); - } else if (dtype == DataType::Bool()) { - return ScalarLiteral(dtype, static_cast(op->data->data)[0]); - } + if (support::IsSimpleScalar(op)) { + return Doc::Text(support::NDArrayScalarToString(op->data)); } - // default fall-back, record it as meta node. + // Fallbock: record it as a meta node. Doc doc; // Don't append optional_info. Because the entry function is Print, // and it will append the optional_info afterwards. - return doc << PrintExpr(GetRef(op), true, false, false); + return doc << PrintExpr(GetRef(op), /*meta=*/true, /*try_inline=*/false, + /*optional_info=*/false); } Doc RelayTextPrinter::VisitExpr_(const TupleNode* op) { @@ -540,9 +517,6 @@ Doc RelayTextPrinter::VisitExpr_(const CallNode* op) { return doc; } else { doc << "(" << Doc::Concat(args) << ")"; - if (op->span.defined()) { - doc << " /* " << PrintSpan(op->span) << " */"; - } return doc; } } @@ -799,11 +773,21 @@ Doc RelayTextPrinter::VisitAttr_(const ArrayNode* op) { } Doc RelayTextPrinter::VisitAttr_(const tir::IntImmNode* op) { - return ScalarLiteral(op->dtype, op->value); + if (support::IsSimpleScalarDtype(op->dtype)) { + return Doc::Text(support::IntImmToString(GetRef(op))); + } else { + // Fallback: Print int64_t without width suffix. + return Doc::Text(std::to_string(op->value)); + } } Doc RelayTextPrinter::VisitAttr_(const tir::FloatImmNode* op) { - return ScalarLiteral(op->dtype, op->value); + if (support::IsSimpleScalarDtype(op->dtype)) { + return Doc::Text(support::FloatImmToString(GetRef(op))); + } else { + // Fallbock: Print double without width suffix. + return Doc::Text(std::to_string(op->value)); + } } Doc RelayTextPrinter::VisitAttr_(const tir::StringImmNode* op) { @@ -977,7 +961,7 @@ Doc RelayTextPrinter::PrintSpan(const Span& span) { Doc doc; const auto* span_node = span.as(); ICHECK(span_node); - doc << span_node->source_name->name; + doc << span_node->source_name->name << ":" << span_node->line << ":" << span_node->column; return doc; } diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h index c34c4a5b6dbe..05a00e3305e1 100644 --- a/src/printer/text_printer.h +++ b/src/printer/text_printer.h @@ -152,13 +152,6 @@ class RelayTextPrinter : public ExprFunctor, // Should only be triggered when op is a free variable being visited for the // first time. Doc VisitExpr_(const VarNode* op) final; - /*! - * \brief special method to print out const scalar - * \param dtype The data type - * \param value The value to be printed. - */ - template - static Doc ScalarLiteral(DataType dtype, const T& value); Doc VisitExpr_(const ConstantNode* op) final; Doc VisitExpr_(const TupleNode* op) final; Doc VisitExpr_(const TupleGetItemNode* op) final; diff --git a/src/support/scalars.cc b/src/support/scalars.cc new file mode 100644 index 000000000000..9caa7ca58915 --- /dev/null +++ b/src/support/scalars.cc @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/support/scalars.cc + * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms. + */ + +#include "./scalars.h" + +#include "tvm/relay/expr.h" +#include "tvm/runtime/builtin_fp16.h" + +namespace tvm { +namespace support { + +/*! \brief The standard scalar dtypes. */ +static const DataType kInt16 = DataType::Int(16); +static const DataType kInt32 = DataType::Int(32); +static const DataType kInt64 = DataType::Int(64); +static const DataType kFloat16 = DataType::Float(16); +static const DataType kFloat32 = DataType::Float(32); +static const DataType kFloat64 = DataType::Float(64); +static const DataType kBool = DataType::Bool(); + +bool IsSimpleScalarDtype(DataType dtype) { + return dtype == kInt16 || dtype == kInt32 || dtype == kInt64 || dtype == kFloat16 || + dtype == kFloat32 || dtype == kFloat64 || dtype == kBool; +} + +bool IsSimpleScalar(const relay::ConstantNode* constant_node) { + return constant_node->is_scalar() && IsSimpleScalarDtype(DataType(constant_node->data->dtype)); +} + +runtime::NDArray IntImmToNDArray(const IntImm& int_imm) { + DLDevice dev = {DLDeviceType::kDLCPU, 0}; + auto data = runtime::NDArray::Empty({}, int_imm->dtype, dev); + if (int_imm.dtype() == kInt16) { + auto* array = reinterpret_cast(data->data); + array[0] = static_cast(int_imm->value); + } else if (int_imm.dtype() == kInt32) { + auto* array = reinterpret_cast(data->data); + array[0] = static_cast(int_imm->value); + } else if (int_imm.dtype() == kInt64) { + auto* array = reinterpret_cast(data->data); + array[0] = int_imm->value; + } else { + LOG(FATAL) << "Unrecognized numeric literal dtype: " << DLDataType2String(int_imm.dtype()); + } + return data; +} + +runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm) { + DLDevice dev = {DLDeviceType::kDLCPU, 0}; + auto data = runtime::NDArray::Empty({}, float_imm->dtype, dev); + if (float_imm.dtype() == kFloat16) { + auto* array = reinterpret_cast(data->data); + array[0] = __gnu_f2h_ieee(static_cast(float_imm->value)); + } else if (float_imm.dtype() == kFloat32) { + auto* array = reinterpret_cast(data->data); + array[0] = static_cast(float_imm->value); + } else if (float_imm.dtype() == kFloat64) { + auto* array = reinterpret_cast(data->data); + array[0] = float_imm->value; + } else { + LOG(FATAL) << "Unrecognized numeric literal dtype: " << DLDataType2String(float_imm.dtype()); + } + return data; +} + +runtime::NDArray BoolToNDArray(bool value) { + DLDevice dev = {DLDeviceType::kDLCPU, 0}; + auto data = runtime::NDArray::Empty({}, kBool, dev); + auto array = reinterpret_cast(data->data); + array[0] = value; + return data; +} + +std::string NDArrayScalarToString(const runtime::NDArray& data) { + std::ostringstream os; + DataType dtype(data->dtype); + ICHECK_EQ(data->device.device_type, kDLCPU) << "Scalars must reside on the CPU to be printed"; + if (dtype == kInt16) { + auto value = static_cast(data->data)[0]; + os << value << "i16"; + } else if (dtype == kInt32) { + auto value = static_cast(data->data)[0]; + os << value; + } else if (dtype == kInt64) { + auto value = static_cast(data->data)[0]; + os << value << "i64"; + } else if (dtype == kFloat16) { + auto value = __gnu_h2f_ieee(static_cast(data->data)[0]); + os << value << "f16"; + } else if (dtype == kFloat32) { + auto value = static_cast(data->data)[0]; + os << value << "f"; + } else if (dtype == kFloat64) { + auto value = static_cast(data->data)[0]; + os << value << "f64"; + } else if (dtype == kBool) { + auto value = static_cast(data->data)[0]; + os << (value ? "True" : "False"); + } else { + LOG(FATAL) << "Unrecognized NDArray scalar dtype: " << DLDataType2String(dtype); + } + return os.str(); +} + +std::string IntImmToString(const IntImm& int_imm) { + std::ostringstream os; + if (int_imm->dtype == kInt16) { + os << int_imm->value << "i16"; + } else if (int_imm->dtype == kInt32) { + os << int_imm->value; + } else if (int_imm->dtype == kInt64) { + os << int_imm->value << "i64"; + } else if (int_imm->dtype == kBool) { + os << (int_imm->value ? "True" : "False"); + } else { + LOG(FATAL) << "Unrecognised IntImm dtype: " << DLDataType2String(int_imm->dtype); + } + return os.str(); +} + +std::string FloatImmToString(const FloatImm& float_imm) { + std::ostringstream os; + if (float_imm->dtype == kFloat16) { + os << float_imm->value << "f16"; + } else if (float_imm->dtype == kFloat32) { + os << float_imm->value << "f"; + } else if (float_imm->dtype == kFloat64) { + os << float_imm->value << "f64"; + } else { + LOG(FATAL) << "Unrecognised FloatImm dtype: " << DLDataType2String(float_imm->dtype); + } + return os.str(); +} + +IntImm ValueToIntImm(int64_t value, int width) { + if (width == 16) { + if (value < std::numeric_limits::min() || + value > std::numeric_limits::max()) { + return {}; + } + return IntImm(kInt16, value); + } else if (width == 32) { + if (value < std::numeric_limits::min() || + value > std::numeric_limits::max()) { + return {}; + } + return IntImm(kInt32, value); + } else if (width == 64) { + return IntImm(kInt64, value); + } else { + LOG(FATAL) << "Unrecognized int scalar width: " << width; + return {}; + } +} + +// 2^15 * (1 + 1023/1024) +// See https://en.wikipedia.org/wiki/Half-precision_floating-point_format +constexpr double kMaxFloat16 = 65504.0; + +FloatImm ValueToFloatImm(double value, int width) { + if (width == 16) { + if (!std::isinf(value) && (value < -kMaxFloat16 || value > kMaxFloat16)) { + return {}; + } + return FloatImm(kFloat16, value); + } else if (width == 32) { + if (!std::isinf(value) && + (value < -std::numeric_limits::max() || value > std::numeric_limits::max())) { + return {}; + } + return FloatImm(kFloat32, value); + } else if (width == 64) { + return FloatImm(kFloat64, value); + } else { + LOG(FATAL) << "Unrecognized float scalar width: " << width; + return {}; + } +} + +} // namespace support +} // namespace tvm diff --git a/src/support/scalars.h b/src/support/scalars.h new file mode 100644 index 000000000000..60b8fc40a8de --- /dev/null +++ b/src/support/scalars.h @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/support/scalars.h + * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms. + */ + +#ifndef TVM_SUPPORT_SCALARS_H_ +#define TVM_SUPPORT_SCALARS_H_ + +#include +#include + +#include "tvm/ir/expr.h" +#include "tvm/relay/expr.h" +#include "tvm/runtime/ndarray.h" + +namespace tvm { +namespace support { + +/*! \brief Returns true if a tensor of empty shape and given dtype is considered a Relay scalar. */ +bool IsSimpleScalarDtype(DataType dtype); + +/*! \brief Returns true if \p constant_node is a float/int/bool scalar. */ +bool IsSimpleScalar(const relay::ConstantNode* constant_node); + +/*! \brief Returns NDArray 'scalar' for given TIR immediate. */ +runtime::NDArray IntImmToNDArray(const IntImm& int_imm); +runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm); +runtime::NDArray BoolToNDArray(bool value); + +/*! \brief Returns Relay literal text for NDArray 'scalar'. */ +std::string NDArrayScalarToString(const runtime::NDArray& data); + +/*! \brief Returns Relay literal text for given TIR immediate. */ +std::string IntImmToString(const IntImm& int_imm); +std::string FloatImmToString(const FloatImm& float_imm); + +/*! + * \brief Returns TIR immediate for given value and width. Result will be null if value is + * out of range in width. Note however for floating point we don't check if the value is + * representable without loss of precision. + */ +IntImm ValueToIntImm(int64_t value, int width); +FloatImm ValueToFloatImm(double value, int width); + +} // namespace support +} // namespace tvm + +#endif // TVM_SUPPORT_SCALARS_H_ diff --git a/tests/cpp/support/scalars_test.cc b/tests/cpp/support/scalars_test.cc new file mode 100644 index 000000000000..d55f0541fa40 --- /dev/null +++ b/tests/cpp/support/scalars_test.cc @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "../../../src/support/scalars.h" + +#include +#include + +namespace tvm { +namespace support { +namespace { + +// Note that functional testing is via test_ir_parser.py and test_ir_text_printer.py. +// Here we just check handling which is difficult to test via the standard Python API. + +TEST(Scalars, IntImmToNDArray_Unsupported) { + ASSERT_THROW(IntImmToNDArray(IntImm(DataType::Int(15), 42)), runtime::InternalError); +} + +TEST(Scalars, FloatImmtoNDArray_Unsupported) { + ASSERT_THROW(FloatImmToNDArray(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError); +} + +TEST(Scalars, NDArrayScalarToString_Unsupported) { + auto ndarray = runtime::NDArray::Empty({}, DataType::Int(8), {DLDeviceType::kDLCPU, 0}); + ASSERT_THROW(NDArrayScalarToString(ndarray), runtime::InternalError); +} + +TEST(Scalars, IntImmToString_Unsupported) { + ASSERT_THROW(IntImmToString(IntImm(DataType::Int(15), 42)), runtime::InternalError); +} + +TEST(Scalars, FloatImmToString_Unsupported) { + ASSERT_THROW(FloatImmToString(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError); +} + +TEST(Scalars, ValueToIntImm_Unsupported) { + ASSERT_THROW(ValueToIntImm(42, 15), runtime::InternalError); +} + +TEST(SCalars, ValueToFloatImm_Unsupported) { + ASSERT_THROW(ValueToFloatImm(42.0, 15), runtime::InternalError); +} + +} // namespace +} // namespace support +} // namespace tvm diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py index fdbd3924ffb7..7a283461e0bd 100644 --- a/tests/python/relay/test_ir_parser.py +++ b/tests/python/relay/test_ir_parser.py @@ -15,11 +15,11 @@ # specific language governing permissions and limitations # under the License. import numpy as np +import pytest import tvm from tvm import relay import tvm.relay.testing -import pytest from numpy import isclose from typing import Union @@ -172,6 +172,26 @@ def test_int_literal(): assert get_scalar(parse_text("-05")) == -5 assert get_scalar(parse_text("9223372036854775807")) == 9223372036854775807 + assert get_scalar(parse_text("-42i")) == -42 + assert get_scalar(parse_text("-42i16")) == -42 + assert get_scalar(parse_text("-42i32")) == -42 + assert get_scalar(parse_text("-42i64")) == -42 + + assert_parses_as("-42i16", relay.const(-42, "int16")) + assert_parses_as("-42i32", relay.const(-42, "int32")) + assert_parses_as("-42i", relay.const(-42, "int32")) + assert_parses_as("-42", relay.const(-42, "int32")) + assert_parses_as("-42i64", relay.const(-42, "int64")) + assert_parses_as("2147483647", relay.const(2147483647, "int32")) + assert_parses_as("2147483648", relay.const(2147483648, "int64")) + + with pytest.raises(tvm.error.DiagnosticError): + # Unrepresentable + parse_text("2147483648i32") + with pytest.raises(tvm.error.DiagnosticError): + # Unrepresentable + parse_text("32768i16") + def test_float_literal(): assert get_scalar(parse_text("1.0f")) == 1.0 @@ -189,11 +209,28 @@ def test_float_literal(): assert isclose(get_scalar(parse_text("1.0E-1f")), 1.0e-1) assert get_scalar(parse_text("1.0E+1f")) == 1.0e1 + assert get_scalar(parse_text("3f16")) == 3.0 + assert get_scalar(parse_text("3f32")) == 3.0 + + assert_parses_as("3f16", relay.const(3.0, "float16")) + assert_parses_as("3f32", relay.const(3.0, "float32")) + assert_parses_as("3f", relay.const(3.0, "float32")) + assert_parses_as("3f64", relay.const(3.0, "float64")) + + with pytest.raises(tvm.error.DiagnosticError): + # Unrepresentable + parse_text("3.40283e+38f32") + with pytest.raises(tvm.error.DiagnosticError): + # Unrepresentable + parse_text("65505f16") + def test_bool_literal(): assert get_scalar(parse_text("True")) == True assert get_scalar(parse_text("False")) == False + assert_parses_as("True", relay.const(True, "bool")) + def test_negative(): # need to handle parsing non-literal operations @@ -993,4 +1030,4 @@ def @main(%x: Tensor[(2, 3), float32]) { if __name__ == "__main__": import sys - pytest.main(sys.argv) + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py index 54e0e4c7ca44..60f611998649 100644 --- a/tests/python/relay/test_ir_text_printer.py +++ b/tests/python/relay/test_ir_text_printer.py @@ -47,16 +47,28 @@ def show(text): print(text) -# Commented due to weird memory allocation error -# def test_large_graph(): -# x = relay.var("x", shape=(3, 2)) -# y = relay.var("y") -# one = relay.const(10e10, dtype="float32") -# z = relay.add(x, one) -# for i in range(int(9e5)): -# z = relay.add(z, one) -# f = relay.Function([x, y], z) -# show(astext(f)) +def assert_prints_as(expr, str): + assert astext(expr) == SEMVER + str + + +def test_scalars(): + assert_prints_as(relay.const(42, "int16"), "42i16") + assert_prints_as(relay.const(42, "int32"), "42") + assert_prints_as(relay.const(42, "int64"), "42i64") + assert_prints_as(relay.const(3.0, "float16"), "3f16") + assert_prints_as(relay.const(3.0, "float32"), "3f") + assert_prints_as(relay.const(3.0, "float64"), "3f64") + + +def test_large_graph(): + x = relay.var("x", shape=(3, 2)) + y = relay.var("y") + one = relay.const(10e10, dtype="float32") + z = relay.add(x, one) + for i in range(int(9e4)): + z = relay.add(z, one) + f = relay.Function([x, y], z) + show(astext(f)) def test_func(): @@ -295,4 +307,7 @@ def test_slash_in_identifier(): if __name__ == "__main__": - pytest.main([__file__]) + import sys + import pytest + + sys.exit(pytest.main([__file__] + sys.argv[1:])) From 16c4faf86c584b22dbeaf304108cee5103ac23c2 Mon Sep 17 00:00:00 2001 From: Altan Haan <3124994+altanh@users.noreply.github.com> Date: Thu, 19 May 2022 09:15:18 -0700 Subject: [PATCH 34/59] nn.batch_flatten is a reshape op (#11367) --- src/relay/op/nn/nn.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 27f295b8b39d..234cafdca150 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -540,10 +540,12 @@ Example:: .add_argument("data", "Tensor", "The input tensor.") .set_support_level(2) .add_type_rel("BatchFlatten", BatchFlattenRel) - .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type) { - return Array{topi::nn::flatten(inputs[0])}; - }); + .set_attr("FTVMCompute", + [](const Attrs& attrs, const Array& inputs, + const Type& out_type) { + return Array{topi::nn::flatten(inputs[0])}; + }) + .set_attr("TReshapeOp", true); // relu TVM_REGISTER_GLOBAL("relay.op.nn._make.relu").set_body_typed([](Expr data) { From 8d0da24f12bdccd8b7d0d953c1280142c8600b4d Mon Sep 17 00:00:00 2001 From: Farshid Salemi Parizi Date: Thu, 19 May 2022 09:40:01 -0700 Subject: [PATCH 35/59] [Hexagon] moves conftest.py to tvm.contrib.hexagon so outside repos can access the testing fixtures (#11277) * adding pytest_plugin to python so other repos can access * import requires_hexagon_toolchain from tvm.contrib.hexagon.pytest_plugin --- python/tvm/contrib/hexagon/pytest_plugin.py | 236 ++++++++++++++++++ tests/python/contrib/test_hexagon/conftest.py | 212 +--------------- .../test_hexagon/test_2d_physical_buffers.py | 2 +- .../python/contrib/test_hexagon/test_usmp.py | 2 +- 4 files changed, 242 insertions(+), 210 deletions(-) create mode 100644 python/tvm/contrib/hexagon/pytest_plugin.py diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py new file mode 100644 index 000000000000..2c62a0a0b569 --- /dev/null +++ b/python/tvm/contrib/hexagon/pytest_plugin.py @@ -0,0 +1,236 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=invalid-name,redefined-outer-name +""" Hexagon testing fixtures used to deduce testing argument + values from testing parameters """ + +import os +import random +from typing import Optional, Union + +import pytest + +import tvm +import tvm.rpc.tracker +from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC +from tvm.contrib.hexagon.session import Session + +HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN" +TVM_TRACKER_HOST = "TVM_TRACKER_HOST" +TVM_TRACKER_PORT = "TVM_TRACKER_PORT" +ANDROID_REMOTE_DIR = "ANDROID_REMOTE_DIR" +ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER" +ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET" + + +@tvm.testing.fixture +def shape_nhwc(batch, in_channel, in_size): + return (batch, in_size, in_size, in_channel) + + +def _compose(args, decs): + """Helper to apply multiple markers""" + if len(args) > 0: + func = args[0] + for dec in reversed(decs): + func = dec(func) + return func + return decs + + +def requires_hexagon_toolchain(*args): + _requires_hexagon_toolchain = [ + pytest.mark.skipif( + os.environ.get(HEXAGON_TOOLCHAIN) is None, + reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.", + ), + ] + + return _compose(args, _requires_hexagon_toolchain) + + +@tvm.testing.fixture +def android_serial_number() -> Optional[str]: + serial = os.getenv(ANDROID_SERIAL_NUMBER, default="") + # Setting ANDROID_SERIAL_NUMBER to an empty string should be + # equivalent to having it unset. + if not serial.strip(): + serial = None + return serial + + +# NOTE on server ports: +# These tests use different port numbers for the RPC server (7070 + ...). +# The reason is that an RPC session cannot be gracefully closed without +# triggering TIME_WAIT state on the server socket. This prevents another +# server to bind to the same port until the wait time elapses. + +LISTEN_PORT_MIN = 2000 # Well above the privileged ports (1024 or lower) +LISTEN_PORT_MAX = 9000 # Below the search range end (port_end=9199) of RPC server +PREVIOUS_PORT = None + + +def get_free_port() -> int: + """Return the next port that is available to listen on""" + global PREVIOUS_PORT + if PREVIOUS_PORT is None: + port = random.randint(LISTEN_PORT_MIN, LISTEN_PORT_MAX) + else: + port = PREVIOUS_PORT + 1 + + while tvm.contrib.hexagon.build._is_port_in_use(port): + port = port + 1 if port < LISTEN_PORT_MAX else LISTEN_PORT_MIN + + PREVIOUS_PORT = port + return port + + +@pytest.fixture(scope="session") +def _tracker_info() -> Union[str, int]: + env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="") + env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="") + + if env_tracker_host or env_tracker_port: + # A tracker is already running, and we should connect to it + # when running tests. + assert env_tracker_host, "TVM_TRACKER_PORT is defined, but TVM_TRACKER_HOST is not" + assert env_tracker_port, "TVM_TRACKER_HOST is defined, but TVM_TRACKER_PORT is not" + env_tracker_port = int(env_tracker_port) + + try: + tvm.rpc.connect_tracker(env_tracker_host, env_tracker_port) + except RuntimeError as exc: + message = ( + "Could not connect to external tracker " + "specified by $TVM_TRACKER_HOST and $TVM_TRACKER_PORT " + f"({env_tracker_host}:{env_tracker_port})" + ) + raise RuntimeError(message) from exc + + yield (env_tracker_host, env_tracker_port) + + else: + # No tracker is provided to the tests, so we should start one + # for the tests to use. + tracker = tvm.rpc.tracker.Tracker("127.0.0.1", get_free_port()) + try: + yield (tracker.host, tracker.port) + finally: + tracker.terminate() + + +@pytest.fixture(scope="session") +def tvm_tracker_host(_tracker_info) -> str: + host, _ = _tracker_info + return host + + +@pytest.fixture(scope="session") +def tvm_tracker_port(_tracker_info) -> int: + _, port = _tracker_info + return port + + +@tvm.testing.fixture +def rpc_server_port() -> int: + return get_free_port() + + +@tvm.testing.fixture +def adb_server_socket() -> str: + return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037") + + +@tvm.testing.fixture +def hexagon_launcher( + request, android_serial_number, rpc_server_port, adb_server_socket +) -> HexagonLauncherRPC: + """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined""" + if android_serial_number is None: + yield None + else: + # Requesting these fixtures sets up a local tracker, if one + # hasn't been provided to us. Delaying the evaluation of + # these fixtures avoids starting a tracker unless necessary. + tvm_tracker_host = request.getfixturevalue("tvm_tracker_host") + tvm_tracker_port = request.getfixturevalue("tvm_tracker_port") + + rpc_info = { + "rpc_tracker_host": tvm_tracker_host, + "rpc_tracker_port": tvm_tracker_port, + "rpc_server_port": rpc_server_port, + "adb_server_socket": adb_server_socket, + } + launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) + launcher.start_server() + try: + yield launcher + finally: + launcher.stop_server() + + +@tvm.testing.fixture +def hexagon_session(hexagon_launcher) -> Session: + if hexagon_launcher is None: + yield None + else: + with hexagon_launcher.start_session() as session: + yield session + + +# If the execution aborts while an RPC server is running, the python +# code that is supposed to shut it down will never execute. This will +# keep pytest from terminating (indefinitely), so add a cleanup +# fixture to terminate any still-running servers. +@pytest.fixture(scope="session", autouse=True) +def terminate_rpc_servers(): + # Since this is a fixture that runs regardless of whether the + # execution happens on simulator or on target, make sure the + # yield happens every time. + serial = os.environ.get(ANDROID_SERIAL_NUMBER) + yield [] + if serial == "simulator": + os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill") + + +aot_host_target = tvm.testing.parameter( + "c", + "llvm -keys=hexagon -link-params=0 " + "-mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp " + "-mcpu=hexagonv68 -mtriple=hexagon", +) + + +@tvm.testing.fixture +def aot_target(aot_host_target): + if aot_host_target == "c": + yield tvm.target.hexagon("v68") + elif aot_host_target.startswith("llvm"): + yield aot_host_target + else: + assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]." + + +def pytest_addoption(parser): + parser.addoption("--gtest_args", action="store", default="") + + +def pytest_generate_tests(metafunc): + option_value = metafunc.config.option.gtest_args + if "gtest_args" in metafunc.fixturenames and option_value is not None: + metafunc.parametrize("gtest_args", [option_value]) diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py index f76181e06d0e..3b057384df37 100644 --- a/tests/python/contrib/test_hexagon/conftest.py +++ b/tests/python/contrib/test_hexagon/conftest.py @@ -18,216 +18,12 @@ """ Hexagon testing fixtures used to deduce testing argument values from testing parameters """ -import os -import random -import socket -from typing import Optional, Union import pytest import tvm -import tvm.rpc.tracker -from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC -from tvm.contrib.hexagon.session import Session +import tvm.testing -HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN" -TVM_TRACKER_HOST = "TVM_TRACKER_HOST" -TVM_TRACKER_PORT = "TVM_TRACKER_PORT" -ANDROID_REMOTE_DIR = "ANDROID_REMOTE_DIR" -ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER" -ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET" - - -@tvm.testing.fixture -def shape_nhwc(batch, in_channel, in_size): - return (batch, in_size, in_size, in_channel) - - -def _compose(args, decs): - """Helper to apply multiple markers""" - if len(args) > 0: - f = args[0] - for d in reversed(decs): - f = d(f) - return f - return decs - - -def requires_hexagon_toolchain(*args): - _requires_hexagon_toolchain = [ - pytest.mark.skipif( - os.environ.get(HEXAGON_TOOLCHAIN) == None, - reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.", - ), - ] - - return _compose(args, _requires_hexagon_toolchain) - - -@tvm.testing.fixture -def android_serial_number() -> Optional[str]: - serial = os.getenv(ANDROID_SERIAL_NUMBER, default="") - # Setting ANDROID_SERIAL_NUMBER to an empty string should be - # equivalent to having it unset. - if not serial.strip(): - serial = None - return serial - - -# NOTE on server ports: -# These tests use different port numbers for the RPC server (7070 + ...). -# The reason is that an RPC session cannot be gracefully closed without -# triggering TIME_WAIT state on the server socket. This prevents another -# server to bind to the same port until the wait time elapses. - -listen_port_min = 2000 # Well above the privileged ports (1024 or lower) -listen_port_max = 9000 # Below the search range end (port_end=9199) of RPC server -previous_port = None - - -def get_free_port() -> int: - - global previous_port - if previous_port is None: - port = random.randint(listen_port_min, listen_port_max) - else: - port = previous_port + 1 - - while tvm.contrib.hexagon.build._is_port_in_use(port): - port = port + 1 if port < listen_port_max else listen_port_min - - previous_port = port - return port - - -@pytest.fixture(scope="session") -def _tracker_info() -> Union[str, int]: - env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="") - env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="") - - if env_tracker_host or env_tracker_port: - # A tracker is already running, and we should connect to it - # when running tests. - assert env_tracker_host, "TVM_TRACKER_PORT is defined, but TVM_TRACKER_HOST is not" - assert env_tracker_port, "TVM_TRACKER_HOST is defined, but TVM_TRACKER_PORT is not" - env_tracker_port = int(env_tracker_port) - - try: - tvm.rpc.connect_tracker(env_tracker_host, env_tracker_port) - except RuntimeError as exc: - message = ( - "Could not connect to external tracker " - "specified by $TVM_TRACKER_HOST and $TVM_TRACKER_PORT " - f"({env_tracker_host}:{env_tracker_port})" - ) - raise RuntimeError(message) from exc - - yield (env_tracker_host, env_tracker_port) - - else: - # No tracker is provided to the tests, so we should start one - # for the tests to use. - tracker = tvm.rpc.tracker.Tracker("127.0.0.1", get_free_port()) - try: - yield (tracker.host, tracker.port) - finally: - tracker.terminate() - - -@pytest.fixture(scope="session") -def tvm_tracker_host(_tracker_info) -> str: - host, port = _tracker_info - return host - - -@pytest.fixture(scope="session") -def tvm_tracker_port(_tracker_info) -> int: - host, port = _tracker_info - return port - - -@tvm.testing.fixture -def rpc_server_port() -> int: - return get_free_port() - - -@tvm.testing.fixture -def adb_server_socket() -> str: - return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037") - - -@tvm.testing.fixture -def hexagon_launcher( - request, android_serial_number, rpc_server_port, adb_server_socket -) -> HexagonLauncherRPC: - if android_serial_number is None: - yield None - else: - # Requesting these fixtures sets up a local tracker, if one - # hasn't been provided to us. Delaying the evaluation of - # these fixtures avoids starting a tracker unless necessary. - tvm_tracker_host = request.getfixturevalue("tvm_tracker_host") - tvm_tracker_port = request.getfixturevalue("tvm_tracker_port") - - rpc_info = { - "rpc_tracker_host": tvm_tracker_host, - "rpc_tracker_port": tvm_tracker_port, - "rpc_server_port": rpc_server_port, - "adb_server_socket": adb_server_socket, - } - launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) - launcher.start_server() - try: - yield launcher - finally: - launcher.stop_server() - - -@tvm.testing.fixture -def hexagon_session(hexagon_launcher) -> Session: - if hexagon_launcher is None: - yield None - else: - with hexagon_launcher.start_session() as session: - yield session - - -# If the execution aborts while an RPC server is running, the python -# code that is supposed to shut it dowm will never execute. This will -# keep pytest from terminating (indefinitely), so add a cleanup -# fixture to terminate any still-running servers. -@pytest.fixture(scope="session", autouse=True) -def terminate_rpc_servers(): - # Since this is a fixture that runs regardless of whether the - # execution happens on simulator or on target, make sure the - # yield happens every time. - serial = os.environ.get(ANDROID_SERIAL_NUMBER) - yield [] - if serial == "simulator": - os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill") - - -aot_host_target = tvm.testing.parameter( - "c", - "llvm -keys=hexagon -link-params=0 -mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv68 -mtriple=hexagon", -) - - -@tvm.testing.fixture -def aot_target(aot_host_target): - if aot_host_target == "c": - yield tvm.target.hexagon("v68") - elif aot_host_target.startswith("llvm"): - yield aot_host_target - else: - assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]." - - -def pytest_addoption(parser): - parser.addoption("--gtest_args", action="store", default="") - - -def pytest_generate_tests(metafunc): - option_value = metafunc.config.option.gtest_args - if "gtest_args" in metafunc.fixturenames and option_value is not None: - metafunc.parametrize("gtest_args", [option_value]) +pytest_plugins = [ + "tvm.contrib.hexagon.pytest_plugin", +] diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py index 78e1eb11ad9f..787d71fa1713 100644 --- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py +++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py @@ -29,7 +29,7 @@ from tvm.tir.stmt_functor import post_order_visit from tvm.contrib.hexagon.build import HexagonLauncher -from .conftest import requires_hexagon_toolchain +from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain from .infrastructure import allocate_hexagon_array # Needed to register the link_shared packedfunc. diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py index 116ecb4154dd..03badfb655d9 100644 --- a/tests/python/contrib/test_hexagon/test_usmp.py +++ b/tests/python/contrib/test_hexagon/test_usmp.py @@ -26,7 +26,7 @@ from tvm.contrib.hexagon.session import Session from tvm.testing.usmp import is_tvm_backendallocworkspace_calls -from .conftest import requires_hexagon_toolchain +from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain usmp_enabled = tvm.testing.parameter(False, True) From cd269101b7c508f5432ad4aee3c1ff8d07a89142 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 19 May 2022 13:52:18 -0700 Subject: [PATCH 36/59] [ci] Use S3 for artifacts (#11349) Co-authored-by: driazati --- Jenkinsfile | 604 +++++++++++++++++++++++++++++++++++------ jenkins/Jenkinsfile.j2 | 125 ++++----- jenkins/macros.j2 | 32 +++ 3 files changed, 598 insertions(+), 163 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 424f97494d76..024b920ac676 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-05-17T17:26:21.660243 +// Generated at 2022-05-19T11:41:58.421857 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -482,53 +482,9 @@ def make(docker_type, path, make_flag) { } } -// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions. -tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake' // use libtvm_runtime.so. -tvm_lib = 'build/libtvm.so, ' + tvm_runtime // use libtvm.so to run the full compiler. -// LLVM upstream lib -tvm_multilib = 'build/libtvm.so, ' + - 'build/libvta_fsim.so, ' + - tvm_runtime - -tvm_multilib_tsim = 'build/libvta_tsim.so, ' + - tvm_multilib - -microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz' - -// pack libraries for later use -def pack_lib(name, libs) { - sh (script: """ - echo "Packing ${libs} into ${name}" - echo ${libs} | sed -e 's/,/ /g' | xargs md5sum - """, label: 'Stash libraries and show md5') - stash includes: libs, name: name -} +// Filenames for stashing between build and test steps +s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}" -// unpack libraries saved before -def unpack_lib(name, libs) { - unstash name - sh (script: """ - echo "Unpacked ${libs} from ${name}" - echo ${libs} | sed -e 's/,/ /g' | xargs md5sum - """, label: 'Unstash libraries and show md5') -} - -// compress microtvm template projects and pack the tar. -def pack_microtvm_template_projects(name) { - sh( - script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/', - label: 'Compress microtvm_template_projects' - ) - pack_lib(name + '-microtvm-libs', microtvm_tar_gz) -} - -def unpack_microtvm_template_projects(name) { - unpack_lib(name + '-microtvm-libs', microtvm_tar_gz) - sh( - script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz', - label: 'Unpack microtvm_template_projects' - ) -} def ci_setup(image) { sh ( @@ -565,24 +521,63 @@ def cpp_unittest(image) { ) } + +def add_microtvm_permissions() { + sh( + script: 'find build/microtvm_template_projects -type f | xargs chmod +x', + label: 'Add execute permissions for microTVM files', + ) +} + + def build() { stage('Build') { environment { SKIP_SLOW_TESTS = "${skip_slow_tests}" } - parallel 'BUILD: GPU': { + parallel( + 'BUILD: GPU': { if (!skip_ci) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") { init_git() sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build" make("${ci_gpu} --no-gpu", 'build', '-j2') - pack_lib('gpu', tvm_multilib) - pack_microtvm_template_projects('gpu') + sh( + script: """ + set -eux + md5sum build/libtvm.so + aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu/build/libtvm.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu/build/libvta_fsim.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu/build/libtvm_runtime.so + md5sum build/config.cmake + aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu/build/config.cmake + aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/gpu/build/microtvm_template_projects --recursive + """, + label: 'Upload artifacts to S3', + ) + + // compiler test sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2" make("${ci_gpu} --no-gpu", 'build2', '-j2') - pack_lib('gpu2', tvm_multilib) + sh( + script: """ + set -eux + md5sum build/libtvm.so + aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu2/build/libvta_fsim.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu2/build/libtvm_runtime.so + md5sum build/config.cmake + aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu2/build/config.cmake + """, + label: 'Upload artifacts to S3', + ) + } } } @@ -597,7 +592,23 @@ stage('Build') { label: 'Create CPU cmake config', ) make(ci_cpu, 'build', '-j2') - pack_lib('cpu', tvm_multilib_tsim) + sh( + script: """ + set -eux + md5sum build/libvta_tsim.so + aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/cpu/build/libvta_tsim.so + md5sum build/libtvm.so + aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu/build/libtvm.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/cpu/build/libvta_fsim.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/cpu/build/libtvm_runtime.so + md5sum build/config.cmake + aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/cpu/build/config.cmake + """, + label: 'Upload artifacts to S3', + ) + timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_cpu) // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh" @@ -644,7 +655,23 @@ stage('Build') { label: 'Create i386 cmake config', ) make(ci_i386, 'build', '-j2') - pack_lib('i386', tvm_multilib_tsim) + sh( + script: """ + set -eux + md5sum build/libvta_tsim.so + aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/i386/build/libvta_tsim.so + md5sum build/libtvm.so + aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/i386/build/libtvm.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/i386/build/libvta_fsim.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/i386/build/libtvm_runtime.so + md5sum build/config.cmake + aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/i386/build/config.cmake + """, + label: 'Upload artifacts to S3', + ) + } } } else { @@ -661,7 +688,21 @@ stage('Build') { label: 'Create ARM cmake config', ) make(ci_arm, 'build', '-j4') - pack_lib('arm', tvm_multilib) + sh( + script: """ + set -eux + md5sum build/libtvm.so + aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/arm/build/libtvm.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/arm/build/libvta_fsim.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/arm/build/libtvm_runtime.so + md5sum build/config.cmake + aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/arm/build/config.cmake + """, + label: 'Upload artifacts to S3', + ) + } } } else { @@ -678,8 +719,20 @@ stage('Build') { label: 'Create QEMU cmake config', ) make(ci_qemu, 'build', '-j2') - pack_lib('qemu', tvm_lib) - pack_microtvm_template_projects('qemu') + sh( + script: """ + set -eux + md5sum build/libtvm.so + aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/qemu/build/libtvm.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/qemu/build/libtvm_runtime.so + md5sum build/config.cmake + aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/qemu/build/config.cmake + aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/qemu/build/microtvm_template_projects --recursive + """, + label: 'Upload artifacts to S3', + ) + } } } else { @@ -696,13 +749,26 @@ stage('Build') { label: 'Create Hexagon cmake config', ) make(ci_hexagon, 'build', '-j2') - pack_lib('hexagon', tvm_lib) + sh( + script: """ + set -eux + md5sum build/libtvm.so + aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/hexagon/build/libtvm.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/hexagon/build/libtvm_runtime.so + md5sum build/config.cmake + aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/hexagon/build/config.cmake + """, + label: 'Upload artifacts to S3', + ) + } } } else { Utils.markStageSkippedForConditional('BUILD: Hexagon') } - } + }, + ) } } @@ -726,10 +792,38 @@ stage('Test') { 'PLATFORM=gpu', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=0'], { - unpack_lib('gpu2', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + cpp_unittest(ci_gpu) - unpack_lib('gpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_gpu) cpp_unittest(ci_gpu) sh ( @@ -762,7 +856,21 @@ stage('Test') { 'PLATFORM=gpu', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=1'], { - unpack_lib('gpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh", @@ -798,7 +906,23 @@ stage('Test') { 'PLATFORM=cpu', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=0'], { - unpack_lib('cpu', tvm_multilib_tsim) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so + md5sum build/libvta_tsim.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_cpu) sh ( script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", @@ -826,7 +950,23 @@ stage('Test') { 'PLATFORM=cpu', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=1'], { - unpack_lib('cpu', tvm_multilib_tsim) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so + md5sum build/libvta_tsim.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_cpu) sh ( script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", @@ -851,7 +991,23 @@ stage('Test') { try { init_git() withEnv(['PLATFORM=cpu'], { - unpack_lib('cpu', tvm_multilib_tsim) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so + md5sum build/libvta_tsim.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_cpu) cpp_unittest(ci_cpu) python_unittest(ci_cpu) @@ -882,7 +1038,21 @@ stage('Test') { 'PLATFORM=i386', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=0'], { - unpack_lib('i386', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_i386) cpp_unittest(ci_i386) python_unittest(ci_i386) @@ -913,7 +1083,21 @@ stage('Test') { 'PLATFORM=i386', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=1'], { - unpack_lib('i386', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_i386) python_unittest(ci_i386) sh ( @@ -943,7 +1127,21 @@ stage('Test') { 'PLATFORM=i386', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=2'], { - unpack_lib('i386', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_i386) python_unittest(ci_i386) sh ( @@ -973,7 +1171,19 @@ stage('Test') { 'PLATFORM=hexagon', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=0'], { - unpack_lib('hexagon', tvm_lib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_hexagon) cpp_unittest(ci_hexagon) sh ( @@ -1006,7 +1216,19 @@ stage('Test') { 'PLATFORM=hexagon', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=1'], { - unpack_lib('hexagon', tvm_lib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_hexagon) sh ( script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh", @@ -1038,7 +1260,19 @@ stage('Test') { 'PLATFORM=hexagon', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=2'], { - unpack_lib('hexagon', tvm_lib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_hexagon) sh ( script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh", @@ -1070,7 +1304,19 @@ stage('Test') { 'PLATFORM=hexagon', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=3'], { - unpack_lib('hexagon', tvm_lib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_hexagon) sh ( script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh", @@ -1099,8 +1345,21 @@ stage('Test') { try { init_git() withEnv(['PLATFORM=qemu'], { - unpack_lib('qemu', tvm_lib) - unpack_microtvm_template_projects('qemu') + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/config.cmake build/config.cmake + md5sum build/config.cmake + aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/microtvm_template_projects build/microtvm_template_projects --recursive + """, + label: 'Download artifacts from S3', + ) + + add_microtvm_permissions() ci_setup(ci_qemu) cpp_unittest(ci_qemu) sh ( @@ -1130,7 +1389,21 @@ stage('Test') { try { init_git() withEnv(['PLATFORM=arm'], { - unpack_lib('arm', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_arm) cpp_unittest(ci_arm) sh ( @@ -1163,7 +1436,21 @@ stage('Test') { 'PLATFORM=arm', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=0'], { - unpack_lib('arm', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_arm) python_unittest(ci_arm) sh ( @@ -1192,7 +1479,21 @@ stage('Test') { 'PLATFORM=arm', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=1'], { - unpack_lib('arm', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_arm) python_unittest(ci_arm) sh ( @@ -1221,7 +1522,21 @@ stage('Test') { 'PLATFORM=gpu', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=0'], { - unpack_lib('gpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", @@ -1249,7 +1564,21 @@ stage('Test') { 'PLATFORM=gpu', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=1'], { - unpack_lib('gpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", @@ -1277,7 +1606,21 @@ stage('Test') { 'PLATFORM=gpu', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=0'], { - unpack_lib('gpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", @@ -1305,7 +1648,21 @@ stage('Test') { 'PLATFORM=gpu', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=1'], { - unpack_lib('gpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", @@ -1333,7 +1690,21 @@ stage('Test') { 'PLATFORM=gpu', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=2'], { - unpack_lib('gpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", @@ -1358,7 +1729,21 @@ stage('Test') { try { init_git() withEnv(['PLATFORM=cpu'], { - unpack_lib('cpu', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_cpu) sh ( script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh", @@ -1383,7 +1768,21 @@ stage('Test') { try { init_git() withEnv(['PLATFORM=arm'], { - unpack_lib('arm', tvm_multilib) + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + md5sum build/config.cmake + """, + label: 'Download artifacts from S3', + ) + ci_setup(ci_arm) sh ( script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", @@ -1405,8 +1804,23 @@ stage('Test') { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") { init_git() - unpack_lib('gpu', tvm_multilib) - unpack_microtvm_template_projects('gpu') + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + md5sum build/libtvm.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + md5sum build/libtvm_runtime.so + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + md5sum build/config.cmake + aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/microtvm_template_projects build/microtvm_template_projects --recursive + """, + label: 'Download artifacts from S3', + ) + + add_microtvm_permissions() timeout(time: 180, unit: 'MINUTES') { ci_setup(ci_gpu) sh ( @@ -1414,7 +1828,15 @@ stage('Test') { label: 'Build docs', ) } - pack_lib('docs', 'docs.tgz') + sh( + script: """ + set -eux + md5sum docs.tgz + aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz + """, + label: 'Upload artifacts to S3', + ) + archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true) } } @@ -1489,7 +1911,15 @@ stage('Deploy') { if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') { node('CPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") { - unpack_lib('docs', 'docs.tgz') + sh( + script: """ + set -eux + aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz + md5sum docs.tgz + """, + label: 'Download artifacts from S3', + ) + deploy_docs() } } diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2 index f250ff12feed..8742d0724485 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/jenkins/Jenkinsfile.j2 @@ -399,53 +399,14 @@ def make(docker_type, path, make_flag) { } } -// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions. -tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake' // use libtvm_runtime.so. -tvm_lib = 'build/libtvm.so, ' + tvm_runtime // use libtvm.so to run the full compiler. -// LLVM upstream lib -tvm_multilib = 'build/libtvm.so, ' + - 'build/libvta_fsim.so, ' + - tvm_runtime - -tvm_multilib_tsim = 'build/libvta_tsim.so, ' + - tvm_multilib - -microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz' - -// pack libraries for later use -def pack_lib(name, libs) { - sh (script: """ - echo "Packing ${libs} into ${name}" - echo ${libs} | sed -e 's/,/ /g' | xargs md5sum - """, label: 'Stash libraries and show md5') - stash includes: libs, name: name -} - -// unpack libraries saved before -def unpack_lib(name, libs) { - unstash name - sh (script: """ - echo "Unpacked ${libs} from ${name}" - echo ${libs} | sed -e 's/,/ /g' | xargs md5sum - """, label: 'Unstash libraries and show md5') -} +// Filenames for stashing between build and test steps +{% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %} +{% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %} +{% set tvm_multilib = ['build/libtvm.so', 'build/libvta_fsim.so'] + tvm_runtime %} +{% set tvm_multilib_tsim = ['build/libvta_tsim.so'] + tvm_multilib %} +{% set microtvm_template_projects = ['build/microtvm_template_projects',] %} +s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}" -// compress microtvm template projects and pack the tar. -def pack_microtvm_template_projects(name) { - sh( - script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/', - label: 'Compress microtvm_template_projects' - ) - pack_lib(name + '-microtvm-libs', microtvm_tar_gz) -} - -def unpack_microtvm_template_projects(name) { - unpack_lib(name + '-microtvm-libs', microtvm_tar_gz) - sh( - script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz', - label: 'Unpack microtvm_template_projects' - ) -} def ci_setup(image) { sh ( @@ -482,24 +443,36 @@ def cpp_unittest(image) { ) } + +def add_microtvm_permissions() { + {% for folder in microtvm_template_projects %} + sh( + script: 'find {{ folder }} -type f | xargs chmod +x', + label: 'Add execute permissions for microTVM files', + ) + {% endfor %} +} + + def build() { stage('Build') { environment { SKIP_SLOW_TESTS = "${skip_slow_tests}" } - parallel 'BUILD: GPU': { + parallel( + 'BUILD: GPU': { if (!skip_ci) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-gpu') }}) { init_git() sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build" make("${ci_gpu} --no-gpu", 'build', '-j2') - pack_lib('gpu', tvm_multilib) - pack_microtvm_template_projects('gpu') + {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} + // compiler test sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2" make("${ci_gpu} --no-gpu", 'build2', '-j2') - pack_lib('gpu2', tvm_multilib) + {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }} } } } @@ -514,7 +487,7 @@ stage('Build') { label: 'Create CPU cmake config', ) make(ci_cpu, 'build', '-j2') - pack_lib('cpu', tvm_multilib_tsim) + {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_cpu) // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh" @@ -561,7 +534,7 @@ stage('Build') { label: 'Create i386 cmake config', ) make(ci_i386, 'build', '-j2') - pack_lib('i386', tvm_multilib_tsim) + {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }} } } } else { @@ -578,7 +551,7 @@ stage('Build') { label: 'Create ARM cmake config', ) make(ci_arm, 'build', '-j4') - pack_lib('arm', tvm_multilib) + {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }} } } } else { @@ -595,8 +568,7 @@ stage('Build') { label: 'Create QEMU cmake config', ) make(ci_qemu, 'build', '-j2') - pack_lib('qemu', tvm_lib) - pack_microtvm_template_projects('qemu') + {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }} } } } else { @@ -613,13 +585,14 @@ stage('Build') { label: 'Create Hexagon cmake config', ) make(ci_hexagon, 'build', '-j2') - pack_lib('hexagon', tvm_lib) + {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }} } } } else { Utils.markStageSkippedForConditional('BUILD: Hexagon') } - } + }, + ) } } @@ -640,14 +613,14 @@ stage('Test') { platform="gpu", ) %} {% if shard_index == 1 %} - unpack_lib('gpu2', tvm_multilib) + {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }} cpp_unittest(ci_gpu) - unpack_lib('gpu', tvm_multilib) + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} ci_setup(ci_gpu) cpp_unittest(ci_gpu) {% else %} - unpack_lib('gpu', tvm_multilib) + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} ci_setup(ci_gpu) {% endif %} {% if shard_index == 2 or num_shards < 2 %} @@ -672,7 +645,7 @@ stage('Test') { ws="tvm/integration-python-cpu", platform="cpu", ) %} - unpack_lib('cpu', tvm_multilib_tsim) + {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} ci_setup(ci_cpu) sh ( script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", @@ -685,7 +658,7 @@ stage('Test') { ws="tvm/ut-python-cpu", platform="cpu", ) %} - unpack_lib('cpu', tvm_multilib_tsim) + {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} ci_setup(ci_cpu) cpp_unittest(ci_cpu) python_unittest(ci_cpu) @@ -702,7 +675,7 @@ stage('Test') { ws="tvm/integration-python-i386", platform="i386", ) %} - unpack_lib('i386', tvm_multilib) + {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }} ci_setup(ci_i386) {% if shard_index == 1 %} cpp_unittest(ci_i386) @@ -721,7 +694,7 @@ stage('Test') { platform="hexagon", num_shards=4, ) %} - unpack_lib('hexagon', tvm_lib) + {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }} ci_setup(ci_hexagon) {% if shard_index == 1 %} cpp_unittest(ci_hexagon) @@ -741,8 +714,8 @@ stage('Test') { ws="tvm/test-qemu", platform="qemu", ) %} - unpack_lib('qemu', tvm_lib) - unpack_microtvm_template_projects('qemu') + {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }} + add_microtvm_permissions() ci_setup(ci_qemu) cpp_unittest(ci_qemu) sh ( @@ -760,7 +733,7 @@ stage('Test') { ws="tvm/ut-python-arm", platform="arm", ) %} - unpack_lib('arm', tvm_multilib) + {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} ci_setup(ci_arm) cpp_unittest(ci_arm) sh ( @@ -778,7 +751,7 @@ stage('Test') { node="ARM", ws="tvm/ut-python-arm", platform="arm", ) %} - unpack_lib('arm', tvm_multilib) + {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} ci_setup(ci_arm) python_unittest(ci_arm) sh ( @@ -793,7 +766,7 @@ stage('Test') { ws="tvm/topi-python-gpu", platform="gpu", ) %} - unpack_lib('gpu', tvm_multilib) + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", @@ -806,7 +779,7 @@ stage('Test') { ws="tvm/frontend-python-gpu", platform="gpu", ) %} - unpack_lib('gpu', tvm_multilib) + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} ci_setup(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", @@ -819,7 +792,7 @@ stage('Test') { ws="tvm/frontend-python-cpu", platform="cpu", ) %} - unpack_lib('cpu', tvm_multilib) + {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }} ci_setup(ci_cpu) sh ( script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh", @@ -832,7 +805,7 @@ stage('Test') { ws="tvm/frontend-python-arm", platform="arm", ) %} - unpack_lib('arm', tvm_multilib) + {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} ci_setup(ci_arm) sh ( script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", @@ -844,8 +817,8 @@ stage('Test') { node('GPU') { ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) { init_git() - unpack_lib('gpu', tvm_multilib) - unpack_microtvm_template_projects('gpu') + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} + add_microtvm_permissions() timeout(time: 180, unit: 'MINUTES') { ci_setup(ci_gpu) sh ( @@ -853,7 +826,7 @@ stage('Test') { label: 'Build docs', ) } - pack_lib('docs', 'docs.tgz') + {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }} archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true) } } @@ -928,7 +901,7 @@ stage('Deploy') { if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') { node('CPU') { ws({{ m.per_exec_ws('tvm/deploy-docs') }}) { - unpack_lib('docs', 'docs.tgz') + {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }} deploy_docs() } } diff --git a/jenkins/macros.j2 b/jenkins/macros.j2 index de33a203f603..2ce005a128ef 100644 --- a/jenkins/macros.j2 +++ b/jenkins/macros.j2 @@ -89,3 +89,35 @@ } }, {% endmacro %} + +{% macro upload_artifacts(tag, filenames, folders=[]) %} +sh( + script: """ + set -eux + {% for filename in filenames %} + md5sum {{ filename }} + aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }} + {% endfor %} + {% for folder in (folders or []) %} + aws s3 cp --no-progress {{ folder }} s3://${s3_prefix}/{{ tag }}/{{ folder }} --recursive + {% endfor %} + """, + label: 'Upload artifacts to S3', + ) +{% endmacro %} + +{% macro download_artifacts(tag, filenames, folders=None) %} +sh( + script: """ + set -eux + {% for filename in filenames %} + aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }} + md5sum {{ filename }} + {% endfor %} + {% for folder in (folders or []) %} + aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ folder }} {{ folder }} --recursive + {% endfor %} + """, + label: 'Download artifacts from S3', + ) +{% endmacro %} From 5e29dddd02193a440c18a1d98fef9023cb008788 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Thu, 19 May 2022 16:09:51 -0700 Subject: [PATCH 37/59] [microTVM][ARM] Add Relay tests for conv2d registered schedules (#11250) * Added conv2d relay test for each schedule * Enable relay tests in qemu * split aot test utils --- python/tvm/autotvm/__init__.py | 1 + python/tvm/autotvm/task/__init__.py | 1 + python/tvm/autotvm/task/dispatcher.py | 53 +++ python/tvm/micro/testing/aot_test_utils.py | 105 ++++ .../micro/{testing.py => testing/utils.py} | 0 .../tvm/testing/aot.py | 450 ++++++++---------- tests/micro/zephyr/test_utils.py | 2 +- tests/micro/zephyr/test_zephyr.py | 2 +- tests/micro/zephyr/test_zephyr_aot.py | 1 - .../contrib/test_cmsisnn/test_binary_ops.py | 6 +- .../contrib/test_cmsisnn/test_conv2d.py | 10 +- .../test_cmsisnn/test_fully_connected.py | 9 +- .../test_cmsisnn/test_invalid_graphs.py | 7 +- .../contrib/test_cmsisnn/test_networks.py | 10 +- .../contrib/test_cmsisnn/test_pooling.py | 11 +- .../contrib/test_cmsisnn/test_softmax.py | 9 +- tests/python/contrib/test_ethosu/infra.py | 2 +- .../contrib/test_ethosu/test_codegen.py | 2 +- .../contrib/test_ethosu/test_networks.py | 2 +- .../integration/test_arm_mprofile_dsp.py | 8 +- tests/python/relay/aot/test_c_device_api.py | 8 +- tests/python/relay/aot/test_cpp_aot.py | 3 +- tests/python/relay/aot/test_crt_aot.py | 6 +- tests/python/relay/aot/test_crt_aot_usmp.py | 5 +- .../strategy/arm_cpu/test_conv2d_nchw.py | 110 +++++ .../strategy/arm_cpu/test_conv2d_nhwc.py | 154 ++++++ .../strategy/arm_cpu/test_depthwise_conv2d.py | 153 ++++++ .../strategy/arm_cpu/test_group_conv2d.py | 151 ++++++ tests/python/relay/utils/external_codegen.py | 3 +- tests/python/unittest/test_crt.py | 2 +- tests/scripts/task_python_microtvm.sh | 2 + 31 files changed, 961 insertions(+), 327 deletions(-) create mode 100644 python/tvm/micro/testing/aot_test_utils.py rename python/tvm/micro/{testing.py => testing/utils.py} (100%) rename tests/python/relay/aot/aot_test_utils.py => python/tvm/testing/aot.py (72%) create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py create mode 100644 tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py create mode 100644 tests/python/relay/strategy/arm_cpu/test_group_conv2d.py diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py index a3c59252b01a..5a7d00960ecd 100644 --- a/python/tvm/autotvm/__init__.py +++ b/python/tvm/autotvm/__init__.py @@ -60,5 +60,6 @@ FallbackContext, ApplyHistoryBest as apply_history_best, ApplyGraphBest as apply_graph_best, + ApplyFixedConfig as apply_fixed_config, ) from .env import GLOBAL_SCOPE diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py index 6eea62264d7d..3949d324c4df 100644 --- a/python/tvm/autotvm/task/__init__.py +++ b/python/tvm/autotvm/task/__init__.py @@ -36,6 +36,7 @@ from .dispatcher import ( DispatchContext, ApplyConfig, + ApplyFixedConfig, ApplyHistoryBest, FallbackContext, clear_fallback_cache, diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index 6c072dc1fa17..11a608d4cbbf 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -31,6 +31,8 @@ from __future__ import absolute_import as _abs import logging +import typing +from typing import Union from collections.abc import Iterable import numpy as np @@ -179,6 +181,57 @@ def update(self, target, workload, cfg): self._config = cfg +class ApplyFixedConfig(DispatchContext): + """Apply a config of a deterministic schedule. + This is used for building a single Relay operator with deterministic schedule + for testing schedules at Relay level. + + Parameters + ---------- + tasks : list[tvm.autotvm.task.task.Task] + List of autoTVM tasks. + schedule_names : str, List[str] + Name of schedules to use. + """ + + def __init__(self, tasks, schedule_names: Union[str, typing.List[str]]): + super(ApplyFixedConfig, self).__init__() + if isinstance(schedule_names, str): + self._schedule_names = list(schedule_names) + elif isinstance(schedule_names, list): + self._schedule_names = schedule_names + else: + raise RuntimeError("Incorrect type: " + schedule_names) + self._tasks = tasks + self.workload = None + + def _query_inside(self, target, workload): + """Override query""" + self.workload = workload + + # Create a config from correct task + for task in self._tasks: + if task.name == workload[0]: + config = task.config_space.get(0) + break + + if not config: + raise RuntimeError( + "workload: %s does not exist in %s" % (str(workload), str(self._tasks)) + ) + # Add low cost to the target schedule and high cost to others. + if workload[0] in self._schedule_names: + config.cost = 1e-6 + else: + config.cost = 100000 + return config + + def update(self, target, workload, cfg): + """Override update""" + self.workload = workload + self._config = cfg + + class ApplyHistoryBest(DispatchContext): """ Apply the history best config diff --git a/python/tvm/micro/testing/aot_test_utils.py b/python/tvm/micro/testing/aot_test_utils.py new file mode 100644 index 000000000000..82ac1ac68e9d --- /dev/null +++ b/python/tvm/micro/testing/aot_test_utils.py @@ -0,0 +1,105 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import logging +import itertools +import shutil + +import pytest + +pytest.importorskip("tvm.micro") + +import tvm +from tvm.testing.aot import AOTTestRunner + +_LOG = logging.getLogger(__name__) + + +AOT_DEFAULT_RUNNER = AOTTestRunner() + +# AOT Test Runner using the Arm® Corstone™-300 Reference Systems +# see: https://developer.arm.com/ip-products/subsystem/corstone/corstone-300 +AOT_CORSTONE300_RUNNER = AOTTestRunner( + makefile="corstone300", + prologue=""" + uart_init(); + """, + includes=["uart.h"], + pass_config={ + "relay.ext.cmsisnn.options": { + "mcpu": "cortex-m55", + } + }, +) + +AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner( + makefile="corstone300", + prologue=""" + uart_init(); + """, + includes=["uart.h"], + pass_config={ + "relay.ext.cmsisnn.options": { + "mcpu": "cortex-m55", + }, + "tir.usmp.enable": True, + }, +) + + +def parametrize_aot_options(test): + """Parametrize over valid option combinations""" + + requires_arm_eabi = pytest.mark.skipif( + shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable" + ) + + interface_api = ["packed", "c"] + use_unpacked_api = [True, False] + test_runner = [AOT_DEFAULT_RUNNER, AOT_CORSTONE300_RUNNER] + + all_combinations = itertools.product(interface_api, use_unpacked_api, test_runner) + + # Filter out packed operators with c interface + valid_combinations = filter( + lambda parameters: not (parameters[0] == "c" and not parameters[1]), + all_combinations, + ) + + # Only use reference system for C interface and unpacked API calls + valid_combinations = filter( + lambda parameters: not ( + parameters[2] == AOT_CORSTONE300_RUNNER + and (parameters[0] == "packed" or not parameters[1]) + ), + valid_combinations, + ) + + # Skip reference system tests if running in i386 container + marked_combinations = map( + lambda parameters: pytest.param(*parameters, marks=[requires_arm_eabi]) + if parameters[2] == AOT_CORSTONE300_RUNNER + else parameters, + valid_combinations, + ) + + fn = pytest.mark.parametrize( + ["interface_api", "use_unpacked_api", "test_runner"], + marked_combinations, + )(test) + + return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn) diff --git a/python/tvm/micro/testing.py b/python/tvm/micro/testing/utils.py similarity index 100% rename from python/tvm/micro/testing.py rename to python/tvm/micro/testing/utils.py diff --git a/tests/python/relay/aot/aot_test_utils.py b/python/tvm/testing/aot.py similarity index 72% rename from tests/python/relay/aot/aot_test_utils.py rename to python/tvm/testing/aot.py index 2c4262a3d2be..f8f170366ac5 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/python/tvm/testing/aot.py @@ -14,39 +14,41 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +"""Common functions for AOT test cases""" import sys import datetime -import itertools -import json -import logging import os import pathlib -import platform import re import shutil import subprocess import tarfile import tempfile +import logging from typing import Any, NamedTuple, Union, Optional, List, Dict - -import pytest import numpy as np -pytest.importorskip("tvm.micro") - import tvm from tvm import relay -from tvm import te +from tvm import autotvm from tvm.contrib import utils, graph_executor -from tvm.relay.backend import te_compiler, Executor, Runtime -from tvm.relay.backend.te_compiler import TECompiler +from tvm.relay.backend import Executor, Runtime from tvm.relay.backend.utils import mangle_module_name from tvm.micro import export_model_library_format -from tvm.micro.testing import mlf_extract_workspace_size_bytes +from tvm.micro.testing.utils import mlf_extract_workspace_size_bytes _LOG = logging.getLogger(__name__) +NP_TYPE_TO_C = { + "int8": "int8_t", + "uint8": "uint8_t", + "int16": "int16_t", + "uint16": "uint16_t", + "int32": "int32_t", + "uint32": "uint32_t", + "float32": "float", +} + AOT_SUCCESS_TOKEN = "AOT_TEST_SUCCESS" AOT_FAILURE_TOKEN = "AOT_TEST_FAILURE" @@ -138,119 +140,7 @@ class AOTTestRunner(NamedTuple): pass_config: Dict[str, Any] = {} -AOT_DEFAULT_RUNNER = AOTTestRunner() - -# AOT Test Runner using the Arm® Corstone™-300 Reference Systems -# see: https://developer.arm.com/ip-products/subsystem/corstone/corstone-300 -AOT_CORSTONE300_RUNNER = AOTTestRunner( - makefile="corstone300", - prologue=""" - uart_init(); - """, - includes=["uart.h"], - pass_config={ - "relay.ext.cmsisnn.options": { - "mcpu": "cortex-m55", - } - }, -) - -AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner( - makefile="corstone300", - prologue=""" - uart_init(); - """, - includes=["uart.h"], - pass_config={ - "relay.ext.cmsisnn.options": { - "mcpu": "cortex-m55", - }, - "tir.usmp.enable": True, - }, -) - -NP_TYPE_TO_C = { - "int8": "int8_t", - "uint8": "uint8_t", - "int16": "int16_t", - "uint16": "uint16_t", - "int32": "int32_t", - "uint32": "uint32_t", - "float32": "float", -} - - -def mangle_name(mod_name, name): - mod_name = mangle_module_name(mod_name) - return mod_name + "_" + name - - -def convert_to_relay( - tflite_model_buf, -): - """Convert a tflite model buffer in a Relay module""" - # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1 - try: - import tflite.Model - - tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) - except AttributeError: - import tflite - - tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) - except ImportError: - raise ImportError("The tflite package must be installed") - - mod, params = relay.frontend.from_tflite(tflite_model) - mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params) - return mod, params - - -def parametrize_aot_options(test): - """Parametrize over valid option combinations""" - - requires_arm_eabi = pytest.mark.skipif( - shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable" - ) - - interface_api = ["packed", "c"] - use_unpacked_api = [True, False] - test_runner = [AOT_DEFAULT_RUNNER, AOT_CORSTONE300_RUNNER] - - all_combinations = itertools.product(interface_api, use_unpacked_api, test_runner) - - # Filter out packed operators with c interface - valid_combinations = filter( - lambda parameters: not (parameters[0] == "c" and not parameters[1]), - all_combinations, - ) - - # Only use reference system for C interface and unpacked API calls - valid_combinations = filter( - lambda parameters: not ( - parameters[2] == AOT_CORSTONE300_RUNNER - and (parameters[0] == "packed" or not parameters[1]) - ), - valid_combinations, - ) - - # Skip reference system tests if running in i386 container - marked_combinations = map( - lambda parameters: pytest.param(*parameters, marks=[requires_arm_eabi]) - if parameters[2] == AOT_CORSTONE300_RUNNER - else parameters, - valid_combinations, - ) - - fn = pytest.mark.parametrize( - ["interface_api", "use_unpacked_api", "test_runner"], - marked_combinations, - )(test) - - return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn) - - -def subprocess_check_log_output(cmd, cwd, logfile): +def _subprocess_check_log_output(cmd, cwd, logfile): """ This method runs a process and logs the output to both a log file and stdout """ @@ -290,15 +180,21 @@ def subprocess_check_log_output(cmd, cwd, logfile): raise RuntimeError(f"Subprocess failed: {cmd}\nstdout:\n{stdout}") +def _mangle_name(mod_name, name): + mod_name = mangle_module_name(mod_name) + return mod_name + "_" + name + + # TODO: Move to linker script with list of symbols rather than coding into source -def emit_data_linkage(output_file, data_linkage): +def _emit_data_linkage(output_file, data_linkage): if data_linkage is not None: output_file.write( - f'__attribute__((section("{data_linkage.section}"), aligned({data_linkage.alignment}))) ' + f'__attribute__((section("{data_linkage.section}"), ' + f"aligned({data_linkage.alignment}))) " ) -def emit_main_prologue( +def _emit_main_prologue( main_file, custom_prologue, workspace_bytes, @@ -316,16 +212,14 @@ def emit_main_prologue( # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment. workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n" main_file.write(workspace_define) - emit_data_linkage(main_file, data_linkage) + _emit_data_linkage(main_file, data_linkage) main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n") main_file.write("tvm_workspace_t app_workspace;\n") main_file.write( - """ - + """\n tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr); } - tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { return StackMemoryManager_Free(&app_workspace,ptr); } @@ -334,30 +228,24 @@ def emit_main_prologue( else: # An implementation is not needed for these if the stack allocator is not used main_file.write( - """ - + """\n tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { return kTvmErrorFunctionCallNotImplemented; } - tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { return kTvmErrorFunctionCallNotImplemented; } - """ ) main_file.write( - """ - + """\n void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); } - void TVMLogf(const char* msg, ...) { va_list args; va_start(args, msg); vfprintf(stdout, msg, args); va_end(args); -} - +}\n TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {} int main(){\n """ @@ -365,105 +253,105 @@ def emit_main_prologue( main_file.write(custom_prologue) -def emit_main_data(main_file, input_map, output_map, mod_name): +def _emit_main_data(main_file, input_map, output_map, mod_name): for key in input_map: sanitized_tensor_name = re.sub(r"\W", "_", key) main_file.write( - f'#include "{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}.h"\n' + f'#include "{_mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}.h"\n' ) for key in output_map: sanitized_tensor_name = re.sub(r"\W", "_", key) main_file.write( - f'#include "{mangle_name(mod_name,"expected_output_data")}_{sanitized_tensor_name}.h"\n' - f'#include "{mangle_name(mod_name,"output_data")}_{sanitized_tensor_name}.h"\n' + f'#include "{_mangle_name(mod_name,"expected_output_data")}_' + f'{sanitized_tensor_name}.h"\n' + f'#include "{_mangle_name(mod_name,"output_data")}_' + f'{sanitized_tensor_name}.h"\n' ) -def emit_main_device_structs(main_file, devices, mod_name): +def _emit_main_device_structs(main_file, devices, mod_name): if devices: main_file.write( - f"struct {mangle_name(mod_name, 'devices')} {mangle_name(mod_name, 'devices')} = {{" + f"struct {_mangle_name(mod_name, 'devices')} {_mangle_name(mod_name, 'devices')} = {{" ) for device in devices: main_file.write(f"\t.{device} = {device},\n") main_file.write("};\n") -def emit_main_workspace_pool_structs(main_file, workspace_pool_names, mod_name): +def _emit_main_workspace_pool_structs(main_file, workspace_pool_names, mod_name): if workspace_pool_names and len(workspace_pool_names) > 0: main_file.write( - f"struct {mangle_name(mod_name, 'workspace_pools')} {mangle_name(mod_name, 'workspace_pools')} = {{" + f"struct {_mangle_name(mod_name, 'workspace_pools')} " + f"{_mangle_name(mod_name, 'workspace_pools')} = {{" ) for workspace_pool_name in workspace_pool_names: main_file.write(f"\t.{workspace_pool_name} = {workspace_pool_name},\n") main_file.write("};\n") -def emit_main_data_structs(main_file, input_map, output_map, mod_name): +def _emit_main_data_structs(main_file, input_map, output_map, mod_name): main_file.write( - f"struct {mangle_name(mod_name, 'inputs')} {mangle_name(mod_name, 'inputs')} = {{" + f"struct {_mangle_name(mod_name, 'inputs')} {_mangle_name(mod_name, 'inputs')} = {{" ) for key in input_map: sanitized_tensor_name = re.sub(r"\W", "_", key) main_file.write( - f"\t.{sanitized_tensor_name} = {mangle_name(mod_name, 'input_data')}_{sanitized_tensor_name},\n" + f"\t.{sanitized_tensor_name} = " + f"{_mangle_name(mod_name, 'input_data')}_{sanitized_tensor_name},\n" ) main_file.write("};\n") main_file.write( - f"struct {mangle_name(mod_name, 'outputs')} {mangle_name(mod_name, 'outputs')} = {{" + f"struct {_mangle_name(mod_name, 'outputs')} {_mangle_name(mod_name, 'outputs')} = {{" ) for key in output_map: sanitized_tensor_name = re.sub(r"\W", "_", key) main_file.write( - f"\t.{sanitized_tensor_name} = {mangle_name(mod_name, 'output_data')}_{sanitized_tensor_name},\n" + f"\t.{sanitized_tensor_name} = {_mangle_name(mod_name, 'output_data')}_" + f"{sanitized_tensor_name},\n" ) main_file.write("};\n") -def emit_main_data_setup(main_file, input_map, output_map, mod_name): +def _emit_main_data_setup(main_file, input_map, output_map, mod_name): num_outputs = len(output_map) num_inputs = len(input_map) - - main_file.write(f'void* {mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ') + main_file.write(f'void* {_mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ') for key in input_map: sanitized_tensor_name = re.sub(r"\W", "_", key) - main_file.write(f'{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}, ') + main_file.write(f'{_mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}, ') main_file.write("};\n") - - main_file.write(f'void* {mangle_name(mod_name,"outputs")}[{num_outputs}] = {{ ') + main_file.write(f'void* {_mangle_name(mod_name,"outputs")}[{num_outputs}] = {{ ') for key in output_map: sanitized_tensor_name = re.sub(r"\W", "_", key) - main_file.write(f'{mangle_name(mod_name, "output_data")}_{sanitized_tensor_name}, ') + main_file.write(f'{_mangle_name(mod_name, "output_data")}_{sanitized_tensor_name}, ') main_file.write("};\n") -def emit_main_c_interface_call( +def _emit_main_c_interface_call( main_file, devices, workspace_pool_names, mod_name, use_workspace_io ): sub_strings = list() - sub_strings.append(f'{mangle_name(mod_name,"run")}(') + sub_strings.append(f'{_mangle_name(mod_name,"run")}(') if not use_workspace_io: - sub_strings.append(f'&{mangle_name(mod_name,"inputs")}, ') - sub_strings.append(f'&{mangle_name(mod_name,"outputs")}, ') + sub_strings.append(f'&{_mangle_name(mod_name,"inputs")}, ') + sub_strings.append(f'&{_mangle_name(mod_name,"outputs")}, ') if workspace_pool_names: - sub_strings.append(f'&{mangle_name(mod_name,"workspace_pools")}, ') + sub_strings.append(f'&{_mangle_name(mod_name,"workspace_pools")}, ') if devices: - sub_strings.append(f'&{mangle_name(mod_name,"devices")}, ') + sub_strings.append(f'&{_mangle_name(mod_name,"devices")}, ') # Removing the last two characters that is a comma and a space sub_strings[-1] = sub_strings[-1][:-2] # Adding brackets and newline instead sub_strings[-1] = sub_strings[-1] + ");\n" - main_file_string = "" - for sub_string in sub_strings: - main_file_string += sub_string - + main_file_string = "".join(sub_strings) main_file.write(main_file_string) -def emit_main_fake_packed_values(main_file): +def _emit_main_fake_packed_values(main_file): main_file.write( """ static DLDevice fake_device = {kDLCPU, 0}; @@ -473,10 +361,10 @@ def emit_main_fake_packed_values(main_file): ) -def emit_main_packed_call(main_file, input_map, output_list, mod_name): - tensors_name = mangle_name(mod_name, "tensors") - values_name = mangle_name(mod_name, "values") - typeids_name = mangle_name(mod_name, "typeids") +def _emit_main_packed_call(main_file, input_map, output_list, mod_name): + tensors_name = _mangle_name(mod_name, "tensors") + values_name = _mangle_name(mod_name, "values") + typeids_name = _mangle_name(mod_name, "typeids") def fake_tensor(source, source_index, packed_index): main_file.write( @@ -503,20 +391,20 @@ def fake_tensor(source, source_index, packed_index): ) for i in range(0, num_inputs): - fake_tensor(mangle_name(mod_name, "inputs"), i, i) + fake_tensor(_mangle_name(mod_name, "inputs"), i, i) for i in range(0, num_outputs): - fake_tensor(mangle_name(mod_name, "outputs"), i, i + num_inputs) + fake_tensor(_mangle_name(mod_name, "outputs"), i, i + num_inputs) main_file.write( - f'{mangle_name(mod_name, "run")}({values_name}, {typeids_name}, 0, NULL, 0, NULL);\n' + f'{_mangle_name(mod_name, "run")}({values_name}, {typeids_name}, 0, NULL, 0, NULL);\n' ) main_file.write("\n") -def emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interface_c=False): +def _emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interface_c=False): for key in outputs: sanitized_tensor_name = re.sub(r"\W", "_", key) - expected_data_name = mangle_name(mod_name, f"expected_output_data_{sanitized_tensor_name}") + expected_data_name = _mangle_name(mod_name, f"expected_output_data_{sanitized_tensor_name}") is_float_dtype = outputs[key].dtype == "float32" comparison_function = "abs" @@ -526,40 +414,39 @@ def emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interf tolerance = output_tolerance or 0.001 data_length_var_name = ( - mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + "_len" + _mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + "_len" ) if use_interface_c: c_type = NP_TYPE_TO_C[str(outputs[key].dtype)] - actual_data_name = f"(({c_type}*)" + mangle_name( + actual_data_name = f"(({c_type}*)" + _mangle_name( mod_name, f"outputs.{sanitized_tensor_name})" ) else: - actual_data_name = mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + actual_data_name = _mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") main_file.write( - f""" - for (int i = 0; i<{data_length_var_name}; i++) {{ - if ({comparison_function}({actual_data_name}[i]-{expected_data_name}[i]) > {tolerance}) {{ - printf("{AOT_FAILURE_TOKEN}\\n"); - return -1; - }} - }} - """ + f"for (int i = 0; i<{data_length_var_name}; i++) {{\n" + f"\tif ({comparison_function}({actual_data_name}[i]-" + f"{expected_data_name}[i]) > {tolerance}) {{\n" + f'\t\tprintf("{AOT_FAILURE_TOKEN}\\n");\n' + f"\t\treturn -1;\n" + f"\t}}\n" + f"}}" ) -def emit_main_init_memory_manager(main_file): +def _emit_main_init_memory_manager(main_file): main_file.write("StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE);") main_file.write("\n") -def emit_main_epilogue(main_file, custom_epilogue): +def _emit_main_epilogue(main_file, custom_epilogue): main_file.write(custom_epilogue) main_file.write(f'printf("{AOT_SUCCESS_TOKEN}\\n");') main_file.write("return 0;") main_file.write("}\n") -def emit_main_common_includes(main_file, custom_includes): +def _emit_main_common_includes(main_file, custom_includes): main_file.write("#include \n") main_file.write("#include \n") main_file.write("#include \n") @@ -570,11 +457,11 @@ def emit_main_common_includes(main_file, custom_includes): main_file.write(f'#include "{include}"\n') -def emit_main_micro_include(main_file, mod_name): +def _emit_main_micro_include(main_file, mod_name): main_file.write(f"#include <{mangle_module_name(mod_name)}.h>\n") -def create_main( +def _create_main( test_name, compiled_models, output_path, @@ -591,17 +478,17 @@ def create_main( # create header file raw_path = file_path.with_suffix(".c").resolve() with open(raw_path, "w") as main_file: - emit_main_common_includes(main_file, custom_includes) + _emit_main_common_includes(main_file, custom_includes) if interface_api == "c": for compiled_model in compiled_models: model = compiled_model.model - emit_main_micro_include(main_file, model.name) + _emit_main_micro_include(main_file, model.name) for compiled_model in compiled_models: model = compiled_model.model - emit_main_data(main_file, model.inputs, model.outputs, model.name) + _emit_main_data(main_file, model.inputs, model.outputs, model.name) - emit_main_prologue( + _emit_main_prologue( main_file, custom_prologue, workspace_bytes, @@ -611,7 +498,7 @@ def create_main( use_stack_allocator, ) if use_stack_allocator: - emit_main_init_memory_manager(main_file) + _emit_main_init_memory_manager(main_file) if interface_api == "c": for compiled_model in compiled_models: @@ -627,32 +514,33 @@ def create_main( for allocated_pool in dict(executor_codegen_metadata.pool_inputs).values() if not allocated_pool.pool_info.is_internal ] - emit_main_device_structs(main_file, devices, model.name) + _emit_main_device_structs(main_file, devices, model.name) if not use_workspace_io: - emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name) - emit_main_data_structs(main_file, model.inputs, model.outputs, model.name) - emit_main_c_interface_call( + _emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name) + _emit_main_data_structs(main_file, model.inputs, model.outputs, model.name) + _emit_main_c_interface_call( main_file, devices, workspace_pool_names, model.name, use_workspace_io ) else: - emit_main_fake_packed_values(main_file) + _emit_main_fake_packed_values(main_file) for compiled_model in compiled_models: model = compiled_model.model - emit_main_data_setup(main_file, model.inputs, model.outputs, model.name) - emit_main_packed_call(main_file, model.inputs, model.outputs, model.name) + _emit_main_data_setup(main_file, model.inputs, model.outputs, model.name) + _emit_main_packed_call(main_file, model.inputs, model.outputs, model.name) for compiled_model in compiled_models: model = compiled_model.model - emit_main_compare( + _emit_main_compare( main_file, model.outputs, model.output_tolerance, model.name, interface_api == "c" ) - emit_main_epilogue(main_file, custom_epilogue) + _emit_main_epilogue(main_file, custom_epilogue) -def create_header_file(tensor_name, npy_data, output_path, data_linkage): +def _create_header_file(tensor_name, npy_data, output_path, data_linkage): """ This method generates a header file containing the data contained in the numpy array provided. - It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone application. + It is used to capture the tensor data (for both inputs and expected outputs) + to be bundled into the standalone application. """ file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve() # create header file @@ -663,7 +551,7 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage): header_file.write("#include \n") header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n") - emit_data_linkage(header_file, data_linkage) + _emit_data_linkage(header_file, data_linkage) header_file.write(f"{NP_TYPE_TO_C[str(npy_data.dtype)]} {tensor_name}[] =") @@ -673,6 +561,27 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage): header_file.write("};\n\n") +def convert_to_relay( + tflite_model_buf, +): + """Convert a tflite model buffer in a Relay module""" + # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1 + try: + import tflite.Model # pylint: disable=import-outside-toplevel + + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) + except AttributeError: + import tflite # pylint: disable=import-outside-toplevel + + tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) + except ImportError: + raise ImportError("The tflite package must be installed") + + mod, params = relay.frontend.from_tflite(tflite_model) + mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params) + return mod, params + + def compile_models( models: Union[List[AOTTestModel], AOTTestModel], interface_api: str, @@ -683,6 +592,7 @@ def compile_models( use_runtime_executor: bool = True, target: tvm.target.Target = tvm.target.Target("c"), workspace_memory_pools=None, + schedule_name: str = None, ) -> List[AOTCompiledTestModel]: """ This method generates runtime.Modules for the tests @@ -708,31 +618,62 @@ def compile_models( compiled_mods = list() for model in models: - with tvm.transform.PassContext(opt_level=3, config=config): - # TODO(Mousius) - Remove once executor/runtime are fully removed from Target - if use_runtime_executor: - executor_factory = tvm.relay.build( - model.module, - target, - executor=executor, - runtime=runtime, - workspace_memory_pools=workspace_memory_pools, - params=model.params, - mod_name=model.name, - ) - compiled_mods.append( - AOTCompiledTestModel(model=model, executor_factory=executor_factory) - ) - else: - executor_factory = tvm.relay.build( - model.module, - tvm.target.Target(target, host=target), - params=model.params, - mod_name=model.name, - ) - compiled_mods.append( - AOTCompiledTestModel(model=model, executor_factory=executor_factory) - ) + if schedule_name: + # Testing with deterministic schedule + task_list = autotvm.task.extract_from_program( + model.module, target=target, params=model.params + ) + with tvm.autotvm.apply_fixed_config(task_list, schedule_name): + with tvm.transform.PassContext(opt_level=3, config=config): + if use_runtime_executor: + executor_factory = tvm.relay.build( + model.module, + target, + executor=executor, + runtime=runtime, + workspace_memory_pools=workspace_memory_pools, + params=model.params, + mod_name=model.name, + ) + compiled_mods.append( + AOTCompiledTestModel(model=model, executor_factory=executor_factory) + ) + else: + executor_factory = tvm.relay.build( + model.module, + tvm.target.Target(target, host=target), + params=model.params, + mod_name=model.name, + ) + compiled_mods.append( + AOTCompiledTestModel(model=model, executor_factory=executor_factory) + ) + else: + with tvm.transform.PassContext(opt_level=3, config=config): + # TODO(Mousius) - Remove once executor/runtime are fully removed from Target + if use_runtime_executor: + executor_factory = tvm.relay.build( + model.module, + target, + executor=executor, + runtime=runtime, + workspace_memory_pools=workspace_memory_pools, + params=model.params, + mod_name=model.name, + ) + compiled_mods.append( + AOTCompiledTestModel(model=model, executor_factory=executor_factory) + ) + else: + executor_factory = tvm.relay.build( + model.module, + tvm.target.Target(target, host=target), + params=model.params, + mod_name=model.name, + ) + compiled_mods.append( + AOTCompiledTestModel(model=model, executor_factory=executor_factory) + ) return compiled_mods @@ -788,8 +729,8 @@ def run_and_check_body(base_path): workspace_bytes += model.extra_memory_in_bytes for key in model.inputs: sanitized_tensor_name = re.sub(r"\W", "_", key) - create_header_file( - f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}', + _create_header_file( + f'{_mangle_name(model.name, "input_data")}_{sanitized_tensor_name}', model.inputs[key], include_path, data_linkage, @@ -797,14 +738,14 @@ def run_and_check_body(base_path): for key in model.outputs: sanitized_tensor_name = re.sub(r"\W", "_", key) - create_header_file( - f'{mangle_name(model.name, "output_data")}_{sanitized_tensor_name}', + _create_header_file( + f'{_mangle_name(model.name, "output_data")}_{sanitized_tensor_name}', np.zeros(model.outputs[key].shape, model.outputs[key].dtype), include_path, data_linkage, ) - create_header_file( - f'{mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}', + _create_header_file( + f'{_mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}', model.outputs[key], include_path, data_linkage, @@ -814,7 +755,7 @@ def run_and_check_body(base_path): # We only need the stack allocator if USMP is not used use_stack_allocator = not use_usmp - create_main( + _create_main( "test.c", models, build_path, @@ -830,8 +771,9 @@ def run_and_check_body(base_path): # Verify that compiles fine file_dir = os.path.dirname(os.path.abspath(__file__)) + makefile_dir = os.path.join(file_dir, "../../../tests/python/relay/aot") codegen_path = os.path.join(base_path, "codegen") - makefile = os.path.join(file_dir, f"{runner.makefile}.mk") + makefile = os.path.join(makefile_dir, f"{runner.makefile}.mk") fvp_dir = "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/" # TODO(@grant-arm): Remove once ci_cpu docker image has been updated to FVP_Corstone_SSE if not os.path.isdir(fvp_dir): @@ -842,8 +784,8 @@ def run_and_check_body(base_path): make_command = ( f"make -f {makefile} build_dir={build_path}" + f" CFLAGS='{cflags}'" - + f" TVM_ROOT={file_dir}/../../../.." - + f" AOT_TEST_ROOT={file_dir}" + + f" TVM_ROOT={file_dir}/../../.." + + f" AOT_TEST_ROOT={makefile_dir}" + f" CODEGEN_ROOT={codegen_path}" + f" STANDALONE_CRT_DIR={tvm.micro.get_standalone_crt_dir()}" + f" FVP_DIR={fvp_dir}" @@ -854,7 +796,7 @@ def run_and_check_body(base_path): compile_command = f"{make_command} aot_test_runner" if verbose: print("Compile command:\n", compile_command) - subprocess_check_log_output(compile_command, ".", compile_log_path) + _subprocess_check_log_output(compile_command, ".", compile_log_path) # Verify that runs fine run_log_path = os.path.join(build_path, "test_run.log") @@ -865,11 +807,11 @@ def run_and_check_body(base_path): # TODO(lhutton1) This is a quick and dirty work around to help temporarily reduce # the flakyness of the tests. Will remove once #10300 and #10314 are resolved. try: - subprocess_check_log_output(run_command, build_path, run_log_path) + _subprocess_check_log_output(run_command, build_path, run_log_path) except RuntimeError as err: print("Failed to run the module, having a second attempt...", file=sys.stderr) print(err, file=sys.stderr) - subprocess_check_log_output(run_command, build_path, run_log_path) + _subprocess_check_log_output(run_command, build_path, run_log_path) with open(run_log_path) as run_log: assert AOT_SUCCESS_TOKEN in run_log.read() @@ -895,6 +837,7 @@ def compile_and_run( target_opts: Dict = None, test_dir: str = None, verbose: bool = False, + schedule_name: str = None, ): """This is a wrapper API to compile and run models as test for AoT @@ -919,6 +862,7 @@ def compile_and_run( pass_config=runner.pass_config, use_runtime_executor=use_runtime_executor, target=tvm.target.Target(target), + schedule_name=schedule_name, ) run_and_check( diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py index e0aad7c3c6d5..4fd3e39fd1c0 100644 --- a/tests/micro/zephyr/test_utils.py +++ b/tests/micro/zephyr/test_utils.py @@ -32,7 +32,7 @@ import tvm.micro from tvm.micro import export_model_library_format from tvm.micro.model_library_format import generate_c_interface_header -from tvm.micro.testing import ( +from tvm.micro.testing.utils import ( mlf_extract_workspace_size_bytes, aot_transport_init_wait, aot_transport_find_message, diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py index 1582d7e4a5fe..49e5e2757b20 100644 --- a/tests/micro/zephyr/test_zephyr.py +++ b/tests/micro/zephyr/test_zephyr.py @@ -30,7 +30,7 @@ from tvm.relay.backend import Executor, Runtime from tvm.relay.testing import byoc from tvm.contrib import utils -from tvm.micro.testing import check_tune_log +from tvm.micro.testing.utils import check_tune_log import test_utils diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py index 87c7dc92fbda..6b355f28de4b 100644 --- a/tests/micro/zephyr/test_zephyr_aot.py +++ b/tests/micro/zephyr/test_zephyr_aot.py @@ -33,7 +33,6 @@ from tvm.relay.backend import Executor, Runtime from tvm.contrib.download import download_testdata -from tvm.micro.testing import aot_transport_init_wait, aot_transport_find_message import test_utils diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py index 028ab406243f..7846bba1e089 100644 --- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py +++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py @@ -36,12 +36,10 @@ assert_partitioned_function, assert_no_external_function, ) -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, +from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run +from tvm.micro.testing.aot_test_utils import ( AOT_CORSTONE300_RUNNER, AOT_USMP_CORSTONE300_RUNNER, - generate_ref_data, - compile_and_run, ) diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py index 47245f60e15e..1cdf98510148 100644 --- a/tests/python/contrib/test_cmsisnn/test_conv2d.py +++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py @@ -23,15 +23,9 @@ from tvm import relay from tvm.relay.op.contrib import cmsisnn +from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, - AOT_CORSTONE300_RUNNER, - AOT_USMP_CORSTONE300_RUNNER, - AOT_DEFAULT_RUNNER, - generate_ref_data, - compile_and_run, -) +from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER from utils import ( skip_if_no_reference_system, make_module, diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py index ec2e9bbdcca7..111d3b2edac1 100644 --- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py +++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py @@ -23,14 +23,9 @@ from tvm import relay from tvm.relay.op.contrib import cmsisnn - -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, - AOT_CORSTONE300_RUNNER, +from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run +from tvm.micro.testing.aot_test_utils import ( AOT_USMP_CORSTONE300_RUNNER, - AOT_DEFAULT_RUNNER, - generate_ref_data, - compile_and_run, ) from utils import ( skip_if_no_reference_system, diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py index 7808fbf7752f..d0a8547d32ac 100644 --- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py +++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py @@ -22,12 +22,9 @@ import tvm from tvm import relay - -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import ( AOT_USMP_CORSTONE300_RUNNER, - generate_ref_data, - compile_and_run, ) from utils import ( skip_if_no_reference_system, diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py index a6e77515859e..fefce9e86c2d 100644 --- a/tests/python/contrib/test_cmsisnn/test_networks.py +++ b/tests/python/contrib/test_cmsisnn/test_networks.py @@ -28,16 +28,14 @@ from tvm.relay.op.contrib import cmsisnn from utils import skip_if_no_reference_system, get_range_for_dtype_str -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import ( AOT_CORSTONE300_RUNNER, AOT_USMP_CORSTONE300_RUNNER, - generate_ref_data, - compile_and_run, ) -def convert_to_relay( +def _convert_to_relay( tflite_model_buf, input_data, input_node, @@ -95,7 +93,7 @@ def test_cnn_small(test_runner): rng = np.random.default_rng(12345) input_data = rng.integers(in_min, high=in_max, size=input_shape, dtype=dtype) - orig_mod, params = convert_to_relay(tflite_model_buf, input_data, "input") + orig_mod, params = _convert_to_relay(tflite_model_buf, input_data, "input") cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params) # validate CMSIS-NN output against CPU output diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py index cca1288ac2a0..a2650bb8d028 100644 --- a/tests/python/contrib/test_cmsisnn/test_pooling.py +++ b/tests/python/contrib/test_cmsisnn/test_pooling.py @@ -23,15 +23,8 @@ from tvm import relay from tvm.relay.op.contrib import cmsisnn - -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, - AOT_CORSTONE300_RUNNER, - AOT_USMP_CORSTONE300_RUNNER, - AOT_DEFAULT_RUNNER, - generate_ref_data, - compile_and_run, -) +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER from utils import ( skip_if_no_reference_system, make_module, diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py index 6eac76d841b4..5a44a7865e66 100644 --- a/tests/python/contrib/test_cmsisnn/test_softmax.py +++ b/tests/python/contrib/test_cmsisnn/test_softmax.py @@ -34,13 +34,8 @@ assert_partitioned_function, assert_no_external_function, ) -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, - AOT_CORSTONE300_RUNNER, - AOT_USMP_CORSTONE300_RUNNER, - generate_ref_data, - compile_and_run, -) +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER def make_model( diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py index 0c42b024f274..20bd12945f8f 100644 --- a/tests/python/contrib/test_ethosu/infra.py +++ b/tests/python/contrib/test_ethosu/infra.py @@ -47,7 +47,7 @@ import tvm.relay.testing.tf as tf_testing from tvm.relay.op.contrib.ethosu import partition_for_ethosu -from tests.python.relay.aot.aot_test_utils import ( +from tvm.testing.aot import ( AOTCompiledTestModel, AOTDataLinkage, AOTTestModel, diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py index 4268392f1b78..7ea813762796 100644 --- a/tests/python/contrib/test_ethosu/test_codegen.py +++ b/tests/python/contrib/test_ethosu/test_codegen.py @@ -29,7 +29,7 @@ from tvm.relay.backend.contrib.ethosu import util from tvm.relay.op.contrib.ethosu import partition_for_ethosu -from tests.python.relay.aot.aot_test_utils import generate_ref_data +from tvm.testing.aot import generate_ref_data from . import infra diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index f64263ca0623..b91168b7bbe6 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -24,7 +24,7 @@ from tvm.relay.op.contrib.ethosu import partition_for_ethosu from tvm.micro import model_library_format as mlf -from tests.python.relay.aot.aot_test_utils import convert_to_relay +from tvm.testing.aot import convert_to_relay from . import infra diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py index 484c19fa222c..7628755af4ac 100644 --- a/tests/python/integration/test_arm_mprofile_dsp.py +++ b/tests/python/integration/test_arm_mprofile_dsp.py @@ -20,12 +20,8 @@ import tvm import tvm.testing from tvm import relay -from tests.python.relay.aot.aot_test_utils import ( - AOTTestModel, - AOT_CORSTONE300_RUNNER, - generate_ref_data, - compile_and_run, -) +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER @tvm.testing.requires_corstone300 diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py index f9fa0c6eadbb..3c7db62890f5 100644 --- a/tests/python/relay/aot/test_c_device_api.py +++ b/tests/python/relay/aot/test_c_device_api.py @@ -24,12 +24,8 @@ from tvm import relay from tvm.ir.module import IRModule -from aot_test_utils import ( - AOT_DEFAULT_RUNNER, - AOTTestModel, - generate_ref_data, - compile_models, -) +from tvm.testing.aot import AOTTestModel, generate_ref_data, compile_models +from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER @pytest.fixture diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py index cdcc61c33ac7..4a12678a79d9 100644 --- a/tests/python/relay/aot/test_cpp_aot.py +++ b/tests/python/relay/aot/test_cpp_aot.py @@ -27,7 +27,8 @@ from tvm import IRModule from tvm import relay from tvm.relay import backend, testing -from aot_test_utils import AOT_DEFAULT_RUNNER, AOTTestModel, generate_ref_data, compile_and_run +from tvm.testing.aot import generate_ref_data +from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER def test_error_c_interface(): diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 2991cc01fc92..d1d80d434b6a 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -37,16 +37,14 @@ from tvm.micro import model_library_format as mlf from tvm.micro import export_model_library_format from tvm.ir.instrument import pass_instrument -from aot_test_utils import ( +from tvm.testing.aot import ( AOTTestModel, - AOT_DEFAULT_RUNNER, generate_ref_data, - convert_to_relay, compile_and_run, compile_models, - parametrize_aot_options, create_relay_module_and_inputs_from_tflite_file, ) +from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options def test_error_c_interface_with_packed_api(): diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py index 650cb4526f09..60b46d96b555 100644 --- a/tests/python/relay/aot/test_crt_aot_usmp.py +++ b/tests/python/relay/aot/test_crt_aot_usmp.py @@ -32,14 +32,13 @@ from tvm.relay.backend import Executor, Runtime from tvm import WorkspaceMemoryPools, PoolInfo from tvm.micro import model_library_format as mlf -from aot_test_utils import ( +from tvm.micro.testing.aot_test_utils import parametrize_aot_options +from tvm.testing.aot import ( AOTTestModel, AOTTestRunner, generate_ref_data, - convert_to_relay, compile_and_run, compile_models, - parametrize_aot_options, run_and_check, create_relay_module_and_inputs_from_tflite_file, ) diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py new file mode 100644 index 000000000000..e88210a59e77 --- /dev/null +++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import sys +import numpy as np +import pytest +import tvm +import tvm.testing +from tvm import relay +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import ( + AOT_CORSTONE300_RUNNER, +) + + +class BasicConv2dTests: + @tvm.testing.requires_corstone300 + def test_conv2d( + self, + data_shape, + kernel_size, + kernel_layout, + num_filter, + strides, + padding, + dilation, + dtype, + schedule_name, + ): + """Test a subgraph with a single conv2d_nchw operator.""" + ishape = data_shape + wshape = (num_filter, data_shape[1], *kernel_size) + weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype) + + input0 = relay.var("input", relay.TensorType(ishape, dtype)) + weight0 = relay.const(weight_data) + out0 = relay.op.nn.conv2d( + input0, + weight0, + kernel_size=kernel_size, + strides=strides, + padding=padding, + dilation=(dilation, dilation), + data_layout="NCHW", + kernel_layout="OIHW", + out_dtype="int32", + out_layout="NCHW", + ) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, dtype)) + weight1 = relay.const(weight_data) + + out1 = relay.op.nn.conv2d( + input1, + weight1, + kernel_size=kernel_size, + strides=strides, + padding=padding, + dilation=(dilation, dilation), + data_layout="NCHW", + kernel_layout=kernel_layout, + out_dtype="int32", + out_layout="NCHW", + ) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + schedule_name=schedule_name, + ) + + +class TestConv2d_OIHW_small_kernel(BasicConv2dTests): + """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation, dtype = tvm.testing.parameters( + ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int8"), + ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int16"), + ((1, 32, 16, 16), (3, 3), 12, 1, 0, 1, "int16"), + ) + kernel_layout = tvm.testing.parameter("OIHW") + schedule_name = tvm.testing.parameter("conv2d_nchw_spatial_pack.arm_cpu") + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py new file mode 100644 index 000000000000..f56645d43672 --- /dev/null +++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py @@ -0,0 +1,154 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import sys +import numpy as np +import pytest +import tvm +import tvm.testing +from tvm import relay +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER + + +class BasicConv2dTests: + @tvm.testing.requires_corstone300 + def test_conv2d( + self, + data_shape, + kernel_size, + kernel_layout, + num_filter, + strides, + padding, + dilation, + dtype, + schedule_name, + ): + """Test a subgraph with a single conv2d operator.""" + ishape = data_shape + wshape = (*kernel_size, data_shape[-1], num_filter) + + weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype) + + input0 = relay.var("input", relay.TensorType(ishape, dtype)) + weight0 = relay.const(weight_data) + out0 = relay.op.nn.conv2d( + input0, + weight0, + kernel_size=kernel_size, + strides=strides, + padding=padding, + dilation=(dilation, dilation), + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype="int32", + out_layout="NHWC", + ) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, dtype)) + + if kernel_layout == "HWOI": + weight1 = relay.const(np.moveaxis(weight_data, 2, -1)) + elif kernel_layout == "HWIO": + weight1 = relay.const(weight_data) + + out1 = relay.op.nn.conv2d( + input1, + weight1, + kernel_size=kernel_size, + strides=strides, + padding=padding, + dilation=(dilation, dilation), + data_layout="NHWC", + kernel_layout=kernel_layout, + out_dtype="int32", + out_layout="NHWC", + ) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + schedule_name=schedule_name, + ) + + +class TestConv2d_DSP_HWOI(BasicConv2dTests): + """This test is for conv2d_nhwc_dsp.arm_cpu schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters( + # TODO(mehrdadh): Fails due to https://github.com/apache/tvm/issues/11216 + # ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1), + # ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1), + # ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2), + ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2), + # from Keyword Spotting model from MLPerfTiny models + # TODO(mehrdad): Fails due to https://github.com/apache/tvm/issues/11216 + # ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1), + # from Visual Wake Word model from MLPerfTiny models + # TODO(mehrdadh): fails due to https://github.com/apache/tvm/issues/11216 + # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1), + # from Image Classification model from MLPerfTiny models + ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1), + ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1), + ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1), + ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1), + ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1), + ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1), + ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1), + ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1), + ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1), + ) + dtype = tvm.testing.parameter("int8", "int16") + kernel_layout = tvm.testing.parameter("HWOI") + schedule_name = tvm.testing.parameter("conv2d_nhwc_dsp.arm_cpu") + + +class TestConv2d_HWIO(BasicConv2dTests): + """This test is for conv2d_nhwc_spatial_pack.arm_cpu schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters( + ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1), + ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1), + ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2), + ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2), + ) + dtype = tvm.testing.parameter("int8", "int16") + kernel_layout = tvm.testing.parameter("HWIO") + schedule_name = tvm.testing.parameter("conv2d_nhwc_spatial_pack.arm_cpu") + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py new file mode 100644 index 000000000000..89f1fb1843b4 --- /dev/null +++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py @@ -0,0 +1,153 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import sys +import numpy as np +import pytest +import tvm +import tvm.testing +from tvm import relay +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER + + +class BasicDepthwiseConv2dTests: + @tvm.testing.requires_corstone300 + def test_conv2d( + self, + data_shape, + data_layout, + kernel_size, + kernel_layout, + num_filter, + strides, + padding, + dilation, + dtype, + schedule_name, + ): + """Test a subgraph with a single conv2d operator.""" + ishape = data_shape + groups = num_filter + + assert groups > 1, f"groups should be more than 1 to create a depthwise conv2d." + + if data_layout == "NCHW" and kernel_layout == "OIHW": + assert ( + num_filter == data_shape[1] + ), f"Output channels({num_filter}) should be equal to input channels({data_shape[1]})." + wshape = (num_filter, data_shape[1] // groups, *kernel_size) + elif data_layout == "NHWC" and kernel_layout == "HWOI": + assert ( + num_filter == data_shape[3] + ), f"Output channels({num_filter}) should be equal to input channels({data_shape[3]})." + wshape = (*kernel_size, num_filter, data_shape[3] // groups) + else: + raise ValueError( + f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})." + ) + + weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype) + + input0 = relay.var("input", relay.TensorType(ishape, dtype)) + weight0 = relay.const(weight_data) + out0 = relay.op.nn.conv2d( + input0, + weight0, + kernel_size=kernel_size, + strides=strides, + padding=padding, + groups=groups, + dilation=(dilation, dilation), + data_layout=data_layout, + kernel_layout=kernel_layout, + out_dtype="int32", + out_layout=data_layout, + ) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, dtype)) + weight1 = relay.const(weight_data) + out1 = relay.op.nn.conv2d( + input1, + weight1, + kernel_size=kernel_size, + strides=strides, + padding=padding, + groups=groups, + dilation=(dilation, dilation), + data_layout=data_layout, + kernel_layout=kernel_layout, + out_dtype="int32", + out_layout=data_layout, + ) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + schedule_name=schedule_name, + ) + + +class TestDepthwiseConv2d_NCHW_OIHW(BasicDepthwiseConv2dTests): + """This test is for depthwise_conv2d_nchw.arm_cpu schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters( + ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1), + ((1, 32, 10, 3), (3, 3), 32, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 1), + ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 2), + ((1, 16, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2), + ) + data_layout = tvm.testing.parameter("NCHW") + dtype = tvm.testing.parameter("int8", "int16") + kernel_layout = tvm.testing.parameter("OIHW") + schedule_name = tvm.testing.parameter("depthwise_conv2d_nchw.arm_cpu") + + +class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests): + """This test is for depthwise_conv2d_nhwc.generic schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters( + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1), + ((1, 49, 10, 64), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2), + ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2), + ) + data_layout = tvm.testing.parameter("NHWC") + dtype = tvm.testing.parameter("int8", "int16") + kernel_layout = tvm.testing.parameter("HWOI") + schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc.generic") + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py new file mode 100644 index 000000000000..d3f504d04e35 --- /dev/null +++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import sys +import numpy as np +import pytest +import tvm +import tvm.testing +from tvm import relay +from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data +from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER + + +class BasicGroupConv2dTests: + @tvm.testing.requires_corstone300 + def test_conv2d( + self, + data_shape, + data_layout, + kernel_size, + kernel_layout, + num_filter, + strides, + padding, + dilation, + groups, + dtype, + schedule_name, + ): + """Test a subgraph with a single conv2d operator.""" + ishape = data_shape + + assert groups > 1, f"groups should be more than 1 to create a group conv2d." + + if data_layout == "NCHW" and kernel_layout == "OIHW": + assert data_shape[1] % groups == 0 + wshape = (num_filter, data_shape[1] // groups, *kernel_size) + elif data_layout == "NHWC" and kernel_layout == "HWIO": + assert data_shape[3] % groups == 0 + wshape = (*kernel_size, data_shape[3] // groups, num_filter) + else: + raise ValueError( + f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})." + ) + + weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype) + + input0 = relay.var("input", relay.TensorType(ishape, dtype)) + weight0 = relay.const(weight_data) + out0 = relay.op.nn.conv2d( + input0, + weight0, + kernel_size=kernel_size, + strides=strides, + padding=padding, + groups=groups, + dilation=(dilation, dilation), + data_layout=data_layout, + kernel_layout=kernel_layout, + out_dtype="int32", + out_layout=data_layout, + ) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, dtype)) + weight1 = relay.const(weight_data) + out1 = relay.op.nn.conv2d( + input1, + weight1, + kernel_size=kernel_size, + strides=strides, + padding=padding, + groups=groups, + dilation=(dilation, dilation), + data_layout=data_layout, + kernel_layout=kernel_layout, + out_dtype="int32", + out_layout=data_layout, + ) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + schedule_name=schedule_name, + ) + + +class TestGroupConv2d_NCHW_OIHW(BasicGroupConv2dTests): + """This test is for group_conv2d_nchw.arm_cpu schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters( + ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1), + ((1, 16, 32, 10), (3, 3), 16, 1, 0, 1), + ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 1), + ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1), + ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1), + ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 2), + ((1, 16, 32, 32), (3, 3), 32, 1, (1, 1, 2, 2), 2), + ) + groups = tvm.testing.parameter(2, 4) + data_layout = tvm.testing.parameter("NCHW") + dtype = tvm.testing.parameter("int8", "int16") + kernel_layout = tvm.testing.parameter("OIHW") + schedule_name = tvm.testing.parameter("group_conv2d_nchw.arm_cpu") + + +class TestGroupConv2d_NHWC_HWIO(BasicGroupConv2dTests): + """This test is for group_conv2d_nhwc.generic schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters( + ((1, 32, 32, 16), (3, 3), 12, 1, 0, 1), + ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1), + ((1, 49, 10, 16), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2), + ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2), + ) + groups = tvm.testing.parameter(2, 4) + data_layout = tvm.testing.parameter("NHWC") + dtype = tvm.testing.parameter("int8", "int16") + kernel_layout = tvm.testing.parameter("HWIO") + schedule_name = tvm.testing.parameter("group_conv2d_nhwc.generic") + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py index 4dbc8f274264..6d3d917ff5a2 100644 --- a/tests/python/relay/utils/external_codegen.py +++ b/tests/python/relay/utils/external_codegen.py @@ -104,7 +104,8 @@ def check_aot_executor_result( mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu() ): # Late import to avoid breaking test with USE_MICRO=OFF. - from aot.aot_test_utils import AOTTestModel, AOT_DEFAULT_RUNNER, compile_and_run + from tvm.testing.aot import AOTTestModel, compile_and_run + from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER interface_api = "packed" use_unpacked_api = False diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py index 3a93dbc89b1f..d5611906fc5d 100644 --- a/tests/python/unittest/test_crt.py +++ b/tests/python/unittest/test_crt.py @@ -292,7 +292,7 @@ def test_platform_timer(): def test_autotune(): """Verify that autotune works with micro.""" import tvm.relay as relay - from tvm.micro.testing import check_tune_log + from tvm.micro.testing.utils import check_tune_log runtime = Runtime("crt", {"system-lib": True}) diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh index d13ee91a0ba8..7301c6f833ab 100755 --- a/tests/scripts/task_python_microtvm.sh +++ b/tests/scripts/task_python_microtvm.sh @@ -51,3 +51,5 @@ export TVM_MICRO_USE_HW=1 export TVM_MICRO_BOARD=qemu_x86 python3 gallery/how_to/work_with_microtvm/micro_tflite.py python3 gallery/how_to/work_with_microtvm/micro_autotune.py + +run_pytest ctypes python-relay-strategy-arm_cpu tests/python/relay/strategy/arm_cpu --enable-corstone300-tests From e02bf824d11019413ed1f8eb78da2b3427b0f026 Mon Sep 17 00:00:00 2001 From: Hua Jiang Date: Thu, 19 May 2022 16:51:13 -0700 Subject: [PATCH 38/59] [Runtime][PipelineExecutor] Add graph manually splitting logic into the unit test. (#11334) * [Runtime][PipelineExecutor] Add graph manually splitting example into the unit test. Current unit test create 3 seperate module then re-connect them to run the pipeline executor. And this is not a real use case for pipeline executor. Adding a manually graph splitting logic which split a full network into 3 subgraph then run the pipeline executor and verify the result to simulate the real use case. * address review comments * trigger build. * address review comments * address review comments * rebase and trigger build. --- tests/python/relay/test_pipeline_executor.py | 224 +++++++++++++++++-- 1 file changed, 201 insertions(+), 23 deletions(-) diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py index b97966dde0c8..541f3bba13da 100644 --- a/tests/python/relay/test_pipeline_executor.py +++ b/tests/python/relay/test_pipeline_executor.py @@ -22,12 +22,195 @@ import tvm import tvm.testing from tvm import relay -from tvm.relay import transform +from tvm.relay import transform, build_module +from tvm.relay.testing import run_opt_pass from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build from tvm._ffi import get_global_func from tvm.contrib import cc as _cc +def graph_split(expr, split_conf, params=None): + """Splitting the graph into a list of subgraphs""" + + def get_dep_var(sub_var_dep): + return [var for var in sub_var_dep[len(sub_var_dep) - 1]["ref_nodes"]] + + def parse_dependency(value, snode_dep, new_input_idx): + new_args = [] + need_update = False + for var in value.args: + is_free_var = False + for dep in snode_dep[:-1]: + if var in dep["nodes"]: + # Mark the previous subgraph node as a dependency. + dep["nodes"][var] += 1 + dep["ref_nodes"][var] = dep["nodes"][var] + # The var of this call is a free_var + is_free_var = True + # if the var of this call is a free_var, recreate it and give it a fixed input name. + if is_free_var: + need_update = True + new_args.append(relay.var(f"data_n_{new_input_idx}", var.checked_type)) + new_input_idx += 1 + else: + new_args.append(var) + # if the 'tvm.relay.expr.Call' has a free_var, recreate it with new name as 'data_n_*'. + if need_update: + value = tvm.relay.expr.Call( + value.op, new_args, value.attrs, value.type_args, value.span + ) + return value, snode_dep, new_input_idx + + def merge_constant_expr(constant_expr, expr): + # merge constant express with a express + if not isinstance(constant_expr.body, tvm.relay.expr.Let): + return tvm.relay.expr.Let(constant_expr.var, constant_expr.value, expr) + + return tvm.relay.expr.Let( + constant_expr.var, constant_expr.value, merge_constant_expr(constant_expr.body, expr) + ) + + def _recursion(anf, pipeline_mods, split_conf, constant_expr): + # Enumurate all operators of compute graph, then split the compute graph into a group of + # subgraph. + nonlocal operator_index_map + nonlocal new_input_idx + nonlocal snode_dep + cur_node_dep = snode_dep[len(snode_dep) - 1] + if isinstance(anf, tvm.relay.Function): + return tvm.relay.Function( + anf.params, + _recursion(anf.body, pipeline_mods, split_conf, constant_expr), + anf.ret_type, + anf.type_params, + anf.attrs, + ) + if isinstance(anf, tvm.relay.expr.Let): + value = anf.value + # record the constant expr to make sure all sugraphs can find correct constant. + if isinstance(value, tvm.relay.expr.Constant): + if not constant_expr: + constant_expr = tvm.relay.expr.Let(anf.var, value, anf.var) + else: + constant_expr = tvm.relay.expr.Let(anf.var, value, constant_expr) + if isinstance(value, tvm.relay.expr.Call): + new_args = [] + # build current var list + cur_node_dep["nodes"][anf.var] = 0 + # Get the dependency information of the nodes. + value, snode_dep, new_input_idx = parse_dependency(value, snode_dep, new_input_idx) + if isinstance(value.op, tvm.ir.Op): + if value.op.name in operator_index_map: + operator_index_map[value.op.name] += 1 + else: + operator_index_map[value.op.name] = 0 + split_operator_name = split_conf[0]["op_name"] if split_conf else "" + split_operator_index = split_conf[0]["op_index"] if split_conf else "" + # if a operator name and repeating count in the network match with the values + # of the 'split configuration', then this place is where we should do the + # graph splitting. + if ( + split_conf + and split_operator_name in operator_index_map + and operator_index_map[split_operator_name] >= split_operator_index + ): + # Do graph splitting. + split_conf.pop(0) + snode_dep.append({"nodes": {}, "ref_nodes": {}}) + ann = _recursion( + anf.body, + pipeline_mods, + split_conf, + constant_expr, + ) + snode_dep.pop() + dep_vars = get_dep_var(snode_dep) + # When the nodes of the current subgraph are the depedency node of another + # subgraph, we need to set them as the output of current subgraph. + body = relay.Tuple(dep_vars) if len(dep_vars) > 1 else anf.var + # when the operator of current subgraph uses previous subgraph constant + # as the argument of a "relay.expr.call", such constant may become a free + # varaible if the constant does not exist in the current subgraph. + # merge the previous constant with current subgraph to avoid such issue. + if constant_expr: + ann = merge_constant_expr(constant_expr, ann) + ann = run_opt_pass(ann, transform.ToGraphNormalForm()) + mod = tvm.IRModule.from_expr(ann) + pipeline_mods.insert(0, mod) + # Return the last node of the current subgraph. + return tvm.relay.expr.Let(anf.var, value, body) + return tvm.relay.expr.Let( + anf.var, + value, + _recursion(anf.body, pipeline_mods, split_conf, constant_expr), + ) + else: + return anf + + snode_dep = [{"nodes": {}, "ref_nodes": {}}] + pipeline_mods = [] + operator_index_map = {} + # Used to tracking new input which caused by graph splitting. + new_input_idx = 0 + constant_expr = None + subgraph_split_conf = split_conf.copy() + # Binding the parameters. + if params: + expr = build_module.bind_params_by_name(expr, params) + anf = run_opt_pass(expr, transform.ToANormalForm()) + anf = run_opt_pass(anf, transform.InferType()) + ann = _recursion( + anf, + pipeline_mods, + subgraph_split_conf, + constant_expr, + ) + ann = run_opt_pass(ann.body, transform.ToGraphNormalForm()) + mod = tvm.IRModule.from_expr(ann) + pipeline_mods.insert(0, mod) + return pipeline_mods + + +def get_network(): + # Get a list of modules representing subgraphs. + mods = [] + dshape = (3, 3) + data = relay.var("data_0", relay.TensorType(dshape, "float32")) + data21 = relay.var("data_1", relay.TensorType(dshape, "float32")) + data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32")) + data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32")) + data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32")) + mvalue1 = np.full((1), 1).astype("float32") + mvalue2 = np.full((1), 2).astype("float32") + mvalue3 = np.full((1), 3).astype("float32") + mv1 = relay.Constant(tvm.nd.array(mvalue1)) + mv2 = relay.Constant(tvm.nd.array(mvalue2)) + mv3 = relay.Constant(tvm.nd.array(mvalue3)) + # There are three outputs in the first model. + net1_output1 = relay.add(data, mv1) + net1_output2 = relay.subtract(data, mv2) + net1_output3 = relay.concatenate((net1_output1, net1_output2), axis=0) + (net1_output3, _) = relay.split(net1_output3, indices_or_sections=2, axis=0) + net1_output3 = relay.add(net1_output3, mv2) + # The second model uses the output named net1_output3 of the first model as the first input, + # the second input of the second model is data21. + net2 = relay.add(net1_output3, mv2) + net2 = relay.add(net2, data21) + net2_output = relay.add(net2, mv3) + # The third model uses the output named net2_output of the second model as the first input + # and uses the output named net1_output2 of the first model as the second input. + net3 = relay.multiply(net2_output, mv3) + net3 = relay.add(net3, net1_output2) + return tvm.IRModule.from_expr(relay.Function([data, data21], relay.Tuple([net3]))), dshape + + +def get_split_mod(): + mod, dshape = get_network() + split_conf = [{"op_name": "add", "op_index": 1}, {"op_name": "add", "op_index": 4}] + mods = graph_split(mod["main"], split_conf) + return mods, dshape + + def get_mannual_mod(): # Get a list of modules representing subgraphs. mods = [] @@ -83,9 +266,8 @@ def get_manual_conf(mods, target): "mod_idx": 0, "cpu_affinity": "0", "output": [ - {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_0"}]}, - {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_0"}]}, - {"output_idx": 2, "dependencies": [{"global_output_index": 0}]}, + {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_n_0"}]}, + {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_n_2"}]}, ], } mod_config[mods[0]] = { @@ -103,7 +285,7 @@ def get_manual_conf(mods, target): "mod_idx": 1, "cpu_affinity": "0", "output": [ - {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_1"}]}, + {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_n_1"}]}, ], } mod_config[mods[1]] = { @@ -120,7 +302,7 @@ def get_manual_conf(mods, target): pipe_config3 = { "mod_idx": 2, "cpu_affinity": "0", - "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 1}]}], + "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 0}]}], } mod_config[mods[2]] = { "pipeline": pipe_config3, @@ -222,7 +404,7 @@ def test_pipe_runtime_error_check(): # This function is used to trigger runtime error by applying wrong logic. if pipeline_executor_build.pipeline_executor_build_enabled(): # Get three pipeline modules here. - (mod1, mod2, mod3), dshape = get_mannual_mod() + (mod1, mod2, mod3), dshape = get_split_mod() # The input or output name is illegal and expects a runtime error. pipe_error = pipeline_executor_build.PipelineConfig() @@ -283,7 +465,7 @@ def test_pipeline(): for target in target_list: affinity = os.sched_getaffinity(0) # Get the three pipeline modules here. - (mod1, mod2, mod3), dshape = get_mannual_mod() + (mod1, mod2, mod3), dshape = get_split_mod() # Prepare batch data for pipeline computation. datas = [] @@ -305,33 +487,29 @@ def test_pipeline(): pipe_config["input"]["data_b"].connect(pipe_config[mod2]["input"]["data_1"]) # The mod1 output[0] will be connected to a input named "data_0" of mod2. - pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_0"]) + pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_n_0"]) # The mod1 output[1] will be connected to a input named "data_0" of mod3. - pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_0"]) + pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_n_2"]) # The mod2 output[2] will be connected to a input named "data_1" of mod3. - pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_1"]) - - # The mod1 output[2] will be connected to pipeline output[0]. - pipe_config[mod1]["output"][2].connect(pipe_config["output"]["0"]) + pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_n_1"]) - # The mod3 output[0] will be connected to pipeline output[1]. - pipe_config[mod3]["output"][0].connect(pipe_config["output"]["1"]) - # Print configueration (print(pipe_config)), the result looks like following. + # The mod3 output[0] will be connected to pipeline output[0]. + pipe_config[mod3]["output"][0].connect(pipe_config["output"]["0"]) + # Print configuration (print(pipe_config)), the result looks like following. # # Inputs # |data_a: mod1:data_0 # |data_b: mod2:data_1 # # output - # |output(1) : mod1.output(2) - # |output(2) : mod3.output(0) + # |output(1) : mod3.output(0) # # connections - # |mod1.output(0)-> mod2.data_0 - # |mod1.output(1)-> mod3.data_0 - # |mod2.output(0)-> mod3.data_1 + # |mod1.output(0)-> mod2.data_n_0 + # |mod1.output(1)-> mod3.data_n_2 + # |mod2.output(0)-> mod3.data_n_1 # Set other parameters. pipe_config[mod1].target = target[0] @@ -367,7 +545,7 @@ def test_pipeline(): # Use the import function to create and initialize PipelineModule. pipeline_module_test = pipeline_executor.PipelineModule.load_library(config_file_name) - assert pipeline_module_test.num_outputs == 2 + assert pipeline_module_test.num_outputs == 1 input_map = pipeline_module_test.get_input_pipeline_map("data_b") assert input_map[0] == "1" and input_map[1] == "data_1" From a6a34046c432b3766e7c32bbd85c098812a12a68 Mon Sep 17 00:00:00 2001 From: Jiawei Liu Date: Thu, 19 May 2022 23:45:25 -0500 Subject: [PATCH 39/59] fix vec*mat in PyTorch converter (#11347) * fix vec*mat in PyTorch converter * Trigger CI --- python/tvm/relay/frontend/pytorch.py | 2 ++ tests/python/frontend/pytorch/test_forward.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index dc5938931ed0..3887b40141c7 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -1698,6 +1698,8 @@ def matmul(self, inputs, input_types): return output elif len(a_shape) > 2: inputs_0 = _op.reshape(inputs_0, [-1, a_shape[-1]]) + elif len(a_shape) == 1: + return _op.squeeze(_op.nn.matmul(_op.expand_dims(inputs_0, axis=0), inputs_1), axis=[0]) if len(b_shape) > 2: trans_axes = list(range(len(b_shape))) diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 1abd59dce811..642beb015fec 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -3511,6 +3511,11 @@ def forward(self, *args): tensor2 = torch.randn(4) verify_model(MatMul1().float().eval(), input_data=[tensor1, tensor2]) + # vector x matrix + tensor1 = torch.randn(4) + tensor2 = torch.randn(4, 3) + verify_model(MatMul1().float().eval(), input_data=[tensor1, tensor2]) + # matrix x matrix tensor1 = torch.randn(10, 4) tensor2 = torch.randn(4, 10) From 7e99d30d63a0c20eedc247c723e2318686b815cf Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Fri, 20 May 2022 17:50:32 +0900 Subject: [PATCH 40/59] [PTX] Intrinsics for async copy from global to shared (SM80) (#11368) * registor ptx builtin for async copy * add basic codegen * add test * update codegen * wip * codegen bug fixed, test working * add commit group * add doc --- include/tvm/tir/builtin.h | 19 +++++ src/target/source/codegen_cuda.cc | 12 ++++ src/target/source/ptx.cc | 26 +++++++ src/target/source/ptx.h | 13 ++++ src/tir/op/builtin.cc | 9 +++ .../python/unittest/test_tir_ptx_cp_async.py | 70 +++++++++++++++++++ 6 files changed, 149 insertions(+) create mode 100644 tests/python/unittest/test_tir_ptx_cp_async.py diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index b166b16b7721..f33432645cc3 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -632,6 +632,25 @@ TVM_DLL const Op& ptx_mma_sp(); */ TVM_DLL const Op& ptx_ldmatrix(); +/*! + * \brief tvm intrinsics for ptx async copy from global to shared memory + * + * void ptx_cp_async(Var shared_ptr, Expr shared_offset, Var global_ptr, Expr global_offset, size_t + * bytes); + * + */ +TVM_DLL const Op& ptx_cp_async(); + +/*! + * \brief tvm intrinsics for ptx async copy commit and wait. + * + * void ptx_commit_group(); + * void ptx_wait_group(int num); + * + */ +TVM_DLL const Op& ptx_commit_group(); +TVM_DLL const Op& ptx_wait_group(); + // TODO(tvm-team) replace the usage of the vector operations by Shuffle. /*! * \brief Get the high level half of the vector diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc index d4ec536fb001..7459d4c250ba 100644 --- a/src/target/source/codegen_cuda.cc +++ b/src/target/source/codegen_cuda.cc @@ -821,6 +821,18 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) { std::string smem_elem_offset = this->PrintExpr(op->args[6]); this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset, smem_ptr, smem_elem_offset); + } else if (op->op.same_as(builtin::ptx_cp_async())) { + std::string dst = this->PrintExpr(op->args[0]); + std::string dst_offset = this->PrintExpr(op->args[1]); + std::string src = this->PrintExpr(op->args[2]); + std::string src_offset = this->PrintExpr(op->args[3]); + std::string size = this->PrintExpr(op->args[4]); + this->stream << PrintCpAsyncAssembly(dst, dst_offset, src, src_offset, size); + } else if (op->op.same_as(builtin::ptx_commit_group())) { + this->stream << "__asm__ __volatile__(\"cp.async.commit_group;\");\n\n"; + } else if (op->op.same_as(builtin::ptx_wait_group())) { + std::string N = this->PrintExpr(op->args[0]); + this->stream << "__asm__ __volatile__(\"cp.async.wait_group " + N + ";\");\n\n"; } else { CodeGenC::VisitExpr_(op, os); } diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc index 02a98ffbbabd..71c68baed6dc 100644 --- a/src/target/source/ptx.cc +++ b/src/target/source/ptx.cc @@ -638,5 +638,31 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type return asm_code; } +std::string PrintCpAsyncAssembly(const std::string& shared_ptr, + const std::string& shared_elem_offset, + const std::string& global_ptr, + const std::string& global_elem_offset, const std::string& bytes) { + std::string asm_code = R"( + { + unsigned int addr; + __asm__ __volatile__( + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" + : "=r"(addr) + : "l"((void *)({smem_addr})) + ); + __asm__ __volatile__( + "cp.async.cg.shared.global [%0], [%1], %2;" + :: "r"(addr), "l"((void*)({global_ptr})), "n"({bytes}) + ); + } +)"; + Replacer replacer; + replacer.register_rule("{smem_addr}", shared_ptr + " + " + shared_elem_offset); + replacer.register_rule("{global_ptr}", global_ptr + " + " + global_elem_offset); + replacer.register_rule("{bytes}", bytes); + asm_code = replacer.rewrite(asm_code); + return asm_code; +} + } // namespace codegen } // namespace tvm diff --git a/src/target/source/ptx.h b/src/target/source/ptx.h index c4255d737ad0..c811a1b9c1d6 100644 --- a/src/target/source/ptx.h +++ b/src/target/source/ptx.h @@ -79,6 +79,19 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type const std::string& smem_ptr, const std::string& smem_elem_offset); +/*! + * \brief Print ptx cp.async assembly string given parameters. + * \param shared_ptr: The pointer to the destination shared memory. + * \param shared_elem_offset: The offset into the shared memory. + * \param global_ptr: The pointer to the global memory. + * \param global_elem_offset: The offset into the global memory. + * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16. + */ +std::string PrintCpAsyncAssembly(const std::string& shared_ptr, + const std::string& shared_elem_offset, + const std::string& global_ptr, + const std::string& global_elem_offset, const std::string& bytes); + } // namespace codegen } // namespace tvm diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 4e8d83dd32df..0415d1bbec9e 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -247,6 +247,15 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_mma_sp) TIR_DEFINE_BUILTIN_FUNC(ptx_ldmatrix) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); +TIR_DEFINE_BUILTIN_FUNC(ptx_cp_async) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + +TIR_DEFINE_BUILTIN_FUNC(ptx_commit_group) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + +TIR_DEFINE_BUILTIN_FUNC(ptx_wait_group) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); + TIR_DEFINE_BUILTIN_FUNC(vectorhigh) .set_attr("TCallEffectKind", Integer(CallEffectKind::kPure)); diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py new file mode 100644 index 000000000000..17b60885509f --- /dev/null +++ b/tests/python/unittest/test_tir_ptx_cp_async.py @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +from tvm.script import tir as T +import numpy as np +import tvm.testing + + +@T.prim_func +def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "float16"]) -> None: + T.func_attr({"global_symbol": "default_function", "tir.noalias": True}) + bx = T.env_thread("blockIdx.x") + tx = T.env_thread("threadIdx.x") + T.launch_thread(bx, 1) + T.launch_thread(tx, 32) + with T.block(): + A_shared = T.alloc_buffer([32, 128], "float16", scope="shared") + T.reads(A[0:32, 0:128]) + T.writes(B[0:32, 0:128]) + + for i in range(16): + T.evaluate( + T.ptx_cp_async( + A_shared.data, tx * 128 + 8 * i, A.data, tx * 128 + 8 * i, 16, dtype="float16" + ) + ) + + # TODO(masahi): Remove dtype requirement from TVMScript parser + T.evaluate(T.ptx_commit_group(dtype="float16")) + T.evaluate(T.ptx_wait_group(0, dtype="float16")) + + for i in range(128): + B[tx, i] = A_shared[tx, i] + + +@tvm.testing.requires_cuda +def test_ptx_cp_async(): + f = ptx_cp_async + arch = tvm.contrib.nvcc.get_target_compute_version() + major, _ = tvm.contrib.nvcc.parse_compute_version(arch) + if major < 8: + # Require at least SM80 + return + + mod = tvm.build(f, target="cuda") + A_np = np.random.rand(32, 128).astype("float16") + B_np = np.zeros((32, 128)).astype("float16") + dev = tvm.cuda(0) + A_nd = tvm.nd.array(A_np, device=dev) + B_nd = tvm.nd.array(B_np, device=dev) + mod(A_nd, B_nd) + tvm.testing.assert_allclose(B_nd.numpy(), A_np) + + +if __name__ == "__main__": + test_ptx_cp_async() From 13272a19ef30b32c457a48b04dca72ed05aef784 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 20 May 2022 01:55:55 -0700 Subject: [PATCH 41/59] [ci] Disable flaky onnx tests (#11376) Co-authored-by: driazati --- tests/python/frontend/onnx/test_forward.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 6fac7f2f20aa..d6f96f0d0796 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -6038,6 +6038,7 @@ def verify_qlinearmul(a_shape, b_shape, c_shape): verify_qlinearmul([5, 1, 7], [2, 7], [5, 2, 7]) +@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11375") @tvm.testing.parametrize_targets def test_qlinearleakyrelu(target, dev): def verify_qlinearleakyrelu(inshape, kwargs): @@ -6063,6 +6064,7 @@ def verify_qlinearleakyrelu(inshape, kwargs): verify_qlinearleakyrelu([5, 1, 4, 6], {"alpha": 0.65}) +@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11375") @tvm.testing.parametrize_targets def test_qlinearsigmoid(target, dev): def verify_qlinearsigmoid(a_shape): From 909851c2f5d66337a2897b6a9fb2b2f786bfa917 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 20 May 2022 01:56:23 -0700 Subject: [PATCH 42/59] [ci][easy] Fix parameters for macros (#11377) Co-authored-by: driazati --- Jenkinsfile | 4 ++-- jenkins/Jenkinsfile.j2 | 2 +- jenkins/macros.j2 | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 024b920ac676..dbbbb29f7972 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-05-19T11:41:58.421857 +// Generated at 2022-05-19T14:04:32.815769 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -524,7 +524,7 @@ def cpp_unittest(image) { def add_microtvm_permissions() { sh( - script: 'find build/microtvm_template_projects -type f | xargs chmod +x', + script: 'find build/microtvm_template_projects -type f | grep qemu-hack | xargs chmod +x', label: 'Add execute permissions for microTVM files', ) } diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2 index 8742d0724485..9eac881c549a 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/jenkins/Jenkinsfile.j2 @@ -447,7 +447,7 @@ def cpp_unittest(image) { def add_microtvm_permissions() { {% for folder in microtvm_template_projects %} sh( - script: 'find {{ folder }} -type f | xargs chmod +x', + script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x', label: 'Add execute permissions for microTVM files', ) {% endfor %} diff --git a/jenkins/macros.j2 b/jenkins/macros.j2 index 2ce005a128ef..ce29aa2d580d 100644 --- a/jenkins/macros.j2 +++ b/jenkins/macros.j2 @@ -90,7 +90,7 @@ }, {% endmacro %} -{% macro upload_artifacts(tag, filenames, folders=[]) %} +{% macro upload_artifacts(tag, filenames, folders=None) %} sh( script: """ set -eux From 3248793dd8043e8fd68a4d2d104d61f1f0e71f61 Mon Sep 17 00:00:00 2001 From: Andrew Cheung <43327640+ninehusky@users.noreply.github.com> Date: Fri, 20 May 2022 02:03:24 -0700 Subject: [PATCH 43/59] Add Conv3D bindings (#11381) --- rust/tvm/src/ir/relay/attrs/nn.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs index c9d291113303..040939d4f6c1 100644 --- a/rust/tvm/src/ir/relay/attrs/nn.rs +++ b/rust/tvm/src/ir/relay/attrs/nn.rs @@ -75,6 +75,25 @@ pub struct Conv2DAttrsNode { pub out_dtype: DataType, } +#[repr(C)] +#[derive(Object, Debug)] +#[ref_name = "Conv3DAttrs"] +#[type_key = "relay.attrs.Conv3DAttrs"] +pub struct Conv3DAttrsNode { + pub base: BaseAttrsNode, + pub strides: Array, + pub padding: Array, + pub dilation: Array, + pub groups: i32, + pub channels: IndexExpr, + pub kernel_size: Array, + pub data_layout: TString, + pub kernel_layout: TString, + pub out_layout: TString, + pub auto_scheduler_rewritten_layout: TString, + pub out_dtype: DataType, +} + #[repr(C)] #[derive(Object, Debug)] #[ref_name = "BiasAddAttrs"] From 07d91fa04182e77887b379c9644778c2a1a92999 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Fri, 20 May 2022 02:12:55 -0700 Subject: [PATCH 44/59] Fix function number datatype from char to uint16_t (#11365) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix function number datatype from char to uint16_t rewrite the modified part to pass lint check Use 2 bytes for func num in fun_registry Fix errors in linter Add the declaration of the helper functions set 2 bytes for func num in func_registry test units pass num_func by value This commit change the datatype of the number of the function from 1 Byte to 2 Bytes. Besides, I use some helper functions to access the number of function and the first function name. * Fix aot_executor_module to unbreak CI. * Fix GraphExecutorModule. * Remove graph_json_to_c_func_registry. * No longer needed and not called anywhere. * Superseded by emitting the FuncRegistry directly in codegen. Co-authored-by: 嚴中璟 --- include/tvm/runtime/crt/func_registry.h | 27 ++++++- python/tvm/micro/func_registry.py | 79 ------------------- .../aot_executor_module/aot_executor_module.c | 2 +- src/runtime/crt/common/func_registry.c | 39 ++++++--- .../graph_executor_module.c | 2 +- src/target/func_registry_generator.cc | 8 +- tests/crt/func_registry_test.cc | 7 +- 7 files changed, 68 insertions(+), 96 deletions(-) delete mode 100644 python/tvm/micro/func_registry.py diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h index 4f8a19af591e..50737f871798 100644 --- a/include/tvm/runtime/crt/func_registry.h +++ b/include/tvm/runtime/crt/func_registry.h @@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry { /*! \brief Names of registered functions, concatenated together and separated by \0. * An additional \0 is present at the end of the concatenated blob to mark the end. * - * Byte 0 is the number of functions in `funcs`. + * Byte 0 and 1 are the number of functions in `funcs`. */ const char* names; @@ -50,6 +50,31 @@ typedef struct TVMFuncRegistry { const TVMBackendPackedCFunc* funcs; } TVMFuncRegistry; +/*! + * \brief Get the of the number of functions from registry. + * + * \param reg TVMFunctionRegistry instance that contains the function. + * \return The number of functions from registry. + */ +uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg); + +/*! + * \brief Set the number of functions to registry. + * + * \param reg TVMFunctionRegistry instance that contains the function. + * \param num_funcs The number of functions + * \return 0 when successful. + */ +int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs); + +/*! + * \brief Get the address of 0th function from registry. + * + * \param reg TVMFunctionRegistry instance that contains the function. + * \return the address of 0th function from registry + */ +const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg); + /*! * \brief Get packed function from registry by name. * diff --git a/python/tvm/micro/func_registry.py b/python/tvm/micro/func_registry.py deleted file mode 100644 index 69c4bb1a29e5..000000000000 --- a/python/tvm/micro/func_registry.py +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Defines functions to work with TVMModule FuncRegistry.""" - -import json - - -def graph_json_to_c_func_registry(graph_path, func_registry_path): - """Convert a graph json file to a CRT-compatible FuncRegistry. - - Parameters - ---------- - graph_path : str - Path to the graph JSON file. - - func_registry_path : str - Path to a .c file which will be written containing the function registry. - """ - with open(graph_path) as json_f: - graph = json.load(json_f) - - funcs = [] - for n in graph["nodes"]: - if n["op"] != "tvm_op": - continue - - funcs.append(n["attrs"]["func_name"]) - - encoded_funcs = f"\\{len(funcs):03o}" + "\\0".join(funcs) - lines = [ - "#include ", - "#include ", - "#include ", - "", - ] - - for f in funcs: - lines.append( - f"extern int {f}(TVMValue* args, int* type_codes, int num_args, " - "TVMValue* out_ret_value, int* out_ret_tcode, void* resource_handle);" - ) - - lines.append("static TVMBackendPackedCFunc funcs[] = {") - - for f in funcs: - lines.append(f" (TVMBackendPackedCFunc) &{f},") - - lines += [ - "};", - "static const TVMFuncRegistry system_lib_registry = {", - f' "{encoded_funcs}\\0",', - " funcs,", - "};", - "static const TVMModule system_lib = {", - " &system_lib_registry,", - "};", - "", - "const TVMModule* TVMSystemLibEntryPoint(void) {", - " return &system_lib;", - "}", - "", # blank line to end the file - ] - with open(func_registry_path, "w") as wrapper_f: - wrapper_f.write("\n".join(lines)) diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c index d4b3755c1314..e1dbd533a3ec 100644 --- a/src/runtime/crt/aot_executor_module/aot_executor_module.c +++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c @@ -176,7 +176,7 @@ static const TVMBackendPackedCFunc aot_executor_registry_funcs[] = { }; static const TVMFuncRegistry aot_executor_registry = { - "\x0aget_input\0" + "\x0a\0get_input\0" "get_input_index\0" "get_input_info\0" "get_num_inputs\0" diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c index 116a5c496f1b..49cef8fd70eb 100644 --- a/src/runtime/crt/common/func_registry.c +++ b/src/runtime/crt/common/func_registry.c @@ -60,14 +60,29 @@ int strcmp_cursor(const char** cursor, const char* name) { return return_value; } +uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) { + uint16_t num_funcs; + memcpy(&num_funcs, reg->names, sizeof(num_funcs)); + return num_funcs; +} + +int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) { + memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs)); + return 0; +} + +const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) { + // NOTE: first function name starts at index 2 to skip num_funcs. + return (reg->names + sizeof(uint16_t)); +} + tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name, tvm_function_index_t* function_index) { tvm_function_index_t idx; - const char* reg_name_ptr; + const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg); idx = 0; - // NOTE: reg_name_ptr starts at index 1 to skip num_funcs. - for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) { + for (; *reg_name_ptr != '\0'; reg_name_ptr++) { if (!strcmp_cursor(®_name_ptr, name)) { *function_index = idx; return kTvmErrorNoError; @@ -82,9 +97,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg, tvm_function_index_t function_index, TVMBackendPackedCFunc* out_func) { - uint8_t num_funcs; + uint16_t num_funcs; - num_funcs = reg->names[0]; + num_funcs = TVMFuncRegistry_GetNumFuncs(reg); if (function_index >= num_funcs) { return kTvmErrorFunctionIndexInvalid; } @@ -101,7 +116,8 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8 reg->registry.names = (const char*)buffer; buffer[0] = 0; // number of functions present in buffer. - buffer[1] = 0; // end of names list marker. + buffer[1] = 0; // note that we combine the first two elements to form a 16-bit function index. + buffer[2] = 0; // end of names list marker. // compute a guess of the average size of one entry: // - assume average function name is around ~10 bytes @@ -117,13 +133,12 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name, TVMBackendPackedCFunc func, int override) { size_t idx; - char* reg_name_ptr; + char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry)); idx = 0; // NOTE: safe to discard const qualifier here, since reg->registry.names was set from // TVMMutableFuncRegistry_Create above. - // NOTE: reg_name_ptr starts at index 1 to skip num_funcs. - for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) { + for (; *reg_name_ptr != 0; reg_name_ptr++) { if (!strcmp_cursor((const char**)®_name_ptr, name)) { if (override == 0) { return kTvmErrorFunctionAlreadyDefined; @@ -149,7 +164,11 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch reg_name_ptr += name_len + 1; *reg_name_ptr = 0; ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func; - ((char*)reg->registry.names)[0]++; // increment num_funcs. + + uint16_t num_funcs; + // increment num_funcs. + num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1; + TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs); return kTvmErrorNoError; } diff --git a/src/runtime/crt/graph_executor_module/graph_executor_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c index 280130a99414..0ae12f5a9e0a 100644 --- a/src/runtime/crt/graph_executor_module/graph_executor_module.c +++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c @@ -229,7 +229,7 @@ static const TVMBackendPackedCFunc graph_executor_registry_funcs[] = { }; static const TVMFuncRegistry graph_executor_registry = { - "\x08get_input\0" + "\x08\0get_input\0" "get_input_index\0" "get_input_info\0" "get_num_inputs\0" diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc index 7c948d50cbb9..d679bf379b62 100644 --- a/src/target/func_registry_generator.cc +++ b/src/target/func_registry_generator.cc @@ -31,7 +31,13 @@ namespace target { std::string GenerateFuncRegistryNames(const Array& function_names) { std::stringstream ss; - ss << (unsigned char)(function_names.size()); + + unsigned char function_nums[sizeof(uint16_t)]; + *reinterpret_cast(function_nums) = function_names.size(); + for (auto f : function_nums) { + ss << f; + } + for (auto f : function_names) { ss << f << '\0'; } diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc index 9f0e7f8d1a5a..5962a3acee39 100644 --- a/tests/crt/func_registry_test.cc +++ b/tests/crt/func_registry_test.cc @@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) { } TEST(FuncRegistry, Empty) { - TVMFuncRegistry registry{"\000", NULL}; + TVMFuncRegistry registry{"\000\000", NULL}; EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(®istry, "foo", NULL)); EXPECT_EQ(kTvmErrorFunctionIndexInvalid, @@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_ } // Matches the style of registry defined in generated C modules. -const char* kBasicFuncNames = "\002Foo\0Bar\0"; // NOTE: final \0 +const char* kBasicFuncNames = "\002\000Foo\0Bar\0"; // NOTE: final \0 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar}; const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs}; @@ -111,7 +111,8 @@ TEST(FuncRegistry, ConstGlobalRegistry) { // Foo EXPECT_EQ(kBasicFuncNames[0], 2); - EXPECT_EQ(kBasicFuncNames[1], 'F'); + EXPECT_EQ(kBasicFuncNames[1], 0); + EXPECT_EQ(kBasicFuncNames[2], 'F'); EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index)); EXPECT_EQ(0, func_index); From c8d22837055d97b2a06b585f0ae2ac5e8269a11d Mon Sep 17 00:00:00 2001 From: xndcn Date: Fri, 20 May 2022 17:13:38 +0800 Subject: [PATCH 45/59] Fix array pointers releasing with `delete` operator (#11328) It may be safe to release POD-types array with `delete` operator, but `delete[]` is always better. --- src/contrib/tf_op/tvm_dso_op_kernels.cc | 2 +- src/target/metadata.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc index fb483ee6f2e0..78c10e4822c8 100644 --- a/src/contrib/tf_op/tvm_dso_op_kernels.cc +++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc @@ -207,7 +207,7 @@ class TVMDSOOpTrait { tensorflow::int64* dims = new tensorflow::int64[num_dims]; cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims, cudaMemcpyDeviceToHost); tensorflow::TensorShapeUtils::MakeShape(dims, num_dims, output_shape); - delete dims; + delete[] dims; } }; #endif diff --git a/src/target/metadata.h b/src/target/metadata.h index 5dc1c9d0eec5..426e8616070a 100644 --- a/src/target/metadata.h +++ b/src/target/metadata.h @@ -134,11 +134,11 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo } private: - ::std::unique_ptr inputs_; + ::std::unique_ptr inputs_; std::vector<::tvm::runtime::metadata::TensorInfo> inputs_objs_; - ::std::unique_ptr outputs_; + ::std::unique_ptr outputs_; std::vector<::tvm::runtime::metadata::TensorInfo> outputs_objs_; - ::std::unique_ptr pools_; + ::std::unique_ptr pools_; std::vector<::tvm::runtime::metadata::TensorInfo> pools_objs_; ::std::string mod_name_; struct ::TVMMetadata storage_; @@ -186,7 +186,7 @@ class InMemoryTensorInfoNode : public ::tvm::target::metadata::VisitableTensorIn private: ::std::string name_; - ::std::unique_ptr shape_; + ::std::unique_ptr shape_; struct ::TVMTensorInfo storage_; }; From c216cbec5bb795a8b13bdb1e177b523e4f7e4ca8 Mon Sep 17 00:00:00 2001 From: ChunPing Chung Date: Fri, 20 May 2022 17:14:25 +0800 Subject: [PATCH 46/59] [Bugfix] Fix qnn.quantize type func with incomplete type (#11124) --- src/relay/qnn/op/quantize.cc | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc index 06a73ee91cbf..da33aaac8187 100644 --- a/src/relay/qnn/op/quantize.cc +++ b/src/relay/qnn/op/quantize.cc @@ -55,8 +55,23 @@ bool QuantizeRel(const Array& types, int num_inputs, const Attrs& attrs, axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis; // If zero point and scale are scalar then axis doesnt matter. - bool scale_is_scalar = (types[1].as())->shape.size() == 0; - bool zp_is_scalar = (types[2].as())->shape.size() == 0; + bool scale_is_scalar, zp_is_scalar; + + if (auto ttype = types[1].as()) { + scale_is_scalar = ttype->shape.size() == 0; + } else { + ICHECK(types[1].as()) + << "Quantize: expect to be TensorType but get " << types[1]; + return false; + } + + if (auto ttype = types[2].as()) { + zp_is_scalar = ttype->shape.size() == 0; + } else { + ICHECK(types[2].as()) + << "Quantize: expect to be TensorType but get " << types[2]; + return false; + } if (!(scale_is_scalar && zp_is_scalar)) { ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << quantize_attrs->axis << " is out of range"; From 01b472f4d05584a669dfe2d7378fdaeeb76be378 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 21 May 2022 01:25:04 +0900 Subject: [PATCH 47/59] [CI] Update CPU and GPU image (#11369) --- Jenkinsfile | 6 +++--- jenkins/Jenkinsfile.j2 | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index dbbbb29f7972..7b8c8f890db1 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,13 +45,13 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-05-19T14:04:32.815769 +// Generated at 2022-05-20T18:06:10.772162 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e' -ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e' -ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e' +ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69' +ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69' ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e' ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e' ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e' diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2 index 9eac881c549a..b00ee0272626 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/jenkins/Jenkinsfile.j2 @@ -52,8 +52,8 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e' -ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e' -ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e' +ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69' +ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69' ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e' ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e' ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e' From 72a5219aad7c9b807169f74f8954580a36c1d85e Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 20 May 2022 12:20:12 -0500 Subject: [PATCH 48/59] [Schedule] Allowed typing.Tuple in tir.schedule._type_checker (#11289) * [Schedule] Allowed typing.Tuple in tir.schedule._type_checker Previously, `typing.Tuple` annotations could not be used with `tir.schedule._type_checker.type_checked` annotations. This allows `Tuple` type annotations to be type-checked. * Revert change, allow tuples input as List arguments * Suppress mypy errors Directly interacting with a type object would otherwise cause some false positives. * Corrected unit test for allowing tuples to be used as typing.List * Represent multi-type lists as List[Union[...]] instead of List[Any] This gives a better error message and plays nicely with _type2str, since `typing.Any` doesn't have a `__name__` field. --- python/tvm/tir/schedule/_type_checker.py | 49 ++++++- .../unittest/test_type_annotation_checker.py | 121 ++++++++++++++++++ 2 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 tests/python/unittest/test_type_annotation_checker.py diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py index 1b86c4aa30db..21ca0c5a922b 100644 --- a/python/tvm/tir/schedule/_type_checker.py +++ b/python/tvm/tir/schedule/_type_checker.py @@ -41,6 +41,13 @@ def list_(type_: Any) -> Any: return [subtype] return None + @staticmethod + def tuple_(type_: Any) -> Optional[List[type]]: + if _Subtype._origin(type_) is tuple: + subtypes = type_.__args__ + return subtypes + return None + @staticmethod def optional(type_: Any) -> Optional[List[type]]: if _Subtype._origin(type_) is Union: @@ -68,6 +75,14 @@ def list_(type_: Any) -> Optional[List[type]]: return [subtype] return None + @staticmethod + def tuple_(type_: Any) -> Optional[List[type]]: + if isinstance(type_, typing.GenericMeta): # type: ignore # pylint: disable=no-member + if type_.__name__ == "Tuple": + subtypes = type_.__args__ # type: ignore # pylint: disable=no-member + return subtypes + return None + @staticmethod def optional(type_: Any) -> Optional[List[type]]: if isinstance(type_, typing._Union): # type: ignore # pylint: disable=no-member,protected-access @@ -93,6 +108,10 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]: if subtype is not None: return "list", subtype + subtype = _Subtype.tuple_(type_) + if subtype is not None: + return "tuple", subtype + subtype = _Subtype.optional(type_) if subtype is not None: return "optional", subtype @@ -108,6 +127,7 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]: "none": lambda: "None", "atomic": lambda t: str(t.__name__), "list": lambda t: f"List[{_type2str(t)}]", + "tuple": lambda *t: f"Tuple[{', '.join([_type2str(x) for x in t])}]", "optional": lambda t: f"Optional[{_type2str(t)}]", "union": lambda *t: f"Union[{', '.join([_type2str(x) for x in t])}]", } @@ -118,11 +138,26 @@ def _type2str(type_: Any) -> str: return _TYPE2STR[key](*subtypes) +def _val2type(value: Any): + if isinstance(value, list): + types = set(_val2type(x) for x in value) + if len(types) == 1: + return List[types.pop()] # type: ignore + + return List[Union[tuple(types)]] # type: ignore + + if isinstance(value, tuple): + types = tuple(_val2type(x) for x in value) # type: ignore + return Tuple[types] + + return type(value) + + def _type_check_err(x: Any, name: str, expected: Any) -> str: return ( f'"{name}" has wrong type. ' f'Expected "{_type2str(expected)}", ' - f'but gets: "{_type2str(type(x))}"' + f'but gets: "{_type2str(_val2type(x))}"' ) @@ -142,6 +177,17 @@ def _type_check_list(v: List[Any], name: str, type_: Any) -> Optional[str]: return error_msg return None + def _type_check_tuple(v: Any, name: str, *types: Any) -> Optional[str]: + if not isinstance(v, tuple): + return _type_check_err(v, name, Tuple[types]) + if len(types) != len(v): + return _type_check_err(v, name, Tuple[types]) + for i, (x, type_) in enumerate(zip(v, types)): + error_msg = _type_check(x, f"{name}[{i}]", type_) + if error_msg is not None: + return error_msg + return None + def _type_check_optional(v: Any, name: str, type_: Any) -> Optional[str]: return None if v is None else _type_check(v, name, type_) @@ -156,6 +202,7 @@ def _type_check_union(v: Any, name: str, *types: Any) -> Optional[str]: "none": _type_check_none, "atomic": _type_check_atomic, "list": _type_check_list, + "tuple": _type_check_tuple, "optional": _type_check_optional, "union": _type_check_union, } diff --git a/tests/python/unittest/test_type_annotation_checker.py b/tests/python/unittest/test_type_annotation_checker.py new file mode 100644 index 000000000000..7317e05b1a75 --- /dev/null +++ b/tests/python/unittest/test_type_annotation_checker.py @@ -0,0 +1,121 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test type checker based on python's type annotations""" + +from typing import List, Tuple + +import pytest + +from tvm.tir.schedule._type_checker import type_checked + + +test_cases = [ + { + "type_annotation": int, + "positive_cases": [5], + "negative_cases": ["5"], + }, + { + "type_annotation": List[int], + "positive_cases": [ + [5], + [], + # Tuples are allowed to be used as lists, because both are + # represented in FFI as tvm::runtime::Array. + (1, 2, 3), + ], + "negative_cases": [ + None, + 5, + ["5"], + ], + }, + { + "type_annotation": Tuple[int], + "positive_cases": [ + (5,), + ], + "negative_cases": [ + None, + (1, 2, 3), + [1], + 5, + ["5"], + ], + }, + { + "type_annotation": Tuple[str, int], + "positive_cases": [ + ("x", 5), + ], + "negative_cases": [ + 42, + ("x", 5, 6), + ("x", 5, "y"), + ("x", 5.0), + (None, 5), + ], + }, +] + +positive_cases = [ + (config["type_annotation"], case) for config in test_cases for case in config["positive_cases"] +] + +negative_cases = [ + (config["type_annotation"], case) for config in test_cases for case in config["negative_cases"] +] + + +def format_name(type_annotation, case): + try: + name = type_annotation.__name__ + except AttributeError: + name = str(type_annotation).replace("typing.", "") + + return f"{name}_{case}" + + +@pytest.mark.parametrize( + ["type_annotation", "case"], + positive_cases, + ids=[format_name(t, c) for t, c in positive_cases], +) +def test_matches_type(type_annotation, case): + @type_checked + def func(_: type_annotation): + pass + + func(case) + + +@pytest.mark.parametrize( + ["type_annotation", "case"], + negative_cases, + ids=[format_name(t, c) for t, c in negative_cases], +) +def test_not_matches(type_annotation, case): + @type_checked + def func(_: type_annotation): + pass + + with pytest.raises(TypeError): + func(case) + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) From febae407edc0dbc0add23474fb36c29b618f3b4e Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 20 May 2022 11:20:35 -0700 Subject: [PATCH 49/59] [docs] Add lightweight docs image (#11045) * [docs] Add lightweight docs image This image includes everything necessary to build the docs without any tutorials and is just about 1.5 GB which is significantly less than the CPU/GPU images. * remove ci.py docs --cpu flag, imply it via a lack of --tutorials/--full so it is the default Co-authored-by: driazati --- docker/Dockerfile.docs | 77 ++++++++++++++++++++++++++++++++++++++++++ docker/build.sh | 7 +++- tests/scripts/ci.py | 33 ++++++------------ 3 files changed, 93 insertions(+), 24 deletions(-) create mode 100644 docker/Dockerfile.docs diff --git a/docker/Dockerfile.docs b/docker/Dockerfile.docs new file mode 100644 index 000000000000..840094b4d0cb --- /dev/null +++ b/docker/Dockerfile.docs @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM ubuntu:18.04 + +# Base scripts +RUN apt-get update --fix-missing + +COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh +RUN bash /install/ubuntu_install_core.sh + +COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh +RUN bash /install/ubuntu1804_install_python.sh + +# Globally disable pip cache +RUN pip config set global.no-cache-dir false + +COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh +RUN bash /install/ubuntu_install_python_package.sh + +COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh +RUN bash /install/ubuntu_install_sphinx.sh + +# Enable doxygen for c++ doc build +RUN apt-get update && apt-get install -y doxygen libprotobuf-dev protobuf-compiler + +COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh +RUN bash /install/ubuntu_install_java.sh + +COPY install/ubuntu_install_nodejs.sh /install/ubuntu_install_nodejs.sh +RUN bash /install/ubuntu_install_nodejs.sh + +# Rust env (build early; takes a while) +COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh +RUN bash /install/ubuntu_install_rust.sh +ENV RUSTUP_HOME /opt/rust +ENV CARGO_HOME /opt/rust +ENV PATH $PATH:$CARGO_HOME/bin + +# sccache +COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh +RUN bash /install/ubuntu_install_sccache.sh + +RUN rm -rf /opt/rust \ + /usr/lib/x86_64-linux-gnu/libopenblas* \ + /usr/lib/jvm/java-11* \ + /usr/lib/x86_64-linux-gnu/libLLVM-6.0.so.1 + +# Environment variables +ENV PATH=/usr/local/nvidia/bin:${PATH} +ENV PATH=/usr/local/cuda/bin:${PATH} +ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH} +ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH} +ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH} +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH} + +# Ensure the local libcuda have higher priority than the /usr/local/cuda/compact +# since the compact libcuda does not work on non-Tesla gpus +ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:${LD_LIBRARY_PATH} + +ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH} +ENV PATH=/node_modules/.bin:${PATH} +ENV VULKAN_SDK=/usr diff --git a/docker/build.sh b/docker/build.sh index ed67b638c79b..75f0e35c6c7b 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -22,7 +22,7 @@ # # Usage: build.sh [--tag ] # [--dockerfile ] [-it] -# [--net=host] [--cache-from ] +# [--net=host] [--cache-from ] [--cache] # [--name CONTAINER_NAME] [--context-path ] # [--spec DOCKER_IMAGE_SPEC] # [] @@ -99,6 +99,11 @@ if [[ "$1" == "--cache-from" ]]; then shift 1 fi +if [[ "$1" == "--cache" ]]; then + shift 1 + DOCKER_NO_CACHE_ARG= +fi + if [[ "$1" == "--context-path" ]]; then DOCKER_CONTEXT_PATH="$2" echo "Using custom context path: ${DOCKER_CONTEXT_PATH}" diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py index d45c3b1ae9cb..b2b903ad01b1 100755 --- a/tests/scripts/ci.py +++ b/tests/scripts/ci.py @@ -141,14 +141,6 @@ def check_gpu(): ) -def check_build(): - if (REPO_ROOT / "build").exists(): - warnings.append( - "Existing build dir found may be interfering with the Docker " - "build (you may need to remove it)" - ) - - def gen_name(s: str) -> str: # random 4 letters suffix = "".join([random.choice(string.ascii_lowercase) for i in range(5)]) @@ -227,38 +219,33 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], inter def docs( tutorial_pattern: Optional[str] = None, full: bool = False, - cpu: bool = False, interactive: bool = False, skip_build: bool = False, docker_image: Optional[str] = None, ) -> None: """ Build the documentation from gallery/ and docs/. By default this builds only - the Python docs. + the Python docs without any tutorials. arguments: - full -- Build all language docs, not just Python - precheck -- Run Sphinx precheck script - tutorial-pattern -- Regex for which tutorials to execute when building docs (can also be set via TVM_TUTORIAL_EXEC_PATTERN) - cpu -- Run with the ci-cpu image and use CMake defaults for building TVM (if no GPUs are available) + full -- Build all language docs, not just Python (this will use the 'ci_gpu' Docker image) + tutorial-pattern -- Regex for which tutorials to execute when building docs (this will use the 'ci_gpu' Docker image) skip_build -- skip build and setup scripts interactive -- start a shell after running build / test scripts docker-image -- manually specify the docker image to use """ - config = "./tests/scripts/task_config_build_gpu.sh" build_dir = get_build_dir("gpu") - if cpu and full: - clean_exit("--full cannot be used with --cpu") extra_setup = [] image = "ci_gpu" if docker_image is None else docker_image - if cpu: + if not full and tutorial_pattern is None: + # TODO: Change this to tlcpack/docs once that is uploaded image = "ci_cpu" if docker_image is None else docker_image build_dir = get_build_dir("cpu") - config = " && ".join( + config_script = " && ".join( [ - "mkdir -p build", - "pushd build", + f"mkdir -p {build_dir}", + f"pushd {build_dir}", "cp ../cmake/config.cmake .", # The docs import tvm.micro, so it has to be enabled in the build "echo set\(USE_MICRO ON\) >> config.cmake", @@ -287,9 +274,10 @@ def docs( ] else: check_gpu() + config_script = f"./tests/scripts/task_config_build_gpu.sh {build_dir}" scripts = extra_setup + [ - config + f" {build_dir}", + config_script, f"./tests/scripts/task_build.py --build-dir {build_dir}", ] @@ -307,7 +295,6 @@ def docs( "IS_LOCAL": "1", "TVM_LIBRARY_PATH": str(REPO_ROOT / build_dir), } - check_build() docker(name=gen_name("docs"), image=image, scripts=scripts, env=env, interactive=interactive) From 0274d8e1f124cecc159abf3234251bf010784581 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 21 May 2022 03:33:54 +0900 Subject: [PATCH 50/59] [TIR] Support tensorization using ldmatrix + MMA (#11355) * [TIR] Support tensorization using ldmatrix + MMA commit 3218facf100b0dfc55715acfd1cee156764129ba Author: Masahiro Masuda Date: Wed May 18 14:04:56 2022 +0900 some clean up commit 7a235b69dc2023b3098ed44d591edb63b20a8f4e Author: Masahiro Masuda Date: Wed May 18 13:55:11 2022 +0900 parameterize over storage scope in mma store intrin commit 827ea4c434c35607b241f8e0ae2efe3214ac2458 Author: Masahiro Masuda Date: Wed May 18 13:37:38 2022 +0900 properly handle floordiv/mod in codegen commit 42d4c6f42182c9fd79566c0955f99cc82abd5144 Author: Masahiro Masuda Date: Wed May 18 09:53:57 2022 +0900 update tuned factors for fp16 commit 328d0aa36b2ea9ea1b051970d612bff82d2d20e6 Author: Masahiro Masuda Date: Wed May 18 08:43:30 2022 +0900 all tests working commit 5e086cf5fd1404ac38f85c4bfbe692687b45a16c Author: Masahiro Masuda Date: Wed May 18 07:48:43 2022 +0900 add doc for mma_fill and mma_store intrin commit 4f945c4116b6d3bdc965ecb2be2229bb46dc11ab Author: Masahiro Masuda Date: Wed May 18 06:39:01 2022 +0900 remove tests commit df7708f7f67761d9c18f9564bc15abd50c12ac69 Author: Masahiro Masuda Date: Tue May 17 19:52:14 2022 +0900 unified test commit 754c83eeb8510b31fb9652b089177f9b8e642ec0 Author: Masahiro Masuda Date: Tue May 17 19:36:24 2022 +0900 clean up LowerWarpmemory commit 178c3dcee7bfa17d5d93fec02aa858dc62151670 Author: Masahiro Masuda Date: Tue May 17 19:15:04 2022 +0900 Use IndexMap commit 07fb58910338c62847fd902b37801d09b8c673b0 Author: Masahiro Masuda Date: Tue May 17 17:51:44 2022 +0900 remove 16x8x8 test commit 2b05b5a5470ac221d559f31a31a8e2ff753b2414 Author: Masahiro Masuda Date: Tue May 17 17:31:35 2022 +0900 generate mma fill/store commit bf23fc50f0ffa99e875d9247ca66acec0c36677f Author: Masahiro Masuda Date: Tue May 17 12:23:30 2022 +0900 mma intrin generation with meta programming commit 5afb5f00afd642cb1e39872edc7965f476dcdcb7 Author: Masahiro Masuda Date: Tue May 17 05:26:14 2022 +0900 ldmatrix intrin generation with meta programming commit fb62abb3424b88ec48c697e306e05889a3ac306f Author: Masahiro Masuda Date: Mon May 16 20:30:49 2022 +0900 minor commit 5a80adce24e84d3ec6bf931b60cb9c730d243394 Author: Masahiro Masuda Date: Mon May 16 19:55:57 2022 +0900 revert some change commit e599a55078ee75f2480a721098341812db58cf6f Author: Masahiro Masuda Date: Mon May 16 19:54:18 2022 +0900 remove obsolete files commit 4b13b85ff91d0d592a7e0c01924e0b49b82f35a8 Author: Masahiro Masuda Date: Mon May 16 19:51:21 2022 +0900 wip commit 848de63455539e25cd0d43e5a65fd048636ef0f7 Author: Masahiro Masuda Date: Mon May 16 19:44:29 2022 +0900 wip commit b35bff97ed10c22559e2164eb7538db0f711ce7e Author: Masahiro Masuda Date: Mon May 16 19:31:18 2022 +0900 update parse error msg commit ad9b053ef865b1f91f03d7b15ed7aae3420ee213 Author: Masahiro Masuda Date: Mon May 16 19:26:51 2022 +0900 fix for avoiding Buffer.vload(...) case commit 54c686443e370edbfae860d0809b1b6182d26414 Author: Masahiro Masuda Date: Mon May 16 18:59:55 2022 +0900 wip commit 078060fe28d22f1db5f07b1c382dee438f02df60 Author: Masahiro Masuda Date: Mon May 16 18:57:34 2022 +0900 wip commit 576f8415e65e0e8a8a7808885e219b3b53867950 Author: Masahiro Masuda Date: Mon May 16 18:52:15 2022 +0900 wip commit 12a376ae2f44aa6660121e64e0358f2866624f7f Author: Masahiro Masuda Date: Mon May 16 17:54:58 2022 +0900 Squashed commit of the following: commit 48eef4981d1a55aaf3b0ac935f2a10347cb1ac2d Author: Masahiro Masuda Date: Mon May 16 17:40:48 2022 +0900 more comment commit 8f67fc87038834e9f7e2c5cd3dfe61fabf442206 Author: Masahiro Masuda Date: Mon May 16 17:11:27 2022 +0900 update test commit ad85036621c005b733763e67ceffae39c356ec99 Author: Masahiro Masuda Date: Mon May 16 16:54:01 2022 +0900 add test commit 4a5dc3ffd5d0bb4a1700e57897c9e0f26e3d2a88 Author: Masahiro Masuda Date: Mon May 16 16:40:47 2022 +0900 [TVMScript] Support function call to help construct AST commit 76c1bcf0ade45d7433a0066236add8372b1cc547 Author: Masahiro Masuda Date: Mon May 16 16:30:07 2022 +0900 simplify iterator in layout transform commit 936280324ea2c91429a6a85a1b8ee89c7b825928 Author: Masahiro Masuda Date: Sat May 14 11:31:39 2022 +0900 remove obsolet files commit 2e119b422d72d726d5f2bd20fe48a1e62fcb0510 Author: Masahiro Masuda Date: Sat May 14 10:43:59 2022 +0900 calculate mma store dst index using inverse affine map commit 9489434ee52b546e2abb2ab28173eefd51525ba4 Author: Masahiro Masuda Date: Sat May 14 10:01:12 2022 +0900 simplify store commit 1adcb77b8bba8e5d91080fe6cbfc7add7f4365c2 Author: Masahiro Masuda Date: Sat May 14 09:43:40 2022 +0900 simplified fill commit 7b13c736d23e0eac94137aa918101d788e60d4f3 Author: Masahiro Masuda Date: Sat May 14 09:22:17 2022 +0900 simplify intrin desc using index map function commit bcf212dda0f94c51f55c48921f61d92fd3b83777 Author: Masahiro Masuda Date: Sat May 14 07:16:42 2022 +0900 seems to work commit dd8ccf9ec2e48100158152e5d4590d141424e2e2 Author: Masahiro Masuda Date: Sat May 14 07:11:57 2022 +0900 poking with the parser commit 596582cbfbd08ebe23ea71aaf7a447472415ccd1 Author: Masahiro Masuda Date: Fri May 13 20:04:59 2022 +0900 16x8x32 4k trans working commit 273f89a8a6ac34f7c79147563922d34d44bffd08 Author: Masahiro Masuda Date: Fri May 13 19:52:13 2022 +0900 add 16x8x16 fp16 trans commit 8e2066cc4c6e86616bc9751324e63ba81a3b02af Author: Masahiro Masuda Date: Fri May 13 19:32:37 2022 +0900 16x8x16 4k trans working commit c2d0744051733e94f840d4517bcee9ca5d444c75 Author: Masahiro Masuda Date: Fri May 13 19:25:52 2022 +0900 16x8x16 trans working commit c2e314cdda1c3a931781e51a863901ea178dffec Author: Masahiro Masuda Date: Fri May 13 16:19:32 2022 +0900 tuned int8 4k, 91 TOPS commit 94d9d965f19ff1a2ebdd342079ef420fb537b16a Author: Masahiro Masuda Date: Fri May 13 15:59:33 2022 +0900 int8 4k tune working commit 3ca8ca02593aff7540c9655aa831348246171752 Author: Masahiro Masuda Date: Fri May 13 08:43:57 2022 +0900 mma 16x8x32 int8 working with ldmatrix b workaround commit 54f1cb731d4b42a6cbc08baf144e74646400eef5 Author: Masahiro Masuda Date: Fri May 13 18:23:27 2022 +0900 wip commit 9d2844db602dc65af4dbd06a73fdd815f486b8b9 Author: Masahiro Masuda Date: Fri May 13 16:38:53 2022 +0900 test tensorize without layout transform commit 86ee6dabc801aeb8d6917bec6de97b42025dbdd1 Author: Masahiro Masuda Date: Fri May 13 15:15:34 2022 +0900 int8 4k tensorize works commit 39f9e32c9a64222c91daba2c32969b27207a31d2 Author: Masahiro Masuda Date: Fri May 13 12:44:39 2022 +0900 begin int8 4k tune commit 6fa91e55b5ab2ba0f901d0d35be1b2fb3ab092b0 Author: Masahiro Masuda Date: Thu May 12 18:53:20 2022 +0900 try fix ldmatrix b for int8 commit 7a962cddc4799fa3df0c0fdf3c056146d3f2cbdf Author: Masahiro Masuda Date: Thu May 12 18:28:34 2022 +0900 fixed warp_coeff commit a0afb5698f307382147a38819e004a2db7f554b1 Author: Masahiro Masuda Date: Thu May 12 12:20:01 2022 +0900 wip commit f70ccd09b07d5325454ffdc39a7619ea84aa7e06 Author: Masahiro Masuda Date: Thu May 12 12:09:57 2022 +0900 int8 tensorize working commit 20321fa4674dabc78fe55b5e0e2876c35b245d21 Author: Masahiro Masuda Date: Thu May 12 07:06:22 2022 +0900 starting 16x8x32 int8 commit 441fd193c59cdc436d87ab35896cbb8c779ddf35 Author: Masahiro Masuda Date: Thu May 12 05:50:46 2022 +0900 adding fp16 accum case commit c9d40b69b1b57bfaddffba09ea07624ae90ee465 Author: Masahiro Masuda Date: Wed May 11 17:04:29 2022 +0900 clean up commit 5b2d48635e762c77c824d1c259ac8bcbcc949421 Author: Masahiro Masuda Date: Wed May 11 16:38:19 2022 +0900 16x8x16 4k tune working commit c3cb170d85600d03da5c3f4cda03552208ca0b8c Author: Masahiro Masuda Date: Wed May 11 16:20:27 2022 +0900 tensoriz fixed commit 68039b081efcdd6aea1d132940b3745f50164974 Author: Masahiro Masuda Date: Wed May 11 15:55:25 2022 +0900 begin 16x8x16 4k tune commit ced5d8d980cc267d4735957c25cb60d71ae977d2 Author: Masahiro Masuda Date: Wed May 11 15:50:11 2022 +0900 16x8x16 worked commit 3d2c90d77c1bb2df2193e9af6cbaa2bd927a26d8 Author: Masahiro Masuda Date: Wed May 11 15:47:26 2022 +0900 fix commit 403050b03ad6b4f0ee8d45088ffb324727bbae48 Author: Masahiro Masuda Date: Wed May 11 15:45:10 2022 +0900 add 16x8x16 test commit 18e8d73661c99cd1c83021063b41a457afcb1638 Author: Masahiro Masuda Date: Wed May 11 06:50:32 2022 +0900 fixed mma store codegen for 16x8x16 commit ec81250561195705122bccb9a2372f71de68121f Author: Masahiro Masuda Date: Wed May 11 04:25:25 2022 +0900 add 16x8x16 mma store codegen commit e08df2a62a4809bcd39782949283c16e7703aa5c Author: Masahiro Masuda Date: Wed May 11 03:47:47 2022 +0900 tensorized C_warp init commit ae0678918929c1ceec73f2039467040c5bb7823b Author: Masahiro Masuda Date: Wed May 11 03:06:06 2022 +0900 mma store codegen working commit deb4d6646cc93d4cdb4f2560ce723bee4d86e144 Author: Masahiro Masuda Date: Tue May 10 19:22:57 2022 +0900 update lower warp memory commit 71fe5fe465300705fa94f9544a2e1a5070de6e0d Author: Masahiro Masuda Date: Tue May 10 09:01:42 2022 +0900 tensorizing mma store commit e80a1f148c47f2a3fac2363a733d8d4e2a2631d0 Author: Masahiro Masuda Date: Thu Apr 28 19:54:08 2022 +0900 clean up commit a9640f4b7c3c9f22b87ca74a61003438dfd8f992 Author: Masahiro Masuda Date: Thu Apr 28 19:40:55 2022 +0900 add tunable 4k test, 36 TFLOPS commit b9f7eae7041d1a9b3e434c331c874e8347e89dc4 Author: Masahiro Masuda Date: Thu Apr 28 18:01:08 2022 +0900 fixed bug in LowerWarpMemory index splitting for ldmatrix commit 00df30823f874910ed1ec1f74718100311764234 Author: Masahiro Masuda Date: Wed Apr 27 07:58:17 2022 +0900 fixed missing reverse_compute_at commit 93f9fe7e5f7ad16c8d0e6240c16c0281a0e97dec Author: Masahiro Masuda Date: Wed Apr 27 06:55:12 2022 +0900 add 4k test commit 3689ef712aa4b282a4818fa2fa2e7e349c3a5eec Author: Masahiro Masuda Date: Wed Apr 27 06:54:09 2022 +0900 temp disable high dim base indices check in tensorize commit 0c859c4f385ba0b6f9477b569b80cee80b5b7282 Author: Masahiro Masuda Date: Tue Apr 26 19:18:23 2022 +0900 clean up commit f6aadbfcfbd73c1667a6de7aedc5894232b8e750 Author: Masahiro Masuda Date: Tue Apr 26 19:13:09 2022 +0900 Add 16x8x8 MMA + LDMatrix test commit 4cf6b20c6ca415e967ab58d80e4a77c701ad7255 Author: Masahiro Masuda Date: Tue Apr 26 18:04:17 2022 +0900 testing 16x8x8 ldmatrix tensoriation * set measure_perf to False * add requires_gpu decorator in tests, always test build on non-ampere * skip cuda compile on old gpu --- include/tvm/tir/builtin.h | 27 + python/tvm/tir/tensor_intrin/__init__.py | 1 + python/tvm/tir/tensor_intrin/cuda.py | 469 ++++++++++++++++++ src/target/source/codegen_cuda.cc | 76 ++- src/tir/op/builtin.cc | 6 + src/tir/transforms/lower_warp_memory.cc | 45 +- ...est_tir_schedule_tensorize_ldmatrix_mma.py | 422 ++++++++++++++++ 7 files changed, 1042 insertions(+), 4 deletions(-) create mode 100644 python/tvm/tir/tensor_intrin/cuda.py create mode 100644 tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index f33432645cc3..5fc42392c337 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -651,6 +651,33 @@ TVM_DLL const Op& ptx_cp_async(); TVM_DLL const Op& ptx_commit_group(); TVM_DLL const Op& ptx_wait_group(); +/*! + * \brief tvm intrinsic for storing the result of PTX MMA into a destination pointer. + * For example, if each thread in a warp of size 32 has 4 elements from the result of + * m16xn8xk16 MMA in its registers, this intrinsic can be used to store the result in a + * 16x8 region in shared or global memory. + * + * There is no real PTX instruction that does that, but we want to hide details of + * complex index manipulation behind this intrinsic to simplify TIR lowering passes (e.g. + * LowerWarpMemory). + * + * void mma_store(IntImm m, IntImm n, Var dst_ptr, Var src_ptr, Expr src_offset, Var dst_stride); + */ +TVM_DLL const Op& mma_store(); + +/*! + * \brief tvm intrinsic for zero-initalizing an MMA accumulation registor. + * For example, if each thread in a warp of size 32 has 8 elements from the A matrix in + * m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its + * 4 accumulation registers. + * + * There is no real PTX instruction that does that, but we introduce this intrinsic for the + * same reason as mma_store above. + * + * void mma_fill(IntImm local_size, Var local_ptr, Expr offset); + */ +TVM_DLL const Op& mma_fill(); + // TODO(tvm-team) replace the usage of the vector operations by Shuffle. /*! * \brief Get the high level half of the vector diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py index 4115c3b90070..a3b47ff6d5d7 100644 --- a/python/tvm/tir/tensor_intrin/__init__.py +++ b/python/tvm/tir/tensor_intrin/__init__.py @@ -20,3 +20,4 @@ from .arm_cpu import * from .dot_product_common import * from .rocm import * +from .cuda import * diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py new file mode 100644 index 000000000000..853a37735486 --- /dev/null +++ b/python/tvm/tir/tensor_intrin/cuda.py @@ -0,0 +1,469 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,missing-function-docstring +"""Intrinsics for tensorization on NVIDIA GPU.""" +from tvm.script import tir as T +from .. import IntImm, Cast +from ..._ffi import register_func +from ...runtime import convert +from .. import TensorIntrin + + +def shared_16x16_to_ldmatrix_32x8_layout(i, j): + thread_id = 4 * (i % 8) + (j % 8) // 2 + return thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 2) + + +def shared_16x32_to_ldmatrix_32x16_layout(i, j): + thread_id = 4 * (i % 8) + (j % 16) // 4 + return thread_id, 8 * (j // 16) + (i // 8) * 4 + j % 4 + + +def shared_32x16_to_ldmatrix_32x16_layout(i, j): + thread_id = (i % 4) + 4 * (j % 8) + return thread_id, 8 * (j // 8) + (i // 16) * 4 + i % 4 + + +@register_func("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout") +def index_map_shared_16x16_to_ldmatrix_32x8_layout(ind): + i, j = ind[0], ind[1] + thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(i, j) + return convert([thread_id, local_id]) + + +lift = convert + +M_DIM = 16 +N_DIM = 16 +WARP_SIZE = 32 +HALF_WARP = WARP_SIZE // 2 +HALF_WARP_expr = lift(HALF_WARP) + + +def get_ldmatrix_intrin(k_dim, dtype, is_b, transposed): + local_size = (M_DIM * k_dim) // WARP_SIZE + shared_offset = None + index_map = None + + if transposed: + assert is_b, "Transposed A matrix not supported" + + ldmatrix_col_major = is_b and not transposed + + if k_dim == 16: + assert dtype == "float16" + + index_map = shared_16x16_to_ldmatrix_32x8_layout + + if transposed: + shared_offset = ( + lambda tx, stride: stride * 8 * (tx // HALF_WARP_expr) + + stride * (tx % 8) + + 8 * ((tx % HALF_WARP_expr) // 8) + ) + else: + shared_offset = lambda tx, stride: stride * (tx % HALF_WARP_expr) + 8 * ( + tx // HALF_WARP_expr + ) + else: + assert ( + k_dim == 32 and dtype == "int8" + ), "Only k_dim == 16 (float16) or k_dim == 32 (int8) supported for now" + + if ldmatrix_col_major: + index_map = shared_32x16_to_ldmatrix_32x16_layout + # A dummy offset, ldmatrix cannot be used for int8 + trans case. + # We still use the ldmatrix intrinsic, but lower it to a manual loop in the codegen. + # Only the stride information is required. + shared_offset = lambda _, stride: stride + elif is_b and transposed: + index_map = shared_16x32_to_ldmatrix_32x16_layout + shared_offset = ( + lambda tx, stride: stride * 8 * (tx // HALF_WARP_expr) + + (tx % 8) * stride + + 16 * ((tx % HALF_WARP_expr) // 8) + ) + else: + index_map = shared_16x32_to_ldmatrix_32x16_layout + shared_offset = lambda tx, stride: stride * (tx % 16) + 16 * (tx // 16) + + assert index_map and shared_offset + + if is_b and not transposed: + row_dim = k_dim + col_dim = M_DIM + else: + row_dim = M_DIM + col_dim = k_dim + + shmem_shape = (row_dim, col_dim) + + @T.prim_func + def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None: + shared = T.match_buffer( + shared_handle, shmem_shape, dtype, align=128, offset_factor=16, scope="shared" + ) + warp = T.match_buffer( + warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp" + ) + + with T.block("root"): + T.reads(shared[0:row_dim, 0:col_dim]) + T.writes(warp[0:WARP_SIZE, 0:local_size]) + + for ax0, ax1 in T.grid(row_dim, col_dim): + with T.block("shared_warp"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + T.reads(shared[v0, v1]) + + thread_id, local_id = index_map(v0, v1) + T.writes(warp[thread_id, local_id]) + warp[thread_id, local_id] = shared[v0, v1] + + @T.prim_func + def ldmatrix_impl(warp_handle: T.handle, shared_handle: T.handle) -> None: + s0 = T.var("int32") + s1 = T.var("int32") + shared = T.match_buffer( + shared_handle, + shmem_shape, + dtype, + align=128, + offset_factor=16, + scope="shared", + strides=[s0, s1], + ) + warp = T.match_buffer( + warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp" + ) + + with T.block("root"): + T.reads(shared[0:row_dim, 0:col_dim]) + T.writes(warp[0:WARP_SIZE, 0:local_size]) + tx = T.env_thread("threadIdx.x") + T.launch_thread(tx, WARP_SIZE) + + T.evaluate( + T.ptx_ldmatrix( + ldmatrix_col_major, + 4, # Always load 4 matrices + ".b16", + warp.data, + warp.elem_offset + lift(local_size) * tx, + shared.access_ptr("r"), + shared_offset(tx, s0), + dtype=dtype, + ) + ) + + return ldmatrix_desc, ldmatrix_impl + + +def get_mma_intrin(k_dim, out_dtype, b_transposed): + local_size = (M_DIM * k_dim) // WARP_SIZE + local_size_out = (M_DIM * N_DIM) // 32 + + index_map_C = shared_16x16_to_ldmatrix_32x8_layout + + if k_dim == 16: + index_map_A = shared_16x16_to_ldmatrix_32x8_layout + index_map_B = shared_16x16_to_ldmatrix_32x8_layout + mma_prefix = "m16n8k16" + elif k_dim == 32 and b_transposed: + index_map_A = index_map_B = shared_16x32_to_ldmatrix_32x16_layout + mma_prefix = "m16n8k32" + elif k_dim == 32 and not b_transposed: + index_map_A = shared_16x32_to_ldmatrix_32x16_layout + index_map_B = shared_32x16_to_ldmatrix_32x16_layout + mma_prefix = "m16n8k32" + else: + assert False + + out_dtype_abbrv = {"float16": "fp16", "float32": "fp32", "int32": "int32"}[out_dtype] + + if out_dtype in ["float16", "float32"]: + in_dtype = "float16" + in_dtype_abbrv = "fp16" + else: + in_dtype = "int8" + in_dtype_abbrv = "int8" + + def maybe_cast(v): + if out_dtype in ["float32", "int32"]: + return Cast(out_dtype, v) + return v + + def maybe_swap(i, j): + if b_transposed: + return j, i + return i, j + + @T.prim_func + def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer( + a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + ) + B = T.match_buffer( + b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + ) + C = T.match_buffer( + c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp" + ) + + with T.block("root"): + T.reads( + C[0:WARP_SIZE, 0:local_size_out], + A[0:WARP_SIZE, 0:local_size], + B[0:WARP_SIZE, 0:local_size], + ) + T.writes(C[0:WARP_SIZE, 0:local_size_out]) + + for i, j, k in T.grid(M_DIM, N_DIM, k_dim): + with T.block("C"): + i, j, k = T.axis.remap("SSR", [i, j, k]) + b_row_ind, b_col_ind = maybe_swap(k, j) + + thread_id_C, local_id_C = index_map_C(i, j) + thread_id_A, local_id_A = index_map_A(i, k) + thread_id_B, local_id_B = index_map_B(b_row_ind, b_col_ind) + + T.reads( + C[thread_id_C, local_id_C], + A[thread_id_A, local_id_A], + B[thread_id_B, local_id_B], + ) + T.writes(C[thread_id_C, local_id_C]) + + C[thread_id_C, local_id_C] += maybe_cast( + A[thread_id_A, local_id_A] + ) * maybe_cast(B[thread_id_B, local_id_B]) + + @T.prim_func + def mma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer( + a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + ) + B = T.match_buffer( + b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + ) + C = T.match_buffer( + c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp" + ) + + with T.block("root"): + T.reads( + C[0:WARP_SIZE, 0:local_size_out], + A[0:WARP_SIZE, 0:local_size], + B[0:WARP_SIZE, 0:local_size], + ) + T.writes(C[0:WARP_SIZE, 0:local_size_out]) + tx = T.env_thread("threadIdx.x") + T.launch_thread(tx, WARP_SIZE) + + T.evaluate( + T.ptx_mma( + mma_prefix, + "row", + "col", + in_dtype_abbrv, + in_dtype_abbrv, + out_dtype_abbrv, + A.data, + A.elem_offset + tx * lift(local_size), + B.data, + B.elem_offset + tx * lift(local_size), + C.data, + C.elem_offset + tx * lift(local_size_out), + False, + dtype=out_dtype, + ) + ) + + T.evaluate( + T.ptx_mma( + mma_prefix, + "row", + "col", + in_dtype_abbrv, + in_dtype_abbrv, + out_dtype_abbrv, + A.data, + A.elem_offset + tx * lift(local_size), + B.data, + B.elem_offset + tx * lift(local_size) + lift(local_size) // 2, + C.data, + C.elem_offset + tx * lift(local_size_out) + lift(local_size_out) // 2, + False, + dtype=out_dtype, + ) + ) + + return mma_sync_desc, mma_sync_impl + + +def get_mma_fill_intrin(dtype, local_size): + zero = IntImm("int32", 0).astype(dtype) + + # Assume M = N = 16 + index_map = shared_16x16_to_ldmatrix_32x8_layout + + @T.prim_func + def mma_fill_desc(a: T.handle) -> None: + C_warp = T.match_buffer(a, [WARP_SIZE, local_size], dtype=dtype, scope="warp") + + with T.block("root"): + T.reads() + T.writes(C_warp[0:WARP_SIZE, 0:local_size]) + for i0, i1 in T.grid(M_DIM, N_DIM): + with T.block("C_warp"): + i, j = T.axis.remap("SS", [i0, i1]) + thread_id, local_id = index_map(i, j) + T.reads() + T.writes(C_warp[thread_id, local_id]) + C_warp[thread_id, local_id] = zero + + @T.prim_func + def mma_fill_impl(a: T.handle) -> None: + C_warp = T.match_buffer( + a, [WARP_SIZE, local_size], dtype=dtype, scope="warp", offset_factor=1 + ) + + with T.block("root"): + T.reads() + T.writes(C_warp[0:WARP_SIZE, 0:local_size]) + tx = T.env_thread("threadIdx.x") + T.launch_thread(tx, WARP_SIZE) + + T.evaluate(T.mma_fill(local_size, C_warp.data, C_warp.elem_offset, dtype=dtype)) + + return mma_fill_desc, mma_fill_impl + + +def get_mma_store_intrin(dtype, local_size, scope="global"): + # Assume M = N = 16 + index_map = shared_16x16_to_ldmatrix_32x8_layout + + @T.prim_func + def mma_store_desc(a: T.handle, c: T.handle) -> None: + C_warp = T.match_buffer(a, [WARP_SIZE, local_size], dtype=dtype, scope="warp") + C = T.match_buffer(c, [M_DIM, N_DIM], dtype=dtype, scope=scope) + + with T.block("root"): + T.reads(C_warp[0:WARP_SIZE, 0:local_size]) + T.writes(C[0:M_DIM, 0:N_DIM]) + for i0, i1 in T.grid(M_DIM, N_DIM): + with T.block("C_warp"): + v0, v1 = T.axis.remap("SS", [i0, i1]) + thread_id, local_id = index_map(v0, v1) + T.reads(C_warp[thread_id, local_id]) + T.writes(C[v0, v1]) + C[v0, v1] = C_warp[thread_id, local_id] + + @T.prim_func + def mma_store_impl(a: T.handle, c: T.handle) -> None: + s0 = T.var("int32") + s1 = T.var("int32") + + C_warp = T.match_buffer( + a, [WARP_SIZE, local_size], dtype=dtype, scope="warp", offset_factor=1 + ) + C = T.match_buffer( + c, [M_DIM, N_DIM], dtype=dtype, scope="global", offset_factor=1, strides=[s0, s1] + ) + + with T.block("root"): + T.reads(C_warp[0:WARP_SIZE, 0:local_size]) + T.writes(C[0:M_DIM, 0:N_DIM]) + tx = T.env_thread("threadIdx.x") + T.launch_thread(tx, WARP_SIZE) + + T.evaluate( + T.mma_store( + M_DIM, + N_DIM, + C.access_ptr("w"), + C_warp.data, + C_warp.elem_offset, + s0, + dtype=dtype, + ) + ) + + return mma_store_desc, mma_store_impl + + +LDMATRIX_16x16_A_INTRIN = "mma.ldmatrix_16x16_a" +TensorIntrin.register(LDMATRIX_16x16_A_INTRIN, *get_ldmatrix_intrin(16, "float16", False, False)) + +LDMATRIX_16x16_B_INTRIN = "mma.ldmatrix_16x16_b" +TensorIntrin.register(LDMATRIX_16x16_B_INTRIN, *get_ldmatrix_intrin(16, "float16", True, False)) + +LDMATRIX_16x16_B_TRANS_INTRIN = "mma.ldmatrix_16x16_b_trans" +TensorIntrin.register( + LDMATRIX_16x16_B_TRANS_INTRIN, *get_ldmatrix_intrin(16, "float16", True, True) +) + +LDMATRIX_16x32_A_INTRIN = "mma.ldmatrix_16x32_a" +TensorIntrin.register(LDMATRIX_16x32_A_INTRIN, *get_ldmatrix_intrin(32, "int8", False, False)) + +LDMATRIX_32x16_B_INTRIN = "mma.ldmatrix_32x16_b" +TensorIntrin.register(LDMATRIX_32x16_B_INTRIN, *get_ldmatrix_intrin(32, "int8", True, False)) + +LDMATRIX_16x32_B_TRANS_INTRIN = "mma.ldmatrix_16x32_b_trans" +TensorIntrin.register(LDMATRIX_16x32_B_TRANS_INTRIN, *get_ldmatrix_intrin(32, "int8", True, True)) + +MMA_f16f16f32_INTRIN = "mma_f16f16f32" +TensorIntrin.register(MMA_f16f16f32_INTRIN, *get_mma_intrin(16, "float32", False)) + +MMA_f16f16f32_TRANS_INTRIN = "mma_f16f16f32_trans" +TensorIntrin.register(MMA_f16f16f32_TRANS_INTRIN, *get_mma_intrin(16, "float32", True)) + +MMA_f16f16f16_INTRIN = "mma_f16f16f16" +TensorIntrin.register(MMA_f16f16f16_INTRIN, *get_mma_intrin(16, "float16", False)) + +MMA_f16f16f16_TRANS_INTRIN = "mma_f16f16f16_trans" +TensorIntrin.register(MMA_f16f16f16_TRANS_INTRIN, *get_mma_intrin(16, "float16", True)) + +MMA_i8i8i32_INTRIN = "mma_i8i8i32" +TensorIntrin.register(MMA_i8i8i32_INTRIN, *get_mma_intrin(32, "int32", False)) + +MMA_i8i8i32_TRANS_INTRIN = "mma_i8i8i32_trans" +TensorIntrin.register(MMA_i8i8i32_TRANS_INTRIN, *get_mma_intrin(32, "int32", True)) + +MMA_fill_16x16_f32_INTRIN = "mma_fill_16x16_f32" +TensorIntrin.register(MMA_fill_16x16_f32_INTRIN, *get_mma_fill_intrin("float32", 8)) + +MMA_fill_16x16_f16_INTRIN = "mma_fill_16x16_f16" +TensorIntrin.register(MMA_fill_16x16_f16_INTRIN, *get_mma_fill_intrin("float16", 8)) + +MMA_fill_16x16_i32_INTRIN = "mma_fill_16x16_i32" +TensorIntrin.register(MMA_fill_16x16_i32_INTRIN, *get_mma_fill_intrin("int32", 8)) + +MMA_store_16x16_f32_global_INTRIN = "mma_store_16x16_f32_global_" +TensorIntrin.register( + MMA_store_16x16_f32_global_INTRIN, *get_mma_store_intrin("float32", 8, "global") +) + +MMA_store_16x16_f16_global_INTRIN = "mma_store_16x16_f16_global_" +TensorIntrin.register( + MMA_store_16x16_f16_global_INTRIN, *get_mma_store_intrin("float16", 8, "global") +) + +MMA_store_16x16_i32_global_INTRIN = "mma_store_16x16_i32_global_" +TensorIntrin.register( + MMA_store_16x16_i32_global_INTRIN, *get_mma_store_intrin("int32", 8, "global") +) diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc index 7459d4c250ba..616e75f2e776 100644 --- a/src/target/source/codegen_cuda.cc +++ b/src/target/source/codegen_cuda.cc @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -818,9 +819,78 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) { std::string local_ptr = this->PrintExpr(op->args[3]); std::string local_elem_offset = this->PrintExpr(op->args[4]); std::string smem_ptr = this->PrintExpr(op->args[5]); - std::string smem_elem_offset = this->PrintExpr(op->args[6]); - this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset, - smem_ptr, smem_elem_offset); + if (trans && op->dtype.bits() == 8) { + // Since ldmatrix assumes that a matrix element is 16 bit, it cannot properly transpose an + // int8 matrix. + std::string smem_stride = this->PrintExpr(op->args[6]); + ICHECK(num == 4); + os << "for (int i = 0; i < 16; ++i) {\n"; + os << local_ptr << "[" + local_elem_offset + " + i] = " << smem_ptr + << "[(i % 8) / 4 * " + smem_stride + " * 16 + (threadIdx.x % 4) * 4 * " + smem_stride + + "+ (i % 4) * " + smem_stride + " + threadIdx.x / 4 + (i / 8) * 8];\n"; + os << "}\n"; + } else { + std::string smem_elem_offset = this->PrintExpr(op->args[6]); + this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset, + smem_ptr, smem_elem_offset); + } + } else if (op->op.same_as(builtin::mma_store())) { + int m = Downcast(op->args[0])->value; + int n = Downcast(op->args[1])->value; + std::string dst = this->PrintExpr(op->args[2]); + std::string src = this->PrintExpr(op->args[3]); + std::string src_offset = this->PrintExpr(op->args[4]); + PrimExpr stride = op->args[5]; + + ICHECK(m == 16 && n == 16) << "Only m == 16 && n == 16 case supported for now"; + + // Each thread in a warp holds a certain number of elements of an MMA output. + // For example, if we compute a 16x16 tile using MMA, each thread holds 8 elements + // in its registers. So conceptually, a warp memory is organized as a 32x8 block. + // A map from a 16x16 tile to a 32x8 block of memory is specified by the index map below. + + // To store the 32x8 output back to a 16x16 tile in shared or global memory, we invert this map + // to determine the output location for each 8 element. + + const auto* index_map_func = + runtime::Registry::Get("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout"); + ICHECK(index_map_func); + + auto inverse_index_map = + IndexMap::FromFunc(2, *index_map_func).Inverse({Range(0, m), Range(0, n)}); + auto indices_16x16 = inverse_index_map->final_indices; + + // "//" and "%" in the index map are translated to FloorDiv/Mod, but the plain Div/Mod are fine. + // FloorDiv/Mod are supposed to be lowered before they reach codegen, so manually replace them + // to the plain ones here. + class LowerFloorDivMod : public ExprMutator { + public: + PrimExpr VisitExpr_(const FloorDivNode* op) { + return tir::Div(this->VisitExpr(op->a), this->VisitExpr(op->b)); + } + PrimExpr VisitExpr_(const FloorModNode* op) { + return tir::Mod(this->VisitExpr(op->a), this->VisitExpr(op->b)); + } + }; + + auto dst_ind = LowerFloorDivMod()(indices_16x16[0] * stride + indices_16x16[1]); + + var_idmap_[inverse_index_map->initial_indices[0].get()] = "threadIdx.x"; + var_idmap_[inverse_index_map->initial_indices[1].get()] = "local_id"; + + os << "for (int local_id = 0; local_id < 8; ++local_id) {\n"; + os << dst << "[" + this->PrintExpr(dst_ind) + "]" + << " = " << src << "[" << src_offset << " + local_id];\n"; + os << "}\n"; + + } else if (op->op.same_as(builtin::mma_fill())) { + std::string num_elem = this->PrintExpr(op->args[0]); + std::string dst = this->PrintExpr(op->args[1]); + std::string dst_offset = this->PrintExpr(op->args[2]); + + os << "for (int i = 0; i < " << num_elem << "; ++i) {\n"; + os << dst << "[" << dst_offset << " + i] = 0.0;"; + os << "}\n"; } else if (op->op.same_as(builtin::ptx_cp_async())) { std::string dst = this->PrintExpr(op->args[0]); std::string dst_offset = this->PrintExpr(op->args[1]); diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 0415d1bbec9e..1871a3d7bf70 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -256,6 +256,12 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_commit_group) TIR_DEFINE_BUILTIN_FUNC(ptx_wait_group) .set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); +TIR_DEFINE_BUILTIN_FUNC(mma_store).set_attr("TCallEffectKind", + Integer(CallEffectKind::kOpaque)); + +TIR_DEFINE_BUILTIN_FUNC(mma_fill).set_attr("TCallEffectKind", + Integer(CallEffectKind::kOpaque)); + TIR_DEFINE_BUILTIN_FUNC(vectorhigh) .set_attr("TCallEffectKind", Integer(CallEffectKind::kPure)); diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc index 40971114d416..d8250cd09888 100644 --- a/src/tir/transforms/lower_warp_memory.cc +++ b/src/tir/transforms/lower_warp_memory.cc @@ -101,7 +101,7 @@ namespace tir { // Visitor to find m in pattern // store warp_mem[m * warp_index + (width * m) * y + x] -class WarpStoreCoeffFinder : private StmtVisitor { +class WarpStoreCoeffFinder : private StmtExprVisitor { public: WarpStoreCoeffFinder(const VarNode* buffer, Var warp_index, arith::Analyzer* analyzer) : buffer_(buffer), warp_index_(warp_index), analyzer_(analyzer) {} @@ -113,6 +113,18 @@ class WarpStoreCoeffFinder : private StmtVisitor { private: /// Visitor implementation + void VisitExpr_(const CallNode* op) final { + if (op->op.same_as(builtin::ptx_ldmatrix()) && op->args[3].as() == buffer_) { + UpdatePattern(op->args[4]); + } else if (op->op.same_as(builtin::mma_fill()) && op->args[1].as() == buffer_) { + auto* local_size = op->args[0].as(); + ICHECK(local_size) << "Integer expected for the first argument of mma_fill"; + warp_coeff_ = local_size->value; + } + + StmtExprVisitor::VisitExpr_(op); + } + void VisitStmt_(const StoreNode* op) final { LOG(FATAL) << "Unexpected use of deprecated StoreNode. Please use BufferStoreNode instead."; } @@ -245,6 +257,37 @@ class WarpAccessRewriter : protected StmtExprMutator { } protected: + PrimExpr RewriteIndicesAt(const CallNode* op, const std::vector& indices) { + Array new_args = op->args; + for (int i : indices) { + if (op->args[i].get() == buffer_) { + PrimExpr local_index = SplitIndexByGroup(op->args[i + 1]).first; + new_args.Set(i + 1, local_index); + } + } + return Call(op->dtype, op->op, new_args); + } + + PrimExpr VisitExpr_(const CallNode* op) override { + if (op->op.same_as(builtin::ptx_mma())) { + return RewriteIndicesAt(op, {6, 8, 10}); + } + + if (op->op.same_as(builtin::ptx_ldmatrix())) { + return RewriteIndicesAt(op, {3}); + } + + if (op->op.same_as(builtin::mma_store())) { + return RewriteIndicesAt(op, {3}); + } + + if (op->op.same_as(builtin::mma_fill())) { + return RewriteIndicesAt(op, {1}); + } + + return StmtExprMutator::VisitExpr_(op); + } + PrimExpr VisitExpr_(const VarNode* op) override { ICHECK(op != buffer_) << "Cannot access address of warp memory directly"; return StmtExprMutator::VisitExpr_(op); diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py new file mode 100644 index 000000000000..67e8ae0ad836 --- /dev/null +++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py @@ -0,0 +1,422 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +import tvm +from tvm import te +from tvm.tir.tensor_intrin.cuda import ( + LDMATRIX_16x16_A_INTRIN, + LDMATRIX_16x16_B_INTRIN, + LDMATRIX_16x16_B_TRANS_INTRIN, + LDMATRIX_16x32_A_INTRIN, + LDMATRIX_32x16_B_INTRIN, + LDMATRIX_16x32_B_TRANS_INTRIN, + MMA_f16f16f32_INTRIN, + MMA_f16f16f32_TRANS_INTRIN, + MMA_f16f16f16_INTRIN, + MMA_f16f16f16_TRANS_INTRIN, + MMA_i8i8i32_INTRIN, + MMA_i8i8i32_TRANS_INTRIN, + MMA_fill_16x16_f32_INTRIN, + MMA_fill_16x16_f16_INTRIN, + MMA_fill_16x16_i32_INTRIN, + MMA_store_16x16_f32_global_INTRIN, + MMA_store_16x16_f16_global_INTRIN, + MMA_store_16x16_i32_global_INTRIN, + shared_16x16_to_ldmatrix_32x8_layout, + shared_32x16_to_ldmatrix_32x16_layout, + shared_16x32_to_ldmatrix_32x16_layout, +) +import tvm.testing +import numpy as np + + +M = 4096 +N = 4096 +K = 4096 +measure_perf = False +gflops = (N * M * K) * 2 / 1e9 + + +def matmul(m, n, k, in_dtype, out_dtype, b_transposed): + b_shape = (n, k) if b_transposed else (k, n) + a = te.placeholder((m, k), name="A", dtype=in_dtype) + b = te.placeholder(b_shape, name="B", dtype=in_dtype) + k = te.reduce_axis((0, k), name="k") + + def maybe_cast(v): + if in_dtype != out_dtype: + return tvm.tir.Cast(out_dtype, v) + return v + + def maybe_swap(i, j): + if b_transposed: + return j, i + return i, j + + c = te.compute( + (m, n), + lambda i, j: te.sum(maybe_cast(a[i, k]) * maybe_cast(b[maybe_swap(k, j)]), axis=[k]), + name="C", + ) + return (a, b, c) + + +def is_ampere_or_newer(): + arch = tvm.contrib.nvcc.get_target_compute_version() + major, _ = tvm.contrib.nvcc.parse_compute_version(arch) + return major >= 8 + + +def run_test( + k_inner, + in_dtype, + out_dtype, + b_transposed, + i_factors, + j_factors, + k_factors, + index_map_A, + index_map_B, + index_map_C, + ldmatrix_a_intrin, + ldmatrix_b_intrin, + mma_intrin, + mma_fill_intrin, + mma_store_intrin, +): + workload = te.create_prim_func(matmul(M, N, K, in_dtype, out_dtype, b_transposed)) + ir_module = tvm.IRModule({"main": workload}) + sch = tvm.tir.Schedule(ir_module) + + block = sch.get_block("C") + i, j, k = sch.get_loops(block) + i, i_tc = sch.split(i, factors=[None, 16]) + j, j_tc = sch.split(j, factors=[None, 16]) + k, k_tc = sch.split(k, factors=[None, k_inner]) + + sch.reorder(i, j, k, i_tc, j_tc, k_tc) + + block_inner = sch.blockize(i_tc) + block_outer, block_inner = block_inner, block + + num_ty = i_factors[2] * j_factors[2] + + i0, i1, i2, i3, i4 = sch.split(i, factors=i_factors) + j0, j1, j2, j3, j4 = sch.split(j, factors=j_factors) + k0, k1, k2 = sch.split(k, k_factors) + + sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3, k2, i4, j4) + + block_idx = sch.fuse(i0, j0) + block_idy = sch.fuse(i1, j1) + thread_idy = sch.fuse(j2, i2) + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + sch.bind(thread_idy, "threadIdx.y") + + def fetch_to_shared(block, idx, ndim): + block_read = sch.cache_read(block, idx, "shared") + sch.compute_at(block_read, k0) + vector_size = 16 if in_dtype == "int8" else 8 + warp_size = 32 + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size]) + sch.bind(f_2, "threadIdx.x") + sch.bind(f_1, "threadIdx.y") + sch.vectorize(f_3) + offset = 8 if in_dtype == "float16" else 16 + sch.storage_align(block_read, 0, axis=-2, factor=32, offset=offset) + + return block_read + + fetch_to_shared(block_outer, 0, 2) + fetch_to_shared(block_outer, 1, 2) + + A_warp = sch.cache_read(block_outer, 0, "warp") + B_warp = sch.cache_read(block_outer, 1, "warp") + + sch.compute_at(A_warp, k1) + sch.compute_at(B_warp, k1) + + C_warp = sch.cache_write(block_outer, 0, "warp") + sch.reverse_compute_at(C_warp, thread_idy) + + ii, jj = sch.get_loops(C_warp)[-2:] + io, ii = sch.split(ii, factors=[None, 16]) + jo, ji = sch.split(jj, factors=[None, 16]) + sch.reorder(io, jo, ii, ji) + + sch.decompose_reduction(block_outer, sch.get_loops(block_outer)[3]) + block_init_c = sch.get_block("C_init") + + def tile_wmma_fragment(block_read, height, width): + i, j = sch.get_loops(block_read)[-2:] + i0, i1 = sch.split(i, factors=[None, height]) + j0, j1 = sch.split(j, factors=[None, width]) + sch.reorder(i0, j0, i1, j1) + return i1 + + loop_a = tile_wmma_fragment(A_warp, 16, k_inner) + + if b_transposed: + loop_b = tile_wmma_fragment(B_warp, 16, k_inner) + else: + loop_b = tile_wmma_fragment(B_warp, k_inner, 16) + + sch.transform_layout(A_warp, 0, "write", index_map_A) + sch.transform_layout(B_warp, 0, "write", index_map_B) + sch.transform_layout(C_warp, 0, "read", index_map_C) + + sch.tensorize(loop_a, ldmatrix_a_intrin) + sch.tensorize(loop_b, ldmatrix_b_intrin) + sch.tensorize(sch.get_loops(block_inner)[-3], mma_intrin) + sch.tensorize(sch.get_loops(block_init_c)[-2], mma_fill_intrin) + sch.tensorize(sch.get_loops(C_warp)[-2], mma_store_intrin) + + if not is_ampere_or_newer(): + return None + + f = tvm.build(sch.mod["main"], target="cuda", name="dense") + + dev = tvm.device("cuda", 0) + + if in_dtype == "float16": + a_np = np.random.uniform(size=(M, K)).astype("float16") + + if b_transposed: + b_np = np.random.uniform(size=(N, K)).astype("float16") + c_np = np.dot(a_np.astype("float32"), b_np.astype("float32").transpose()).astype( + out_dtype + ) + else: + b_np = np.random.uniform(size=(K, N)).astype("float16") + c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype(out_dtype) + else: + a_np = np.random.randint(-128, 128, (M, K)).astype("int8") + + if b_transposed: + b_np = np.random.randint(-128, 128, (N, K)).astype("int8") + c_np = np.dot(a_np.astype("float32"), b_np.astype("float32").transpose()).astype( + "int32" + ) + else: + b_np = np.random.randint(-128, 128, (K, N)).astype("int8") + c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype("int32") + + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(b_np, dev) + c = tvm.nd.array(np.zeros((M, N), dtype=out_dtype), dev) + + f(a, b, c) + + if out_dtype != "float16": + # The numpy reference is computed with fp32 precision (otherwise too slow). + # So there is non-trivial accuracy difference if TVM result is computed with fp16 accumulation. + tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3) + + return lambda: f.time_evaluator(f.entry_name, dev, number=500)(a, b, c) + + +@tvm.testing.requires_cuda +def test_f16f16f32_m16n16k16(): + def index_map(i, j): + return ( + i // 16, + j // 16, + *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16), + ) + + k_inner = 16 + in_dtype = "float16" + out_dtype = "float32" + i_factors, j_factors, k_factors = [4, 8, 2, 4, 1], [1, 64, 2, 1, 2], [128, 2, 1] + + timer = run_test( + k_inner, + in_dtype, + out_dtype, + False, # b_transposed + i_factors, + j_factors, + k_factors, + index_map, + index_map, + index_map, + LDMATRIX_16x16_A_INTRIN, + LDMATRIX_16x16_B_INTRIN, + MMA_f16f16f32_INTRIN, + MMA_fill_16x16_f32_INTRIN, + MMA_store_16x16_f32_global_INTRIN, + ) + + if measure_perf and timer: + print("f16f16f32_m16n16k16: %f GFLOPS" % (gflops / (timer().mean))) + + timer = run_test( + k_inner, + in_dtype, + out_dtype, + True, # b_transposed + i_factors, + j_factors, + k_factors, + index_map, + index_map, + index_map, + LDMATRIX_16x16_A_INTRIN, + LDMATRIX_16x16_B_TRANS_INTRIN, + MMA_f16f16f32_TRANS_INTRIN, + MMA_fill_16x16_f32_INTRIN, + MMA_store_16x16_f32_global_INTRIN, + ) + + if measure_perf and timer: + print("f16f16f32_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean))) + + +@tvm.testing.requires_cuda +def test_f16f16f16_m16n16k16(): + def index_map(i, j): + return ( + i // 16, + j // 16, + *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16), + ) + + k_inner = 16 + in_dtype = "float16" + out_dtype = "float16" + i_factors, j_factors, k_factors = [16, 2, 1, 4, 2], [16, 2, 2, 1, 4], [128, 2, 1] + + timer = run_test( + k_inner, + in_dtype, + out_dtype, + False, # b_transposed + i_factors, + j_factors, + k_factors, + index_map, + index_map, + index_map, + LDMATRIX_16x16_A_INTRIN, + LDMATRIX_16x16_B_INTRIN, + MMA_f16f16f16_INTRIN, + MMA_fill_16x16_f16_INTRIN, + MMA_store_16x16_f16_global_INTRIN, + ) + + if measure_perf and timer: + print("f16f16f16_m16n16k16: %f GFLOPS" % (gflops / (timer().mean))) + + timer = run_test( + k_inner, + in_dtype, + out_dtype, + True, # b_transposed + i_factors, + j_factors, + k_factors, + index_map, + index_map, + index_map, + LDMATRIX_16x16_A_INTRIN, + LDMATRIX_16x16_B_TRANS_INTRIN, + MMA_f16f16f16_TRANS_INTRIN, + MMA_fill_16x16_f16_INTRIN, + MMA_store_16x16_f16_global_INTRIN, + ) + + if measure_perf and timer: + print("f16f16f16_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean))) + + +@tvm.testing.requires_cuda +def test_i8i8i32_m16n16k32(): + def index_map_A(i, j): + return ( + i // 16, + j // 32, + *shared_16x32_to_ldmatrix_32x16_layout(i % 16, j % 32), + ) + + def index_map_B(i, j): + return ( + i // 32, + j // 16, + *shared_32x16_to_ldmatrix_32x16_layout(i % 32, j % 16), + ) + + def index_map_C(i, j): + return ( + i // 16, + j // 16, + *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16), + ) + + k_inner = 32 + in_dtype = "int8" + out_dtype = "int32" + i_factors, j_factors, k_factors = [1, 32, 1, 4, 2], [8, 4, 4, 2, 1], [32, 2, 2] + + timer = run_test( + k_inner, + in_dtype, + out_dtype, + False, # b_transposed + i_factors, + j_factors, + k_factors, + index_map_A, + index_map_B, + index_map_C, + LDMATRIX_16x32_A_INTRIN, + LDMATRIX_32x16_B_INTRIN, + MMA_i8i8i32_INTRIN, + MMA_fill_16x16_i32_INTRIN, + MMA_store_16x16_i32_global_INTRIN, + ) + + if measure_perf and timer: + print("i8i8i32_m16n16k32: %f GOPS" % (gflops / (timer().mean))) + + timer = run_test( + k_inner, + in_dtype, + out_dtype, + True, # b_transposed + i_factors, + j_factors, + k_factors, + index_map_A, + index_map_A, + index_map_C, + LDMATRIX_16x32_A_INTRIN, + LDMATRIX_16x32_B_TRANS_INTRIN, + MMA_i8i8i32_TRANS_INTRIN, + MMA_fill_16x16_i32_INTRIN, + MMA_store_16x16_i32_global_INTRIN, + ) + + if measure_perf and timer: + print("i8i8i32_m16n16k32_trans: %f GOPS" % (gflops / (timer().mean))) + + +if __name__ == "__main__": + test_f16f16f32_m16n16k16() + test_f16f16f16_m16n16k16() + test_i8i8i32_m16n16k32() From 85e42b6af38ea3bd0c99c8208d7baed5086a8959 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 20 May 2022 15:09:19 -0500 Subject: [PATCH 51/59] [skip ci] Fix scipy intersphinx link (#11399) Follow-up from https://github.com/apache/tvm/pull/10181, as the URL has changed again in https://github.com/scipy/scipy/pull/16221. From [this comment](https://github.com/scipy/scipy/issues/14267#issuecomment-1034196161), the `html-scipyorg` portion wasn't intended to be part of the URL. This should resolve the HTTP 404 occurring in `Docs: GPU` step (e.g. [here](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-11269/13/pipeline/405#step-975-log-73)), by accessing `https://docs.scipy.org/doc/scipy-1.8.0/objects.inv` instead of `https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/objects.inv` --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index da31c3a4243c..400d959bade6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -203,7 +203,7 @@ def git_describe_version(original_version): intersphinx_mapping = { "python": ("https://docs.python.org/{.major}".format(sys.version_info), None), # "numpy": ("https://numpy.org/doc/stable", None), - "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/", None), + "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/", None), # "matplotlib": ("https://matplotlib.org/", None), } From 50997035befc0383dcba21808ab739d9ed8df08c Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 20 May 2022 16:09:01 -0700 Subject: [PATCH 52/59] [ci] Restructure Jenkinsfile (#11380) Co-authored-by: driazati --- .gitattributes | 2 + Jenkinsfile | 278 +++++------- jenkins/Build.groovy.j2 | 186 ++++++++ jenkins/Deploy.groovy.j2 | 71 +++ jenkins/DockerBuild.groovy.j2 | 158 +++++++ jenkins/Jenkinsfile.j2 | 812 +--------------------------------- jenkins/Lint.groovy.j2 | 18 + jenkins/Prepare.groovy.j2 | 133 ++++++ jenkins/README.md | 28 ++ jenkins/Test.groovy.j2 | 236 ++++++++++ tests/lint/rat-excludes | 8 + 11 files changed, 977 insertions(+), 953 deletions(-) create mode 100644 .gitattributes create mode 100644 jenkins/Build.groovy.j2 create mode 100644 jenkins/Deploy.groovy.j2 create mode 100644 jenkins/DockerBuild.groovy.j2 create mode 100644 jenkins/Lint.groovy.j2 create mode 100644 jenkins/Prepare.groovy.j2 create mode 100644 jenkins/README.md create mode 100644 jenkins/Test.groovy.j2 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000000..29e2373f30ff --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +Jenkinsfile linguist-generated=true + diff --git a/Jenkinsfile b/Jenkinsfile index 7b8c8f890db1..0b64f9306844 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-05-20T18:06:10.772162 +// Generated at 2022-05-20T13:24:01.371704 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -86,6 +86,20 @@ docker_build = 'docker/build.sh' max_time = 180 rebuild_docker_images = false +// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/ +// execute this before anything else, including requesting any time on an agent +if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { + print "INFO: Build skipped due to trigger being Branch Indexing" + currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful + return +} + +// Filenames for stashing between build and test steps +s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}" + +// General note: Jenkins has limits on the size of a method (or top level code) +// that are pretty strict, so most usage of groovy methods in these templates +// are purely to satisfy the JVM def per_exec_ws(folder) { return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder } @@ -183,146 +197,52 @@ def should_skip_ci(pr_number) { return git_skip_ci_code == 0 } -// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/ -// execute this before anything else, including requesting any time on an agent -if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { - print "INFO: Build skipped due to trigger being Branch Indexing" - currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful - return -} - -cancel_previous_build() - -def lint() { -stage('Lint') { - parallel( - 'Lint 1 of 2': { +def prepare() { + stage('Prepare') { node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") { init_git() - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'TVM_NUM_SHARDS=2', - 'TVM_SHARD_INDEX=0'], { - ci_arm = params.ci_arm_param ?: ci_arm - ci_cpu = params.ci_cpu_param ?: ci_cpu - ci_gpu = params.ci_gpu_param ?: ci_gpu - ci_hexagon = params.ci_hexagon_param ?: ci_hexagon - ci_i386 = params.ci_i386_param ?: ci_i386 - ci_lint = params.ci_lint_param ?: ci_lint - ci_qemu = params.ci_qemu_param ?: ci_qemu - ci_wasm = params.ci_wasm_param ?: ci_wasm - - sh (script: """ - echo "Docker images being used in this build:" - echo " ci_arm = ${ci_arm}" - echo " ci_cpu = ${ci_cpu}" - echo " ci_gpu = ${ci_gpu}" - echo " ci_hexagon = ${ci_hexagon}" - echo " ci_i386 = ${ci_i386}" - echo " ci_lint = ${ci_lint}" - echo " ci_qemu = ${ci_qemu}" - echo " ci_wasm = ${ci_wasm}" - """, label: 'Docker image names') - - is_docs_only_build = sh ( - returnStatus: true, - script: './tests/scripts/git_change_docs.sh', - label: 'Check for docs only changes', - ) - skip_ci = should_skip_ci(env.CHANGE_ID) - skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID) - rebuild_docker_images = sh ( - returnStatus: true, - script: './tests/scripts/git_change_docker.sh', - label: 'Check for any docker changes', - ) - if (skip_ci) { - // Don't rebuild when skipping CI - rebuild_docker_images = false - } - if (rebuild_docker_images) { - // Exit before linting so we can use the newly created Docker images - // to run the lint - return - } - sh ( - script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", - label: 'Run lint', - ) - }) - } - } - } - }, - 'Lint 2 of 2': { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") { - init_git() - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'TVM_NUM_SHARDS=2', - 'TVM_SHARD_INDEX=1'], { - ci_arm = params.ci_arm_param ?: ci_arm - ci_cpu = params.ci_cpu_param ?: ci_cpu - ci_gpu = params.ci_gpu_param ?: ci_gpu - ci_hexagon = params.ci_hexagon_param ?: ci_hexagon - ci_i386 = params.ci_i386_param ?: ci_i386 - ci_lint = params.ci_lint_param ?: ci_lint - ci_qemu = params.ci_qemu_param ?: ci_qemu - ci_wasm = params.ci_wasm_param ?: ci_wasm - - sh (script: """ - echo "Docker images being used in this build:" - echo " ci_arm = ${ci_arm}" - echo " ci_cpu = ${ci_cpu}" - echo " ci_gpu = ${ci_gpu}" - echo " ci_hexagon = ${ci_hexagon}" - echo " ci_i386 = ${ci_i386}" - echo " ci_lint = ${ci_lint}" - echo " ci_qemu = ${ci_qemu}" - echo " ci_wasm = ${ci_wasm}" - """, label: 'Docker image names') - - is_docs_only_build = sh ( - returnStatus: true, - script: './tests/scripts/git_change_docs.sh', - label: 'Check for docs only changes', - ) - skip_ci = should_skip_ci(env.CHANGE_ID) - skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID) - rebuild_docker_images = sh ( - returnStatus: true, - script: './tests/scripts/git_change_docker.sh', - label: 'Check for any docker changes', - ) - if (skip_ci) { - // Don't rebuild when skipping CI - rebuild_docker_images = false - } - if (rebuild_docker_images) { - // Exit before linting so we can use the newly created Docker images - // to run the lint - return - } - sh ( - script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", - label: 'Run lint', - ) - }) + ci_arm = params.ci_arm_param ?: ci_arm + ci_cpu = params.ci_cpu_param ?: ci_cpu + ci_gpu = params.ci_gpu_param ?: ci_gpu + ci_hexagon = params.ci_hexagon_param ?: ci_hexagon + ci_i386 = params.ci_i386_param ?: ci_i386 + ci_lint = params.ci_lint_param ?: ci_lint + ci_qemu = params.ci_qemu_param ?: ci_qemu + ci_wasm = params.ci_wasm_param ?: ci_wasm + + sh (script: """ + echo "Docker images being used in this build:" + echo " ci_arm = ${ci_arm}" + echo " ci_cpu = ${ci_cpu}" + echo " ci_gpu = ${ci_gpu}" + echo " ci_hexagon = ${ci_hexagon}" + echo " ci_i386 = ${ci_i386}" + echo " ci_lint = ${ci_lint}" + echo " ci_qemu = ${ci_qemu}" + echo " ci_wasm = ${ci_wasm}" + """, label: 'Docker image names') + + is_docs_only_build = sh ( + returnStatus: true, + script: './tests/scripts/git_change_docs.sh', + label: 'Check for docs only changes', + ) + skip_ci = should_skip_ci(env.CHANGE_ID) + skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID) + rebuild_docker_images = sh ( + returnStatus: true, + script: './tests/scripts/git_change_docker.sh', + label: 'Check for any docker changes', + ) + if (skip_ci) { + // Don't rebuild when skipping CI + rebuild_docker_images = false } } } - }, - ) -} + } } - -// [note: method size] -// This has to be extracted into a method due to JVM limitations on the size of -// a method (so the code can't all be inlined) -lint() - def build_image(image_name) { hash = sh( returnStdout: true, @@ -378,7 +298,7 @@ def build_image(image_name) { ) } -if (rebuild_docker_images) { +def build_docker_images() { stage('Docker Image Build') { // TODO in a follow up PR: Find ecr tag and use in subsequent builds parallel 'ci-lint': { @@ -481,11 +401,46 @@ def make(docker_type, path, make_flag) { } } } - -// Filenames for stashing between build and test steps -s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}" - - +def lint() { + stage('Lint') { + parallel( + 'Lint 1 of 2': { + node('CPU-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") { + init_git() + timeout(time: max_time, unit: 'MINUTES') { + withEnv([ + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=0'], { + sh ( + script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", + label: 'Run lint', + ) + }) + } + } + } + }, + 'Lint 2 of 2': { + node('CPU-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") { + init_git() + timeout(time: max_time, unit: 'MINUTES') { + withEnv([ + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=1'], { + sh ( + script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", + label: 'Run lint', + ) + }) + } + } + } + }, + ) + } +} def ci_setup(image) { sh ( script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh", @@ -529,7 +484,6 @@ def add_microtvm_permissions() { ) } - def build() { stage('Build') { environment { @@ -771,10 +725,6 @@ stage('Build') { ) } } - -// [note: method size] -build() - def test() { stage('Test') { environment { @@ -1845,10 +1795,6 @@ stage('Test') { ) } } - -// [note: method size] -test() - /* stage('Build packages') { parallel 'conda CPU': { @@ -1907,11 +1853,13 @@ def deploy_docs() { } } -stage('Deploy') { - if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') { - node('CPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") { - sh( + +def deploy() { + stage('Deploy') { + if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') { + node('CPU') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") { + sh( script: """ set -eux aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz @@ -1920,8 +1868,26 @@ stage('Deploy') { label: 'Download artifacts from S3', ) - deploy_docs() + deploy_docs() + } } } } } + + +cancel_previous_build() + +prepare() + +if (rebuild_docker_images) { + build_docker_images() +} + +lint() + +build() + +test() + +deploy() diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2 new file mode 100644 index 000000000000..c1715949175b --- /dev/null +++ b/jenkins/Build.groovy.j2 @@ -0,0 +1,186 @@ +def ci_setup(image) { + sh ( + script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh", + label: 'Set up CI environment', + ) +} + +def python_unittest(image) { + sh ( + script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh", + label: 'Run Python unit tests', + ) +} + +def fsim_test(image) { + sh ( + script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh", + label: 'Run VTA tests in FSIM', + ) +} + +def cmake_build(image, path, make_flag) { + sh ( + script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod", + label: 'Run cmake build', + ) +} + +def cpp_unittest(image) { + sh ( + script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh", + label: 'Build and run C++ tests', + ) +} + + +def add_microtvm_permissions() { + {% for folder in microtvm_template_projects %} + sh( + script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x', + label: 'Add execute permissions for microTVM files', + ) + {% endfor %} +} + +def build() { +stage('Build') { + environment { + SKIP_SLOW_TESTS = "${skip_slow_tests}" + } + parallel( + 'BUILD: GPU': { + if (!skip_ci) { + node('CPU-SMALL') { + ws({{ m.per_exec_ws('tvm/build-gpu') }}) { + init_git() + sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build" + make("${ci_gpu} --no-gpu", 'build', '-j2') + {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} + + // compiler test + sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2" + make("${ci_gpu} --no-gpu", 'build2', '-j2') + {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }} + } + } + } + }, + 'BUILD: CPU': { + if (!skip_ci && is_docs_only_build != 1) { + node('CPU-SMALL') { + ws({{ m.per_exec_ws('tvm/build-cpu') }}) { + init_git() + sh ( + script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build", + label: 'Create CPU cmake config', + ) + make(ci_cpu, 'build', '-j2') + {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} + timeout(time: max_time, unit: 'MINUTES') { + ci_setup(ci_cpu) + // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh" + // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch + sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test') + } + } + } + } else { + Utils.markStageSkippedForConditional('BUILD: CPU') + } + }, + 'BUILD: WASM': { + if (!skip_ci && is_docs_only_build != 1) { + node('CPU-SMALL') { + ws({{ m.per_exec_ws('tvm/build-wasm') }}) { + init_git() + sh ( + script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build", + label: 'Create WASM cmake config', + ) + make(ci_wasm, 'build', '-j2') + cpp_unittest(ci_wasm) + timeout(time: max_time, unit: 'MINUTES') { + ci_setup(ci_wasm) + sh ( + script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh", + label: 'Run WASM lint and tests', + ) + } + } + } + } else { + Utils.markStageSkippedForConditional('BUILD: WASM') + } + }, + 'BUILD: i386': { + if (!skip_ci && is_docs_only_build != 1) { + node('CPU-SMALL') { + ws({{ m.per_exec_ws('tvm/build-i386') }}) { + init_git() + sh ( + script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build", + label: 'Create i386 cmake config', + ) + make(ci_i386, 'build', '-j2') + {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }} + } + } + } else { + Utils.markStageSkippedForConditional('BUILD: i386') + } + }, + 'BUILD: arm': { + if (!skip_ci && is_docs_only_build != 1) { + node('ARM') { + ws({{ m.per_exec_ws('tvm/build-arm') }}) { + init_git() + sh ( + script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build", + label: 'Create ARM cmake config', + ) + make(ci_arm, 'build', '-j4') + {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }} + } + } + } else { + Utils.markStageSkippedForConditional('BUILD: arm') + } + }, + 'BUILD: QEMU': { + if (!skip_ci && is_docs_only_build != 1) { + node('CPU-SMALL') { + ws({{ m.per_exec_ws('tvm/build-qemu') }}) { + init_git() + sh ( + script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build", + label: 'Create QEMU cmake config', + ) + make(ci_qemu, 'build', '-j2') + {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }} + } + } + } else { + Utils.markStageSkippedForConditional('BUILD: QEMU') + } + }, + 'BUILD: Hexagon': { + if (!skip_ci && is_docs_only_build != 1) { + node('CPU-SMALL') { + ws({{ m.per_exec_ws('tvm/build-hexagon') }}) { + init_git() + sh ( + script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build", + label: 'Create Hexagon cmake config', + ) + make(ci_hexagon, 'build', '-j2') + {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }} + } + } + } else { + Utils.markStageSkippedForConditional('BUILD: Hexagon') + } + }, + ) +} +} diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2 new file mode 100644 index 000000000000..917f71ded1ff --- /dev/null +++ b/jenkins/Deploy.groovy.j2 @@ -0,0 +1,71 @@ +/* +stage('Build packages') { + parallel 'conda CPU': { + node('CPU') { + sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh + } + }, + 'conda cuda': { + node('CPU') { + sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh + sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh + } + } +// Here we could upload the packages to anaconda for releases +// and/or the main branch +} +*/ + +def deploy_docs() { + // Note: This code must stay in the Jenkinsfile to ensure that it runs + // from a trusted context only + sh( + script: ''' + set -eux + rm -rf tvm-site + git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site + cd tvm-site + git status + git checkout -B $DOCS_DEPLOY_BRANCH + + rm -rf docs + mkdir -p docs + tar xf ../docs.tgz -C docs + COMMIT=$(cat docs/commit_hash) + git add . + git config user.name tvm-bot + git config user.email 95660001+tvm-bot@users.noreply.github.com + git commit -m"deploying docs (apache/tvm@$COMMIT)" + git status + ''', + label: 'Unpack docs and update tvm-site' + ) + + withCredentials([string( + credentialsId: 'docs-push-token', + variable: 'GITHUB_TOKEN', + )]) { + sh( + script: ''' + cd tvm-site + git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git + git push deploy $DOCS_DEPLOY_BRANCH + ''', + label: 'Upload docs to apache/tvm-site' + ) + } +} + + +def deploy() { + stage('Deploy') { + if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') { + node('CPU') { + ws({{ m.per_exec_ws('tvm/deploy-docs') }}) { + {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }} + deploy_docs() + } + } + } + } +} diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2 new file mode 100644 index 000000000000..84bb8e3e376d --- /dev/null +++ b/jenkins/DockerBuild.groovy.j2 @@ -0,0 +1,158 @@ +def build_image(image_name) { + hash = sh( + returnStdout: true, + script: 'git log -1 --format=\'%h\'' + ).trim() + def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}" + sh( + script: "${docker_build} ${image_name} --spec ${full_name}", + label: 'Build docker image' + ) + aws_account_id = sh( + returnStdout: true, + script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"', + label: 'Get AWS ID' + ).trim() + + try { + // Use a credential so Jenkins knows to scrub the AWS account ID which is nice + // (but so we don't have to rely it being hardcoded in Jenkins) + withCredentials([string( + credentialsId: 'aws-account-id', + variable: '_ACCOUNT_ID_DO_NOT_USE', + )]) { + withEnv([ + "AWS_ACCOUNT_ID=${aws_account_id}", + 'AWS_DEFAULT_REGION=us-west-2']) { + sh( + script: ''' + set -x + aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com + ''', + label: 'Log in to ECR' + ) + sh( + script: """ + set -x + docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name} + docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name} + """, + label: 'Upload image to ECR' + ) + } + } + } finally { + sh( + script: 'rm -f ~/.docker/config.json', + label: 'Clean up login credentials' + ) + } + sh( + script: "docker rmi ${full_name}", + label: 'Remove docker image' + ) +} + +def build_docker_images() { + stage('Docker Image Build') { + // TODO in a follow up PR: Find ecr tag and use in subsequent builds + parallel 'ci-lint': { + node('CPU') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_lint') + } + } + }, 'ci-cpu': { + node('CPU') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_cpu') + } + } + }, 'ci-gpu': { + node('GPU') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_gpu') + } + } + }, 'ci-qemu': { + node('CPU') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_qemu') + } + } + }, 'ci-i386': { + node('CPU') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_i386') + } + } + }, 'ci-arm': { + node('ARM') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_arm') + } + } + }, 'ci-wasm': { + node('CPU') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_wasm') + } + } + }, 'ci-hexagon': { + node('CPU') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + build_image('ci_hexagon') + } + } + } + } + // // TODO: Once we are able to use the built images, enable this step + // // If the docker images changed, we need to run the image build before the lint + // // can run since it requires a base docker image. Most of the time the images + // // aren't build though so it's faster to use the same node that checks for + // // docker changes to run the lint in the usual case. + // stage('Sanity Check (re-run)') { + // timeout(time: max_time, unit: 'MINUTES') { + // node('CPU') { + // ws({{ m.per_exec_ws('tvm/sanity') }}) { + // init_git() + // sh ( + // script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", + // label: 'Run lint', + // ) + // } + // } + // } + // } +} + +// Run make. First try to do an incremental make from a previous workspace in hope to +// accelerate the compilation. If something is wrong, clean the workspace and then +// build from scratch. +def make(docker_type, path, make_flag) { + timeout(time: max_time, unit: 'MINUTES') { + try { + cmake_build(docker_type, path, make_flag) + // always run cpp test when build + } catch (hudson.AbortException ae) { + // script exited due to user abort, directly throw instead of retry + if (ae.getMessage().contains('script returned exit code 143')) { + throw ae + } + echo 'Incremental compilation failed. Fall back to build from scratch' + sh ( + script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}", + label: 'Clear old cmake workspace', + ) + cmake_build(docker_type, path, make_flag) + } + } +} diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2 index b00ee0272626..a1127ec6a8d5 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/jenkins/Jenkinsfile.j2 @@ -83,103 +83,6 @@ docker_build = 'docker/build.sh' max_time = 180 rebuild_docker_images = false -def per_exec_ws(folder) { - return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder -} - -// initialize source codes -def init_git() { - checkout scm - - // Clear out all Docker images that aren't going to be used - sh( - script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}} {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }", - label: 'Clean old Docker images', - ) - // Add more info about job node - sh ( - script: './tests/scripts/task_show_node_info.sh', - label: 'Show executor node info', - ) - - // Determine merge commit to use for all stages - sh ( - script: 'git fetch origin main', - label: 'Fetch upstream', - ) - if (upstream_revision == null) { - upstream_revision = sh( - script: 'git log -1 FETCH_HEAD --format=\'%H\'', - label: 'Determine upstream revision', - returnStdout: true, - ).trim() - } - sh ( - script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}", - label: 'Merge to origin/main' - ) - - retry(5) { - timeout(time: 2, unit: 'MINUTES') { - sh (script: 'git submodule update --init -f', label: 'Update git submodules') - } - } -} - -def should_skip_slow_tests(pr_number) { - withCredentials([string( - credentialsId: 'tvm-bot-jenkins-reader', - variable: 'GITHUB_TOKEN', - )]) { - // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests - result = sh ( - returnStatus: true, - script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'", - label: 'Check if CI should run slow tests', - ) - } - return result == 0 -} - -def cancel_previous_build() { - // cancel previous build if it is not on main. - if (env.BRANCH_NAME != 'main') { - def buildNumber = env.BUILD_NUMBER as int - // Milestone API allows us to cancel previous build - // with the same milestone number - if (buildNumber > 1) milestone(buildNumber - 1) - milestone(buildNumber) - } -} - -def should_skip_ci(pr_number) { - if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { - // never skip CI on build sourced from a branch - return false - } - glob_skip_ci_code = sh ( - returnStatus: true, - script: "./tests/scripts/git_skip_ci_globs.py", - label: 'Check if CI should be skipped due to changed files', - ) - if (glob_skip_ci_code == 0) { - return true - } - withCredentials([string( - credentialsId: 'tvm-bot-jenkins-reader', - variable: 'TOKEN', - )]) { - // Exit code of 1 means run full CI (or the script had an error, so run - // full CI just in case). Exit code of 0 means skip CI. - git_skip_ci_code = sh ( - returnStatus: true, - script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'", - label: 'Check if CI should be skipped', - ) - } - return git_skip_ci_code == 0 -} - // skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/ // execute this before anything else, including requesting any time on an agent if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { @@ -188,217 +91,6 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { return } -cancel_previous_build() - -def lint() { -stage('Lint') { - parallel( - {% call m.sharded_lint_step(name='Lint', num_shards=2, node='CPU-SMALL', ws='tvm/lint') %} - {% for image in images %} - {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }} - {% endfor %} - - sh (script: """ - echo "Docker images being used in this build:" - {% for image in images %} - echo " {{ image.name }} = ${ {{- image.name -}} }" - {% endfor %} - """, label: 'Docker image names') - - is_docs_only_build = sh ( - returnStatus: true, - script: './tests/scripts/git_change_docs.sh', - label: 'Check for docs only changes', - ) - skip_ci = should_skip_ci(env.CHANGE_ID) - skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID) - rebuild_docker_images = sh ( - returnStatus: true, - script: './tests/scripts/git_change_docker.sh', - label: 'Check for any docker changes', - ) - if (skip_ci) { - // Don't rebuild when skipping CI - rebuild_docker_images = false - } - if (rebuild_docker_images) { - // Exit before linting so we can use the newly created Docker images - // to run the lint - return - } - sh ( - script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", - label: 'Run lint', - ) - {% endcall %} - ) -} -} - -// [note: method size] -// This has to be extracted into a method due to JVM limitations on the size of -// a method (so the code can't all be inlined) -lint() - -def build_image(image_name) { - hash = sh( - returnStdout: true, - script: 'git log -1 --format=\'%h\'' - ).trim() - def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}" - sh( - script: "${docker_build} ${image_name} --spec ${full_name}", - label: 'Build docker image' - ) - aws_account_id = sh( - returnStdout: true, - script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"', - label: 'Get AWS ID' - ).trim() - - try { - // Use a credential so Jenkins knows to scrub the AWS account ID which is nice - // (but so we don't have to rely it being hardcoded in Jenkins) - withCredentials([string( - credentialsId: 'aws-account-id', - variable: '_ACCOUNT_ID_DO_NOT_USE', - )]) { - withEnv([ - "AWS_ACCOUNT_ID=${aws_account_id}", - 'AWS_DEFAULT_REGION=us-west-2']) { - sh( - script: ''' - set -x - aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com - ''', - label: 'Log in to ECR' - ) - sh( - script: """ - set -x - docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name} - docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name} - """, - label: 'Upload image to ECR' - ) - } - } - } finally { - sh( - script: 'rm -f ~/.docker/config.json', - label: 'Clean up login credentials' - ) - } - sh( - script: "docker rmi ${full_name}", - label: 'Remove docker image' - ) -} - -if (rebuild_docker_images) { - stage('Docker Image Build') { - // TODO in a follow up PR: Find ecr tag and use in subsequent builds - parallel 'ci-lint': { - node('CPU') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_lint') - } - } - }, 'ci-cpu': { - node('CPU') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_cpu') - } - } - }, 'ci-gpu': { - node('GPU') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_gpu') - } - } - }, 'ci-qemu': { - node('CPU') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_qemu') - } - } - }, 'ci-i386': { - node('CPU') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_i386') - } - } - }, 'ci-arm': { - node('ARM') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_arm') - } - } - }, 'ci-wasm': { - node('CPU') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_wasm') - } - } - }, 'ci-hexagon': { - node('CPU') { - timeout(time: max_time, unit: 'MINUTES') { - init_git() - build_image('ci_hexagon') - } - } - } - } - // // TODO: Once we are able to use the built images, enable this step - // // If the docker images changed, we need to run the image build before the lint - // // can run since it requires a base docker image. Most of the time the images - // // aren't build though so it's faster to use the same node that checks for - // // docker changes to run the lint in the usual case. - // stage('Sanity Check (re-run)') { - // timeout(time: max_time, unit: 'MINUTES') { - // node('CPU') { - // ws({{ m.per_exec_ws('tvm/sanity') }}) { - // init_git() - // sh ( - // script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", - // label: 'Run lint', - // ) - // } - // } - // } - // } -} - -// Run make. First try to do an incremental make from a previous workspace in hope to -// accelerate the compilation. If something is wrong, clean the workspace and then -// build from scratch. -def make(docker_type, path, make_flag) { - timeout(time: max_time, unit: 'MINUTES') { - try { - cmake_build(docker_type, path, make_flag) - // always run cpp test when build - } catch (hudson.AbortException ae) { - // script exited due to user abort, directly throw instead of retry - if (ae.getMessage().contains('script returned exit code 143')) { - throw ae - } - echo 'Incremental compilation failed. Fall back to build from scratch' - sh ( - script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}", - label: 'Clear old cmake workspace', - ) - cmake_build(docker_type, path, make_flag) - } - } -} - // Filenames for stashing between build and test steps {% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %} {% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %} @@ -407,503 +99,29 @@ def make(docker_type, path, make_flag) { {% set microtvm_template_projects = ['build/microtvm_template_projects',] %} s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}" +// General note: Jenkins has limits on the size of a method (or top level code) +// that are pretty strict, so most usage of groovy methods in these templates +// are purely to satisfy the JVM +{% include "jenkins/Prepare.groovy.j2" %} +{% include "jenkins/DockerBuild.groovy.j2" %} +{% include "jenkins/Lint.groovy.j2" %} +{% include "jenkins/Build.groovy.j2" %} +{% include "jenkins/Test.groovy.j2" %} +{% include "jenkins/Deploy.groovy.j2" %} -def ci_setup(image) { - sh ( - script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh", - label: 'Set up CI environment', - ) -} - -def python_unittest(image) { - sh ( - script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh", - label: 'Run Python unit tests', - ) -} - -def fsim_test(image) { - sh ( - script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh", - label: 'Run VTA tests in FSIM', - ) -} - -def cmake_build(image, path, make_flag) { - sh ( - script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod", - label: 'Run cmake build', - ) -} -def cpp_unittest(image) { - sh ( - script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh", - label: 'Build and run C++ tests', - ) -} +cancel_previous_build() +prepare() -def add_microtvm_permissions() { - {% for folder in microtvm_template_projects %} - sh( - script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x', - label: 'Add execute permissions for microTVM files', - ) - {% endfor %} +if (rebuild_docker_images) { + build_docker_images() } +lint() -def build() { -stage('Build') { - environment { - SKIP_SLOW_TESTS = "${skip_slow_tests}" - } - parallel( - 'BUILD: GPU': { - if (!skip_ci) { - node('CPU-SMALL') { - ws({{ m.per_exec_ws('tvm/build-gpu') }}) { - init_git() - sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build" - make("${ci_gpu} --no-gpu", 'build', '-j2') - {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} - - // compiler test - sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2" - make("${ci_gpu} --no-gpu", 'build2', '-j2') - {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }} - } - } - } - }, - 'BUILD: CPU': { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws({{ m.per_exec_ws('tvm/build-cpu') }}) { - init_git() - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build", - label: 'Create CPU cmake config', - ) - make(ci_cpu, 'build', '-j2') - {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} - timeout(time: max_time, unit: 'MINUTES') { - ci_setup(ci_cpu) - // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh" - // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch - sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test') - } - } - } - } else { - Utils.markStageSkippedForConditional('BUILD: CPU') - } - }, - 'BUILD: WASM': { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws({{ m.per_exec_ws('tvm/build-wasm') }}) { - init_git() - sh ( - script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build", - label: 'Create WASM cmake config', - ) - make(ci_wasm, 'build', '-j2') - cpp_unittest(ci_wasm) - timeout(time: max_time, unit: 'MINUTES') { - ci_setup(ci_wasm) - sh ( - script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh", - label: 'Run WASM lint and tests', - ) - } - } - } - } else { - Utils.markStageSkippedForConditional('BUILD: WASM') - } - }, - 'BUILD: i386': { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws({{ m.per_exec_ws('tvm/build-i386') }}) { - init_git() - sh ( - script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build", - label: 'Create i386 cmake config', - ) - make(ci_i386, 'build', '-j2') - {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }} - } - } - } else { - Utils.markStageSkippedForConditional('BUILD: i386') - } - }, - 'BUILD: arm': { - if (!skip_ci && is_docs_only_build != 1) { - node('ARM') { - ws({{ m.per_exec_ws('tvm/build-arm') }}) { - init_git() - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build", - label: 'Create ARM cmake config', - ) - make(ci_arm, 'build', '-j4') - {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }} - } - } - } else { - Utils.markStageSkippedForConditional('BUILD: arm') - } - }, - 'BUILD: QEMU': { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws({{ m.per_exec_ws('tvm/build-qemu') }}) { - init_git() - sh ( - script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build", - label: 'Create QEMU cmake config', - ) - make(ci_qemu, 'build', '-j2') - {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }} - } - } - } else { - Utils.markStageSkippedForConditional('BUILD: QEMU') - } - }, - 'BUILD: Hexagon': { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws({{ m.per_exec_ws('tvm/build-hexagon') }}) { - init_git() - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build", - label: 'Create Hexagon cmake config', - ) - make(ci_hexagon, 'build', '-j2') - {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }} - } - } - } else { - Utils.markStageSkippedForConditional('BUILD: Hexagon') - } - }, - ) -} -} - -// [note: method size] build() -def test() { -stage('Test') { - environment { - SKIP_SLOW_TESTS = "${skip_slow_tests}" - } - parallel( - {% call(shard_index, num_shards) m.sharded_test_step( - name="unittest: GPU", - num_shards=2, - node="GPU", - ws="tvm/ut-python-gpu", - platform="gpu", - ) %} - {% if shard_index == 1 %} - {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }} - cpp_unittest(ci_gpu) - - {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} - ci_setup(ci_gpu) - cpp_unittest(ci_gpu) - {% else %} - {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} - ci_setup(ci_gpu) - {% endif %} - {% if shard_index == 2 or num_shards < 2 %} - sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh", - label: 'Run Java unit tests', - ) - {% endif %} - sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh", - label: 'Run Python GPU unit tests', - ) - sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh", - label: 'Run Python GPU integration tests', - ) - {% endcall %} - {% call(shard_index, num_shards) m.sharded_test_step( - name="integration: CPU", - node="CPU", - num_shards=2, - ws="tvm/integration-python-cpu", - platform="cpu", - ) %} - {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - {% endcall %} - {% call m.test_step( - name="unittest: CPU", - node="CPU-SMALL", - ws="tvm/ut-python-cpu", - platform="cpu", - ) %} - {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} - ci_setup(ci_cpu) - cpp_unittest(ci_cpu) - python_unittest(ci_cpu) - fsim_test(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh", - label: 'Run VTA tests in TSIM', - ) - {% endcall %} - {% call(shard_index, num_shards) m.sharded_test_step( - name="python: i386", - node="CPU-SMALL", - num_shards=3, - ws="tvm/integration-python-i386", - platform="i386", - ) %} - {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }} - ci_setup(ci_i386) - {% if shard_index == 1 %} - cpp_unittest(ci_i386) - {% endif %} - python_unittest(ci_i386) - sh ( - script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", - label: 'Run i386 integration tests', - ) - fsim_test(ci_i386) - {% endcall %} - {% call(shard_index, num_shards) m.sharded_test_step( - name="test: Hexagon", - node="CPU-SMALL", - ws="tvm/test-hexagon", - platform="hexagon", - num_shards=4, - ) %} - {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }} - ci_setup(ci_hexagon) - {% if shard_index == 1 %} - cpp_unittest(ci_hexagon) - {% endif %} - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh", - label: 'Build Hexagon API', - ) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - {% endcall %} - {% call m.test_step( - name="test: QEMU", - node="CPU-SMALL", - ws="tvm/test-qemu", - platform="qemu", - ) %} - {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }} - add_microtvm_permissions() - ci_setup(ci_qemu) - cpp_unittest(ci_qemu) - sh ( - script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh", - label: 'Run microTVM tests', - ) - sh ( - script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh", - label: 'Run microTVM demos', - ) - {% endcall %} - {% call m.test_step( - name="topi: aarch64", - node="ARM", - ws="tvm/ut-python-arm", - platform="arm", -) %} - {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} - ci_setup(ci_arm) - cpp_unittest(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh", - label: 'Run test_arm_compute_lib test', - ) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', - ) - {% endcall %} - {% call(shard_index, num_shards) m.sharded_test_step( - name="integration: aarch64", - num_shards=2, - node="ARM", ws="tvm/ut-python-arm", - platform="arm", - ) %} - {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} - ci_setup(ci_arm) - python_unittest(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - {% endcall %} - {% call(shard_index, num_shards) m.sharded_test_step( - name="topi: GPU", - node="GPU", - num_shards=2, - ws="tvm/topi-python-gpu", - platform="gpu", - ) %} - {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} - ci_setup(ci_gpu) - sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', - ) - {% endcall %} - {% call(shard_index, num_shards) m.sharded_test_step( - name="frontend: GPU", node="GPU", - num_shards=3, - ws="tvm/frontend-python-gpu", - platform="gpu", - ) %} - {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} - ci_setup(ci_gpu) - sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", - label: 'Run Python frontend tests', - ) - {% endcall %} - {% call m.test_step( - name="frontend: CPU", - node="CPU", - ws="tvm/frontend-python-cpu", - platform="cpu", -) %} - {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }} - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh", - label: 'Run Python frontend tests', - ) - {% endcall %} - {% call m.test_step( - name="frontend: aarch64", - node="ARM", - ws="tvm/frontend-python-arm", - platform="arm", -) %} - {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} - ci_setup(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", - label: 'Run Python frontend tests', - ) - {% endcall %} - 'docs: GPU': { - if (!skip_ci) { - node('GPU') { - ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) { - init_git() - {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} - add_microtvm_permissions() - timeout(time: 180, unit: 'MINUTES') { - ci_setup(ci_gpu) - sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh", - label: 'Build docs', - ) - } - {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }} - archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true) - } - } - } - }, - ) -} -} - -// [note: method size] test() -/* -stage('Build packages') { - parallel 'conda CPU': { - node('CPU') { - sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh - } - }, - 'conda cuda': { - node('CPU') { - sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh - sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh - } - } -// Here we could upload the packages to anaconda for releases -// and/or the main branch -} -*/ - -def deploy_docs() { - // Note: This code must stay in the Jenkinsfile to ensure that it runs - // from a trusted context only - sh( - script: ''' - set -eux - rm -rf tvm-site - git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site - cd tvm-site - git status - git checkout -B $DOCS_DEPLOY_BRANCH - - rm -rf docs - mkdir -p docs - tar xf ../docs.tgz -C docs - COMMIT=$(cat docs/commit_hash) - git add . - git config user.name tvm-bot - git config user.email 95660001+tvm-bot@users.noreply.github.com - git commit -m"deploying docs (apache/tvm@$COMMIT)" - git status - ''', - label: 'Unpack docs and update tvm-site' - ) - - withCredentials([string( - credentialsId: 'docs-push-token', - variable: 'GITHUB_TOKEN', - )]) { - sh( - script: ''' - cd tvm-site - git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git - git push deploy $DOCS_DEPLOY_BRANCH - ''', - label: 'Upload docs to apache/tvm-site' - ) - } -} - -stage('Deploy') { - if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') { - node('CPU') { - ws({{ m.per_exec_ws('tvm/deploy-docs') }}) { - {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }} - deploy_docs() - } - } - } -} +deploy() diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2 new file mode 100644 index 000000000000..61c13cd407d0 --- /dev/null +++ b/jenkins/Lint.groovy.j2 @@ -0,0 +1,18 @@ +def lint() { + stage('Lint') { + parallel( + {% call m.sharded_lint_step( + name='Lint', + num_shards=2, + node='CPU-SMALL', + ws='tvm/lint', + ) + %} + sh ( + script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh", + label: 'Run lint', + ) + {% endcall %} + ) + } +} diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2 new file mode 100644 index 000000000000..d7bf5e706b0b --- /dev/null +++ b/jenkins/Prepare.groovy.j2 @@ -0,0 +1,133 @@ +def per_exec_ws(folder) { + return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder +} + +// initialize source codes +def init_git() { + checkout scm + + // Clear out all Docker images that aren't going to be used + sh( + script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}} {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }", + label: 'Clean old Docker images', + ) + // Add more info about job node + sh ( + script: './tests/scripts/task_show_node_info.sh', + label: 'Show executor node info', + ) + + // Determine merge commit to use for all stages + sh ( + script: 'git fetch origin main', + label: 'Fetch upstream', + ) + if (upstream_revision == null) { + upstream_revision = sh( + script: 'git log -1 FETCH_HEAD --format=\'%H\'', + label: 'Determine upstream revision', + returnStdout: true, + ).trim() + } + sh ( + script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}", + label: 'Merge to origin/main' + ) + + retry(5) { + timeout(time: 2, unit: 'MINUTES') { + sh (script: 'git submodule update --init -f', label: 'Update git submodules') + } + } +} + +def should_skip_slow_tests(pr_number) { + withCredentials([string( + credentialsId: 'tvm-bot-jenkins-reader', + variable: 'GITHUB_TOKEN', + )]) { + // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests + result = sh ( + returnStatus: true, + script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'", + label: 'Check if CI should run slow tests', + ) + } + return result == 0 +} + +def cancel_previous_build() { + // cancel previous build if it is not on main. + if (env.BRANCH_NAME != 'main') { + def buildNumber = env.BUILD_NUMBER as int + // Milestone API allows us to cancel previous build + // with the same milestone number + if (buildNumber > 1) milestone(buildNumber - 1) + milestone(buildNumber) + } +} + +def should_skip_ci(pr_number) { + if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { + // never skip CI on build sourced from a branch + return false + } + glob_skip_ci_code = sh ( + returnStatus: true, + script: "./tests/scripts/git_skip_ci_globs.py", + label: 'Check if CI should be skipped due to changed files', + ) + if (glob_skip_ci_code == 0) { + return true + } + withCredentials([string( + credentialsId: 'tvm-bot-jenkins-reader', + variable: 'TOKEN', + )]) { + // Exit code of 1 means run full CI (or the script had an error, so run + // full CI just in case). Exit code of 0 means skip CI. + git_skip_ci_code = sh ( + returnStatus: true, + script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'", + label: 'Check if CI should be skipped', + ) + } + return git_skip_ci_code == 0 +} + +def prepare() { + stage('Prepare') { + node('CPU-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") { + init_git() + {% for image in images %} + {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }} + {% endfor %} + + sh (script: """ + echo "Docker images being used in this build:" + {% for image in images %} + echo " {{ image.name }} = ${ {{- image.name -}} }" + {% endfor %} + """, label: 'Docker image names') + + is_docs_only_build = sh ( + returnStatus: true, + script: './tests/scripts/git_change_docs.sh', + label: 'Check for docs only changes', + ) + skip_ci = should_skip_ci(env.CHANGE_ID) + skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID) + rebuild_docker_images = sh ( + returnStatus: true, + script: './tests/scripts/git_change_docker.sh', + label: 'Check for any docker changes', + ) + if (skip_ci) { + // Don't rebuild when skipping CI + rebuild_docker_images = false + } + } + } + } +} diff --git a/jenkins/README.md b/jenkins/README.md new file mode 100644 index 000000000000..454664b40c64 --- /dev/null +++ b/jenkins/README.md @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + +# Jenkins CI + +The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches. + +To regenerate the `Jenkinsfile`, run + +```bash +pip install -r jenkins/requirements.txt +python jenkins/generate.py +``` + diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2 new file mode 100644 index 000000000000..b287c2a3156e --- /dev/null +++ b/jenkins/Test.groovy.j2 @@ -0,0 +1,236 @@ +def test() { +stage('Test') { + environment { + SKIP_SLOW_TESTS = "${skip_slow_tests}" + } + parallel( + {% call(shard_index, num_shards) m.sharded_test_step( + name="unittest: GPU", + num_shards=2, + node="GPU", + ws="tvm/ut-python-gpu", + platform="gpu", + ) %} + {% if shard_index == 1 %} + {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }} + cpp_unittest(ci_gpu) + + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} + ci_setup(ci_gpu) + cpp_unittest(ci_gpu) + {% else %} + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} + ci_setup(ci_gpu) + {% endif %} + {% if shard_index == 2 or num_shards < 2 %} + sh ( + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh", + label: 'Run Java unit tests', + ) + {% endif %} + sh ( + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh", + label: 'Run Python GPU unit tests', + ) + sh ( + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh", + label: 'Run Python GPU integration tests', + ) + {% endcall %} + {% call(shard_index, num_shards) m.sharded_test_step( + name="integration: CPU", + node="CPU", + num_shards=2, + ws="tvm/integration-python-cpu", + platform="cpu", + ) %} + {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} + ci_setup(ci_cpu) + sh ( + script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", + label: 'Run CPU integration tests', + ) + {% endcall %} + {% call m.test_step( + name="unittest: CPU", + node="CPU-SMALL", + ws="tvm/ut-python-cpu", + platform="cpu", + ) %} + {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }} + ci_setup(ci_cpu) + cpp_unittest(ci_cpu) + python_unittest(ci_cpu) + fsim_test(ci_cpu) + sh ( + script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh", + label: 'Run VTA tests in TSIM', + ) + {% endcall %} + {% call(shard_index, num_shards) m.sharded_test_step( + name="python: i386", + node="CPU-SMALL", + num_shards=3, + ws="tvm/integration-python-i386", + platform="i386", + ) %} + {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }} + ci_setup(ci_i386) + {% if shard_index == 1 %} + cpp_unittest(ci_i386) + {% endif %} + python_unittest(ci_i386) + sh ( + script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", + label: 'Run i386 integration tests', + ) + fsim_test(ci_i386) + {% endcall %} + {% call(shard_index, num_shards) m.sharded_test_step( + name="test: Hexagon", + node="CPU-SMALL", + ws="tvm/test-hexagon", + platform="hexagon", + num_shards=4, + ) %} + {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }} + ci_setup(ci_hexagon) + {% if shard_index == 1 %} + cpp_unittest(ci_hexagon) + {% endif %} + sh ( + script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh", + label: 'Build Hexagon API', + ) + sh ( + script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", + label: 'Run Hexagon tests', + ) + {% endcall %} + {% call m.test_step( + name="test: QEMU", + node="CPU-SMALL", + ws="tvm/test-qemu", + platform="qemu", + ) %} + {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }} + add_microtvm_permissions() + ci_setup(ci_qemu) + cpp_unittest(ci_qemu) + sh ( + script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh", + label: 'Run microTVM tests', + ) + sh ( + script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh", + label: 'Run microTVM demos', + ) + {% endcall %} + {% call m.test_step( + name="topi: aarch64", + node="ARM", + ws="tvm/ut-python-arm", + platform="arm", +) %} + {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} + ci_setup(ci_arm) + cpp_unittest(ci_arm) + sh ( + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh", + label: 'Run test_arm_compute_lib test', + ) + sh ( + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh", + label: 'Run TOPI tests', + ) + {% endcall %} + {% call(shard_index, num_shards) m.sharded_test_step( + name="integration: aarch64", + num_shards=2, + node="ARM", ws="tvm/ut-python-arm", + platform="arm", + ) %} + {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} + ci_setup(ci_arm) + python_unittest(ci_arm) + sh ( + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", + label: 'Run CPU integration tests', + ) + {% endcall %} + {% call(shard_index, num_shards) m.sharded_test_step( + name="topi: GPU", + node="GPU", + num_shards=2, + ws="tvm/topi-python-gpu", + platform="gpu", + ) %} + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} + ci_setup(ci_gpu) + sh ( + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", + label: 'Run TOPI tests', + ) + {% endcall %} + {% call(shard_index, num_shards) m.sharded_test_step( + name="frontend: GPU", node="GPU", + num_shards=3, + ws="tvm/frontend-python-gpu", + platform="gpu", + ) %} + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }} + ci_setup(ci_gpu) + sh ( + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", + label: 'Run Python frontend tests', + ) + {% endcall %} + {% call m.test_step( + name="frontend: CPU", + node="CPU", + ws="tvm/frontend-python-cpu", + platform="cpu", +) %} + {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }} + ci_setup(ci_cpu) + sh ( + script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh", + label: 'Run Python frontend tests', + ) + {% endcall %} + {% call m.test_step( + name="frontend: aarch64", + node="ARM", + ws="tvm/frontend-python-arm", + platform="arm", +) %} + {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }} + ci_setup(ci_arm) + sh ( + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", + label: 'Run Python frontend tests', + ) + {% endcall %} + 'docs: GPU': { + if (!skip_ci) { + node('GPU') { + ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) { + init_git() + {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} + add_microtvm_permissions() + timeout(time: 180, unit: 'MINUTES') { + ci_setup(ci_gpu) + sh ( + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh", + label: 'Build docs', + ) + } + {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }} + archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true) + } + } + } + }, + ) +} +} diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes index 3dff79c565ce..1cdb78e31913 100644 --- a/tests/lint/rat-excludes +++ b/tests/lint/rat-excludes @@ -51,3 +51,11 @@ MANIFEST .bash_history rat-excludes Cargo.lock + +# Included template files +Build.groovy.j2 +Deploy.groovy.j2 +DockerBuild.groovy.j2 +Lint.groovy.j2 +Prepare.groovy.j2 +Test.groovy.j2 From bbc6ba362f4ff223a6954f79cd237de25209ffbd Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Sat, 21 May 2022 09:57:37 +0800 Subject: [PATCH 53/59] [Meta Schedule] Add Auto-Thread Binding Rule (#11177) The current meta-schedule uses a PostProc `RewriteUnboundBlock` to auto-bind blocks to threads. However, it's a post proc, which means there are no search opportunities, and always splits with `factor=1024`. This PR adds a new search rule called `AutoBind` to do a similar thing to bind threads with sampled factors. Also with a corresponding mutator. After applying this rule, we get some positive perf results (on RTX-3080): Element-wise: from 2.76 us to 2.48 us Conv2d Winograd: from 29.45 us to 18.96 us (ansor 22.00 us) Resnet18: from 0.591 ms to 0.531 ms (ansor 0.565 ms) --- include/tvm/meta_schedule/mutator.h | 10 +- include/tvm/meta_schedule/postproc.h | 4 +- include/tvm/meta_schedule/schedule_rule.h | 7 + python/tvm/meta_schedule/mutator/__init__.py | 1 + .../mutator/mutate_thread_binding.py | 32 +++ .../postproc/rewrite_unbound_block.py | 5 +- .../meta_schedule/schedule_rule/__init__.py | 1 + .../meta_schedule/schedule_rule/auto_bind.py | 49 +++++ .../testing/conv2d_winograd_cpu.py | 2 +- .../testing/conv2d_winograd_cuda.py | 2 +- .../meta_schedule/testing/schedule_rule.py | 8 + python/tvm/meta_schedule/tune.py | 8 +- python/tvm/topi/cuda/conv2d_nhwc_winograd.py | 2 +- python/tvm/topi/cuda/conv2d_winograd.py | 2 +- python/tvm/topi/nn/conv2d.py | 7 +- .../mutator/mutate_thread_binding.cc | 167 +++++++++++++++ .../postproc/rewrite_unbound_block.cc | 139 ++----------- src/meta_schedule/schedule_rule/auto_bind.cc | 192 ++++++++++++++++++ src/meta_schedule/schedule_rule/auto_bind.h | 52 +++++ src/meta_schedule/schedule_rule/winograd.cc | 23 ++- ...meta_schedule_custom_rule_winograd_cuda.py | 96 ++++++++- ..._schedule_mutator_mutate_thread_binding.py | 86 ++++++++ ...t_meta_schedule_schedule_rule_auto_bind.py | 75 +++++++ 23 files changed, 831 insertions(+), 139 deletions(-) create mode 100644 python/tvm/meta_schedule/mutator/mutate_thread_binding.py create mode 100644 python/tvm/meta_schedule/schedule_rule/auto_bind.py create mode 100644 src/meta_schedule/mutator/mutate_thread_binding.cc create mode 100644 src/meta_schedule/schedule_rule/auto_bind.cc create mode 100644 src/meta_schedule/schedule_rule/auto_bind.h create mode 100644 tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h index 002fa51ee5e3..d80fa70eee8a 100644 --- a/include/tvm/meta_schedule/mutator.h +++ b/include/tvm/meta_schedule/mutator.h @@ -119,13 +119,21 @@ class Mutator : public runtime::ObjectRef { * \return The created mutator. */ TVM_DLL static Mutator MutateParallel(int64_t max_jobs_per_core); - /*! \brief Create a Mutator that mutates auto unroll step */ + /*! + * \brief Create a Mutator that mutates auto unroll step + * \return The mutator created + */ TVM_DLL static Mutator MutateUnroll(); /*! * \brief Create a Mutator that mutates the outcome of SampleComputeLocation * \return The mutator created */ TVM_DLL static Mutator MutateComputeLocation(); + /*! + * \brief Create a Mutator that mutates auto thread binding. + * \return The mutator created + */ + TVM_DLL static Mutator MutateThreadBinding(); /*! * \brief Create a mutator with customized methods on the python-side. * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`. diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h index 8b32ce460933..195d55855017 100644 --- a/include/tvm/meta_schedule/postproc.h +++ b/include/tvm/meta_schedule/postproc.h @@ -144,10 +144,10 @@ class Postproc : public runtime::ObjectRef { TVM_DLL static Postproc RewriteReductionBlock(); /*! * \brief Create a postprocessor that adds thread binding to unbound blocks - * \param max_threadblock The max number of threadblocks in the cuda device. + * \param max_threadblocks The max number of threadblocks in the cuda device. * \return The postprocessor created. */ - TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblock); + TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblocks); /*! * \brief Create a postprocessor that applies tensorization to annotated blocks * \param vectorize_init_loop Whether or not vectorize the initialization loop produced by diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h index 2b2eefeb7574..b39c72e24db8 100644 --- a/include/tvm/meta_schedule/schedule_rule.h +++ b/include/tvm/meta_schedule/schedule_rule.h @@ -212,6 +212,13 @@ class ScheduleRule : public runtime::ObjectRef { int max_vectorize_extent, // Array unroll_max_steps, // bool unroll_explicit); + /*! + * \brief Auto bind loops around the block to BlockIdx and ThreadIdx + * \param max_threadblocks The maximum number of threadblock on GPU + * \param thread_extents Candidates of thread axis extent. + * \return The schedule rule created + */ + TVM_DLL static ScheduleRule AutoBind(int max_threadblocks, Array thread_extents); /*! * \brief Create a schedule rule with customized methods on the python-side. * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`. diff --git a/python/tvm/meta_schedule/mutator/__init__.py b/python/tvm/meta_schedule/mutator/__init__.py index e534ba14346e..a0f7bac35768 100644 --- a/python/tvm/meta_schedule/mutator/__init__.py +++ b/python/tvm/meta_schedule/mutator/__init__.py @@ -22,5 +22,6 @@ from .mutator import Mutator, PyMutator from .mutate_compute_location import MutateComputeLocation from .mutate_tile_size import MutateTileSize +from .mutate_thread_binding import MutateThreadBinding from .mutate_parallel import MutateParallel from .mutate_unroll import MutateUnroll diff --git a/python/tvm/meta_schedule/mutator/mutate_thread_binding.py b/python/tvm/meta_schedule/mutator/mutate_thread_binding.py new file mode 100644 index 000000000000..6a2553f94346 --- /dev/null +++ b/python/tvm/meta_schedule/mutator/mutate_thread_binding.py @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Mutator that mutates the thread binding extent""" +from tvm._ffi.registry import register_object + +from .. import _ffi_api +from .mutator import Mutator + + +@register_object("meta_schedule.MutateThreadBinding") +class MutateThreadBinding(Mutator): + """Mutator that mutates the binding extent""" + + def __init__(self) -> None: + """Mutator that mutates the binding extent""" + self.__init_handle_by_constructor__( + _ffi_api.MutateThreadBinding, # type: ignore # pylint: disable=no-member + ) diff --git a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py index c89bc4b0369a..aef5bca690e4 100644 --- a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py +++ b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py @@ -17,6 +17,7 @@ """A postprocessor that adds thread binding to unbound blocks""" from tvm._ffi.registry import register_object + from .. import _ffi_api from .postproc import Postproc @@ -25,8 +26,8 @@ class RewriteUnboundBlock(Postproc): """A postprocessor that adds thread binding to unbound blocks""" - def __init__(self, max_threadblock: int = 256) -> None: + def __init__(self, max_threadblocks: int = 256) -> None: self.__init_handle_by_constructor__( _ffi_api.PostprocRewriteUnboundBlock, # type: ignore # pylint: disable=no-member - max_threadblock, + max_threadblocks, ) diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py index a958fdc39db1..18fc1de78c7b 100644 --- a/python/tvm/meta_schedule/schedule_rule/__init__.py +++ b/python/tvm/meta_schedule/schedule_rule/__init__.py @@ -20,6 +20,7 @@ blocks in a schedule. See also PostOrderApply. """ from .add_rfactor import AddRFactor +from .auto_bind import AutoBind from .auto_inline import AutoInline from .cross_thread_reduction import CrossThreadReduction from .multi_level_tiling import MultiLevelTiling, MultiLevelTilingWithIntrin, ReuseType diff --git a/python/tvm/meta_schedule/schedule_rule/auto_bind.py b/python/tvm/meta_schedule/schedule_rule/auto_bind.py new file mode 100644 index 000000000000..c211093e9275 --- /dev/null +++ b/python/tvm/meta_schedule/schedule_rule/auto_bind.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Auto-bind Rule that binds blocks to threads if needed""" +from typing import List, Optional + +from tvm._ffi import register_object + +from .. import _ffi_api +from .schedule_rule import ScheduleRule + + +@register_object("meta_schedule.AutoBind") +class AutoBind(ScheduleRule): + """Auto bind loops around the block to BlockIdx and ThreadIdx + + Parameters + ---------- + max_threadblocks: int + The maximum number of threadblock on GPU. + thread_extents: Optional[List[int]] + Candidates of thread axis extent. + """ + + def __init__( + self, + max_threadblocks: int = 256, + thread_extents: Optional[List[int]] = None, + ) -> None: + if thread_extents is None: + thread_extents = [32, 64, 128, 256, 512, 1024] + self.__init_handle_by_constructor__( + _ffi_api.ScheduleRuleAutoBind, # type: ignore # pylint: disable=no-member + max_threadblocks, + thread_extents, + ) diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py index 261768c4897b..d6242020726b 100644 --- a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py +++ b/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py @@ -131,7 +131,7 @@ def conv2d_winograd_cpu( vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap( "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1] ) - T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse"}) + T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.llvm"}) T.reads( [ inverse[vh, vw, p_3, co_1], diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py index 530eadafc0f3..e737f9b04e62 100644 --- a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py +++ b/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py @@ -132,7 +132,7 @@ def conv2d_winograd_cuda( # type: ignore vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap( "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1] ) - T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse"}) + T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.cuda"}) T.reads( [ inverse[vh, vw, p_3, co_1], diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py index b149f20c52e3..e159bfaaaa5a 100644 --- a/python/tvm/meta_schedule/testing/schedule_rule.py +++ b/python/tvm/meta_schedule/testing/schedule_rule.py @@ -17,6 +17,7 @@ """Default schedule rules""" from tvm.meta_schedule.schedule_rule import ( AddRFactor, + AutoBind, AutoInline, CrossThreadReduction, MultiLevelTiling, @@ -28,6 +29,13 @@ from tvm.target import Target +def auto_bind(target: Target) -> ScheduleRule: + """Default schedule rules for auto bind""" + if target.kind.name == "cuda": + return AutoBind(max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024]) + raise NotImplementedError(f"{target.kind.name} is not supported") + + def auto_inline(target: Target) -> ScheduleRule: """Default schedule rules for auto inline""" if target.kind.name == "llvm": diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index 270c0dab8db4..9af237b3b7b8 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -156,6 +156,10 @@ def _sch_rules() -> List[ScheduleRule]: unroll_max_steps=[0, 16, 64, 512, 1024], unroll_explicit=True, ), + M.AutoBind( + max_threadblocks=256, + thread_extents=[32, 64, 128, 256, 512, 1024], + ), ] @staticmethod @@ -177,7 +181,8 @@ def _mutator_probs() -> Dict[Mutator, float]: return { M.MutateTileSize(): 0.9, - M.MutateUnroll(): 0.1, + M.MutateUnroll(): 0.08, + M.MutateThreadBinding(): 0.02, } @@ -842,6 +847,7 @@ def tune_relay( """ # pylint: disable=import-outside-toplevel from tvm.relay import build as relay_build + from .relay_integration import extract_task_from_relay # pylint: disable=protected-access, enable=import-outside-toplevel diff --git a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py index 80745a90d9ff..8accbbe53273 100644 --- a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py +++ b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py @@ -440,7 +440,7 @@ def nhwc_winograd_cuda( bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b] ), name="inverse", - attrs={"schedule_rule": "meta_schedule.winograd_inverse"}, + attrs={"schedule_rule": "meta_schedule.winograd_inverse.cuda"}, ) # Output diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py index 4ff3f52b998f..d2b373ba87a7 100644 --- a/python/tvm/topi/cuda/conv2d_winograd.py +++ b/python/tvm/topi/cuda/conv2d_winograd.py @@ -152,7 +152,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_ bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b] ), name="inverse", - attrs={"schedule_rule": "meta_schedule.winograd_inverse"}, + attrs={"schedule_rule": "meta_schedule.winograd_inverse.cuda"}, ) # output diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index c27ea81144ac..b7ae9b3e1cd7 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -1096,6 +1096,11 @@ def _conv2d_winograd_nhwc_impl( bgemm = auto_scheduler.rewrite_compute_body(bgemm, auto_scheduler_rewritten_layout) # inverse transform + if target is not None: + target_kind = "meta_schedule.winograd_inverse." + target.kind.name + else: + target_kind = "None" + r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_b") inverse = te.compute( @@ -1106,7 +1111,7 @@ def _conv2d_winograd_nhwc_impl( name="inverse", attrs={ "auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"], - "schedule_rule": "meta_schedule.winograd_inverse", + "schedule_rule": target_kind, }, # the attrs are necessary hints for the auto-scheduler ) diff --git a/src/meta_schedule/mutator/mutate_thread_binding.cc b/src/meta_schedule/mutator/mutate_thread_binding.cc new file mode 100644 index 000000000000..41207162ee1d --- /dev/null +++ b/src/meta_schedule/mutator/mutate_thread_binding.cc @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" + +namespace tvm { +namespace meta_schedule { + +using tir::Instruction; +using tir::InstructionKind; +using tir::Trace; + +/*! \brief A mutator that mutates the thread binding factor decision of SampleCategorical */ +class MutateThreadBindingNode : public MutatorNode { + public: + /*! \brief JSON representation of the workload */ + std::string json_mod_; + + void VisitAttrs(tvm::AttrVisitor* v) {} + static constexpr const char* _type_key = "meta_schedule.MutateThreadBinding"; + TVM_DECLARE_FINAL_OBJECT_INFO(MutateThreadBindingNode, MutatorNode); + + public: + // Inherit from `MutatorNode` + void InitializeWithTuneContext(const TuneContext& context) final { + this->json_mod_ = SaveJSON(context->mod.value()); + } + // Inherit from `MutatorNode` + Optional Apply(const Trace& trace, TRandState* rand_state) final; + + private: + struct Candidate { + /*! \brief The sampling instruction to be mutated */ + Instruction inst; + /*! \brief The probability */ + std::vector probs; + /*! \brief The decision made */ + int decision; + + explicit Candidate(Instruction inst, std::vector probs, int decision) + : inst(std::move(inst)), probs(std::move(probs)), decision(std::move(decision)) {} + }; + + std::vector FindCandidates(const Trace& trace, TRandState* rand_state); +}; + +/*! + * \brief Find Candidate with the following pattern: + * \code + * v = sch.sample_categorical(...) + * l1, l2 = sch.split(loop=l0, factors=[None, v]) + * sch.bind(loop=l2, thread_axis="threadIdx.x") + * \endcode + * + * \param trace The trace from which to find the instructions + * \return All the candidate instructions + */ +std::vector MutateThreadBindingNode::FindCandidates( + const Trace& trace, TRandState* rand_state) { + using tir::InstructionNode; + + static InstructionKind inst_sample_categorical = InstructionKind::Get("SampleCategorical"); + static InstructionKind inst_split = InstructionKind::Get("Split"); + static InstructionKind inst_bind = InstructionKind::Get("Bind"); + + std::vector candidates; + std::unordered_map sample_insts; + std::unordered_map sampled_split_insts; + std::vector bind_insts; + + auto is_split_by_sample = [&sample_insts](const Instruction& inst) -> bool { + if (!inst->kind.same_as(inst_split)) { + return false; + } + // Only consider cases with 2 factors and the first one is None + if (inst->inputs.size() != 3 || inst->inputs[1].defined()) return false; + ICHECK(inst->inputs[2].defined()); + + return sample_insts.find(Downcast(inst->inputs[2]).get()) != sample_insts.end(); + }; + + auto is_thread_binding_by_sample = [&sampled_split_insts](const Instruction& inst) -> bool { + if (!inst->kind.same_as(inst_bind)) { + return false; + } + ICHECK_EQ(inst->inputs.size(), 1); + ICHECK_EQ(inst->attrs.size(), 1); + if (Downcast(inst->attrs[0]) != "threadIdx.x") return false; + + return sampled_split_insts.find(Downcast(inst->inputs[0]).get()) != + sampled_split_insts.end(); + }; + + for (const Instruction& inst : trace->insts) { + if (inst->kind.same_as(inst_sample_categorical)) { + ICHECK_EQ(inst->outputs.size(), 1); + const PrimExprNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[0], PrimExprNode); + sample_insts[var_rv] = inst.get(); + } else if (is_split_by_sample(inst)) { + CHECK_EQ(inst->outputs.size(), 2); + // Only consider the inner loop, which can be bound to threadIdx.x + const tir::LoopRVNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[1], tir::LoopRVNode); + sampled_split_insts[var_rv] = inst.get(); + } else if (is_thread_binding_by_sample(inst)) { + bind_insts.push_back(inst.get()); + } + } + + for (const InstructionNode* bind_inst : bind_insts) { + const auto* loop_rv = TVM_TYPE_AS(loop_rv, bind_inst->inputs[0], tir::LoopRVNode); + auto split_it = sampled_split_insts.find(loop_rv); + ICHECK(split_it != sampled_split_insts.end()); + const InstructionNode* split_inst = split_it->second; + + const auto* expr_rv = TVM_TYPE_AS(expr_rv, split_inst->inputs[2], PrimExprNode); + auto sample_it = sample_insts.find(expr_rv); + ICHECK(sample_it != sample_insts.end()); + const InstructionNode* sample_inst = sample_it->second; + + int decision = Downcast(trace->decisions[GetRef(sample_inst)])->value; + + std::vector probs = + support::AsVector(Downcast>(sample_inst->attrs[1])); + + candidates.emplace_back(GetRef(sample_inst), probs, decision); + } + return candidates; +} + +Optional MutateThreadBindingNode::Apply(const Trace& trace, TRandState* rand_state) { + std::vector candidates = FindCandidates(trace, rand_state); + if (candidates.empty()) { + return NullOpt; + } + Candidate candidate = candidates[tir::SampleInt(rand_state, 0, candidates.size())]; + // Remove the current decision + candidate.probs.erase(candidate.probs.begin() + candidate.decision); + int result = tir::MakeMultinomialSampler(rand_state, candidate.probs)(); + if (result >= candidate.decision) { + result += 1; + } + return trace->WithDecision(candidate.inst, Integer(result), /*remove_postproc=*/true); +} + +Mutator Mutator::MutateThreadBinding() { return Mutator(make_object()); } + +TVM_REGISTER_NODE_TYPE(MutateThreadBindingNode); +TVM_REGISTER_GLOBAL("meta_schedule.MutateThreadBinding") + .set_body_typed(Mutator::MutateThreadBinding); + +} // namespace meta_schedule +} // namespace tvm diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc index 73dc89d30e1f..183f04e7ba23 100644 --- a/src/meta_schedule/postproc/rewrite_unbound_block.cc +++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc @@ -16,84 +16,12 @@ * specific language governing permissions and limitations * under the License. */ +#include "../schedule_rule/auto_bind.h" #include "../utils.h" namespace tvm { namespace tir { -/*! \brief The rewrite type for an unbound block */ -enum class BindType : int32_t { - /*! \brief No additional thread binding is needed */ - kNoBind = 0, - /*! \brief Need to bind to blockIdx */ - kBindBlock = 1, - /*! \brief Need to bind to both blockIdx and threadIdx */ - kBindBlockThread = 2, -}; - -/*! - * \brief Check the combination of bindings to be added to the block - * \param block_sref The block to be checked - * \param fuse_first_num The number of loops to be fused - * \return The type of binding to be added to the block - */ -BindType GetBindType(const StmtSRef& block_sref, int* fuse_first_num) { - Array loops = tir::GetLoops(block_sref); - int n = loops.size(); - if (n == 0) { - return BindType::kNoBind; - } - int i_block_idx = -1; - int i_thread_idx = -1; - int i_multi_child = -1; - int i_spatial_loop = -1; - for (int i = 0; i < n; ++i) { - const StmtSRef& loop_sref = loops[i]; - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); - runtime::ThreadScope thread_scope = GetThreadScope(loop); - if (IsBlockIdx(thread_scope)) { - if (i_block_idx == -1) { - i_block_idx = i; - } - } - if (IsThreadIdx(thread_scope)) { - if (i_thread_idx == -1) { - i_thread_idx = i; - } - } - if (loop->kind != tir::ForKind::kSerial) { - if (i_multi_child == -1) { - i_multi_child = i; - } - } - if (!IsSingleStmt(loop->body)) { - if (i_multi_child == -1) { - i_multi_child = i + 1; - } - } - if (tir::GetLoopIterType(loop_sref) == IterVarType::kDataPar) { - if (i_spatial_loop == i - 1) { - ++i_spatial_loop; - } - } - } - if (i_multi_child == -1) { - i_multi_child = n; - } - if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) { - return BindType::kNoBind; - } else if (i_block_idx != -1 && i_thread_idx == -1) { - ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not"; - throw; - } else if (i_block_idx == -1 && i_thread_idx != -1) { - *fuse_first_num = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1); - return BindType::kBindBlock; - } else { // i_block_idx == -1 && i_thread_idx == -1 - *fuse_first_num = std::min(i_multi_child, i_spatial_loop + 1); - return BindType::kBindBlockThread; - } -} - /*! \brief Find all the blocks that are not bound */ class UnboundBlockFinder : private StmtVisitor { public: @@ -159,11 +87,11 @@ class RewriteUnboundBlockNode : public PostprocNode { // Inherited from PostprocNode void InitializeWithTuneContext(const TuneContext& context) final { CHECK(context->target.defined()) << "ValueError: target is not defined"; - Optional max_num_threads = + Optional max_threads_per_block = context->target.value()->GetAttr("max_threads_per_block"); - CHECK(max_num_threads.defined()) + CHECK(max_threads_per_block.defined()) << "ValueError: missing attribute `max_threads_per_block` in the target"; - this->max_num_threads_ = max_num_threads.value(); + this->max_threads_per_block_ = max_threads_per_block.value(); } // Inherited from PostprocNode @@ -171,13 +99,13 @@ class RewriteUnboundBlockNode : public PostprocNode { public: /*! \brief The max number of threads per block from Target */ - int max_num_threads_ = -1; + int max_threads_per_block_ = -1; /*! \brief The max number of threadblocks in the cuda device */ - int max_threadblock_ = -1; + int max_threadblocks_ = -1; void VisitAttrs(tvm::AttrVisitor* v) { - // `max_num_threads_` is not visited - // `max_threadblock_` is not visited + // `max_threads_per_block_` is not visited + // `max_threadblocks_` is not visited } static constexpr const char* _type_key = "meta_schedule.RewriteUnboundBlock"; @@ -186,61 +114,28 @@ class RewriteUnboundBlockNode : public PostprocNode { bool RewriteUnboundBlockNode::Apply(const tir::Schedule& sch) { using tir::BlockRV; + using tir::ExprRV; using tir::LoopRV; using tir::Schedule; - ICHECK_NE(this->max_num_threads_, -1); + ICHECK_NE(this->max_threads_per_block_, -1); + auto get_factor = [t = this->max_threads_per_block_](int max_extent) -> ExprRV { + return Integer(std::min(t, max_extent)); + }; std::vector> unbound_blocks = tir::UnboundBlockFinder::Find(sch->state()); for (const auto& kv : unbound_blocks) { tir::StmtSRef block_sref = kv.first; String global_var_name = kv.second; - int fuse_first_num = 0; - tir::BindType bind_type = tir::GetBindType(block_sref, &fuse_first_num); - if (bind_type == tir::BindType::kNoBind) { - continue; - } BlockRV block_rv = GetRVFromSRef(sch, block_sref, global_var_name); - Array loop_rvs = sch->GetLoops(block_rv); - LoopRV fused = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + fuse_first_num}); - if (bind_type == tir::BindType::kBindBlock) { - sch->Bind(fused, "blockIdx.x"); - } else if (bind_type == tir::BindType::kBindBlockThread) { - int64_t extent_size = 0; - Array splits; - if (const int64_t* extent_ptr = tir::GetLoopIntExtent(sch->Get(fused).get())) { - extent_size = *extent_ptr; - if (extent_size > max_threadblock_ * max_num_threads_) { - splits = - sch->Split(fused, {NullOpt, Integer(max_threadblock_), Integer(max_num_threads_)}); - ICHECK_EQ(splits.size(), 3); - sch->Reorder({splits[1], splits[2], splits[0]}); - sch->Bind(splits[1], "blockIdx.x"); - sch->Bind(splits[2], "threadIdx.x"); - } else { - ICHECK_NE(extent_size, 0); - splits = sch->Split( - fused, - {NullOpt, Integer(std::min(static_cast(max_num_threads_), extent_size))}); - ICHECK_EQ(splits.size(), 2); - sch->Bind(splits[0], "blockIdx.x"); - sch->Bind(splits[1], "threadIdx.x"); - } - } else { - // loop is dynamic, returns nullptr - splits = sch->Split(fused, {NullOpt, Integer(max_num_threads_)}); - ICHECK_EQ(splits.size(), 2); - sch->Bind(splits[0], "blockIdx.x"); - sch->Bind(splits[1], "threadIdx.x"); - } - } + BindBlockThreadIdx(sch, block_rv, max_threadblocks_, max_threads_per_block_, get_factor); } return true; } -Postproc Postproc::RewriteUnboundBlock(int max_threadblock) { +Postproc Postproc::RewriteUnboundBlock(int max_threadblocks) { ObjectPtr n = make_object(); - n->max_threadblock_ = max_threadblock; - n->max_num_threads_ = -1; + n->max_threadblocks_ = max_threadblocks; + n->max_threads_per_block_ = -1; return Postproc(n); } diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc new file mode 100644 index 000000000000..9c16856557e0 --- /dev/null +++ b/src/meta_schedule/schedule_rule/auto_bind.cc @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "./auto_bind.h" + +#include +#include + +#include "../utils.h" + +namespace tvm { +namespace meta_schedule { + +void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv, + int64_t max_threadblocks, int64_t max_threads_per_block, + std::function get_factor) { + using namespace tvm::tir; + Array loops = tir::GetLoops(sch->GetSRef(block_rv)); + int n = loops.size(); + if (n == 0) { + return; + } + int i_block_idx = -1; + int i_thread_idx = -1; + int i_multi_child = -1; + int i_spatial_loop = -1; + for (int i = 0; i < n; ++i) { + const StmtSRef& loop_sref = loops[i]; + const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + runtime::ThreadScope thread_scope = GetThreadScope(loop); + if (IsBlockIdx(thread_scope)) { + if (i_block_idx == -1) { + i_block_idx = i; + } + } + if (IsThreadIdx(thread_scope)) { + if (i_thread_idx == -1) { + i_thread_idx = i; + } + } + if (loop->kind != ForKind::kSerial) { + if (i_multi_child == -1) { + i_multi_child = i; + } + } + if (!IsSingleStmt(loop->body)) { + if (i_multi_child == -1) { + i_multi_child = i + 1; + } + } + if (GetLoopIterType(loop_sref) == IterVarType::kDataPar) { + if (i_spatial_loop == i - 1) { + ++i_spatial_loop; + } + } + } + if (i_multi_child == -1) { + i_multi_child = n; + } + if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) { + return; + } + if (i_block_idx != -1 && i_thread_idx == -1) { + ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not"; + throw; + } + LoopRV loop_rv{nullptr}; + if (i_block_idx == -1 && i_thread_idx != -1) { + int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1); + Array loop_rvs = sch->GetLoops(block_rv); + loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse}); + sch->Bind(loop_rv, "blockIdx.x"); + return; + } else { // i_block_idx == -1 && i_thread_idx == -1 + Array loop_rvs = sch->GetLoops(block_rv); + int num_fuse = std::min(i_multi_child, i_spatial_loop + 1); + loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse}); + } + int64_t extent = -1; + if (const int64_t* e = GetLoopIntExtent(sch->Get(loop_rv).get())) { + extent = *e; + } else { + extent = std::numeric_limits::max(); + } + if (extent <= max_threadblocks * max_threads_per_block) { + ExprRV factor = get_factor(std::min(extent, max_threads_per_block)); + Array splits = sch->Split(loop_rv, {NullOpt, factor}); + ICHECK_EQ(splits.size(), 2); + sch->Bind(splits[0], "blockIdx.x"); + sch->Bind(splits[1], "threadIdx.x"); + } else { + Array splits = sch->Split(loop_rv, {NullOpt, + Integer(max_threadblocks), // + Integer(max_threads_per_block)}); + ICHECK_EQ(splits.size(), 3); + sch->Reorder({splits[1], splits[2], splits[0]}); + sch->Bind(splits[1], "blockIdx.x"); + sch->Bind(splits[2], "threadIdx.x"); + } +} + +std::function MakeFactorSampler(tir::Schedule sch, + Array thread_extents) { + return [sch = std::move(sch), + thread_extents = std::move(thread_extents)](int64_t max_extent) -> tir::ExprRV { + Array extents; + extents.reserve(thread_extents.size()); + for (const Integer extent : thread_extents) { + if (extent->value <= max_extent) { + extents.push_back(extent); + } + } + int n = extents.size(); + if (n == 0) { + return Integer(max_extent); + } + if (n == 1) { + return Integer(extents[0]); + } + Array probs(n, FloatImm(DataType::Float(64), 1.0 / n)); + return sch->SampleCategorical(extents, probs); + }; +} + +class AutoBindNode : public ScheduleRuleNode { + public: + // Inherited from ScheduleRuleNode + void InitializeWithTuneContext(const TuneContext& context) final { + CHECK(context->target.defined()) << "ValueError: target is not defined"; + Optional max_threads_per_block = + context->target.value()->GetAttr("max_threads_per_block"); + CHECK(max_threads_per_block.defined()) + << "ValueError: missing attribute `max_threads_per_block` in the target"; + this->max_threads_per_block_ = max_threads_per_block.value(); + } + + // Inherited from ScheduleRuleNode + Array Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final; + + public: + /*! \brief The max number of threads per block from Target */ + int64_t max_threads_per_block_ = -1; + /*! \brief The max number of threadblocks in the cuda device */ + int64_t max_threadblocks_ = -1; + /*! \brief thread_extents Candidates of thread axis extent. */ + Array thread_extents_; + + void VisitAttrs(tvm::AttrVisitor* v) { + // `max_threads_per_block_` is not visited + // `max_threadblocks_` is not visited + // `thread_extents_` is not visited + } + + static constexpr const char* _type_key = "meta_schedule.AutoBind"; + TVM_DECLARE_FINAL_OBJECT_INFO(AutoBindNode, ScheduleRuleNode); +}; + +Array AutoBindNode::Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) { + ICHECK_NE(this->max_threads_per_block_, -1); + auto get_factor = MakeFactorSampler(sch, this->thread_extents_); + BindBlockThreadIdx(sch, block_rv, max_threadblocks_, max_threads_per_block_, get_factor); + return {sch}; +} + +ScheduleRule ScheduleRule::AutoBind(int max_threadblocks, Array thread_extents) { + ObjectPtr n = make_object(); + n->max_threadblocks_ = max_threadblocks; + n->max_threads_per_block_ = -1; + n->thread_extents_ = std::move(thread_extents); + return ScheduleRule(n); +} + +TVM_REGISTER_NODE_TYPE(AutoBindNode); +TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleAutoBind").set_body_typed(ScheduleRule::AutoBind); + +} // namespace meta_schedule +} // namespace tvm diff --git a/src/meta_schedule/schedule_rule/auto_bind.h b/src/meta_schedule/schedule_rule/auto_bind.h new file mode 100644 index 000000000000..b397d2015c19 --- /dev/null +++ b/src/meta_schedule/schedule_rule/auto_bind.h @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_ +#define TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_ + +#include "../utils.h" + +namespace tvm { +namespace meta_schedule { + +/*! + * \brief Bind the given block if it is not bound to blockIdx or threadIdx. + * \param sch The schedule. + * \param block The block to be bound. + * \param max_threadblocks The maximum number of threadblocks allowed. + * \param max_threads The maximum number of threads allowed. + * \param get_factor A function that returns the tiling factor. + */ +void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block, + int64_t max_threadblocks, int64_t max_threads_per_block, + std::function get_factor); + +/*! + * \brief Given candidates of thread_extents, make a sampler that use `sch->SampleCategorical` + * to return a random thread extent. + * \param sch The schedule + * \param thread_extents The candidate thread extents. + * \return A sampler that returns a random thread extent. + */ +std::function MakeFactorSampler(tir::Schedule sch, + Array thread_extents); + +} // namespace meta_schedule +} // namespace tvm + +#endif // TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_ diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc index d8aab3a3f757..ceec080b00a9 100644 --- a/src/meta_schedule/schedule_rule/winograd.cc +++ b/src/meta_schedule/schedule_rule/winograd.cc @@ -17,9 +17,12 @@ * under the License. */ #include "../utils.h" +#include "./auto_bind.h" namespace tvm { -namespace tir { +namespace meta_schedule { + +using namespace tvm::tir; TVM_REGISTER_GLOBAL("meta_schedule.compute_inline") .set_body_typed([](Schedule sch, BlockRV block) -> Array { @@ -63,7 +66,7 @@ inline LoopRV ScheduleDataPack(Schedule sch, BlockRV block) { return t1[1]; } -TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse") +TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.llvm") .set_body_typed([](Schedule sch, BlockRV block) -> Array { ScheduleDataPack(sch, block); return {sch}; @@ -81,6 +84,16 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.llvm") return {sch}; }); +TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.cuda") + .set_body_typed([](Schedule sch, BlockRV block) -> Array { + ScheduleDataPack(sch, block); + int64_t max_threadblocks = 256; + int64_t max_threads_per_block = 1024; + auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024}); + BindBlockThreadIdx(sch, block, max_threadblocks, max_threads_per_block, get_factor); + return {sch}; + }); + TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda") .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array { BlockRV input_tile = GetOnlyProducer(sch, data_pack); @@ -89,8 +102,12 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda") sch->ComputeAt(input_tile, /*loop_rv=*/loop, /*preserve_unit_loops=*/true); sch->SetScope(input_tile, /*buffer_index=*/0, /*storage_scope=*/"local"); sch->ComputeInline(data_pad); + int64_t max_threadblocks = 256; + int64_t max_threads_per_block = 1024; + auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024}); + BindBlockThreadIdx(sch, data_pack, max_threadblocks, max_threads_per_block, get_factor); return {sch}; }); -} // namespace tir +} // namespace meta_schedule } // namespace tvm diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py index afe6548d6fe3..328f98e7f0cb 100644 --- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py +++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py @@ -44,6 +44,25 @@ def input_tile_data_pad(sch: Schedule): b127 = sch.get_block(name="data_pad") sch.compute_inline(block=b127) + b3 = sch.get_block(name="data_pack") + l25, l26, l27, l28, _, _, _, _ = sch.get_loops(block=b3) + l33 = sch.fuse(l25, l26, l27, l28) + v34 = sch.sample_categorical( + candidates=[32, 64, 128, 256, 512, 1024], + probs=[ + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + ], + decision=2, + ) + l35, l36 = sch.split(loop=l33, factors=[None, v34]) + sch.bind(loop=l35, thread_axis="blockIdx.x") + sch.bind(loop=l36, thread_axis="threadIdx.x") + def data_pack(sch: Schedule): b16 = sch.get_block(name="data_pack") l17, l18, l19, l20, l21, l22 = sch.get_loops(block=b16) @@ -74,6 +93,16 @@ def bgemm(sch: Schedule): ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS", ) + sch.annotate( + block_or_loop=b31, + ann_key="meta_schedule.thread_extent_low_inclusive", + ann_val=32, + ) + sch.annotate( + block_or_loop=b31, + ann_key="meta_schedule.thread_extent_high_inclusive", + ann_val=1024, + ) b32 = sch.cache_write(block=b31, write_buffer_index=0, storage_scope="local") b31, b32 = b32, b31 l33, l34, l35, l36, l37 = sch.get_loops(block=b32) @@ -185,6 +214,57 @@ def inverse(sch: Schedule): sch.unroll(loop=l6) sch.unroll(loop=l7) sch.reorder(l10, l14, l11, l15, l2, l3, l6, l7) + l59 = sch.fuse(l10, l14, l11, l15) + v60 = sch.sample_categorical( + candidates=[32, 64, 128, 256, 512, 1024], + probs=[ + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + ], + decision=2, + ) + l61, l62 = sch.split(loop=l59, factors=[None, v60]) + sch.bind(loop=l61, thread_axis="blockIdx.x") + sch.bind(loop=l62, thread_axis="threadIdx.x") + + def conv2d(sch: Schedule): + b7 = sch.get_block(name="conv2d_winograd") + l141, l142, l143, l144 = sch.get_loops(block=b7) + l145 = sch.fuse(l141, l142, l143, l144) + v146 = sch.sample_categorical( + candidates=[32, 64, 128, 256, 512, 1024], + probs=[ + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + ], + decision=2, + ) + l147, l148 = sch.split(loop=l145, factors=[None, v146]) + sch.bind(loop=l147, thread_axis="blockIdx.x") + sch.bind(loop=l148, thread_axis="threadIdx.x") + + def root_anno(sch: Schedule): + b8 = sch.get_block(name="root", func_name="main") + v140 = sch.sample_categorical( + candidates=[0, 16, 64, 512, 1024], + probs=[ + 0.20000000000000001, + 0.20000000000000001, + 0.20000000000000001, + 0.20000000000000001, + 0.20000000000000001, + ], + decision=2, + ) + sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v140) # pylint: enable=invalid-name @@ -194,6 +274,8 @@ def inverse(sch: Schedule): input_tile_data_pad(sch) bgemm(sch) inverse(sch) + conv2d(sch) + root_anno(sch) return sch.mod @@ -203,23 +285,27 @@ def test_conv2d_winograd_cuda(): mod = IRModule({"main": mod}) context = TuneContext( mod=mod, - target=Target("cuda"), + target=Target("nvidia/geforce-rtx-3090", host="llvm"), task_name="Custom Search Space Task", sch_rules=DefaultCUDA._sch_rules(), # pylint: disable=protected-access ) + for sch_rule in context.sch_rules: + sch_rule.initialize_with_tune_context(context) post_order_apply = PostOrderApply() post_order_apply.initialize_with_tune_context(context) (sch,) = post_order_apply.generate_design_space(mod) decisions = dict( zip( - [i for i in sch.trace.insts[:-2] if i.kind.name.startswith("Sample")], + [i for i in sch.trace.insts if i.kind.name.startswith("Sample")], [ # data_pack [3, 3], [64, 2], + 2, # inverse [3, 3], [2, 64], + 2, # bgemm [1, 1, 1, 1, 6], [1, 1, 1, 3, 2], @@ -228,10 +314,14 @@ def test_conv2d_winograd_cuda(): [32, 1, 4], 1, 1, + # root anno + 2, + # conv2d + 2, ], ) ) - trace = Trace(sch.trace.insts[:-2], decisions=decisions) + trace = Trace(sch.trace.insts, decisions=decisions) sch = Schedule(mod=mod) trace.apply_to_schedule(sch, remove_postproc=False) answer = sch.mod diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py new file mode 100644 index 000000000000..a2e5dcbd1f0a --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring +from tvm.meta_schedule import TuneContext +from tvm.meta_schedule.mutator import MutateThreadBinding, Mutator +from tvm.script import tir as T +from tvm.target import Target +from tvm.tir import Schedule + +# pylint: disable=invalid-name, no-member + + +@T.prim_func +def element_wise(var_A: T.handle, var_B: T.handle) -> None: + A = T.match_buffer(var_A, [512, 512], dtype="float32") + B = T.match_buffer(var_B, [512, 512], dtype="float32") + for i, j in T.grid(512, 512): + with T.block("C"): + vi, vj = T.axis.remap("SS", [i, j]) + B[vi, vj] = A[vi, vj] + 1.0 + + +# pylint: enable=invalid-name, no-member + + +def _sch() -> Schedule: + sch = Schedule(element_wise, debug_mask="all") + # pylint: disable=invalid-name + b0 = sch.get_block(name="C", func_name="main") + l1, l2 = sch.get_loops(block=b0) + l3 = sch.fuse(l1, l2) + v4 = sch.sample_categorical( + candidates=[32, 64, 128, 256, 512, 1024], + probs=[ + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + 0.16666666666666666, + ], + decision=3, + ) + l5, l6 = sch.split(loop=l3, factors=[None, v4]) + sch.bind(loop=l5, thread_axis="blockIdx.x") + sch.bind(loop=l6, thread_axis="threadIdx.x") + # pylint: enable=invalid-name + return sch + + +def _make_mutator(target: Target) -> Mutator: + mutator = MutateThreadBinding() + mutator.initialize_with_tune_context(TuneContext(mod=element_wise, target=target)) + return mutator + + +def test_mutate_thread_binding(): + mutator = _make_mutator(target=Target("cuda")) + sch = _sch() + results = set() + for _ in range(100): + trace = mutator.apply(sch.trace) + decision = trace.decisions[trace.insts[-4]] + results.add(decision) + if len(results) == 5: + break + assert len(results) == 5 + assert results == {0, 1, 2, 4, 5} + + +if __name__ == "__main__": + test_mutate_thread_binding() diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py new file mode 100644 index 000000000000..bd0a24e8b642 --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring + +from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply +from tvm.meta_schedule.testing.schedule_rule import auto_bind +from tvm.meta_schedule.testing.space_generation import check_trace +from tvm.meta_schedule.tune_context import TuneContext +from tvm.target import Target +from tvm.script import tir as T + + +@T.prim_func +def element_wise(var_A: T.handle, var_B: T.handle) -> None: + A = T.match_buffer(var_A, [512, 512], dtype="float32") + B = T.match_buffer(var_B, [512, 512], dtype="float32") + for i, j in T.grid(512, 512): + with T.block("C"): + vi, vj = T.axis.remap("SS", [i, j]) + B[vi, vj] = A[vi, vj] + 1.0 + + +def _create_context(mod, target, rule) -> TuneContext: + ctx = TuneContext( + mod=mod, + target=target, + space_generator=PostOrderApply(), + sch_rules=[rule], + task_name="test", + ) + ctx.space_generator.initialize_with_tune_context(ctx) + for sch_rule in ctx.sch_rules: + sch_rule.initialize_with_tune_context(ctx) + return ctx + + +def test_cuda_element_wise(): + expected = [ + [ + 'b0 = sch.get_block(name="C", func_name="main")', + "l1, l2 = sch.get_loops(block=b0)", + "l3 = sch.fuse(l1, l2)", + "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])", + "l5, l6 = sch.split(loop=l3, factors=[None, v4])", + 'sch.bind(loop=l5, thread_axis="blockIdx.x")', + 'sch.bind(loop=l6, thread_axis="threadIdx.x")', + ] + ] + target = Target("nvidia/geforce-rtx-3080", host="llvm") + ctx = _create_context( + element_wise, + target=target, + rule=auto_bind(target=target), + ) + spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) + assert len(spaces) == 1 + check_trace(spaces, expected) + + +if __name__ == "__main__": + test_cuda_element_wise() From d0999bbd3b40b9466cc3b5c01f2b4b7fb09b478d Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 20 May 2022 21:33:55 -0500 Subject: [PATCH 54/59] [FFI] Renamed __VisitAttrs__ and __fvisit__ to non-reserved names (#11392) All names beginning with two underscores are reserved for the compiler, even if they occur inside a class or namespace. --- include/tvm/ir/attrs.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h index 9a2468714962..d2eda659a5d1 100644 --- a/include/tvm/ir/attrs.h +++ b/include/tvm/ir/attrs.h @@ -67,13 +67,13 @@ namespace tvm { static constexpr const char* _type_key = TypeKey; \ TVM_DECLARE_FINAL_OBJECT_INFO(ClassName, ::tvm::BaseAttrsNode) \ template \ - void __VisitAttrs__(FVisit& __fvisit__) // NOLINT(*) + void _tvm_VisitAttrs(FVisit& _tvm_fvisit) // NOLINT(*) /*! * \brief Declare an attribute field. * \param FieldName The field name. */ -#define TVM_ATTR_FIELD(FieldName) __fvisit__(#FieldName, &FieldName) +#define TVM_ATTR_FIELD(FieldName) _tvm_fvisit(#FieldName, &FieldName) /*! * \brief Create a NodeRef type that represents null. @@ -835,12 +835,12 @@ class AttrsNode : public BaseAttrsNode { public: void VisitAttrs(AttrVisitor* v) { ::tvm::detail::AttrNormalVisitor vis(v); - self()->__VisitAttrs__(vis); + self()->_tvm_VisitAttrs(vis); } void VisitNonDefaultAttrs(AttrVisitor* v) { ::tvm::detail::AttrNonDefaultVisitor vis(v); - self()->__VisitAttrs__(vis); + self()->_tvm_VisitAttrs(vis); } void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final { @@ -861,7 +861,7 @@ class AttrsNode : public BaseAttrsNode { return false; }; auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind); - self()->__VisitAttrs__(vis); + self()->_tvm_VisitAttrs(vis); hit_count = vis.hit_count_; } else { // construct a map then do lookup. @@ -879,7 +879,7 @@ class AttrsNode : public BaseAttrsNode { return false; }; auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind); - self()->__VisitAttrs__(vis); + self()->_tvm_VisitAttrs(vis); hit_count = vis.hit_count_; } // error handling, slow path @@ -887,7 +887,7 @@ class AttrsNode : public BaseAttrsNode { for (int i = 0; i < args.size(); i += 2) { ::tvm::detail::AttrExistVisitor visitor; visitor.key_ = args[i].operator std::string(); - self()->__VisitAttrs__(visitor); + self()->_tvm_VisitAttrs(visitor); if (!visitor.exist_) { std::ostringstream os; os << DerivedType::_type_key << ": does not have field \'" << visitor.key_ @@ -903,18 +903,18 @@ class AttrsNode : public BaseAttrsNode { bool SEqualReduce(const DerivedType* other, SEqualReducer equal) const { DerivedType* pself = self(); ::tvm::detail::AttrsSEqualVisitor visitor(pself, other, equal); - self()->__VisitAttrs__(visitor); + self()->_tvm_VisitAttrs(visitor); return visitor.result_; } void SHashReduce(SHashReducer hash_reducer) const { ::tvm::detail::AttrsSHashVisitor visitor(hash_reducer); - self()->__VisitAttrs__(visitor); + self()->_tvm_VisitAttrs(visitor); } Array ListFieldInfo() const final { ::tvm::detail::AttrDocVisitor visitor; - self()->__VisitAttrs__(visitor); + self()->_tvm_VisitAttrs(visitor); return visitor.fields_; } From fa5460242e31cea3df7db8efe42da57196eba25e Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Sat, 21 May 2022 07:21:15 -0700 Subject: [PATCH 55/59] [MetaSchedule] Enhance CPU auto vectorization (#11404) --- .../rewrite_parallel_vectorize_unroll.cc | 2 +- ...tproc_rewrite_parallel_vectorize_unroll.py | 91 ++++++++++++++++++- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc index 69e8dfb858bc..001c97645b6e 100644 --- a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc +++ b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc @@ -207,7 +207,7 @@ void AdjustParallelVectorize(const Schedule& sch, const BlockRV& block_rv, continue; } else if (prev_used_iter == -1) { // the stride of last axis is not 1 means the memory access is not contiguous - if (strides[i] != 1) { + if (strides[i] != 1 && fusible != 0) { break; } fusible++; diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py index 9988e874b81d..f9b71bfdb654 100644 --- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py +++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py @@ -16,9 +16,8 @@ # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring import tvm -from tvm.script import tir as T - from tvm.meta_schedule.postproc import RewriteParallelVectorizeUnroll +from tvm.script import tir as T from tvm.tir.schedule import Schedule # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant @@ -70,6 +69,85 @@ def Move_PUV0(a: T.handle, b: T.handle) -> None: T.writes([B[vi, vj, vk]]) B[vi, vj, vk] = A[vi, vj, vk] + +@tvm.script.ir_module +class Fused_NN_Dense: + @T.prim_func + def main(placeholder: T.Buffer[(64, 768), "float32"], placeholder_1: T.Buffer[(768, 768), "float32"], T_matmul_NT: T.Buffer[(64, 768), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]}) + # body + # with T.block("root") + for i0, i1, i2 in T.grid(64, 768, 768): + with T.block("T_matmul_NT"): + i, j, k = T.axis.remap("SSR", [i0, i1, i2]) + T.reads(placeholder[i, k], placeholder_1[j, k]) + T.writes(T_matmul_NT[i, j]) + with T.init(): + T_matmul_NT[i, j] = T.float32(0) + T_matmul_NT[i, j] = T_matmul_NT[i, j] + placeholder[i, k] * placeholder_1[j, k] + +@T.prim_func +def before_matmul_vectorize( + placeholder: T.Buffer[(64, 768), "float32"], + placeholder_1: T.Buffer[(768, 768), "float32"], + T_matmul_NT: T.Buffer[(64, 768), "float32"], +) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]}) + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.vectorize":64}) + T_matmul_NT_global = T.alloc_buffer([64, 768], dtype="float32") + for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 16, 1, 3): + for i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(48, 8, 1, 16, 8, 16): + with T.block("T_matmul_NT"): + i = T.axis.spatial(64, i0_2 * 8 + i0_3) + j = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + i1_3) + k = T.axis.reduce(768, i2_0 * 16 + i2_1) + T.reads(placeholder[i, k], placeholder_1[j, k]) + T.writes(T_matmul_NT_global[i, j]) + with T.init(): + T_matmul_NT_global[i, j] = T.float32(0) + T_matmul_NT_global[i, j] = T_matmul_NT_global[i, j] + placeholder[i, k] * placeholder_1[j, k] + for ax0, ax1 in T.grid(64, 16): + with T.block("T_matmul_NT_global"): + v0 = T.axis.spatial(64, ax0) + v1 = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + ax1) + T.reads(T_matmul_NT_global[v0, v1]) + T.writes(T_matmul_NT[v0, v1]) + T_matmul_NT[v0, v1] = T_matmul_NT_global[v0, v1] + +@T.prim_func +def after_matmul_vectorize( + placeholder: T.Buffer[(64, 768), "float32"], + placeholder_1: T.Buffer[(768, 768), "float32"], + T_matmul_NT: T.Buffer[(64, 768), "float32"], +) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]}) + T_matmul_NT_global = T.alloc_buffer([64, 768], dtype="float32") + for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 16, 1, 3): + for i2_0, i0_2, i1_2, i2_1, i0_3 in T.grid(48, 8, 1, 16, 8): + for i1_3_fused in T.vectorized(16): + with T.block("T_matmul_NT"): + i = T.axis.spatial(64, i0_2 * 8 + i0_3) + j = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + i1_3_fused) + k = T.axis.reduce(768, i2_0 * 16 + i2_1) + T.reads(placeholder[i, k], placeholder_1[j, k]) + T.writes(T_matmul_NT_global[i, j]) + with T.init(): + T_matmul_NT_global[i, j] = T.float32(0) + T_matmul_NT_global[i, j] = T_matmul_NT_global[i, j] + placeholder[i, k] * placeholder_1[j, k] + for ax0 in T.serial(64): + for ax1_fused in T.vectorized(16): + with T.block("T_matmul_NT_global"): + v0 = T.axis.spatial(64, ax0) + v1 = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + ax1_fused) + T.reads(T_matmul_NT_global[v0, v1]) + T.writes(T_matmul_NT[v0, v1]) + T_matmul_NT[v0, v1] = T_matmul_NT_global[v0, v1] + + # fmt: on # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable @@ -78,10 +156,17 @@ def test_meta_schedule_postproc_rewrite_parallel_unroll_vectorize(): postproc = RewriteParallelVectorizeUnroll() sch = Schedule(Move_PUV) assert postproc.apply(sch) - print(sch.mod["main"].script()) mod = tvm.tir.transform.Simplify()(sch.mod) tvm.ir.assert_structural_equal(mod["main"], Move_PUV0) +def test_vectorize_inner_loop(): + sch = Schedule(before_matmul_vectorize) + rule = RewriteParallelVectorizeUnroll() + assert rule.apply(sch) + tvm.ir.assert_structural_equal(sch.mod["main"], after_matmul_vectorize) + + if __name__ == "__main__": test_meta_schedule_postproc_rewrite_parallel_unroll_vectorize() + test_vectorize_inner_loop() From 83c9ee1a26ff66b9300615a50b4b400ff83cb06d Mon Sep 17 00:00:00 2001 From: Christoph Gerum Date: Mon, 23 May 2022 12:12:46 +0200 Subject: [PATCH 56/59] Fix int8 cuda kernels on older SM versions (#11389) * Fix int8 cuda kernels on older SM versions * Update target.py * Simplify initialiasation of do_tensorize * Simplify initialization of do_tensorize dense * Simplify initialization of do_tensorize in group_conv_nchw * Fix tensorize for conv2d_int8 as well. * Try to make linter happy * make linter happy * Fix wrong commit to auto_scheduler --- python/tvm/target/target.py | 4 ++++ python/tvm/topi/cuda/batch_matmul.py | 7 ++----- python/tvm/topi/cuda/conv2d_int8.py | 7 +++---- python/tvm/topi/cuda/dense.py | 6 ++---- python/tvm/topi/cuda/group_conv2d_nchw.py | 4 +--- 5 files changed, 12 insertions(+), 16 deletions(-) diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index 101980941fb0..a37727e926c0 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -191,6 +191,10 @@ def mattr(self): def supports_integer_dot_product(self): if self.attrs.get("supports_integer_dot_product", []): return bool(self.attrs["supports_integer_dot_product"]) + if self.kind == "cuda": + sm_version = int(self.arch.split("_")[1]) + if sm_version >= 61: + return True return False @property diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py index ff625d6d714c..4e476094f2d9 100644 --- a/python/tvm/topi/cuda/batch_matmul.py +++ b/python/tvm/topi/cuda/batch_matmul.py @@ -22,7 +22,7 @@ from tvm.contrib import cublas from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity from .. import nn, generic -from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor, is_target +from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor from .tensor_intrin import dp4a @@ -367,10 +367,7 @@ def _schedule_batch_matmul_int8(cfg, s, output): # dp4a tensorize target = tvm.target.Target.current(allow_none=False) - do_tensorize = True - - if is_target(["vulkan", "rocm"]): - do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product + do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product if do_tensorize: dtypes = (input_x.dtype, input_y.dtype) diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py index a8b21a1deca0..0edd64e0e379 100644 --- a/python/tvm/topi/cuda/conv2d_int8.py +++ b/python/tvm/topi/cuda/conv2d_int8.py @@ -26,7 +26,7 @@ from ..nn.pad import pad from ..nn.conv2d import unpack_NCHWc_to_nchw from ..nn.utils import get_pad_tuple -from ..utils import get_const_tuple, traverse_inline, is_target +from ..utils import get_const_tuple, traverse_inline def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"): @@ -311,9 +311,8 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output): _, rc_block = s[conv].split(rc_block, factor=4) target = tvm.target.Target.current(allow_none=False) - do_tensorize = True - if is_target(["vulkan", "rocm"]): - do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product + do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product + if do_tensorize: dtypes = (pad_data.dtype, packed_kernel.dtype) s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes)) diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 859f6c1097c6..32b80db6d584 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -24,7 +24,7 @@ from .tensor_intrin import dp4a from .. import tag from .. import generic -from ..utils import traverse_inline, get_const_tuple, is_target +from ..utils import traverse_inline, get_const_tuple logger = logging.getLogger("topi") @@ -172,9 +172,7 @@ def _schedule_dense_int8(cfg, s, output): ko, ki = s[CC].split(ko, factor=4) ko, kt = cfg["tile_k"].apply(s, CC, ko) target = tvm.target.Target.current(allow_none=False) - do_tensorize = True - if is_target(["vulkan", "rocm"]): - do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product + do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product if do_tensorize: dtypes = (data.dtype, weight.dtype) diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py index f786b0d8d647..b48ea3a5f8be 100644 --- a/python/tvm/topi/cuda/group_conv2d_nchw.py +++ b/python/tvm/topi/cuda/group_conv2d_nchw.py @@ -507,9 +507,7 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output): s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block) _, rc_block = s[conv].split(rc_block, factor=4) target = tvm.target.Target.current(allow_none=False) - do_tensorize = True - if "vulkan" in target.keys: - do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product + do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product if do_tensorize: dtypes = (pad_data.dtype, packed_kernel.dtype) s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes)) From df632baa78a4f550759d62fbc252039bfd9a64c3 Mon Sep 17 00:00:00 2001 From: Florin Blanaru Date: Mon, 23 May 2022 11:14:00 +0100 Subject: [PATCH 57/59] [Tests] Replace the Relay interpreter with the VM in the op tests (#11386) --- python/tvm/relay/testing/__init__.py | 9 +- .../relay/dyn/test_dynamic_op_level10.py | 54 ++-- .../relay/dyn/test_dynamic_op_level2.py | 46 ++-- .../relay/dyn/test_dynamic_op_level3.py | 71 +++--- .../relay/dyn/test_dynamic_op_level5.py | 15 +- .../relay/dyn/test_dynamic_op_level6.py | 27 +- tests/python/relay/test_op_grad_level1.py | 38 +-- tests/python/relay/test_op_grad_level10.py | 37 ++- tests/python/relay/test_op_grad_level2.py | 151 +++++++---- tests/python/relay/test_op_grad_level3.py | 74 +++--- tests/python/relay/test_op_grad_level4.py | 47 ++-- tests/python/relay/test_op_level1.py | 39 ++- tests/python/relay/test_op_level10.py | 236 ++++++++++-------- tests/python/relay/test_op_level2.py | 32 +-- tests/python/relay/test_op_level3.py | 69 +++-- tests/python/relay/test_op_level4.py | 48 ++-- tests/python/relay/test_op_level5.py | 166 +++++------- tests/python/relay/test_op_level6.py | 55 ++-- 18 files changed, 645 insertions(+), 569 deletions(-) diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py index 909712511061..2399a474de88 100644 --- a/python/tvm/relay/testing/__init__.py +++ b/python/tvm/relay/testing/__init__.py @@ -82,6 +82,7 @@ def check_grad( mean=0, mode="higher_order", target_devices=None, + executor_kind="debug", ): """Perform numerical gradient checking given a relay function. @@ -146,8 +147,12 @@ def check_grad( for target, dev in target_devices: # Eval the backward and forward functions # TODO(mbs): Evaluate a pair of functions so can share preparation between them. - bwd_func_compiled = relay.create_executor(device=dev, target=target).evaluate(bwd_func) - fwd_func_compiled = relay.create_executor(device=dev, target=target).evaluate(fwd_func) + bwd_func_compiled = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(bwd_func) + fwd_func_compiled = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(fwd_func) # Get analytic gradients. _, grads = bwd_func_compiled(*inputs) diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py index d34b80303b29..5a31977b4506 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level10.py +++ b/tests/python/relay/dyn/test_dynamic_op_level10.py @@ -27,9 +27,11 @@ import random import tvm.testing +executor_kind = tvm.testing.parameter("debug", "vm") + @tvm.testing.uses_gpu -def test_broadcast_to(): +def test_broadcast_to(executor_kind): def verify_more_dynamic_broadcast_to(x_shape, out_shape): rank = len(out_shape) dtype = "float32" @@ -45,12 +47,13 @@ def verify_more_dynamic_broadcast_to(x_shape, out_shape): x = np.random.uniform(size=np.prod(x_shape)).astype(dtype) ref_res = np.broadcast_to(np.reshape(x, x_shape), out_shape) for target, dev in tvm.testing.enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate( - func - )(x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type)) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate(func)( + x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type) + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) verify_more_dynamic_broadcast_to((4, 3), (3, 4, 3)) @@ -70,12 +73,11 @@ def verify_broadcast_to(x_shape, out_shape): x = np.random.uniform(size=x_shape).astype(dtype) ref_res = np.broadcast_to(x, out_shape) for target, dev in tvm.testing.enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate( - func - )(x, np.array(out_shape).astype(shape_type)) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate(func)(x, np.array(out_shape).astype(shape_type)) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) verify_broadcast_to((1,), (1, 1, 1)) verify_broadcast_to((1, 1), (4, 1, 1)) @@ -83,7 +85,7 @@ def verify_broadcast_to(x_shape, out_shape): @tvm.testing.uses_gpu -def test_dyn_broadcast_to(): +def test_dyn_broadcast_to(executor_kind): dtype = "uint8" rank = 3 shape_type = "int64" @@ -101,16 +103,15 @@ def test_dyn_broadcast_to(): dyn_shape = (1,) * rank ref_res = np.broadcast_to(x, dyn_shape) for target, dev in tvm.testing.enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate(func)( - x, np.array(dyn_shape).astype(shape_type) - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor(executor_kind, mod=mod, device=dev, target=target).evaluate( + func + )(x, np.array(dyn_shape).astype(shape_type)) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu -def test_dyn_one_hot(): +def test_dyn_one_hot(executor_kind): def _get_oshape(indices_shape, depth, axis): oshape = [] true_axis = len(indices_shape) if axis == -1 else axis @@ -135,12 +136,11 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype): indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32") out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype) for target, dev in tvm.testing.enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - out_relay = relay.create_executor( - kind, mod=mod, device=dev, target=target - ).evaluate()(indices_np, np.array(depth).astype("int32")) - tvm.testing.assert_allclose(out_relay.numpy(), out_np) + mod = tvm.ir.IRModule.from_expr(func) + out_relay = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate()(indices_np, np.array(depth).astype("int32")) + tvm.testing.assert_allclose(out_relay.numpy(), out_np) _verify((3,), 3, 1, 0, -1, "int32") _verify((3,), 3, 1.0, 0.0, -1, "float32") diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py index fd7ab7002806..a017762ce35d 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level2.py +++ b/tests/python/relay/dyn/test_dynamic_op_level2.py @@ -27,9 +27,11 @@ import tvm.topi.testing from tvm.relay.testing import run_infer_type +executor_kind = tvm.testing.parameter("debug", "vm") + @tvm.testing.uses_gpu -def test_dyn_upsampling_run(): +def test_dyn_upsampling_run(executor_kind): def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=False): if layout == "NCHW": @@ -58,12 +60,13 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa func = relay.Function([x, scale_h_var, scale_w_var], z) for target, dev in tvm.testing.enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32") - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate()( + x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32") + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6) verify_upsampling((1, 16, 32, 32), 3, 2.0, "NCHW", "nearest_neighbor") verify_upsampling((1, 16, 32, 32), 5, 2.0, "NCHW", "bilinear", True) @@ -85,7 +88,7 @@ def test_dyn_upsampling_infer_type_const(): @tvm.testing.uses_gpu -def test_dyn_upsampling3d_run(): +def test_dyn_upsampling3d_run(executor_kind): def verify_upsampling3d( dshape, scale_d, scale_h, scale_w, layout, method, coord_trans="asymmetric" ): @@ -124,15 +127,16 @@ def verify_upsampling3d( func = relay.Function([x, scale_d_var, scale_h_var, scale_w_var], z) for target, dev in enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - x_data, - np.array(scale_d).astype("float32"), - np.array(scale_h).astype("float32"), - np.array(scale_w).astype("float32"), - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate()( + x_data, + np.array(scale_d).astype("float32"), + np.array(scale_h).astype("float32"), + np.array(scale_w).astype("float32"), + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6) verify_upsampling3d((1, 1, 1, 1, 1), 2, 3, 4, "NCDHW", "nearest_neighbor") verify_upsampling3d((1, 8, 16, 16, 16), 2.0, 3.0, 4.0, "NCDHW", "nearest_neighbor") @@ -163,7 +167,7 @@ def test_dyn_upsampling3d_infer_type_const(): @tvm.testing.uses_gpu -def test_dyn_pad(): +def test_dyn_pad(executor_kind): def verify_pad(dshape, pad_width, pad_val, dtype): x = relay.var("x", relay.TensorType(dshape, dtype)) ndim = len(dshape) @@ -178,7 +182,9 @@ def verify_pad(dshape, pad_width, pad_val, dtype): ref_res = np.pad(data, pad_width, "constant", constant_values=(((pad_val,) * 2),) * ndim) pad_width = np.array(pad_width).astype("int64") - verify_func(func, [data, pad_width, np.array(pad_val).astype(dtype)], ref_res) + verify_func( + executor_kind, func, [data, pad_width, np.array(pad_val).astype(dtype)], ref_res + ) def verify_pad_default_fill(dshape, pad_width, dtype): x = relay.var("x", relay.TensorType(dshape, dtype)) @@ -193,7 +199,7 @@ def verify_pad_default_fill(dshape, pad_width, dtype): ref_res = np.pad(data, pad_width) pad_width = np.array(pad_width).astype("int64") - verify_func(func, [data, pad_width], ref_res) + verify_func(executor_kind, func, [data, pad_width], ref_res) verify_pad((4, 10, 7, 7), ((1, 1), (2, 2), (3, 3), (4, 4)), 2.0, "int32") verify_pad((2, 7), ((1, 4), (2, 2)), 4.0, "float64") diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py index 0456401e8ad2..0e68cd7246ac 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level3.py +++ b/tests/python/relay/dyn/test_dynamic_op_level3.py @@ -23,24 +23,25 @@ from tvm import relay, te from tvm.relay.testing import check_grad, run_infer_type +executor_kind = tvm.testing.parameter("debug", "vm") -def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()): + +def verify_func(executor_kind, func, data, ref_res, target_device=tvm.testing.enabled_targets()): assert isinstance(data, list) for target, dev in target_device: - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - *data - ) - if isinstance(op_res, tvm.runtime.container.ADT): - assert len(op_res) == len( - ref_res - ), "Outputs from TVM and Python implementation must be equal " - for op_result, ref_result in zip(op_res, ref_res): - tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5) - else: - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) - relay.backend.te_compiler.get().clear() + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate()(*data) + if isinstance(op_res, tvm.runtime.container.ADT): + assert len(op_res) == len( + ref_res + ), "Outputs from TVM and Python implementation must be equal " + for op_result, ref_result in zip(op_res, ref_res): + tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5) + else: + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + relay.backend.te_compiler.get().clear() def check_on_vm(target, dev, args, expected_result, mod): @@ -53,7 +54,7 @@ def check_on_vm(target, dev, args, expected_result, mod): @tvm.testing.uses_gpu -def test_dyn_reshape(): +def test_dyn_reshape(executor_kind): def verify_reshape(shape, newshape, oshape): x = relay.var("x", relay.TensorType(shape, "float32")) y = relay.var("y", relay.TensorType((len(newshape),), "int64")) @@ -69,7 +70,7 @@ def verify_reshape(shape, newshape, oshape): test_inputs=[x_data], eps=1e-3, ) - verify_func(func, [x_data, np.array(newshape).astype("int64")], ref_res) + verify_func(executor_kind, func, [x_data, np.array(newshape).astype("int64")], ref_res) verify_reshape((2, 3, 4), (8, 3), (8, 3)) verify_reshape((4, 7), (2, 7, 2), (2, 7, 2)) @@ -83,7 +84,7 @@ def verify_reshape(shape, newshape, oshape): @tvm.testing.uses_gpu -def test_dyn_shape_reshape(): +def test_dyn_shape_reshape(executor_kind): def verify_reshape(shape, newshape, oshape): x = relay.var("x", relay.TensorType(shape, "float32")) y = relay.var("y", relay.TensorType(newshape, "float32")) @@ -94,13 +95,13 @@ def verify_reshape(shape, newshape, oshape): y_data = np.random.uniform(low=-1, high=1, size=newshape).astype("float32") ref_res = np.reshape(x_data, oshape) check_grad(run_infer_type(func), inputs=[x_data, y_data], eps=1e-3) - verify_func(func, [x_data, y_data], ref_res) + verify_func(executor_kind, func, [x_data, y_data], ref_res) verify_reshape((2, 3, 4), (8, 3), (8, 3)) verify_reshape((4, 7), (2, 7, 2), (2, 7, 2)) -def test_squeeze(): +def test_squeeze(executor_kind): def verify_squeeze(shape, dtype, axis): x = relay.var("x", relay.TensorType(shape, dtype)) assert axis is not None @@ -110,14 +111,14 @@ def verify_squeeze(shape, dtype, axis): func = relay.Function([x, axis], squeeze) x_data = np.random.random_sample(shape).astype(dtype) ref_res = np.squeeze(x_data, axis=np_axis) - verify_func(func, [x_data, np.array(np_axis).astype("int64")], ref_res) + verify_func(executor_kind, func, [x_data, np.array(np_axis).astype("int64")], ref_res) verify_squeeze((1, 3, 1), "float32", [0]) verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2]) @tvm.testing.uses_gpu -def test_dyn_expand_dims(): +def test_dyn_expand_dims(executor_kind): def verify_expand_dims( dshape, dtype, oshape, axis, num_newaxis, target_device=tvm.testing.enabled_targets() ): @@ -130,7 +131,7 @@ def verify_expand_dims( data_np = np.random.uniform(size=dshape).astype(dtype) axis_np = np.array(axis).astype("int64") ref_res = data_np.reshape(oshape) - verify_func(func, [data_np, axis_np], ref_res, target_device=target_device) + verify_func(executor_kind, func, [data_np, axis_np], ref_res, target_device=target_device) for dtype in ["float16", "float32"]: verify_expand_dims((2, 2), dtype, (2, 2, 1), 2, 1) @@ -146,7 +147,7 @@ def verify_expand_dims( @tvm.testing.uses_gpu -def test_dyn_tile(): +def test_dyn_tile(executor_kind): def verify_tile(dshape, reps): x = relay.var("x", relay.TensorType(dshape, "float32")) r = relay.var("reps", relay.TensorType((len(reps),), "float32")) @@ -156,7 +157,7 @@ def verify_tile(dshape, reps): x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32") ref_res = np.tile(x_data, reps=reps) reps_data = np.array(reps).astype("float32") - verify_func(func, [x_data, np.array(reps).astype("float32")], ref_res) + verify_func(executor_kind, func, [x_data, np.array(reps).astype("float32")], ref_res) verify_tile((2, 3, 4), (3, 2, 1)) verify_tile((2, 3, 4), (1, 2)) @@ -164,7 +165,7 @@ def verify_tile(dshape, reps): @tvm.testing.uses_gpu -def test_dyn_zeros_ones(): +def test_dyn_zeros_ones(executor_kind): def verify_zeros_ones(shape, dtype): for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]: rank = len(shape) @@ -175,14 +176,16 @@ def verify_zeros_ones(shape, dtype): func = relay.Function([dyn_shape], y) ref_res = ref(shape, dtype) - verify_func(func, [np.array(shape).astype("int64")], ref_res.astype("int64")) + verify_func( + executor_kind, func, [np.array(shape).astype("int64")], ref_res.astype("int64") + ) verify_zeros_ones((1, 3), "int64") verify_zeros_ones((8, 9, 1, 2), "float32") @tvm.testing.uses_gpu -def test_dyn_full(): +def test_dyn_full(executor_kind): def verify_full(fill_value, src_shape, dtype): x = relay.var("x", relay.scalar_type(dtype)) rank = len(src_shape) @@ -192,7 +195,10 @@ def verify_full(fill_value, src_shape, dtype): ref_res = np.full(src_shape, fill_value).astype(dtype) verify_func( - func, [np.array(fill_value).astype(dtype), np.array(src_shape).astype("int64")], ref_res + executor_kind, + func, + [np.array(fill_value).astype(dtype), np.array(src_shape).astype("int64")], + ref_res, ) verify_full(4, (1, 3, 4, 4), "int32") @@ -201,7 +207,7 @@ def verify_full(fill_value, src_shape, dtype): @tvm.testing.uses_gpu -def test_dyn_sparse_to_dense(): +def test_dyn_sparse_to_dense(executor_kind): def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected): sparse_indices_data = np.array(sparse_indices) sparse_values_data = np.array(sparse_values) @@ -242,7 +248,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ output_shape_data, ] - verify_func(func, arguments, xpected) + verify_func(executor_kind, func, arguments, xpected) verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0]) # scalar verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3]) # vector @@ -301,7 +307,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ @pytest.mark.parametrize("dtype", [np.int64, np.int32]) @pytest.mark.parametrize("use_dyn", [True, False]) def test_sparse_fill_empty_rows( - sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn + sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn, executor_kind ): def ref_sparse_fill_empty_rows( sparse_indices: np.ndarray, @@ -404,6 +410,7 @@ def verify_sparse_fill_empty_rows( assert empty_row_indicator_infer_type.checked_type.dtype == "bool" verify_func( + executor_kind, func, [sparse_indices_np, sparse_values_np, dense_shape_np, default_value_np], ref_res, diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py index 2eeeb1d828c9..58234929c7bb 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level5.py +++ b/tests/python/relay/dyn/test_dynamic_op_level5.py @@ -26,6 +26,8 @@ import tvm.topi.testing import tvm.testing +executor_kind = tvm.testing.parameter("debug", "vm") + def test_resize2d_infer_type(): n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") @@ -37,7 +39,7 @@ def test_resize2d_infer_type(): @tvm.testing.uses_gpu -def test_resize2d(): +def test_resize2d(executor_kind): def verify_resize2d(dshape, scale, method, layout): if layout == "NHWC": size = (dshape[1] * scale, dshape[2] * scale) @@ -62,12 +64,11 @@ def verify_resize2d(dshape, scale, method, layout): ) for target, dev in tvm.testing.enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - x_data, size - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate()(x_data, size) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6) for method in ["linear", "nearest_neighbor"]: for layout in ["NCHW", "NHWC"]: diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py index 530c402b2947..ebf9c36263be 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level6.py +++ b/tests/python/relay/dyn/test_dynamic_op_level6.py @@ -22,9 +22,11 @@ from tvm import relay import tvm.testing +executor_kind = tvm.testing.parameter("debug", "vm") + @tvm.testing.uses_gpu -def test_dynamic_topk(): +def test_dynamic_topk(executor_kind): def verify_topk(k, axis, ret_type, is_ascend, dtype): shape = (20, 100) x = relay.var("x", relay.TensorType(shape, "float32")) @@ -53,18 +55,17 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype): np_indices = np_indices.astype(dtype) for target, dev in tvm.testing.enabled_targets(): - for kind in ["vm", "debug"]: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - np_data, np.array([k]).astype("float32") - ) - if ret_type == "both": - tvm.testing.assert_allclose(op_res[0].numpy(), np_values) - tvm.testing.assert_allclose(op_res[1].numpy(), np_indices) - elif ret_type == "values": - tvm.testing.assert_allclose(op_res.numpy(), np_values) - else: - tvm.testing.assert_allclose(op_res.numpy(), np_indices) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor( + executor_kind, mod=mod, device=dev, target=target + ).evaluate()(np_data, np.array([k]).astype("float32")) + if ret_type == "both": + tvm.testing.assert_allclose(op_res[0].numpy(), np_values) + tvm.testing.assert_allclose(op_res[1].numpy(), np_indices) + elif ret_type == "values": + tvm.testing.assert_allclose(op_res.numpy(), np_values) + else: + tvm.testing.assert_allclose(op_res.numpy(), np_indices) np.random.seed(0) for k in [0, 1, 5]: diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py index a31191a42c48..cb94f297cfa3 100644 --- a/tests/python/relay/test_op_grad_level1.py +++ b/tests/python/relay/test_op_grad_level1.py @@ -26,6 +26,8 @@ from tvm.relay.testing import check_grad, run_infer_type from tvm.relay.transform import gradient +executor_kind = tvm.testing.parameter("debug") + def sigmoid(x): one = np.ones_like(x) @@ -67,7 +69,7 @@ class TestUnaryOp: dtype = tvm.testing.parameter("float32", "float64") shape = tvm.testing.parameter((10, 4)) - def test_op(self, target, dev, relay_op, ref_func, shape, dtype): + def test_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype): target = tvm.target.Target(target) if target.kind.name == "vulkan": @@ -125,9 +127,9 @@ def test_op(self, target, dev, relay_op, ref_func, shape, dtype): grad_in = np.random.rand(*shape).astype(dtype) ref_grad_out = ref_func(data_in, grad_in) - op_res, (op_grad, _) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)( - data_in, grad_in - ) + op_res, (op_grad, _) = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(bwd_func)(data_in, grad_in) np.testing.assert_allclose(op_grad.numpy(), ref_grad_out, rtol=0.01) @@ -143,7 +145,7 @@ class TestBinaryOp: dtype = tvm.testing.parameter("float32", "float64") shape = tvm.testing.parameter((5, 10, 5)) - def test_binary_op(self, target, dev, relay_op, ref_func, shape, dtype): + def test_binary_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype): t = relay.TensorType(shape, dtype=dtype) x = relay.var("x", t) y = relay.var("y", t) @@ -156,31 +158,31 @@ def test_binary_op(self, target, dev, relay_op, ref_func, shape, dtype): fwd_func = run_infer_type(fwd_func) bwd_func = run_infer_type(gradient(fwd_func)) - op_res, (op_grad0, op_grad1) = relay.create_executor(device=dev, target=target).evaluate( - bwd_func - )(x_data, y_data) + op_res, (op_grad0, op_grad1) = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(bwd_func)(x_data, y_data) np.testing.assert_allclose(op_grad0.numpy(), ref_grad0, rtol=0.01) np.testing.assert_allclose(op_grad1.numpy(), ref_grad1, rtol=0.01) -def test_softmax_grad(target, dev): +def test_softmax_grad(executor_kind, target, dev): target = tvm.target.Target(target) if target.kind.name == "vulkan": pytest.xfail("Known failure on vulkan") data = relay.var("data", relay.TensorType((1, 16), "float64")) fwd_func = relay.Function([data], relay.nn.softmax(data)) - check_grad(fwd_func, scale=1, target_devices=[(target, dev)]) + check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind) -def test_log_softmax_grad(target, dev): +def test_log_softmax_grad(executor_kind, target, dev): target = tvm.target.Target(target) if target.kind.name == "vulkan": pytest.xfail("Known failure on vulkan") data = relay.var("data", relay.TensorType((2, 16), "float64")) fwd_func = relay.Function([data], relay.nn.log_softmax(data)) - check_grad(fwd_func, scale=1, target_devices=[(target, dev)]) + check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind) class TestBiasAddGrad: @@ -191,25 +193,25 @@ class TestBiasAddGrad: ((4, 8), (8,), 1), ) - def test_bias_add(self, target, dev, d_shape, b_shape, axis): + def test_bias_add(self, executor_kind, target, dev, d_shape, b_shape, axis): data = relay.var("data", relay.TensorType(d_shape, "float32")) bias = relay.var("bias", relay.TensorType(b_shape, "float32")) fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias, axis=axis)) - check_grad(fwd_func, target_devices=[(target, dev)]) + check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind) -def test_expand_dims_grad(target, dev): +def test_expand_dims_grad(executor_kind, target, dev): data = relay.var("data", shape=(2, 3), dtype="float64") fwd_func = relay.Function([data], relay.expand_dims(data, axis=1, num_newaxis=2)) - check_grad(fwd_func, target_devices=[(target, dev)]) + check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind) -def test_concatenate_grad(target, dev): +def test_concatenate_grad(executor_kind, target, dev): x = relay.var("x", shape=(2, 2, 5)) y = relay.var("y", shape=(2, 1, 5)) z = relay.var("z", shape=(2, 4, 5)) fwd_func = relay.Function([x, y, z], relay.concatenate([x, y, z], axis=1)) - check_grad(fwd_func, target_devices=[(target, dev)]) + check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind) if __name__ == "__main__": diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py index 4c2c9082e044..6b2531a4a1f6 100644 --- a/tests/python/relay/test_op_grad_level10.py +++ b/tests/python/relay/test_op_grad_level10.py @@ -28,9 +28,10 @@ index_dtype = tvm.testing.parameter("int32", "int64") val_dtype = tvm.testing.parameter("float32", "float64") +executor_kind = tvm.testing.parameter("debug") -def test_cross_entropy_grad(target, dev, val_dtype): +def test_cross_entropy_grad(executor_kind, target, dev, val_dtype): target = tvm.target.Target(target) if target.kind.name == "vulkan" and val_dtype == "float64": # GLSL.std.450's Log implementation only takes 16/32-bit floats. @@ -44,10 +45,11 @@ def test_cross_entropy_grad(target, dev, val_dtype): scale=0.1, mean=1, target_devices=[(target, dev)], + executor_kind=executor_kind, ) -def test_cross_entropy_with_logits_grad(target, dev, val_dtype): +def test_cross_entropy_with_logits_grad(executor_kind, target, dev, val_dtype): x = relay.var("x", shape=(2, 5), dtype=val_dtype) y = relay.var("y", shape=(2, 5), dtype=val_dtype) check_grad( @@ -56,13 +58,16 @@ def test_cross_entropy_with_logits_grad(target, dev, val_dtype): scale=0.1, mean=1, target_devices=[(target, dev)], + executor_kind=executor_kind, ) -def test_checkpoint(target, dev): +def test_checkpoint(executor_kind, target, dev): inputs = [relay.var("x{}".format(i), shape=(1,)) for i in range(4)] output = relay.multiply(relay.add(inputs[0], inputs[1]), relay.add(inputs[2], inputs[3])) - check_grad(relay.Function(inputs, relay.annotation.checkpoint(output))) + check_grad( + relay.Function(inputs, relay.annotation.checkpoint(output)), executor_kind=executor_kind + ) scope = relay.ScopeBuilder() out_tuple = scope.let( @@ -76,7 +81,11 @@ def test_checkpoint(target, dev): ) ) out_single = scope.get() - check_grad(relay.Function(inputs, out_single), target_devices=[(target, dev)]) + check_grad( + relay.Function(inputs, out_single), + target_devices=[(target, dev)], + executor_kind=executor_kind, + ) class TestBatchMatmulGrad: @@ -87,7 +96,9 @@ class TestBatchMatmulGrad: ((2, 5, 3), (2, 4, 5), True, True), ) - def test_batch_matmul_grad(self, target, dev, a_shape, b_shape, transpose_a, transpose_b): + def test_batch_matmul_grad( + self, executor_kind, target, dev, a_shape, b_shape, transpose_a, transpose_b + ): tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32")) tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32")) check_grad( @@ -98,18 +109,20 @@ def test_batch_matmul_grad(self, target, dev, a_shape, b_shape, transpose_a, tra ), ), target_devices=[(target, dev)], + executor_kind=executor_kind, ) -def test_reverse_reshape_grad(target, dev): +def test_reverse_reshape_grad(executor_kind, target, dev): x = relay.var("x", shape=(3, 4, 5), dtype="float64") check_grad( relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))), target_devices=[(target, dev)], + executor_kind=executor_kind, ) -def test_one_hot_grad(target, dev, index_dtype, val_dtype): +def test_one_hot_grad(executor_kind, target, dev, index_dtype, val_dtype): indices_shape = (3, 4) depth = 5 axis = -1 @@ -127,7 +140,13 @@ def test_one_hot_grad(target, dev, index_dtype, val_dtype): y = relay.one_hot(indices, on_val, off_val, depth, axis, val_dtype) f = relay.Function([indices, on_val, off_val], y) - check_grad(f, inputs=inputs, test_inputs=test_inputs, target_devices=[(target, dev)]) + check_grad( + f, + inputs=inputs, + test_inputs=test_inputs, + target_devices=[(target, dev)], + executor_kind=executor_kind, + ) if __name__ == "__main__": diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py index fcdcfe6accd8..820f724bfc43 100644 --- a/tests/python/relay/test_op_grad_level2.py +++ b/tests/python/relay/test_op_grad_level2.py @@ -25,8 +25,10 @@ from tvm.relay.transform import gradient import tvm.testing +executor_kind = tvm.testing.parameter("debug") -def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode): + +def verify_max_pool2d_grad(executor_kind, x_shape, pool_size, strides, padding, ceil_mode): x = relay.var("x", relay.TensorType(x_shape, "float32")) y = tvm.relay.nn.max_pool2d( x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode @@ -51,24 +53,41 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode): ) for target, dev in tvm.testing.enabled_targets(): - op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)( - data - ) + op_res, (op_grad,) = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01) @tvm.testing.uses_gpu -def test_max_pool2d_grad(): +def test_max_pool2d_grad(executor_kind): verify_max_pool2d_grad( - (1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0), ceil_mode=False + executor_kind, + (1, 4, 16, 16), + pool_size=(2, 2), + strides=(2, 2), + padding=(0, 0), + ceil_mode=False, ) verify_max_pool2d_grad( - (1, 4, 16, 16), pool_size=(1, 1), strides=(1, 1), padding=(1, 1), ceil_mode=False + executor_kind, + (1, 4, 16, 16), + pool_size=(1, 1), + strides=(1, 1), + padding=(1, 1), + ceil_mode=False, ) def verify_avg_pool2d_grad( - x_shape, pool_size, strides, padding, ceil_mode, count_include_pad, dtype="float32" + x_shape, + pool_size, + strides, + padding, + ceil_mode, + count_include_pad, + executor_kind, + dtype="float32", ): for shape_dtype in ["int32", "int64"]: @@ -101,14 +120,14 @@ def verify_avg_pool2d_grad( ) for target, dev in tvm.testing.enabled_targets(): - op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate( - bwd_func - )(data) + op_res, (op_grad,) = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01) @tvm.testing.uses_gpu -def test_avg_pool2d_grad(): +def test_avg_pool2d_grad(executor_kind): verify_avg_pool2d_grad( (1, 4, 16, 16), pool_size=(2, 2), @@ -116,6 +135,7 @@ def test_avg_pool2d_grad(): padding=(0, 0), ceil_mode=False, count_include_pad=True, + executor_kind=executor_kind, ) verify_avg_pool2d_grad( (1, 4, 16, 16), @@ -124,6 +144,7 @@ def test_avg_pool2d_grad(): padding=(1, 1), ceil_mode=False, count_include_pad=False, + executor_kind=executor_kind, ) verify_avg_pool2d_grad( (1, 4, 16, 16), @@ -132,11 +153,12 @@ def test_avg_pool2d_grad(): padding=(1, 1), ceil_mode=False, count_include_pad=False, + executor_kind=executor_kind, dtype="int32", ) -def verify_global_avg_pool2d_grad(x_shape): +def verify_global_avg_pool2d_grad(executor_kind, x_shape): x = relay.var("x", relay.TensorType(x_shape, "float32")) y = tvm.relay.nn.global_avg_pool2d(x) @@ -158,19 +180,21 @@ def verify_global_avg_pool2d_grad(x_shape): ) for target, dev in tvm.testing.enabled_targets(): - op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)( - data - ) + op_res, (op_grad,) = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01) @tvm.testing.uses_gpu -def test_global_avg_pool2d_grad(): - verify_global_avg_pool2d_grad((1, 4, 16, 16)) - verify_global_avg_pool2d_grad((1, 8, 8, 24)) +def test_global_avg_pool2d_grad(executor_kind): + verify_global_avg_pool2d_grad(executor_kind, (1, 4, 16, 16)) + verify_global_avg_pool2d_grad(executor_kind, (1, 8, 8, 24)) -def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order"): +def verify_conv2d_grad( + dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order", executor_kind="vm" +): dtype = "float32" data = relay.var("data", shape=dshape, dtype=dtype) weight = relay.var("weight", shape=wshape, dtype=dtype) @@ -184,59 +208,73 @@ def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mod out_dtype=dtype, ) fwd_func = relay.Function([data, weight], conv) - check_grad(fwd_func, mode=mode) + check_grad(fwd_func, mode=mode, executor_kind=executor_kind) @tvm.testing.uses_gpu -def test_conv2d_grad(): - verify_conv2d_grad((1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1]) - verify_conv2d_grad((1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1]) - verify_conv2d_grad((1, 4, 16, 16), (16, 4, 1, 1), [2, 2], [0, 0], [1, 1]) - verify_conv2d_grad((1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1], mode="first_order") +def test_conv2d_grad(executor_kind): + verify_conv2d_grad( + (1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1], executor_kind=executor_kind + ) + verify_conv2d_grad( + (1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1], executor_kind=executor_kind + ) + verify_conv2d_grad( + (1, 4, 16, 16), (16, 4, 1, 1), [2, 2], [0, 0], [1, 1], executor_kind=executor_kind + ) + verify_conv2d_grad( + (1, 4, 16, 16), + (16, 4, 3, 3), + [1, 1], + [1, 1], + [1, 1], + mode="first_order", + executor_kind=executor_kind, + ) -def verify_dense_grad(d_shape, w_shape): +def verify_dense_grad(d_shape, w_shape, executor_kind): data = relay.var("data", relay.TensorType(d_shape, "float32")) weight = relay.var("weight", relay.TensorType(w_shape, "float32")) fwd_func = relay.Function([data, weight], relay.nn.dense(data, weight)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_dense_grad(): - verify_dense_grad((1, 8), (16, 8)) - verify_dense_grad((1, 4), (3, 4)) - verify_dense_grad((5, 4), (3, 4)) +def test_dense_grad(executor_kind): + verify_dense_grad((1, 8), (16, 8), executor_kind) + verify_dense_grad((1, 4), (3, 4), executor_kind) + verify_dense_grad((5, 4), (3, 4), executor_kind) -def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b): +def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b, executor_kind): tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32")) tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32")) fwd_func = relay.Function( [tensor_a, tensor_b], relay.nn.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b), ) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_matmul_grad(): - verify_matmul_grad((1, 8), (8, 16), False, False) - verify_matmul_grad((4, 1), (4, 3), True, False) - verify_matmul_grad((4, 5), (3, 4), True, True) +def test_matmul_grad(executor_kind): + verify_matmul_grad((1, 8), (8, 16), False, False, executor_kind) + verify_matmul_grad((4, 1), (4, 3), True, False, executor_kind) + verify_matmul_grad((4, 5), (3, 4), True, True, executor_kind) -def verify_batch_flatten_grad(d_shape): +def verify_batch_flatten_grad(d_shape, executor_kind): data = relay.var("data", relay.TensorType(d_shape, "float32")) fwd_func = relay.Function([data], relay.nn.batch_flatten(data)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_batch_flatten_grad(): - verify_batch_flatten_grad((1, 2, 3, 4)) - verify_batch_flatten_grad((1, 8)) +def test_batch_flatten_grad(executor_kind): + verify_batch_flatten_grad((1, 2, 3, 4), executor_kind) + verify_batch_flatten_grad((1, 8), executor_kind) def verify_conv2d_backward_weight( - dy_shape, x_shape, kernel_size, stride, padding, groups=1, out_channels=None + executor_kind, dy_shape, x_shape, kernel_size, stride, padding, groups=1, out_channels=None ): dtype = "float32" dy = relay.var("dy", shape=dy_shape, dtype=dtype) @@ -265,7 +303,11 @@ def verify_conv2d_backward_weight( dy_np = np.random.randn(*dy_shape).astype(dtype) x_np = np.random.randn(*x_shape).astype(dtype) - dw_np = relay.create_executor(device=dev, target=target).evaluate(dw)(dy_np, x_np).numpy() + dw_np = ( + relay.create_executor(executor_kind, device=dev, target=target) + .evaluate(dw)(dy_np, x_np) + .numpy() + ) ref_dw_np = tvm.topi.testing.conv2d_backward_weight_python( dy_np, x_np, kernel_size, stride, padding, groups=groups, channels=out_channels ) @@ -273,11 +315,22 @@ def verify_conv2d_backward_weight( np.testing.assert_allclose(dw_np, ref_dw_np, rtol=1e-4, atol=1e-4) -def test_conv2d_backward_weight(): - verify_conv2d_backward_weight((2, 8, 32, 32), (2, 4, 32, 32), (3, 3), (1, 1), (1, 1)) - verify_conv2d_backward_weight((2, 16, 15, 15), (2, 3, 32, 32), (3, 3), (2, 2), (0, 0)) +def test_conv2d_backward_weight(executor_kind): + verify_conv2d_backward_weight( + executor_kind, (2, 8, 32, 32), (2, 4, 32, 32), (3, 3), (1, 1), (1, 1) + ) + verify_conv2d_backward_weight( + executor_kind, (2, 16, 15, 15), (2, 3, 32, 32), (3, 3), (2, 2), (0, 0) + ) verify_conv2d_backward_weight( - (1, 16, 32, 32), (1, 16, 32, 32), (3, 3), (1, 1), (1, 1), groups=16, out_channels=16 + executor_kind, + (1, 16, 32, 32), + (1, 16, 32, 32), + (3, 3), + (1, 1), + (1, 1), + groups=16, + out_channels=16, ) diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py index 30d849853d87..89b8199b9e22 100644 --- a/tests/python/relay/test_op_grad_level3.py +++ b/tests/python/relay/test_op_grad_level3.py @@ -24,9 +24,11 @@ from tvm.relay.transform import gradient import tvm.testing +executor_kind = tvm.testing.parameter("debug") + @tvm.testing.uses_gpu -def test_clip(): +def test_clip(executor_kind): for dtype in ("float32", "float64"): ref = lambda x: np.where( x > 10.0, np.zeros_like(x), np.where(x < 1.0, np.zeros_like(x), np.ones_like(x)) @@ -41,49 +43,49 @@ def test_clip(): bwd_func = run_infer_type(gradient(fwd_func)) for target, dev in tvm.testing.enabled_targets(): - op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate( - bwd_func - )(data) + op_res, (op_grad,) = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(bwd_func)(data) np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01) -def verify_transpose_grad(d_shape, axes=None): +def verify_transpose_grad(d_shape, axes=None, executor_kind="vm"): data = relay.var("data", relay.TensorType(d_shape, "float32")) fwd_func = relay.Function([data], relay.transpose(data, axes=axes)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_transpose_grad(): - verify_transpose_grad((1, 2, 3, 4)) - verify_transpose_grad((1, 2, 3, 4), axes=(0, 2, 3, 1)) +def test_transpose_grad(executor_kind): + verify_transpose_grad((1, 2, 3, 4), executor_kind=executor_kind) + verify_transpose_grad((1, 2, 3, 4), axes=(0, 2, 3, 1), executor_kind=executor_kind) -def test_negative_grad(): +def test_negative_grad(executor_kind): data = relay.var("data", relay.TensorType((10, 4), "float32")) fwd_func = relay.Function([data], relay.negative(data)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_cast_grad(): +def test_cast_grad(executor_kind): data = relay.var("data", relay.TensorType((10, 4), "float32")) fwd_func = relay.Function([data], relay.cast(data, "float64")) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_cast_like_grad(): +def test_cast_like_grad(executor_kind): data = relay.var("data", shape=(10, 4), dtype="float32") like = relay.var("like", shape=(1,), dtype="float64") fwd_func = relay.Function([data, like], relay.cast_like(data, like)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_copy_grad(): +def test_copy_grad(executor_kind): data = relay.var("data", relay.TensorType((10, 4), "float64")) fwd_func = relay.Function([data], relay.copy(data)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_take_grad(): +def test_take_grad(executor_kind): data_dtype = relay.TensorType((3, 4, 5), "float64") data = relay.var("data", data_dtype) indices = relay.var("indices", relay.TensorType((relay.Any(),), "int32")) @@ -92,28 +94,28 @@ def test_take_grad(): # take on axis fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=1)) - check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs) + check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind) # take on flattened fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=None)) - check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs) + check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind) -def test_stack_grad(): +def test_stack_grad(executor_kind): args = [relay.var(c, shape=(2, 3, 4), dtype="float64") for c in "xyz"] fwd_func = relay.Function(args, relay.stack(args, axis=0)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_squeeze_grad(): +def test_squeeze_grad(executor_kind): data = relay.var("data", shape=(2, 1, 1, 3, 4, 1), dtype="float64") fwd_func = relay.Function([data], relay.squeeze(data)) fwd_func_subset = relay.Function([data], relay.squeeze(data, axis=[1, -1])) - check_grad(fwd_func) - check_grad(fwd_func_subset) + check_grad(fwd_func, executor_kind=executor_kind) + check_grad(fwd_func_subset, executor_kind=executor_kind) -def test_arange_grad(): +def test_arange_grad(executor_kind): # TODO: testing arange numerically is strange because two-sided approx can # produce different output shapes dtype = "float64" @@ -122,23 +124,25 @@ def test_arange_grad(): step = relay.var("step", relay.TensorType((), dtype)) values = [np.array(v, dtype=dtype) for v in [2.5, 9.5, 1.8]] fwd_func = relay.Function([start, stop, step], relay.arange(start, stop, step, dtype)) - check_grad(fwd_func, inputs=values) + check_grad(fwd_func, inputs=values, executor_kind=executor_kind) -def test_gather_nd_grad(): +def test_gather_nd_grad(executor_kind): data = relay.var("data", relay.TensorType((2, 3), "float64")) indices = relay.var("indices", relay.TensorType((2, 4), "int64")) fwd = relay.Function([data, indices], relay.gather_nd(data, indices)) data_np = np.random.rand(2, 3).astype("float64") indices_np = np.array([[0, 1, 1, 0], [0, 1, 0, 0]], dtype="int64") - check_grad(fwd, inputs=[data_np, indices_np], test_inputs=[data_np]) + check_grad( + fwd, inputs=[data_np, indices_np], test_inputs=[data_np], executor_kind=executor_kind + ) -def test_reshape_like_grad(): +def test_reshape_like_grad(executor_kind): data = relay.var("data", shape=(2, 3, 4), dtype="float32") shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32") fwd_func = relay.Function([data, shape_like], relay.reshape_like(data, shape_like)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) def test_zeros_ones_grad_const_ints(): @@ -172,7 +176,7 @@ def test_zeros_ones_grad_const_expr(): tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_dyn) -def test_zeros_ones_grad_dynamic(): +def test_zeros_ones_grad_dynamic(executor_kind): rank = np.random.randint(low=1, high=5, dtype="int32") dyn_shape = np.random.randint(low=1, high=4, size=(rank,), dtype="int32") shape_data = relay.var("shape_data", shape=(rank,), dtype="int32") @@ -182,9 +186,9 @@ def test_zeros_ones_grad_dynamic(): bwd_func = run_infer_type(gradient(run_infer_type(fwd_func))) for target, dev in tvm.testing.enabled_targets(): - res, (grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)( - dyn_shape - ) + res, (grad,) = relay.create_executor(executor_kind, device=dev, target=target).evaluate( + bwd_func + )(dyn_shape) tvm.testing.assert_allclose(res.numpy(), op_ref(dyn_shape, dtype="float32")) tvm.testing.assert_allclose(grad.numpy(), np.zeros((rank,), dtype="int32")) diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py index 17d30cacac41..9ed2ef262777 100644 --- a/tests/python/relay/test_op_grad_level4.py +++ b/tests/python/relay/test_op_grad_level4.py @@ -16,43 +16,46 @@ # under the License. import pytest import numpy as np +import tvm.testing from tvm import relay from tvm.relay.testing import check_grad, _np_randn_from_type +executor_kind = tvm.testing.parameter("debug") -def verify_reduction_grad(red_fn, d_shape, axis=None, keepdims=False, exclude=False): + +def verify_reduction_grad(executor_kind, red_fn, d_shape, axis=None, keepdims=False, exclude=False): data = relay.var("data", relay.TensorType(d_shape, "float32")) fwd_func = relay.Function([data], red_fn(data, axis=axis, keepdims=keepdims, exclude=exclude)) - check_grad(fwd_func) + check_grad(fwd_func, executor_kind=executor_kind) -def test_reduction_grad(): +def test_reduction_grad(executor_kind): def _unbiased_variance(x, axis=None, keepdims=False, exclude=False): return relay.variance(x, axis=axis, keepdims=keepdims, exclude=exclude, unbiased=True) for op in (relay.sum, relay.variance, _unbiased_variance, relay.mean): - verify_reduction_grad(op, (4, 2)) - verify_reduction_grad(op, (4, 2), axis=-1, keepdims=True) - verify_reduction_grad(op, (4, 2, 1), axis=(1, 2), exclude=True) - verify_reduction_grad(op, (4, 2, 1), axis=1) + verify_reduction_grad(executor_kind, op, (4, 2)) + verify_reduction_grad(executor_kind, op, (4, 2), axis=-1, keepdims=True) + verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=(1, 2), exclude=True) + verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=1) -def verify_max_grad(d_shape, axis=None, keepdims=False, exclude=False): +def verify_max_grad(executor_kind, d_shape, axis=None, keepdims=False, exclude=False): data = relay.var("data", relay.TensorType(d_shape, "float32")) fwd_func = relay.Function( [data], relay.max(data, axis=axis, keepdims=keepdims, exclude=exclude) ) - check_grad(fwd_func, scale=1e-3) + check_grad(fwd_func, scale=1e-3, executor_kind=executor_kind) -def test_max_grad(): - verify_max_grad((10, 10), axis=None) - verify_max_grad((10, 10), axis=-1) - verify_max_grad((6, 3, 2), axis=(1, 2), keepdims=True) - verify_max_grad((5, 4, 3), axis=(0, 2), exclude=True) +def test_max_grad(executor_kind): + verify_max_grad(executor_kind, (10, 10), axis=None) + verify_max_grad(executor_kind, (10, 10), axis=-1) + verify_max_grad(executor_kind, (6, 3, 2), axis=(1, 2), keepdims=True) + verify_max_grad(executor_kind, (5, 4, 3), axis=(0, 2), exclude=True) -def test_where_grad(): +def test_where_grad(executor_kind): cond_type = relay.TensorType((2, 3, 4), "int32") lhs_type = relay.TensorType((1, 3, 4), "float32") rhs_type = relay.TensorType((2, 1, 4), "float32") @@ -66,10 +69,10 @@ def test_where_grad(): lhs = relay.var("lhs", type_annotation=lhs_type) rhs = relay.var("rhs", type_annotation=rhs_type) fwd_func = relay.Function([cond, lhs, rhs], relay.where(cond, lhs, rhs)) - check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:]) + check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:], executor_kind=executor_kind) -def test_less_equal_grad(): +def test_less_equal_grad(executor_kind): x_type = relay.TensorType((2, 3, 4), "float32") y_type = relay.TensorType((3, 1), "float32") # We need to generate inputs far apart to get correct numerical gradients @@ -83,10 +86,10 @@ def test_less_equal_grad(): x = relay.var("x", type_annotation=x_type) y = relay.var("y", type_annotation=y_type) fwd_func = relay.Function([x, y], relay.less_equal(x, y)) - check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6) + check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind) -def test_not_equal_grad(): +def test_not_equal_grad(executor_kind): x_type = relay.TensorType((2, 3, 4), "float32") y_type = relay.TensorType((3, 1), "float32") # We need to generate inputs far apart to get correct numerical gradients @@ -100,17 +103,17 @@ def test_not_equal_grad(): x = relay.var("x", type_annotation=x_type) y = relay.var("y", type_annotation=y_type) fwd_func = relay.Function([x, y], relay.not_equal(x, y)) - check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6) + check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind) -def test_strided_slice_grad(): +def test_strided_slice_grad(executor_kind): def check(sh, dtype, begin, end, strides, slice_mode): x = relay.var("x", shape=sh, dtype=dtype) f = relay.Function( [x], relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=slice_mode), ) - check_grad(f) + check_grad(f, executor_kind=executor_kind) check((2, 3, 4), "float32", (0, 1, 0), (-1, -1, 1), (1, 1, 1), "size") check((2, 3, 4), "float32", (0, 1, 0), (2, 3, 1), (1, 1, 1), "end") diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index d4238f81e01b..1b72e5ce5137 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -26,6 +26,8 @@ from tvm.contrib.nvcc import have_fp16 import tvm.testing +executor_kind = tvm.testing.parameter("graph", "vm") + def sigmoid(x): one = np.ones_like(x) @@ -286,7 +288,7 @@ def test_log_softmax(): @tvm.testing.uses_gpu -def test_concatenate(): +def test_concatenate(executor_kind): for dtype in ["float16", "float32"]: n, t, d = te.size_var("n"), te.size_var("t"), 100 x = relay.var("x", shape=(n, t, d)) @@ -336,17 +338,13 @@ def test_concatenate(): and not have_fp16(tvm.cuda(0).compute_version) ): continue - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - x_data, y_data, t_data - ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.01) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( x_data, y_data, t_data ) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.01) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01) -def test_dropout(): +def test_dropout(executor_kind): for dtype in ["float16", "float32"]: n, t, d = te.size_var("n"), te.size_var("t"), te.size_var("d") input_ty = relay.TensorType((n, t, d), dtype) @@ -361,9 +359,8 @@ def test_dropout(): y = relay.nn.dropout(x, rate=0.5) func = relay.Function([], y) for target, dev in tvm.testing.enabled_targets(): - for backend in ["debug", "graph"]: - op_res = relay.create_executor("debug", device=dev, target=target).evaluate(func)() - tvm.testing.assert_allclose(op_res.numpy(), in_np, rtol=0.01) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)() + tvm.testing.assert_allclose(op_res.numpy(), in_np, rtol=0.01) def test_batch_norm(): @@ -490,7 +487,7 @@ def test_matmul_type_check(): @tvm.testing.uses_gpu -def test_matmul(): +def test_matmul(executor_kind): for dtype in ["float16", "float32"]: # Matmul accuracy for float16 is poor if dtype == "float16": @@ -529,14 +526,10 @@ def test_matmul(): ref_res = np.dot(x_data.transpose(), w_data) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( x_data, w_data ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( - x_data, w_data - ) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @pytest.mark.xfail @@ -552,7 +545,7 @@ def test_dense_type_check(): @tvm.testing.uses_gpu -def test_dense(): +def test_dense(executor_kind): for dtype in ["float16", "float32"]: # Dense accuracy for float16 is poor if dtype == "float16": @@ -591,14 +584,10 @@ def test_dense(): ref_res = np.dot(x_data, w_data.T) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - x_data, w_data - ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( x_data, w_data ) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index 8ee5adbb318d..7e0b8ad89f64 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -27,9 +27,11 @@ from tvm.relay import transform from tvm.relay.testing import run_infer_type +executor_kind = tvm.testing.parameter("graph", "vm") + @tvm.testing.uses_gpu -def test_checkpoint(): +def test_checkpoint(executor_kind): dtype = "float32" xs = [relay.var("x{}".format(i), dtype) for i in range(4)] f = relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3])) @@ -41,12 +43,11 @@ def test_checkpoint(): inputs = [np.random.uniform() for _ in range(len(xs))] for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - f_res = relay.create_executor(kind, device=dev, target=target).evaluate(f)(*inputs) - f_checkpoint_res = relay.create_executor(kind, device=dev, target=target).evaluate( - f_checkpoint - )(*inputs) - tvm.testing.assert_allclose(f_res.numpy(), f_checkpoint_res.numpy(), 0, 0) + f_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(*inputs) + f_checkpoint_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate( + f_checkpoint + )(*inputs) + tvm.testing.assert_allclose(f_res.numpy(), f_checkpoint_res.numpy(), 0, 0) def test_checkpoint_alpha_equal(): @@ -171,7 +172,7 @@ def test_checkpoint_alpha_equal_tuple(): @tvm.testing.uses_gpu -def test_collapse_sum_like(): +def test_collapse_sum_like(executor_kind): shape = (3, 4, 5, 6) shape_like = (4, 5, 6) dtype = "float32" @@ -186,13 +187,14 @@ def test_collapse_sum_like(): y = np.random.uniform(size=shape_like).astype(dtype) ref_res = np.sum(x, 0) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x, y) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x, y + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu -def test_collapse_sum_to(): +def test_collapse_sum_to(executor_kind): shape = (3, 4, 5, 6) shape_to = (4, 5, 6) dtype = "float32" @@ -205,13 +207,12 @@ def test_collapse_sum_to(): x = np.random.uniform(size=shape).astype(dtype) ref_res = np.sum(x, 0) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu -def test_broadcast_to(): +def test_broadcast_to(executor_kind): shape = (4, 1, 6) shape_like = (3, 4, 5, 6) dtype = "float32" @@ -224,13 +225,12 @@ def test_broadcast_to(): x = np.random.uniform(size=shape).astype(dtype) ref_res = np.broadcast_to(x, shape_like) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu -def test_broadcast_to_const_shape_int64(): +def test_broadcast_to_const_shape_int64(executor_kind): shape_like = relay.const(np.array([1, 5]), dtype="int64") x = relay.var("x", shape=(1,), dtype="int64") z = relay.broadcast_to(x, shape=shape_like) @@ -241,13 +241,12 @@ def test_broadcast_to_const_shape_int64(): x = np.random.randint(10, size=(1,), dtype="int64") ref_res = np.broadcast_to(x, (5,)) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(f)(x) - tvm.testing.assert_allclose(op_res.numpy(), ref_res) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x) + tvm.testing.assert_allclose(op_res.numpy(), ref_res) @tvm.testing.uses_gpu -def test_broadcast_to_like(): +def test_broadcast_to_like(executor_kind): shape = (4, 1, 6) shape_like = (3, 4, 5, 6) dtype = "float32" @@ -264,9 +263,10 @@ def test_broadcast_to_like(): ref_res = np.broadcast_to(x, shape_like) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x, y) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x, y + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) def np_slice_like(np_data, np_shape_like, axis=None): @@ -288,7 +288,7 @@ def np_slice_like(np_data, np_shape_like, axis=None): return np_result -def verify_slice_like(data, slice_like, axes, output, dtype="float32"): +def verify_slice_like(executor_kind, data, slice_like, axes, output, dtype="float32"): x = relay.var("data", relay.TensorType(data, dtype)) y = relay.var("slice_like", relay.TensorType(slice_like, dtype)) z = relay.slice_like(x, y, axes) @@ -308,31 +308,46 @@ def verify_slice_like(data, slice_like, axes, output, dtype="float32"): ref_res = np_slice_like(x_data, y_data, axes) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data, y_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data, y_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu -def test_slice_like(): +def test_slice_like(executor_kind): d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4") - verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)) - verify_slice_like(data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3)) - verify_slice_like(data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1, 2), output=(d2, d2, d3)) - verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)) - verify_slice_like(data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5)) - verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3)) - verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3)) verify_slice_like( - data=(1, 3, 224, 224), slice_like=(1, 3, 112, 112), axes=(2, 3), output=(1, 3, 112, 112) + executor_kind, data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3) + ) + verify_slice_like( + executor_kind, data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3) + ) + verify_slice_like( + executor_kind, data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1, 2), output=(d2, d2, d3) + ) + verify_slice_like( + executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3) + ) + verify_slice_like(executor_kind, data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5)) + verify_slice_like( + executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3) + ) + verify_slice_like( + executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3) + ) + verify_slice_like( + executor_kind, + data=(1, 3, 224, 224), + slice_like=(1, 3, 112, 112), + axes=(2, 3), + output=(1, 3, 112, 112), ) @tvm.testing.uses_gpu -def test_reverse_reshape(): - def verify_reverse_reshape(shape, newshape, oshape): +def test_reverse_reshape(executor_kind): + def verify_reverse_reshape(executor_kind, shape, newshape, oshape): x = relay.var("x", relay.TensorType(shape, "float32")) z = relay.reverse_reshape(x, newshape=newshape) zz = run_infer_type(z) @@ -343,21 +358,20 @@ def verify_reverse_reshape(shape, newshape, oshape): x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32") ref_res = np.reshape(x_data, oshape) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) - verify_reverse_reshape((2, 3, 4), (4, 0, 2), (4, 3, 2)) - verify_reverse_reshape((2, 3, 4), (2, 0, 0), (2, 3, 4)) - verify_reverse_reshape((2, 3, 4), (0, -1), (3, 8)) - verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4)) - verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12)) + verify_reverse_reshape(executor_kind, (2, 3, 4), (4, 0, 2), (4, 3, 2)) + verify_reverse_reshape(executor_kind, (2, 3, 4), (2, 0, 0), (2, 3, 4)) + verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -1), (3, 8)) + verify_reverse_reshape(executor_kind, (2, 3, 4), (-1, 0), (6, 4)) + verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -3), (2, 12)) def verify_batch_matmul_with_inputs( - x, y, x_np, y_np, out_shape, dtype="float32", trans_x=False, trans_y=True + executor_kind, x, y, x_np, y_np, out_shape, dtype="float32", trans_x=False, trans_y=True ): z = relay.nn.batch_matmul(x, y, transpose_a=trans_x, transpose_b=trans_y) zz = run_infer_type(z) @@ -368,26 +382,29 @@ def verify_batch_matmul_with_inputs( z_np = tvm.topi.testing.batch_matmul(x_np, y_np, trans_x=trans_x, trans_y=trans_y) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - if len(input_vars) == 2: - z = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_np, y_np - ) - else: - z = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_np) - tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5, atol=1e-5) + if len(input_vars) == 2: + z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_np, y_np + ) + else: + z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_np) + tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5, atol=1e-5) -def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32", trans_x=False, trans_y=True): +def verify_batch_matmul( + executor_kind, x_shape, y_shape, out_shape, dtype="float32", trans_x=False, trans_y=True +): x = relay.var("x", relay.TensorType(x_shape, dtype)) y = relay.var("y", relay.TensorType(y_shape, dtype)) x_np = np.random.uniform(size=x_shape).astype(dtype) y_np = np.random.uniform(size=y_shape).astype(dtype) - verify_batch_matmul_with_inputs(x, y, x_np, y_np, out_shape, dtype, trans_x, trans_y) + verify_batch_matmul_with_inputs( + executor_kind, x, y, x_np, y_np, out_shape, dtype, trans_x, trans_y + ) @tvm.testing.uses_gpu -def test_batch_matmul(): +def test_batch_matmul(executor_kind): b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var("n"), te.size_var("k") x = relay.var("x", relay.TensorType((b, m, k), "float32")) y = relay.var("y", relay.TensorType((b, n, k), "float32")) @@ -395,17 +412,31 @@ def test_batch_matmul(): zz = run_infer_type(z) assert zz.checked_type == relay.TensorType((b, m, n), "float32") - verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True) - verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True) - verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True) - verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True) - verify_batch_matmul((1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True) - verify_batch_matmul((5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False) - verify_batch_matmul((5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False) + verify_batch_matmul( + executor_kind, (1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True + ) + verify_batch_matmul( + executor_kind, (5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True + ) + verify_batch_matmul( + executor_kind, (5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True + ) + verify_batch_matmul( + executor_kind, (30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True + ) + verify_batch_matmul( + executor_kind, (1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True + ) + verify_batch_matmul( + executor_kind, (5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False + ) + verify_batch_matmul( + executor_kind, (5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False + ) x_np = np.random.randn(10, 27, 64).astype("float32") x = relay.var("x", shape=x_np.shape) - verify_batch_matmul_with_inputs(x, x, x_np, x_np, (10, 27, 27)) + verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27)) @pytest.mark.skip("Requires cascadelake") @@ -492,13 +523,13 @@ def test_shape_of(): for target, dev in tvm.testing.enabled_targets(): # Because using graph executor, this op will be optimized after # constant folding pass, here we only test with interpreter - for kind in ["debug"]: + for kind in ["vm"]: op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_data) tvm.testing.assert_allclose(op_res.numpy(), np.array(shape).astype("int32")) @tvm.testing.uses_gpu -def test_ndarray_size(): +def test_ndarray_size(executor_kind): def verify_ndarray_size(shape): x = relay.var("x", shape=shape) func = relay.Function([x], relay.op.ndarray_size(x)) @@ -507,11 +538,10 @@ def verify_ndarray_size(shape): x_data = np.random.uniform(size=shape).astype("float32") ref_res = np.size(x_data) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res) verify_ndarray_size((2, 3, 5)) verify_ndarray_size((2, 3, 5, 7)) @@ -573,7 +603,7 @@ def test_adaptive_pool(): @tvm.testing.uses_gpu -def test_sequence_mask(): +def test_sequence_mask(executor_kind): def _verify(data_shape, mask_value, axis, dtype, itype): max_length = data_shape[axis] nbatch = data_shape[1 - axis] @@ -588,11 +618,10 @@ def _verify(data_shape, mask_value, axis, dtype, itype): gt_out_np = tvm.topi.testing.sequence_mask(data_np, valid_length_np, mask_value, axis) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - data_np, valid_length_np - ) - tvm.testing.assert_allclose(out_relay.numpy(), gt_out_np) + out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate( + func + )(data_np, valid_length_np) + tvm.testing.assert_allclose(out_relay.numpy(), gt_out_np) _verify((5, 10), 0.0, 1, "float32", "int32") _verify((2, 3, 5, 3), 0.0, 0, "float32", "int64") @@ -600,7 +629,7 @@ def _verify(data_shape, mask_value, axis, dtype, itype): @tvm.testing.uses_gpu -def test_one_hot(): +def test_one_hot(executor_kind): def _get_oshape(indices_shape, depth, axis): oshape = [] true_axis = len(indices_shape) if axis == -1 else axis @@ -629,11 +658,10 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype): out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - indices_np - ) - tvm.testing.assert_allclose(out_relay.numpy(), out_np) + out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate( + func + )(indices_np) + tvm.testing.assert_allclose(out_relay.numpy(), out_np) _verify((3,), 3, 1, 0, -1, "int32") _verify((3,), 3, 1.0, 0.0, -1, "float32") @@ -644,7 +672,7 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype): @tvm.testing.uses_gpu -def test_matrix_set_diag(): +def test_matrix_set_diag(executor_kind): def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"): input = relay.var("input", relay.TensorType(input_shape, dtype)) diagonal = relay.var("diagonal", relay.TensorType(diagonal_shape, dtype)) @@ -660,11 +688,10 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"): out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - input_np, diagonal_np - ) - tvm.testing.assert_allclose(out_relay.numpy(), out_np) + out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate( + func + )(input_np, diagonal_np) + tvm.testing.assert_allclose(out_relay.numpy(), out_np) _verify((2, 2), (2,), "float32") _verify((4, 3, 3), (4, 3), "int32") @@ -675,7 +702,7 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"): @tvm.testing.parametrize_targets -def test_nll_loss(dev, target): +def test_nll_loss(executor_kind, dev, target): def _get_oshape(target_shape, reduction): if reduction == "none": return target_shape @@ -702,11 +729,10 @@ def _verify(prediction_shape, reduction="mean", ignore_index=-100, dtype="float3 predictions_np, targets_np, weights_np, reduction, ignore_index ) - for kind in ["graph", "debug"]: - out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - predictions_np, targets_np, weights_np - ) - tvm.testing.assert_allclose(out_relay.numpy(), out_np, rtol=1e-6, atol=1e-6) + out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + predictions_np, targets_np, weights_np + ) + tvm.testing.assert_allclose(out_relay.numpy(), out_np, rtol=1e-6, atol=1e-6) _verify((10, 5)) _verify((10, 5, 2, 2)) diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index c644890bbcbe..726ee578da85 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -30,6 +30,8 @@ from tvm.relay.testing import run_infer_type from tvm.topi.cuda.conv3d_winograd import _infer_tile_size +executor_kind = tvm.testing.parameter("graph", "vm") + @tvm.testing.uses_gpu def test_conv1d_infer_type(): @@ -1301,7 +1303,7 @@ def test_avg_pool2d_no_count_pad(): @tvm.testing.uses_gpu -def test_flatten_infer_type(): +def test_flatten_infer_type(executor_kind): d1, d2, d3, d4 = te.size_var("d1"), te.size_var("d2"), te.size_var("d3"), te.size_var("d4") x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32")) y = relay.nn.batch_flatten(x) @@ -1330,10 +1332,10 @@ def test_flatten_infer_type(): ref_res = x_data.flatten().reshape(o_shape) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu @@ -1438,7 +1440,7 @@ def _test_run(dtype): @tvm.testing.uses_gpu @pytest.mark.parametrize("dtype", ["float32", "float16"]) -def test_lrn(dtype): +def test_lrn(executor_kind, dtype): n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") x = relay.var("x", shape=(n, c, h, w), dtype=dtype) y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=0.00001, beta=0.75) @@ -1461,14 +1463,14 @@ def test_lrn(dtype): ref_res = tvm.topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu -def test_l2_normalize(): +def test_l2_normalize(executor_kind): n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") x = relay.var("x", shape=(n, c, h, w)) y = relay.nn.l2_normalize(x, eps=0.001, axis=[1]) @@ -1489,10 +1491,10 @@ def test_l2_normalize(): ref_res = tvm.topi.testing.l2_normalize_python(x_data, eps, axis) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) def batch_flatten(data): diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index ef4b45ade9aa..9d27839c4703 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -30,7 +30,7 @@ from utils import ref_funcs -executor_kind = tvm.testing.parameter("graph", "debug") +executor_kind = tvm.testing.parameter("graph", "vm") class TestZerosOnes: @@ -644,7 +644,7 @@ def test_full_like_infer_type(): assert yy.checked_type == relay.TensorType((n, c, h, w), "float32") -def test_infer_type_leaky_relu(target, dev): +def test_infer_type_leaky_relu(target, dev, executor_kind): n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) y = relay.nn.leaky_relu(x, alpha=0.1) @@ -663,10 +663,8 @@ def test_infer_type_leaky_relu(target, dev): x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype) ref_res = np.where(x_data > 0, x_data, x_data * 0.1) - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_data) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) class TestInferTypePrelu: @@ -684,7 +682,7 @@ class TestInferTypePrelu: ((1, 2, 2, 3), None, 3, (1, 2, 2, 3)), ) - def test_infer_type_prelu(self, target, dev, data, alpha, axis, output, dtype): + def test_infer_type_prelu(self, target, dev, executor_kind, data, alpha, axis, output, dtype): x = relay.var("data", relay.TensorType(data, dtype)) if alpha: y = relay.var("alpha", relay.TensorType(alpha, dtype)) @@ -712,14 +710,10 @@ def test_infer_type_prelu(self, target, dev, data, alpha, axis, output, dtype): else: ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - x_data, a_data - ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( x_data, a_data ) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) class TestArange: @@ -1051,7 +1045,7 @@ class TestDynamicScatter: ((16, 16, 4, 5), (16, 16, 4, 5), 3), ) - @pytest.mark.parametrize("executor_kind", ["vm", "debug"]) + @pytest.mark.parametrize("executor_kind", ["vm"]) def test_dynamic_scatter(self, target, dev, executor_kind, dshape, ishape, axis): d = relay.var("d", relay.TensorType([relay.Any() for i in range(len(dshape))], "float32")) i = relay.var("i", relay.TensorType([relay.Any() for i in range(len(ishape))], "int64")) @@ -2033,31 +2027,30 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False): x_data = np.random.randint(50, size=n).astype(dtype) if is_dyn: - backends = ["vm", "debug"] + backend = "vm" else: - backends = ["graph", "debug"] - - for kind in backends: - mod = tvm.ir.IRModule.from_expr(func) - tvm_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - x_data - ) # unique, indices, inverse_indices, num_unique, (counts) - np_res = calc_numpy_unique( - x_data, is_sorted - ) # unique, indices, inverse_indices, num_unique, counts - num_unique = np_res[3][0] - - # num_unique - assert num_unique == tvm_res[3].numpy()[0] - # unique - tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5) - # indices - tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5) - # inverse_indices - tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5) - # counts - if return_counts: - tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5) + backend = "graph" + + mod = tvm.ir.IRModule.from_expr(func) + tvm_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()( + x_data + ) # unique, indices, inverse_indices, num_unique, (counts) + np_res = calc_numpy_unique( + x_data, is_sorted + ) # unique, indices, inverse_indices, num_unique, counts + num_unique = np_res[3][0] + + # num_unique + assert num_unique == tvm_res[3].numpy()[0] + # unique + tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5) + # indices + tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5) + # inverse_indices + tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5) + # counts + if return_counts: + tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5) for dtype in ["int32", "int64"]: for i in range(8): diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py index b9bbef951555..e46832d570e9 100644 --- a/tests/python/relay/test_op_level4.py +++ b/tests/python/relay/test_op_level4.py @@ -26,7 +26,7 @@ from tvm.relay import transform from tvm.relay.testing import run_infer_type -executor_kind = tvm.testing.parameter("graph", "debug") +executor_kind = tvm.testing.parameter("graph", "vm") @tvm.testing.uses_gpu @@ -153,14 +153,13 @@ def test_binary_int_broadcast_2(): @tvm.testing.uses_gpu -def test_where(): +def test_where(executor_kind): def run(func, inputs, ref_res): for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - *inputs - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + *inputs + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) def verify(x_np, y_np, cond_np): ref_res = np.where(cond_np, x_np, y_np) @@ -398,7 +397,7 @@ def get_test_case(shape, gt_func, test_argmin=False): assert op_res.numpy().item() == ans -def verify_mean_var_std(funcs, shape, axis, keepdims): +def verify_mean_var_std(executor_kind, funcs, shape, axis, keepdims): test_func = funcs[0] ref_func = funcs[1] dtype = "float32" @@ -411,27 +410,26 @@ def verify_mean_var_std(funcs, shape, axis, keepdims): ref_res = ref_func(x_data, axis=axis, dtype=dtype, keepdims=keepdims) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res1[0].numpy(), ref_mean, rtol=1e-5) - tvm.testing.assert_allclose(op_res1[1].numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res2[0].numpy(), ref_mean, rtol=1e-5) - tvm.testing.assert_allclose(op_res2[1].numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res[0].numpy(), ref_mean, rtol=1e-5) + tvm.testing.assert_allclose(op_res[1].numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu -def test_mean_var_std(): +def test_mean_var_std(executor_kind): for func in [[relay.mean_variance, np.var], [relay.mean_std, np.std]]: - verify_mean_var_std(func, (2, 3, 4), 1, True) - verify_mean_var_std(func, (2, 3, 4), (1,), True) - verify_mean_var_std(func, (2, 3, 4), -1, True) - verify_mean_var_std(func, (2, 3, 4), (0, 1, 2), False) - verify_mean_var_std(func, (4, 4, 3), None, False) - verify_mean_var_std(func, (4, 4, 3), (0, 2), False) - verify_mean_var_std(func, (128, 24, 128), (0, 1), False) - verify_mean_var_std(func, (128, 24, 128), (0, 2), False) - verify_mean_var_std(func, (128, 24, 128), (0, 1), True) - verify_mean_var_std(func, (128, 24, 128), (0, 2), True) + verify_mean_var_std(executor_kind, func, (2, 3, 4), 1, True) + verify_mean_var_std(executor_kind, func, (2, 3, 4), (1,), True) + verify_mean_var_std(executor_kind, func, (2, 3, 4), -1, True) + verify_mean_var_std(executor_kind, func, (2, 3, 4), (0, 1, 2), False) + verify_mean_var_std(executor_kind, func, (4, 4, 3), None, False) + verify_mean_var_std(executor_kind, func, (4, 4, 3), (0, 2), False) + verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), False) + verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), False) + verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), True) + verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), True) @tvm.testing.uses_gpu diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 10cd91415724..af9c08409c01 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -29,7 +29,7 @@ from tvm import relay, te from tvm.relay.testing import run_infer_type -executor_kind = tvm.testing.parameter("graph", "debug") +executor_kind = tvm.testing.parameter("graph", "vm") def test_resize1d_infer_type(): @@ -279,7 +279,7 @@ def test_crop_and_resize(self, target, dev, executor_kind, layout, interpolate_m @tvm.testing.uses_gpu -def test_multibox_prior(): +def test_multibox_prior(executor_kind): def get_ref_result( dshape, sizes=(1.0,), ratios=(1.0,), steps=(-1.0, -1.0), offsets=(0.5, 0.5), clip=True ): @@ -358,10 +358,10 @@ def verify_multibox_prior( func = relay.Function([x], z) func = run_infer_type(func) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(data) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) sizes = (0.3, 1.5, 0.7) ratios = (1.3, 2.4) @@ -415,7 +415,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): func = relay.Function([x], z.astuple()) func = run_infer_type(func) for target, dev in tvm.testing.enabled_targets(): - out = relay.create_executor("debug", device=dev, target=target).evaluate(func)(np_data) + out = relay.create_executor("vm", device=dev, target=target).evaluate(func)(np_data) tvm.testing.assert_allclose(out[0].numpy(), np_out1, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[1].numpy(), np_out2, rtol=1e-3, atol=1e-04) @@ -428,7 +428,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): @tvm.testing.uses_gpu -def test_non_max_suppression(): +def test_non_max_suppression(executor_kind): def verify_nms( x0_data, x1_data, @@ -486,22 +486,14 @@ def verify_nms( func_indices = relay.Function([x0, x1, x2, x3], z_indices) func_indices = run_infer_type(func_indices) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - x0_data, x1_data, x2_data, x3_data - ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( x0_data, x1_data, x2_data, x3_data ) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) - op_indices_res1 = relay.create_executor("graph", device=dev, target=target).evaluate( - func_indices - )(x0_data, x1_data, x2_data, x3_data) - tvm.testing.assert_allclose(op_indices_res1[0].numpy(), ref_indices_res, rtol=1e-5) - op_indices_res2 = relay.create_executor("debug", device=dev, target=target).evaluate( - func_indices - )(x0_data, x1_data, x2_data, x3_data) - tvm.testing.assert_allclose(op_indices_res2[0].numpy(), ref_indices_res, rtol=1e-5) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_indices_res = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) + tvm.testing.assert_allclose(op_indices_res[0].numpy(), ref_indices_res, rtol=1e-5) np_data = np.array( [ @@ -633,7 +625,7 @@ def verify_nms( @tvm.testing.uses_gpu -def test_multibox_transform_loc(): +def test_multibox_transform_loc(executor_kind): def test_default_value(): num_anchors = 3 num_classes = 3 @@ -683,14 +675,10 @@ def test_default_value(): func = relay.Function([cls_prob, loc_pred, anchors], nms) func = run_infer_type(func) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - np_cls_prob, np_loc_preds, np_anchors - ) - tvm.testing.assert_allclose(op_res1.numpy(), expected_np_out, rtol=1e-5) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( np_cls_prob, np_loc_preds, np_anchors ) - tvm.testing.assert_allclose(op_res2.numpy(), expected_np_out, rtol=1e-5) + tvm.testing.assert_allclose(op_res.numpy(), expected_np_out, rtol=1e-5) def test_threshold(): num_anchors = 5 @@ -727,7 +715,7 @@ def test_threshold(): @tvm.testing.uses_gpu -def test_roi_align(): +def test_roi_align(executor_kind): def verify_roi_align( data_shape, rois_shape, @@ -778,14 +766,10 @@ def verify_roi_align( mode=mode, ) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( np_data, np_rois ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, atol=1e-6, rtol=1e-3) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( - np_data, np_rois - ) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, atol=1e-6, rtol=1e-3) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, atol=1e-6, rtol=1e-3) def verify_roi_align_nchw( data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode @@ -848,7 +832,7 @@ def verify_roi_align_nhwc( @tvm.testing.uses_gpu -def test_roi_pool(): +def test_roi_pool(executor_kind): def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale): data = relay.var("data", relay.ty.TensorType(data_shape, "float32")) rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32")) @@ -875,21 +859,17 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale): np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale ) for target, dev in tvm.testing.enabled_targets(): - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - np_data, np_rois - ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-4) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( np_data, np_rois ) - tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-4) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) verify_roi_pool((1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0) verify_roi_pool((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5) @tvm.testing.uses_gpu -def test_proposal(): +def test_proposal(executor_kind): def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs): cls_prob = relay.var("cls_prob", relay.ty.TensorType(np_cls_prob.shape, "float32")) bbox_pred = relay.var("bbox_pred", relay.ty.TensorType(np_bbox_pred.shape, "float32")) @@ -905,14 +885,10 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs): print("Skip test because %s is not enabled." % target) continue dev = tvm.device(target, 0) - op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)( + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( np_cls_prob, np_bbox_pred, np_im_info ) - tvm.testing.assert_allclose(op_res1.numpy(), np_out, rtol=1e-4) - op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)( - np_cls_prob, np_bbox_pred, np_im_info - ) - tvm.testing.assert_allclose(op_res2.numpy(), np_out, rtol=1e-4) + tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-4) attrs = { "scales": (0.5,), @@ -986,7 +962,7 @@ def verify_yolo_reorg(shape, stride, out_shape): @tvm.testing.uses_gpu -def test_yolo_reorg(): +def test_yolo_reorg(executor_kind): def verify_yolo_reorg(shape, stride): x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32") ref_res = tvm.topi.testing.reorg_python(x_data, stride) @@ -1000,11 +976,10 @@ def verify_yolo_reorg(shape, stride): func = relay.Function([x], z) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) verify_yolo_reorg((1, 100, 20, 20), 10) verify_yolo_reorg((1, 4, 6, 6), 2) @@ -1155,7 +1130,7 @@ def test_run( @tvm.testing.uses_gpu -def test_depth_to_space(): +def test_depth_to_space(executor_kind): def verify_depth_to_space(dshape, block_size, layout, mode): if layout == "NHWC": out_shape = [ @@ -1188,11 +1163,10 @@ def verify_depth_to_space(dshape, block_size, layout, mode): func = relay.Function([x], z) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) for layout in ["NHWC", "NCHW"]: for mode in ["DCR", "CDR"]: @@ -1200,7 +1174,7 @@ def verify_depth_to_space(dshape, block_size, layout, mode): @tvm.testing.uses_gpu -def test_space_to_depth(): +def test_space_to_depth(executor_kind): def verify_space_to_depth(dshape, block_size, layout): if layout == "NHWC": out_shape = [ @@ -1233,11 +1207,10 @@ def verify_space_to_depth(dshape, block_size, layout): func = relay.Function([x], z) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) for layout in ["NHWC", "NCHW"]: verify_space_to_depth((1, 4, 4, 4), 2, layout) @@ -1369,7 +1342,7 @@ def test_dilation2d( @tvm.testing.uses_gpu -def test_affine_grid(): +def test_affine_grid(executor_kind): def verify_affine_grid(num_batch, target_shape): dtype = "float32" data_shape = (num_batch, 2, 3) @@ -1385,18 +1358,17 @@ def verify_affine_grid(num_batch, target_shape): ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res1 = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - data_np - ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + data_np + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5) verify_affine_grid(1, (16, 32)) verify_affine_grid(4, (16, 32)) @tvm.testing.uses_gpu -def test_grid_sample(): +def test_grid_sample(executor_kind): def verify_grid_sample( data_shape, grid_shape, method="bilinear", padding_mode="zeros", align_corners=True ): @@ -1436,11 +1408,10 @@ def verify_grid_sample( ) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res1 = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - data_np, grid_np - ) - tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + data_np, grid_np + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5) methods = ["nearest", "bilinear", "bicubic"] padding_modes = ["zeros", "border", "reflection"] @@ -1462,7 +1433,7 @@ def verify_grid_sample( @tvm.testing.uses_gpu -def test_space_to_batch_nd(): +def test_space_to_batch_nd(executor_kind): def verify_space_to_batch_nd(dshape, block_shape, paddings): x_data = np.random.uniform(size=dshape).astype("float32") pad_before, pad_after = map(list, zip(*paddings)) @@ -1479,18 +1450,17 @@ def verify_space_to_batch_nd(dshape, block_shape, paddings): func = relay.Function([x], z) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) verify_space_to_batch_nd([3, 3, 2, 1], [3], [[0, 0]]) verify_space_to_batch_nd([2, 2, 4, 1], [2, 2], [[0, 0], [2, 0]]) @tvm.testing.uses_gpu -def test_batch_to_space_nd(): +def test_batch_to_space_nd(executor_kind): def verify_batch_to_space_nd(dshape, block_shape, crops): x_data = np.random.uniform(size=dshape).astype("float32") crop_begin_list, crop_end_list = map(list, zip(*crops)) @@ -1507,18 +1477,17 @@ def verify_batch_to_space_nd(dshape, block_shape, crops): func = relay.Function([x], z) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4) verify_batch_to_space_nd([4, 1, 1, 3], [2, 2], [[0, 0], [0, 0]]) verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [[0, 0], [2, 0]]) @tvm.testing.uses_gpu -def test_all_class_non_max_suppression(): +def test_all_class_non_max_suppression(executor_kind): def verify_all_class_non_max_suppression( boxes_np, scores_np, @@ -1542,12 +1511,11 @@ def verify_all_class_non_max_suppression( func = run_infer_type(func) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - selected_indices, num_detections = relay.create_executor( - kind, device=dev, target=target - ).evaluate(func)(boxes_np, scores_np) - tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]] - np.testing.assert_equal(tvm_res, expected_indices) + selected_indices, num_detections = relay.create_executor( + executor_kind, device=dev, target=target + ).evaluate(func)(boxes_np, scores_np) + tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]] + np.testing.assert_equal(tvm_res, expected_indices) boxes = np.array( [ diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py index 48c58dc2dc33..78db5b87385d 100644 --- a/tests/python/relay/test_op_level6.py +++ b/tests/python/relay/test_op_level6.py @@ -23,6 +23,8 @@ from tvm.topi.testing import searchsorted_ref import tvm.testing +executor_kind = tvm.testing.parameter("graph", "vm") + @tvm.testing.uses_gpu def test_sort(): @@ -40,16 +42,15 @@ def verify_sort(shape, axis, is_ascend, is_dyn=False, in_dtype="float32"): ref_res = -np.sort(-x_data, axis=axis) if is_dyn: - backends = ["vm", "debug"] + backend = "vm" else: - backends = ["graph", "debug"] + backend = "graph" for target, dev in tvm.testing.enabled_targets(): - for kind in backends: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) for is_dyn in [False, True]: verify_sort((2, 3, 4), axis=0, is_ascend=False, is_dyn=is_dyn) @@ -76,16 +77,15 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False, in_dtype="float3 ref_res = np.argsort(-x_data, axis=axis, kind="stable") if is_dyn: - backends = ["vm", "debug"] + backend = "vm" else: - backends = ["graph", "debug"] + backend = "graph" for target, dev in tvm.testing.enabled_targets(): - for kind in backends: - mod = tvm.ir.IRModule.from_expr(func) - op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()( - x_data - ) - tvm.testing.assert_allclose(op_res.numpy(), ref_res.astype(dtype), rtol=1e-5) + mod = tvm.ir.IRModule.from_expr(func) + op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()( + x_data + ) + tvm.testing.assert_allclose(op_res.numpy(), ref_res.astype(dtype), rtol=1e-5) for is_dyn in [False, True]: for dtype in ["int32", "int64", "float32", "float64"]: @@ -102,7 +102,7 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False, in_dtype="float3 @tvm.testing.uses_gpu -def test_topk(): +def test_topk(executor_kind): def verify_topk(k, axis, ret_type, is_ascend, dtype, in_dtype="float32"): shape = (20, 100) x = relay.var("x", relay.TensorType(shape, in_dtype)) @@ -129,17 +129,16 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype, in_dtype="float32"): np_indices = np_indices.astype(dtype) for target, dev in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)( - np_data - ) - if ret_type == "both": - tvm.testing.assert_allclose(op_res[0].numpy(), np_values) - tvm.testing.assert_allclose(op_res[1].numpy(), np_indices) - elif ret_type == "values": - tvm.testing.assert_allclose(op_res.numpy(), np_values) - else: - tvm.testing.assert_allclose(op_res.numpy(), np_indices) + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)( + np_data + ) + if ret_type == "both": + tvm.testing.assert_allclose(op_res[0].numpy(), np_values) + tvm.testing.assert_allclose(op_res[1].numpy(), np_indices) + elif ret_type == "values": + tvm.testing.assert_allclose(op_res.numpy(), np_values) + else: + tvm.testing.assert_allclose(op_res.numpy(), np_indices) np.random.seed(0) for k in [0, 1, 5]: From 6247bf48aaa59be9549dd8c342702c6005f16c5f Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Mon, 23 May 2022 11:59:02 +0100 Subject: [PATCH 58/59] [CMSIS-NN] Aligned buffer sizes for Conv2D post CMSIS-NN SHA update (#11359) --- .../backend/contrib/cmsisnn/buffer_size.cc | 18 +++++--- .../backend/contrib/cmsisnn/buffer_size.h | 3 +- .../backend/contrib/cmsisnn/relay_to_tir.cc | 6 +-- .../contrib/cmsisnn/buffer_size_test.cc | 41 ++++++++++--------- tests/python/relay/aot/test_crt_aot.py | 2 +- 5 files changed, 40 insertions(+), 30 deletions(-) diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.cc b/src/relay/backend/contrib/cmsisnn/buffer_size.cc index 2502a09e75d6..b6b98c0fc34f 100644 --- a/src/relay/backend/contrib/cmsisnn/buffer_size.cc +++ b/src/relay/backend/contrib/cmsisnn/buffer_size.cc @@ -29,24 +29,30 @@ namespace cmsisnn { int Conv2dBufferSize(CMSISNNFlags flags, int32_t padding_w, int32_t padding_h, int32_t input_n, int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w, - int32_t stride_w, int32_t stride_h, int32_t filter_w, int32_t filter_h) { + int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h, + int32_t filter_w, int32_t filter_h) { bool is1x1 = (padding_w == 0) && (padding_h == 0) && (input_c % 4 == 0) && (stride_w == 1) && - (stride_h == 1) && (filter_w == 1) && (filter_h == 1); - bool is1xN = - (output_h == 1) && (input_h == 1) && (filter_h == 1) && (output_w % 4 == 0) && (input_n == 1); + (stride_h == 1) && (filter_w == 1) && (filter_h == 1) && (dilation_w == 1) && + (dilation_h == 1); + bool is1xN = (output_h == 1) && (input_h == 1) && (filter_h == 1) && (output_w % 4 == 0) && + (input_n == 1) && (dilation_w == 1) && (dilation_h == 1); if (is1x1) { return 0; } if (is1xN) { - if (flags.dsp && !flags.mve) { + if (!flags.mve) { return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t); } return 0; } - if (flags.dsp) { + if (flags.mve) { + int32_t col_length = input_c * filter_w * filter_h; + col_length = (col_length + 7) / 8; + return 4 * col_length * 8 * (int32_t)sizeof(int8_t); + } else { return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t); } return 0; diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.h b/src/relay/backend/contrib/cmsisnn/buffer_size.h index dec3c3eafc48..e89763fd5a2d 100644 --- a/src/relay/backend/contrib/cmsisnn/buffer_size.h +++ b/src/relay/backend/contrib/cmsisnn/buffer_size.h @@ -56,7 +56,8 @@ namespace cmsisnn { */ int Conv2dBufferSize(CMSISNNFlags flags, int32_t padding_w, int32_t padding_h, int32_t input_n, int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w, - int32_t stride_w, int32_t stride_h, int32_t filter_w, int32_t filter_h); + int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h, + int32_t filter_w, int32_t filter_h); /*! * \brief Calculates the appropriate buffer size for CMSIS-NN Depthwise Convolutions diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc index 210175817f9c..dc5537ee905d 100644 --- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc +++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc @@ -238,9 +238,9 @@ class RelayToTIRVisitor : public MixedModeMutator { context_buffer_size = DepthwiseConv2dBufferSize(flags, input_n, input_c, output_c, filter_w, filter_h); } else { - context_buffer_size = - Conv2dBufferSize(flags, padding_w, padding_h, input_n, input_h, input_c, output_h, - output_w, stride_w, stride_h, filter_w, filter_h); + context_buffer_size = Conv2dBufferSize(flags, padding_w, padding_h, input_n, input_h, input_c, + output_h, output_w, stride_w, stride_h, dilation_w, + dilation_h, filter_w, filter_h); } if (context_buffer_size) { diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc index 7b8047a3b294..b7458858d4ab 100644 --- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc +++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc @@ -44,7 +44,7 @@ class CMSISNNCalculatedBufferSize : public testing::TestWithParam Date: Mon, 23 May 2022 23:00:52 +0800 Subject: [PATCH 59/59] [TVMScript] fix typo for block syntax (#11407) --- python/tvm/script/parser.py | 4 +- python/tvm/tir/schedule/schedule.py | 2 +- ...est_tir_transform_compact_buffer_region.py | 62 +++++++++---------- .../test_tir_transform_flatten_buffer.py | 22 +++---- .../unittest/test_tvmscript_complete.py | 12 ++-- .../unittest/test_tvmscript_roundtrip.py | 2 +- 6 files changed, 52 insertions(+), 52 deletions(-) diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py index daeb018ea989..a376cb7eb08d 100644 --- a/python/tvm/script/parser.py +++ b/python/tvm/script/parser.py @@ -786,9 +786,9 @@ def transform_With(self, node): withitem = (expr context_expr, expr? optional_vars) By now 2 patterns of With is supported: 1. with scope handler with symbol def - with T.block(*axes)/T.allocate() as targets: + with T.allocate() as targets: 2. with scope handler without symbol def - with T.let()/T.Assert()/T.attr()/T.realize() + with T.block(*axes)/T.let()/T.Assert()/T.attr()/T.realize() """ if not isinstance(node.rhs, ast.Call): diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index 8bfd9063158c..6474ba0baa3d 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -592,7 +592,7 @@ def before_split(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128, 128)) for i, j in T.grid(128, 128): - with T.block("B") as [vi, vj]: + with T.block("B"): vi, vj = T.axis.remap("SS", [i, j]) B[vi, vj] = A[vi, vj] * 2.0 diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py index 8ad95bd4bc0c..3e538e27a494 100644 --- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py +++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py @@ -40,12 +40,12 @@ def elementwise_func(a: T.handle, c: T.handle) -> None: T.writes(C[i, 0:16]) B = T.alloc_buffer((16, 16), "float32") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i, j]) T.writes(B[i, j]) B[i, j] = A[i, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[i, j]) T.writes(C[i, j]) C[i, j] = B[i, j] * 2.0 @@ -61,12 +61,12 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None: T.writes(C[i, 0:16]) B = T.alloc_buffer((1, 16), "float32") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i, j]) T.writes(B[0, j]) B[0, j] = A[i, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[0, j]) T.writes(C[i, j]) C[i, j] = B[0, j] * 2.0 @@ -97,7 +97,7 @@ def param_buffer_access_func(a: T.handle, c: T.handle) -> None: T.reads(A[i, 0:16]) T.writes(B[i, 0:16]) for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i, j]) T.writes(B[i, j]) B[i, j] = A[i, j] + 1.0 @@ -115,12 +115,12 @@ def shared_mem_func(a: T.handle, c: T.handle) -> None: T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16]) B = T.alloc_buffer((16, 16), "float32", scope="shared") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i0 * 8 + i1 * 4 + i2, j]) T.writes(B[i0 * 8 + i1 * 4 + i2, j]) B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[i0 * 8 + i1 * 4 + i2, j]) T.writes(C[i0 * 8 + i1 * 4 + i2, j]) C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0 @@ -138,12 +138,12 @@ def compacted_shared_mem_func(a: T.handle, c: T.handle) -> None: T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16]) B = T.alloc_buffer((8, 16), "float32", scope="shared") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i0 * 8 + i1 * 4 + i2, j]) T.writes(B[i1 * 4 + i2, j]) B[i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[i1 * 4 + i2, j]) T.writes(C[i0 * 8 + i1 * 4 + i2, j]) C[i0 * 8 + i1 * 4 + i2, j] = B[i1 * 4 + i2, j] * 2.0 @@ -161,12 +161,12 @@ def warp_mem_func(a: T.handle, c: T.handle) -> None: T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16]) B = T.alloc_buffer((16, 16), "float32", scope="warp") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i0 * 8 + i1 * 4 + i2, j]) T.writes(B[i0 * 8 + i1 * 4 + i2, j]) B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[i0 * 8 + i1 * 4 + i2, j]) T.writes(C[i0 * 8 + i1 * 4 + i2, j]) C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0 @@ -184,12 +184,12 @@ def compacted_warp_mem_func(a: T.handle, c: T.handle) -> None: T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16]) B = T.alloc_buffer((4, 16), "float32", scope="warp") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i0 * 8 + i1 * 4 + i2, j]) T.writes(B[i2, j]) B[i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[i2, j]) T.writes(C[i0 * 8 + i1 * 4 + i2, j]) C[i0 * 8 + i1 * 4 + i2, j] = B[i2, j] * 2.0 @@ -205,12 +205,12 @@ def symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None: T.writes(C[i * 8 : i * 8 + 8]) B = T.alloc_buffer((n * 8,), "float32") for j in range(0, 8): - with T.block() as []: + with T.block(): T.reads(A[i * 8 + j]) T.writes(B[i * 8 + j]) B[i * 8 + j] = A[i * 8 + j] + 1.0 for j in range(0, 8): - with T.block() as []: + with T.block(): T.reads(B[i * 8 + j]) T.writes(C[i * 8 + j]) C[i * 8 + j] = B[i * 8 + j] * 2.0 @@ -226,12 +226,12 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None: T.writes(C[i * 8 : i * 8 + 8]) B = T.alloc_buffer((T.min(n, 1) * 8,), "float32") for j in range(0, 8): - with T.block() as []: + with T.block(): T.reads(A[i * 8 + j]) T.writes(B[j]) B[j] = A[i * 8 + j] + 1.0 for j in range(0, 8): - with T.block() as []: + with T.block(): T.reads(B[j]) T.writes(C[i * 8 + j]) C[i * 8 + j] = B[j] * 2.0 @@ -247,7 +247,7 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None: T.writes(C[0, 8]) B = T.alloc_buffer((8, 8), "float32") for j in range(0, 4): - with T.block() as []: + with T.block(): D = T.alloc_buffer((8, 8), "float32") T.reads(A[i, j]) T.writes(B[i, j]) @@ -256,12 +256,12 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None: for k in range(2, 4): B[i, j] = A[i, j] + D[k, j] for j in range(3, 5): - with T.block() as []: + with T.block(): T.reads(B[i, j]) T.writes(C[i, j]) C[i, j] = B[i, j] for j in range(6, 8): - with T.block() as []: + with T.block(): T.reads(B[i, j]) T.writes(C[i, j]) C[i, j] = B[i, j] @@ -277,7 +277,7 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None: T.writes(C[0, 8]) B = T.alloc_buffer((1, 8), "float32") for j in range(0, 4): - with T.block() as []: + with T.block(): D = T.alloc_buffer((6, 1), "float32") T.reads(A[i, j]) T.writes(B[0, j]) @@ -286,12 +286,12 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None: for k in range(2, 4): B[0, j] = A[i, j] + D[k - 2, 0] for j in range(3, 5): - with T.block() as []: + with T.block(): T.reads(B[0, j]) T.writes(C[i, j]) C[i, j] = B[0, j] for j in range(6, 8): - with T.block() as []: + with T.block(): T.reads(B[0, j]) T.writes(C[i, j]) C[i, j] = B[0, j] @@ -309,12 +309,12 @@ def match_buffer_func(a: T.handle, c: T.handle) -> None: with T.block(): B0 = T.match_buffer(B[i, 0:16], (16)) for j in range(0, 16): - with T.block() as []: + with T.block(): A1 = T.match_buffer(A0[j], ()) B1 = T.match_buffer(B0[j], ()) B1[()] = A1[()] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): C1 = T.match_buffer(C0[j], ()) B2 = T.match_buffer(B[i, j], ()) C1[()] = B2[()] * 2.0 @@ -332,12 +332,12 @@ def compacted_match_buffer_func(a: T.handle, c: T.handle) -> None: with T.block(): B0 = T.match_buffer(B[0, 0:16], (16)) for j in range(0, 16): - with T.block() as []: + with T.block(): A1 = T.match_buffer(A0[j], ()) B1 = T.match_buffer(B0[j], ()) B1[()] = A1[()] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): C1 = T.match_buffer(C0[j], ()) B2 = T.match_buffer(B[0, j], ()) C1[()] = B2[()] * 2.0 @@ -353,13 +353,13 @@ def storage_align_func(a: T.handle, c: T.handle) -> None: T.writes(C[i, 0:16]) B = T.alloc_buffer((16, 16), "float32") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i, j]) T.writes(B[i, j]) T.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]}) B[i, j] = A[i, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[i, j]) T.writes(C[i, j]) C[i, j] = B[i, j] * 2.0 @@ -375,13 +375,13 @@ def compacted_storage_align_func(a: T.handle, c: T.handle) -> None: T.writes(C[i, 0:16]) B = T.alloc_buffer((1, 16), strides=(31, 1), dtypes="float32") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i, j]) T.writes(B[0, j]) T.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]}) B[0, j] = A[i, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[0, j]) T.writes(C[i, j]) C[i, j] = B[0, j] * 2.0 diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py index 68b1ad338964..65be43aba321 100644 --- a/tests/python/unittest/test_tir_transform_flatten_buffer.py +++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py @@ -37,12 +37,12 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None: T.writes(C[i, 0:16]) B = T.alloc_buffer([1, 16], "float32", scope="global") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i, j]) T.writes(B[0, j]) B[0, j] = A[i, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[0, j]) T.writes(C[i, j]) C[i, j] = B[0, j] * 2.0 @@ -74,12 +74,12 @@ def compacted_gpu_func(a: T.handle, c: T.handle) -> None: T.writes(C[i0 * 4 + i1 * 2 + i2, 0:16]) B = T.alloc_buffer([1, 16], "float32", scope="local") for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i0 * 4 + i1 * 2 + i2, j]) T.writes(B[0, j]) B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0 for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[0, j]) T.writes(C[i0 * 4 + i1 * 2 + i2, j]) C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0 @@ -117,12 +117,12 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> T.writes(C[i, m]) B = T.alloc_buffer((m,), "float32", scope="global") for j in range(0, m): - with T.block() as []: + with T.block(): T.reads(A[i, j]) T.writes(B[j]) B[j] = A[i, j] + 1.0 for j in range(0, m): - with T.block() as []: + with T.block(): T.reads(B[j]) T.writes(C[i, j]) C[i, j] = B[j] * 2.0 @@ -149,7 +149,7 @@ def compacted_predicate_func(a: T.handle, c: T.handle) -> None: C = T.match_buffer(c, (32), "float32") for i, j in T.grid(5, 7): - with T.block() as []: + with T.block(): T.reads(A[i * 7 + j]) T.writes(C[i * 7 + j]) T.where(i * 7 + j < 32) @@ -174,7 +174,7 @@ def compacted_unit_loop_func(a: T.handle, c: T.handle) -> None: C = T.match_buffer(c, (32), "float32") for x, y, z in T.grid(4, 1, 8): - with T.block() as []: + with T.block(): T.reads(A[x * 8 + y * 8 + z]) T.writes(C[x * 8 + y * 8 + z]) C[x * 8 + y * 8 + z] = A[x * 8 + y * 8 + z] + 1.0 @@ -197,7 +197,7 @@ def compacted_multi_alloc_func(a: T.handle, d: T.handle) -> None: D = T.match_buffer(d, (32), "float32") for i in range(0, 32): - with T.block() as []: + with T.block(): T.reads(A[i]) T.writes(D[i]) B = T.alloc_buffer((32,), scope="global") @@ -233,13 +233,13 @@ def compacted_strided_buffer_func(a: T.handle, c: T.handle) -> None: B = T.alloc_buffer([4, 16], "float32", strides=[17, 1], scope="global") for i1 in range(0, 4): for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(A[i0 * 4 + i1, j]) T.writes(B[i1, j]) B[i1, j] = A[i0 * 4 + i1, j] + 1.0 for i1 in range(0, 4): for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads(B[i1, j]) T.writes(C[i0 * 4 + i1, j]) C[i0 * 4 + i1, j] = B[i1, j] * 2.0 diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py index 17e6d94e6744..c4b4afb24f82 100644 --- a/tests/python/unittest/test_tvmscript_complete.py +++ b/tests/python/unittest/test_tvmscript_complete.py @@ -62,7 +62,7 @@ def elementwise_with_root(a: T.handle, b: T.handle, c: T.handle) -> None: B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) - with T.block() as []: + with T.block(): for i, j in T.grid(128, 128): with T.block(): vi, vj = T.axis.remap("SS", [i, j]) @@ -78,8 +78,8 @@ def func_with_opaque_block(a: T.handle, b: T.handle, c: T.handle) -> None: B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) - with T.block() as []: - with T.block() as []: + with T.block(): + with T.block(): B[0, 0] = A[0, 0] + T.float32(1) for i, j in T.grid(128, 128): with T.block(): @@ -93,7 +93,7 @@ def func_with_part_access_region(a: T.handle, b: T.handle, c: T.handle) -> None: B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) - with T.block() as []: + with T.block(): for i, j in T.grid(128, 128): with T.block(): vi, vj = T.axis.remap("SS", [i, j]) @@ -263,7 +263,7 @@ def match_buffer_func(a: T.handle) -> None: A0 = T.match_buffer(A[i, 0:16], (16)) with T.block(): for j in range(0, 16): - with T.block() as []: + with T.block(): A1 = T.match_buffer(A0[j], ()) A1[()] = 1.0 @@ -280,7 +280,7 @@ def expected_match_buffer_func(a: T.handle) -> None: T.reads([]) T.writes(A0[0:16]) for j in range(0, 16): - with T.block() as []: + with T.block(): T.reads([]) T.writes(A0[j]) A1 = T.match_buffer(A0[j], ()) diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index 948a76216831..f6db826dfda6 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -2840,7 +2840,7 @@ def rank0_block(a: T.handle) -> None: B = T.alloc_buffer((), "float32") B[()] = A[()] - with T.block("update") as []: + with T.block("update"): T.reads([A[()]]) T.writes([B[()]]) for i in range(1):