[cache flush] port cache flush to ansor (apache#32)

jcf94 · Jun 20, 2020 · 145e61c · 145e61c
1 parent 36cd9ef
commit 145e61c
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 3 deletions.
diff --git a/scripts/tune_test.py b/scripts/tune_test.py
@@ -22,7 +22,8 @@ def create_tune_option(target, log_file, n_trials, num_measure_per_iter, verbose
             measure_ctx = ansor.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400)
             runner = measure_ctx.runner
         else:
-            runner = ansor.LocalRunner(repeat=1, min_repeat_ms=400)
+            os.environ['TVM_AUTO_CACHE_FLUSH'] = "1"
+            runner = ansor.LocalRunner(repeat=10, number=1, min_repeat_ms=0, timeout=run_timeout)
     else:
         os.environ['TVM_NDK_CC'] = ndk_cc
         builder = ansor.LocalBuilder(timeout=build_timeout, build_func='ndk')

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
@@ -24,9 +24,14 @@
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/registry.h>
 
+#include <cstdlib>
 #include <cstring>
 #include <memory>
 
+#if defined(_M_X64) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
 #include "rpc_endpoint.h"
 #include "rpc_session.h"
 
@@ -300,6 +305,23 @@ std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
   return rmod->sess();
 }
 
+inline void CacheFlush(const char* p, unsigned int allocation_size) {
+// TODO: (FrozenGene)
+// Support ARM.
+#if (defined(_M_X64) || defined(__x86_64__))
+  size_t cache_line = 64;
+
+  if (p == nullptr || allocation_size <= 0) {
+    return;
+  }
+
+  for (size_t i = 0; i < allocation_size; i += cache_line) {
+    _mm_clflush(static_cast<const void*>(&p[i]));
+  }
+
+#endif
+}
+
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
                              int min_repeat_ms) {
   CHECK(pf != nullptr);
@@ -313,12 +335,21 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
   auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
+    const char* cache_flush = std::getenv("TVM_AUTO_CACHE_FLUSH");
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
 
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
 
     for (int i = 0; i < repeat; ++i) {
+      if (cache_flush && std::atoi(cache_flush) != 0) {
+        CHECK_EQ(number, 1);
+        // we want to keep input data
+        for (int j = 1; j < args.size(); j++) {
+          CacheFlush((char*)(args[j].operator DLTensor*()->data),
+                     GetDataSize(*(args[j].operator DLTensor*())));
+        }
+      }
       std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
           tend;
       double duration_ms = 0.0;

diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
@@ -166,8 +166,13 @@ class ThreadGroup::Impl {
 #if defined(_M_X64) || defined(__x86_64__)
       big_count /= 2;  // ignore hyper-threading
 #endif
-      for (int i = 0; i < big_count; ++i) {
-        CPU_SET(sorted_order_[i], &cpuset);
+      const char* bind_master_core_0 = getenv("TVM_BIND_MASTER_CORE_0");
+      if (bind_master_core_0 && atoi(bind_master_core_0) != 0) {
+        CPU_SET(sorted_order_[0], &cpuset);
+      } else {
+        for (int i = 0; i < big_count; ++i) {
+          CPU_SET(sorted_order_[i], &cpuset);
+        }
       }
     }
 #if defined(__ANDROID__)