Skip to content

Commit

Permalink
[cache flush] port cache flush to ansor (apache#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
FrozenGene authored and merrymercy committed Jun 20, 2020
1 parent 36cd9ef commit 145e61c
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 3 deletions.
3 changes: 2 additions & 1 deletion scripts/tune_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def create_tune_option(target, log_file, n_trials, num_measure_per_iter, verbose
measure_ctx = ansor.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400)
runner = measure_ctx.runner
else:
runner = ansor.LocalRunner(repeat=1, min_repeat_ms=400)
os.environ['TVM_AUTO_CACHE_FLUSH'] = "1"
runner = ansor.LocalRunner(repeat=10, number=1, min_repeat_ms=0, timeout=run_timeout)
else:
os.environ['TVM_NDK_CC'] = ndk_cc
builder = ansor.LocalBuilder(timeout=build_timeout, build_func='ndk')
Expand Down
31 changes: 31 additions & 0 deletions src/runtime/rpc/rpc_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,14 @@
#include <tvm/runtime/container.h>
#include <tvm/runtime/registry.h>

#include <cstdlib>
#include <cstring>
#include <memory>

#if defined(_M_X64) || defined(__x86_64__)
#include <x86intrin.h>
#endif

#include "rpc_endpoint.h"
#include "rpc_session.h"

Expand Down Expand Up @@ -300,6 +305,23 @@ std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
return rmod->sess();
}

inline void CacheFlush(const char* p, unsigned int allocation_size) {
// TODO: (FrozenGene)
// Support ARM.
#if (defined(_M_X64) || defined(__x86_64__))
size_t cache_line = 64;

if (p == nullptr || allocation_size <= 0) {
return;
}

for (size_t i = 0; i < allocation_size; i += cache_line) {
_mm_clflush(static_cast<const void*>(&p[i]));
}

#endif
}

PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
int min_repeat_ms) {
CHECK(pf != nullptr);
Expand All @@ -313,12 +335,21 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) mutable {
TVMRetValue temp;
std::ostringstream os;
const char* cache_flush = std::getenv("TVM_AUTO_CACHE_FLUSH");
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);

DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);

for (int i = 0; i < repeat; ++i) {
if (cache_flush && std::atoi(cache_flush) != 0) {
CHECK_EQ(number, 1);
// we want to keep input data
for (int j = 1; j < args.size(); j++) {
CacheFlush((char*)(args[j].operator DLTensor*()->data),
GetDataSize(*(args[j].operator DLTensor*())));
}
}
std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
tend;
double duration_ms = 0.0;
Expand Down
9 changes: 7 additions & 2 deletions src/runtime/threading_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,13 @@ class ThreadGroup::Impl {
#if defined(_M_X64) || defined(__x86_64__)
big_count /= 2; // ignore hyper-threading
#endif
for (int i = 0; i < big_count; ++i) {
CPU_SET(sorted_order_[i], &cpuset);
const char* bind_master_core_0 = getenv("TVM_BIND_MASTER_CORE_0");
if (bind_master_core_0 && atoi(bind_master_core_0) != 0) {
CPU_SET(sorted_order_[0], &cpuset);
} else {
for (int i = 0; i < big_count; ++i) {
CPU_SET(sorted_order_[i], &cpuset);
}
}
}
#if defined(__ANDROID__)
Expand Down

0 comments on commit 145e61c

Please sign in to comment.