diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index e70a96401cad..30b4ccbc5618 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+include(ExternalProject)
+
 set(PICK_SIM  "sim")
 set(PICK_HW   "target")
 set(PICK_NONE "OFF")
@@ -77,6 +79,13 @@ if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}")
   include_directories("${HEXAGON_TOOLCHAIN}/include/iss")
   link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
   list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper")
+  ExternalProject_Add(sim_dev
+    SOURCE_DIR "${CMAKE_SOURCE_DIR}/src/runtime/hexagon/sim/driver"
+    CMAKE_ARGS
+      "-DCMAKE_C_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
+      "-DCMAKE_CXX_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
+    INSTALL_COMMAND "true"
+  )
 elseif(USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
   find_hexagon_sdk_root()
   find_hexagon_toolchain()
diff --git a/src/runtime/hexagon/sim/driver/CMakeLists.txt b/src/runtime/hexagon/sim/driver/CMakeLists.txt
new file mode 100644
index 000000000000..8632b491f259
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/CMakeLists.txt
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+project(SIM_DEV C CXX)
+cmake_minimum_required(VERSION 3.0.2)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+  include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+endif()
+
+set(EXTRA_CXX_FLAGS
+  "-O2"
+  "-Wno-format"
+  "-mhvx -mhvx-length=128b"
+  "-mv60"
+  "-stdlib=libc++"
+)
+
+set(EXTRA_LINK_FLAGS
+  "-stdlib=libc++"
+  "-G0"
+  "-Wl,--force-dynamic"
+  "-Wl,--export-dynamic"
+  "-Wl,--whole-archive"   # This should link entire libc, libc++ and libc+abi.
+  "-Wl,--defsym=HEAP_SIZE=0x40000000"
+)
+
+string(REGEX REPLACE ";" " " EXTRA_CXX_FLAGS_STR "${EXTRA_CXX_FLAGS}")
+string(REGEX REPLACE ";" " " EXTRA_LINK_FLAGS_STR "${EXTRA_LINK_FLAGS}")
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_FLAGS "${EXTRA_CXX_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${EXTRA_LINK_FLAGS_STR} ${CMAKE_EXE_LINKER_FLAGS}")
+
+# Set project properties.
+
+file(GLOB SOURCE_FILES "*.cc")
+add_executable(sim_dev ${SOURCE_FILES})
+target_include_directories(sim_dev
+  PUBLIC "."
+  PUBLIC ".."
+  PUBLIC "../../../../../include"
+  PUBLIC "../../../../../3rdparty/dlpack/include"
+)
+
+target_link_libraries(sim_dev "-ldl")
diff --git a/src/runtime/hexagon/sim/driver/README.md b/src/runtime/hexagon/sim/driver/README.md
new file mode 100644
index 000000000000..3aee1a14b796
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/README.md
@@ -0,0 +1,38 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Hexagon simulator driver
+
+The driver (`sim_dev` executable) is the process running on the Hexagon simulator that handles the Hexagon-side communication with the TVM runtime running on x86. The location of `sim_dev` should be added to `PATH` before running any python code that uses Hexagon. The `sim_dev` executable is not intended to be run by users, it is automatically loaded by the simulator control code (in `hexagon_device_sim.cc`).
+
+### Prerequisites
+
+1. Hexagon C/C++ toolchain (such as the one in Hexagon SDK version 3.5.0 or later).
+
+Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk.
+
+### Configuring
+
+Set
+```
+CMAKE_C_COMPILER=hexagon-clang
+CMAKE_CXX_COMPILER=hexagon-clang++
+```
+
+### Building
+
+There are no special options required for `make` (or the tool selected with `cmake`). The location of the resulting binary `sim_dev` should be added to `PATH`.
diff --git a/src/runtime/hexagon/sim/driver/fake_pthread.cc b/src/runtime/hexagon/sim/driver/fake_pthread.cc
new file mode 100644
index 000000000000..74090d0bf796
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/fake_pthread.cc
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cassert>
+#include <cerrno>
+#include <csetjmp>
+#include <cstddef>
+#include <cstdlib>
+#include <map>
+#include <vector>
+
+#include "pthread.h"
+#include "sched.h"
+
+/*!
+ * Implementation of a subset of pthread API for single-threaded execution.
+ *
+ * They main idea is that the thread function ("start_routine" in the call
+ * to pthread_create) is executed immediately. When pthread_create returns,
+ * the thread function has already finished.
+ *
+ * Since the thread routine can itself call pthread_create, it is possible
+ * to have multiple threads existing at the same time, although only the
+ * last one is running.
+ *
+ * There are two main things that need to be taken care of:
+ * - thread-specific data, i.e. pthread_setspecific, pthread_getspecific,
+ *   and the handling of thread keys,
+ * - handling of thread return values.
+ *
+ * Threads are identified by thread ids (of type pthread_t). The main process
+ * thread has the id of 0, the remaining threads have ids starting at 1 and
+ * incrementing by 1. For each thread there is some data (thread_info_t)
+ * associated with it, and stored in "thread_data" map. When a thread
+ * terminates, the corresponding entry from "thread_data" cannot be removed
+ * until the return value is claimed (pthread_join), unless it is explicitly
+ * discarded (pthread_detach). When a new thread is created, it gets the
+ * first available id for which there is no entry in "thread_data". This
+ * could be an id that was never allocated, or an id that was used, but
+ * has since been removed from the map.
+ * A thread can terminate through thread_exit. This means that when the
+ * thread function calls thread_exit, the execution should return to the
+ * pthread_create call that ran it. This is implemented via setjmp/longjmp
+ * (neither longjmp nor pthread_exit unwind the stack).
+ *
+ * Any mutexes or condition variables cannot block, or else it would cause
+ * a deadlock. Since there is only one thread running at a time, locking
+ * a mutex or waiting for a condition always succeeds (returns immediately).
+ */
+
+struct key_entry_t {
+  key_entry_t(void* v, void (*d)(void*)) : value(v), dtor(d) {}
+  void* value = nullptr;
+  void (*dtor)(void*) = nullptr;
+};
+
+struct thread_info_t {
+  thread_info_t() = default;
+  std::map<pthread_key_t, key_entry_t> keys;
+  std::jmp_buf env;
+  void* ret_value = nullptr;
+  bool finished = false;
+  bool detached = false;
+};
+
+static pthread_t main_thread_id = 0;
+
+static std::map<pthread_t, thread_info_t> thread_data = {
+    // Reserve the 0th entry.
+    {main_thread_id, {}}};
+
+static std::vector<pthread_t> running_threads = {main_thread_id};
+
+template <typename K, typename V>
+K first_available_key(const std::map<K, V>& m) {
+  auto i = m.begin(), e = m.end();
+  K key = 1;
+  for (; i != e && key == i->first; ++i, ++key) {
+  }
+  return key;
+}
+
+int pthread_cond_destroy(pthread_cond_t* cond) { return 0; }
+
+int pthread_cond_init(pthread_cond_t* __restrict cond,
+                      const pthread_condattr_t* __restrict attr) {
+  return 0;
+}
+
+int pthread_cond_signal(pthread_cond_t* cond) { return 0; }
+
+int pthread_cond_broadcast(pthread_cond_t* cond) { return 0; }
+
+int pthread_cond_timedwait(pthread_cond_t* __restrict cond,
+                           pthread_mutex_t* __restrict mutex,
+                           const struct timespec* __restrict abstime) {
+  return 0;
+}
+
+int pthread_cond_wait(pthread_cond_t* __restrict cond,
+                      pthread_mutex_t* __restrict mutex) {
+  return 0;
+}
+
+int pthread_mutexattr_init(pthread_mutexattr_t* attr) { return 0; }
+
+int pthread_mutexattr_destroy(pthread_mutexattr_t* attr) { return 0; }
+
+int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type) {
+  return 0;
+}
+
+int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr,
+                              int* __restrict type) {
+  *type = PTHREAD_MUTEX_NORMAL;
+  return 0;
+}
+
+int pthread_mutex_init(pthread_mutex_t* __restrict mutex,
+                       const pthread_mutexattr_t* __restrict attr) {
+  return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_mutex_lock(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_mutex_trylock(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_mutex_unlock(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_once(pthread_once_t* once_control, void (*init_routine)(void)) {
+  static_assert(PTHREAD_ONCE_INIT != PTHREAD_ONCE_DONE,
+                "PTHREAD_ONCE_INIT must be different from PTHREAD_ONCE_DONE");
+  if (*once_control == PTHREAD_ONCE_INIT) {
+    init_routine();
+    *once_control = PTHREAD_ONCE_DONE;
+  }
+  return 0;
+}
+
+int pthread_equal(pthread_t t1, pthread_t t2) { return t1 == t2; }
+
+int pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+                   void* (*start_routine)(void*), void* arg) {
+  std::jmp_buf& env = thread_data[pthread_self()].env;
+  volatile pthread_t tid;
+  if (setjmp(env) == 0) {
+    tid = first_available_key(thread_data);
+    *thread = tid;
+    running_threads.push_back(pthread_t(tid));
+    thread_info_t& thr = thread_data[pthread_t(tid)];
+    thr.ret_value = start_routine(arg);
+  }
+  thread_info_t& thr = thread_data[pthread_t(tid)];
+  thr.finished = true;
+  running_threads.pop_back();
+
+  // Destroy all keys.
+  bool repeat = true;
+  size_t iter = 0;
+  while (repeat && iter++ < PTHREAD_DESTRUCTOR_ITERATIONS) {
+    repeat = false;
+    // Assume that destructors can create new keys (i.e. modify the map).
+    for (size_t k = 0; k != PTHREAD_KEYS_MAX; ++k) {
+      auto f = thr.keys.find(k);
+      if (f == thr.keys.end()) {
+        continue;
+      }
+      key_entry_t& key = f->second;
+      if (key.dtor == nullptr || key.value == nullptr) {
+        continue;
+      }
+      key.dtor(key.value);
+      repeat = true;
+    }
+  }
+
+  if (thr.detached) {
+    thread_data.erase(pthread_t(tid));
+  }
+
+  return 0;
+}
+
+int pthread_join(pthread_t thread, void** retval) {
+  auto f = thread_data.find(thread);
+  if (f == thread_data.end()) {
+    return ESRCH;
+  }
+  thread_info_t& thr = f->second;
+  if (!thr.finished) {
+    return EDEADLK;
+  }
+  if (retval != nullptr) {
+    *retval = thr.ret_value;
+  }
+  thread_data.erase(f);
+  return 0;
+}
+
+int pthread_detach(pthread_t thread) {
+  auto f = thread_data.find(thread);
+  if (f == thread_data.end()) {
+    return ESRCH;
+  }
+  // Can discard the return value.
+  f->second.detached = true;
+  return 0;
+}
+
+void pthread_exit(void* retval) {
+  pthread_t sid = pthread_self();
+  if (sid != main_thread_id) {
+    thread_info_t& self = thread_data[sid];
+    self.ret_value = retval;
+    self.finished = true;
+    longjmp(self.env, 1);
+  }
+  exit(0);  // Only executes for the main thread, plus silences
+            // the "should not return" warning.
+}
+
+int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) {
+  if (key == nullptr) {
+    return EINVAL;
+  }
+  auto& keys = thread_data[pthread_self()].keys;
+  pthread_key_t k = first_available_key(keys);
+  if (k >= PTHREAD_KEYS_MAX) {
+    return EAGAIN;
+  }
+  *key = k;
+  keys.emplace(k, key_entry_t{nullptr, destructor});
+  return 0;
+}
+
+int pthread_key_delete(pthread_key_t key) {
+  auto& keys = thread_data[pthread_self()].keys;
+  auto f = keys.find(key);
+  if (f == keys.end()) {
+    return EINVAL;
+  }
+  // pthread_key_delete does not call key destructors.
+  keys.erase(f);
+  return 0;
+}
+
+int pthread_setspecific(pthread_key_t key, const void* value) {
+  auto& keys = thread_data[pthread_self()].keys;
+  auto f = keys.find(key);
+  if (f == keys.end()) {
+    return EINVAL;
+  }
+  f->second.value = const_cast<void*>(value);
+  return 0;
+}
+
+void* pthread_getspecific(pthread_key_t key) {
+  auto& keys = thread_data[pthread_self()].keys;
+  auto f = keys.find(key);
+  if (f != keys.end()) {
+    return f->second.value;
+  }
+  return nullptr;
+}
+
+pthread_t pthread_self(void) { return running_threads.back(); }
+
+int sched_yield(void) { return 0; }
+
+#ifdef __cplusplus_
+extern "C" int nanosleep(const struct timespec* req, struct timespec* rem);
+#endif
+
+int nanosleep(const struct timespec* req, struct timespec* rem) { return 0; }
diff --git a/src/runtime/hexagon/sim/driver/pthread.h b/src/runtime/hexagon/sim/driver/pthread.h
new file mode 100644
index 000000000000..1748d614cbbf
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/pthread.h
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
+#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
+
+#define _PROVIDE_POSIX_TIME_DECLS 1
+#include <time.h>
+#undef _PROVIDE_POSIX_TIME_DECLS
+
+typedef int pthread_t;
+typedef int pthread_attr_t;
+typedef int pthread_cond_t;
+typedef int pthread_condattr_t;
+typedef int pthread_key_t;
+typedef int pthread_mutex_t;
+typedef int pthread_mutexattr_t;
+typedef int pthread_once_t;
+
+enum {
+  PTHREAD_COND_INITIALIZER,
+  PTHREAD_MUTEX_DEFAULT,
+  PTHREAD_MUTEX_ERRORCHECK,
+  PTHREAD_MUTEX_INITIALIZER,
+  PTHREAD_MUTEX_NORMAL,
+  PTHREAD_MUTEX_RECURSIVE,
+  PTHREAD_ONCE_INIT = 0,  // Must be same as in QuRT
+  PTHREAD_ONCE_DONE,      // Non-standard
+};
+
+const size_t PTHREAD_KEYS_MAX = 128;
+const size_t PTHREAD_DESTRUCTOR_ITERATIONS = 4;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int pthread_cond_destroy(pthread_cond_t* cond);
+int pthread_cond_init(pthread_cond_t* __restrict cond,
+                      const pthread_condattr_t* __restrict attr);
+int pthread_cond_signal(pthread_cond_t* cond);
+int pthread_cond_broadcast(pthread_cond_t* cond);
+int pthread_cond_timedwait(pthread_cond_t* __restrict cond,
+                           pthread_mutex_t* __restrict mutex,
+                           const struct timespec* __restrict abstime);
+int pthread_cond_wait(pthread_cond_t* __restrict cond,
+                      pthread_mutex_t* __restrict mutex);
+
+int pthread_mutexattr_init(pthread_mutexattr_t* attr);
+int pthread_mutexattr_destroy(pthread_mutexattr_t* attr);
+int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr,
+                              int* __restrict type);
+int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type);
+
+int pthread_mutex_init(pthread_mutex_t* __restrict mutex,
+                       const pthread_mutexattr_t* __restrict attr);
+int pthread_mutex_destroy(pthread_mutex_t* mutex);
+int pthread_mutex_lock(pthread_mutex_t* mutex);
+int pthread_mutex_trylock(pthread_mutex_t* mutex);
+int pthread_mutex_unlock(pthread_mutex_t* mutex);
+
+int pthread_once(pthread_once_t* once_control, void (*init_routine)(void));
+int pthread_equal(pthread_t t1, pthread_t t2);
+
+int pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+                   void* (*start_routine)(void*), void* arg);
+int pthread_join(pthread_t thread, void** retval);
+int pthread_detach(pthread_t thread);
+void pthread_exit(void* retval) __attribute__((__noreturn__));
+
+int pthread_key_create(pthread_key_t* key, void (*destructor)(void*));
+int pthread_key_delete(pthread_key_t key);
+int pthread_setspecific(pthread_key_t key, const void* value);
+void* pthread_getspecific(pthread_key_t key);
+
+pthread_t pthread_self(void);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
diff --git a/src/runtime/hexagon/sim/driver/sched.h b/src/runtime/hexagon/sim/driver/sched.h
new file mode 100644
index 000000000000..cc63630f2072
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/sched.h
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
+#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int sched_yield(void);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
diff --git a/src/runtime/hexagon/sim/driver/sim_device.cc b/src/runtime/hexagon/sim/driver/sim_device.cc
new file mode 100644
index 000000000000..23dc05307038
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/sim_device.cc
@@ -0,0 +1,573 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+  Required options:
+    -ldl -G0                  For dlinit/dlopen/dlclose.
+    -Wl,--force-dynamic       Make this a dynamic executable (with dynamic
+                              symbol table).
+    -Wl,-E                    Export all defined symbols as dynamic.
+    -Wl,--whole-archive       Link the entire contents of libc.
+    -mhvx -mhvx-length=128b   Enable HVX.
+    -Wno-format               Silence format warning (unsigned vs uint32_t).
+*/
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "hexagon_sim_proto.h"
+#include "pthread.h"
+#include "tvm/runtime/c_runtime_api.h"
+
+static std::string timeNow() {
+  char str[11];  // [hh:mm:ss]
+  time_t time_value = time(NULL);
+  tm* pnow = localtime(&time_value);  // NOLINT(runtime/threadsafe_fn)
+
+  snprintf(str, sizeof(str), "[%02d:%02d:%02d]", pnow->tm_hour, pnow->tm_min,
+           pnow->tm_sec);
+  return std::string(str);
+}
+
+#define LOG(FMT, ...)                                                 \
+  fprintf(stderr, "%s %s:%d: " FMT "\n", timeNow().c_str(), __FILE__, \
+          __LINE__, ##__VA_ARGS__)
+
+using HVX_Vector =
+    int __attribute__((__vector_size__(128))) __attribute__((aligned(128)));
+
+static unsigned getVectorLength() {
+  HVX_Vector v = __builtin_HEXAGON_V6_lvsplatw_128B(0x01010101);
+  unsigned char* p = reinterpret_cast<unsigned char*>(&v);
+  if (p[127] == 1) return 128;
+  assert(p[63] == 1);
+  return 64;
+}
+
+extern "C" {
+// Print vector functions. They can be used to help debug tensorized
+// code, via
+// ib.emit(tvm.call_extern('int32', 'V6_pv8', 'vector:', v))
+// ib.emit(tvm.call_extern('int32', 'V6_pv16', 'info:', v))
+// ib.emit(tvm.call_extern('int32', 'V6_pv32', 'value:', v))
+
+// The first argument is a string printed before the vector contents.
+int V6_pv8(const char* s, HVX_Vector v);
+int V6_pv16(const char* s, HVX_Vector v);
+int V6_pv32(const char* s, HVX_Vector v);
+}
+
+int V6_pv8(const char* s, HVX_Vector v) {
+  unsigned vlen = getVectorLength();
+  uint8_t* ptr = reinterpret_cast<uint8_t*>(&v);
+  fprintf(stderr, "%s:", s);
+  for (unsigned i = 0; i != vlen; ++i) {
+    fprintf(stderr, " %02x", ptr[i]);
+  }
+  fprintf(stderr, "\n");
+  return 0;
+}
+
+int V6_pv16(const char* s, HVX_Vector v) {
+  unsigned vlen = getVectorLength();
+  uint16_t* ptr = reinterpret_cast<uint16_t*>(&v);
+  fprintf(stderr, "%s:", s);
+  for (unsigned i = 0; i != vlen / sizeof(uint16_t); ++i) {
+    fprintf(stderr, " %04x", ptr[i]);
+  }
+  fprintf(stderr, "\n");
+  return 0;
+}
+
+int V6_pv32(const char* s, HVX_Vector v) {
+  unsigned vlen = getVectorLength();
+  uint32_t* ptr = reinterpret_cast<uint32_t*>(&v);
+  fprintf(stderr, "%s:", s);
+  for (unsigned i = 0; i != vlen / sizeof(uint32_t); ++i) {
+    fprintf(stderr, " %08x", ptr[i]);
+  }
+  fprintf(stderr, "\n");
+  return 0;
+}
+
+extern "C" {
+// Function referenced from libc++.a, but not defined in libc.a.
+int clock_gettime(clockid_t clock_id, struct timespec* tp);
+// pthread_create is wrapped so that we can set a bigger stack size
+// for QuRT. Here this isn't needed, but we still need to implement
+// the wrapper.
+int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+                          void* (*start_routine)(void*), void* arg);
+}
+
+int clock_gettime(clockid_t clock_id, struct timespec* tp) {
+  // Stub implementation.
+  return 0;
+}
+
+int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+                          void* (*start_routine)(void*), void* arg) {
+  LOG("%s", __func__);
+  return pthread_create(thread, attr, start_routine, arg);
+}
+
+// FIXME(kparzysz-quic): query the cfg register to compute the VTCM base.
+// This works now.
+const unsigned int TCM_BASE = 0xD8000000;
+const unsigned int VTCM_BASE = TCM_BASE + 0x400000;
+
+class Allocator {
+ private:
+  struct Block {
+    Block(void* p, size_t s) : ptr_(p), size_(s), vtcm_(false) {}
+    Block(void* p, size_t s, bool v) : ptr_(p), size_(s), vtcm_(v) {}
+    bool operator<(const Block& b) const {
+      return uintptr_t(ptr_) < uintptr_t(b.ptr_);
+    }
+    void* ptr_;
+    size_t size_;
+    bool vtcm_;
+  };
+
+  using vector_type = std::vector<Block>;
+  using iterator = vector_type::iterator;
+  vector_type allocations_;
+
+  uintptr_t cur_vtcm = VTCM_BASE;
+
+ public:
+  void* alloc(unsigned size, size_t align);
+  void* vtcm_alloc(unsigned size, size_t align);
+  void free(void* p);
+};
+
+void* Allocator::alloc(unsigned size, size_t align) {
+  void* ptr = aligned_alloc(align, size);
+  if (ptr == nullptr) {
+    perror("device: error allocating memory:");
+    return ptr;
+  }
+
+  Block b(ptr, size);
+  iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b);
+  iterator w = allocations_.insert(i, b);
+  if (w != allocations_.begin()) {
+    iterator pw = w - 1;
+    assert(uintptr_t(pw->ptr_) + pw->size_ < uintptr_t(w->ptr_));
+  }
+  if (w + 1 != allocations_.end()) {
+    iterator nw = w + 1;
+    assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_));
+  }
+
+  LOG("device: allocated %d bytes aligned at %d: %p", size, align, ptr);
+  return ptr;
+}
+
+// For now, just allocation sequentially. This needs to be improved to use a
+// free list.
+void* Allocator::vtcm_alloc(unsigned size, size_t align) {
+  uintptr_t a = cur_vtcm;
+  a = (a + (align - 1)) & -align;
+  cur_vtcm = a + size;
+  void* ptr = reinterpret_cast<void*>(a);
+  if (ptr == nullptr) {
+    perror("device: error allocating vtcm memory:");
+    return ptr;
+  }
+
+  Block b(ptr, size, true);
+  iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b);
+  iterator w = allocations_.insert(i, b);
+  if (w != allocations_.begin()) {
+    iterator pw = w - 1;
+    assert(uintptr_t(pw->ptr_) + pw->size_ <= uintptr_t(w->ptr_));
+  }
+  if (w + 1 != allocations_.end()) {
+    iterator nw = w + 1;
+    assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_));
+  }
+
+  LOG("device: allocated vtcm %d bytes aligned at %d: %p", size, align, ptr);
+  return ptr;
+}
+
+void Allocator::free(void* ptr) {
+  LOG("device: freeing %p", ptr);
+  iterator i = std::lower_bound(allocations_.begin(), allocations_.end(),
+                                Block(ptr, 0));
+  assert(i != allocations_.end());
+  assert(i->ptr_ == ptr);
+  if (!i->vtcm_) ::free(i->ptr_);
+  allocations_.erase(i);
+}
+
+static void printMsgCall(const MsgCall& mc) {
+  auto to_dec_string = [](int v) {
+    char tmp[11];
+    snprintf(tmp, sizeof(tmp), "%d", v);
+    return std::string(tmp);
+  };
+  auto to_hex_string = [](uint32_t v) {
+    char tmp[9];
+    snprintf(tmp, sizeof(tmp), "%lx", v);
+    return std::string(tmp);
+  };
+  std::string str = "device: launching " + to_hex_string(mc.func_va) +
+                    " sc:" + to_dec_string(mc.scalar_num) + " {";
+  for (unsigned i = 0; i != mc.scalar_num; ++i) {
+    str += ' ' + to_hex_string(mc.data[i]);
+    if (i + 1 != mc.scalar_num) str += ',';
+  }
+  str += " }, st:" + to_dec_string(mc.stack_num) + " {";
+  for (unsigned i = 0; i != mc.stack_num; ++i) {
+    str += ' ' + to_hex_string(mc.data[i + mc.scalar_num]);
+    if (i + 1 != mc.stack_num) str += ',';
+  }
+  str += " }";
+  LOG("%s", str.c_str());
+}
+
+static std::vector<MsgCall*> task_queue;
+
+struct Environment {
+  Allocator alloc;
+  void* dl_handle = nullptr;
+};
+
+extern "C" {
+volatile Message message_buffer;
+int dispatch(Environment* env) __attribute__((noinline));
+}
+
+static volatile unsigned char payload_buffer[4096];
+
+static void setMsg(uint32_t code, uint32_t len, uint32_t va) {
+  message_buffer.code = code;
+  message_buffer.len = len;
+  message_buffer.va = va;
+}
+
+inline void* pointer(uint32_t v) {
+  return reinterpret_cast<void*>(static_cast<uintptr_t>(v));
+}
+
+inline uint32_t va(const volatile void* p) {
+  return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
+}
+
+__attribute__((naked)) uint32_t launcher(volatile MsgCall* mc, uint64_t* pcc) {
+  __asm__(
+      "// This function is intentionally written to be readable,      \n"
+      "// rather than fast.                                           \n"
+      "// r0 = value of 'volatile MsgCall *mc'                        \n"
+      "// r1 = address where to store the program cycle count         \n"
+      "{ memd(r29+#-16) = r21:20                                      \n"
+      "  allocframe(#24)          }                                   \n"
+      "{ memd(r29+#0) = r17:16                                        \n"
+      "  memd(r29+#8) = r19:18    }                                   \n"
+      "{ r17:16 = combine(r1,r0)                                      \n"
+      "  r18 = r29                                                    \n"
+      "  r1 = memw(r0+#4)            // scalar_num                    \n"
+      "  r2 = memw(r0+#8)         }  // stack_num                     \n"
+      "// If there are no stack values, skip the stack setup.         \n"
+      "{ p0 = cmp.eq(r2,#0)                                           \n"
+      "  if (p0.new) jump:t .Llauncher1 }                             \n"
+
+      "// Allocate space on the stack. Let r2 = needed space          \n"
+      "// rounded up to a multiple of 8.                              \n"
+      "{ loop0(.Llauncher0,r2)                                        \n"
+      "  r2 = asl(r2,#2)          }                                   \n"
+      "{ r2 = add(r2,#4)          }                                   \n"
+      "{ r2 = clrbit(r2,#2)       }                                   \n"
+      "{ r29 = sub(r29,r2)        }                                   \n"
+
+      "// Copy stack contents onto the stack. Stack contents start    \n"
+      "// at r3 = r0 + offsetof(data) + scalar_num*4                  \n"
+      "{ r3 = addasl(r0,r1,#2)                                        \n"
+      "  r4 = r29                 }                                   \n"
+      "{ r3 = add(r3,#12)         } // offsetof(data)                 \n"
+      ".Llauncher0:                                                   \n"
+      "{ r5 = memw(r3++#4)                                            \n"
+      "  memw(r4++#4) = r5.new    } :endloop0                         \n"
+
+      "// Load registers. Some of the loaded data may actually be     \n"
+      "// values from the stack part of 'data', but it's not an issue.\n"
+      ".Llauncher1:                                                   \n"
+      "{ r0 = memw(r16+#12)         // mc + offsetof(data)            \n"
+      "  r1 = memw(r16+#16)       }                                   \n"
+      "{ r2 = memw(r16+#20)                                           \n"
+      "  r3 = memw(r16+#24)       }                                   \n"
+      "{ r4 = memw(r16+#28)                                           \n"
+      "  r5 = memw(r16+#32)       }                                   \n"
+
+      "// Call.                                                       \n"
+      "{ r6 = memw(r16+#0)                                            \n"
+      "  r21:20 = upcycle         }                                   \n"
+      "{ callr r6                 }                                   \n"
+
+      "// Restore stack pointer (free up r18), calculate cycle count. \n"
+      "{ r29 = r18                                                    \n"
+      "  r19:18 = upcycle         }                                   \n"
+      "{ r19:18 = sub(r19:18, r21:20) }                               \n"
+
+      "// Store pcount, restore non-volatile registers, and return.   \n"
+      "{ memd(r17+#0) = r19:18                                        \n"
+      "  r21:20 = memd(r29+#16)   }                                   \n"
+      "{ r19:18 = memd(r29+#8)                                        \n"
+      "  r17:16 = memd(r29+#0)    }                                   \n"
+      "{ dealloc_return           } // implicit-use r1:0              \n");
+}
+
+int dispatch(Environment* env) {
+  uint32_t code = message_buffer.code;
+  // Special handling of MsgReq.
+  if (code == kMsgReq) {
+    assert(message_buffer.len <= sizeof(payload_buffer));
+    setMsg(kMsgAck, sizeof(payload_buffer), va(payload_buffer));
+    return 0;
+  }
+
+  switch (code) {
+    case kAlloc: {
+      LOG("device: {kAlloc, %lu, %lx}", message_buffer.len, message_buffer.va);
+      assert(message_buffer.len == sizeof(MsgAlloc));
+      auto* ma = reinterpret_cast<volatile MsgAlloc*>(message_buffer.va);
+      void* p = env->alloc.alloc(ma->size, ma->align);
+      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(p);
+      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+      break;
+    }
+    case kFree: {
+      LOG("device: {kFree, %lu, %lx}", message_buffer.len, message_buffer.va);
+      assert(message_buffer.len == sizeof(MsgPointer));
+      auto* mp = reinterpret_cast<volatile MsgPointer*>(message_buffer.va);
+      env->alloc.free(pointer(mp->va));
+      setMsg(kNone, 0u, 0u);
+      break;
+    }
+    case kAllocVtcm: {
+      LOG("device: {kAllocVtcm, %lu, %lx}", message_buffer.len,
+          message_buffer.va);
+      assert(message_buffer.len == sizeof(MsgAlloc));
+      auto* ma = reinterpret_cast<volatile MsgAlloc*>(message_buffer.va);
+      void* p = env->alloc.vtcm_alloc(ma->size, ma->align);
+      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(p);
+      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+      break;
+    }
+    case kCopy: {
+      LOG("device: {kCopy, %lu, %lx}", message_buffer.len, message_buffer.va);
+      assert(message_buffer.len == sizeof(MsgCopy));
+      auto* mc = reinterpret_cast<volatile MsgCopy*>(message_buffer.va);
+      memcpy(pointer(mc->dst), pointer(mc->src), mc->len);
+      setMsg(kNone, 0u, 0u);
+      break;
+    }
+    case kLoad: {
+      if (env->dl_handle != nullptr) dlclose(env->dl_handle);
+      const char* name = static_cast<const char*>(pointer(message_buffer.va));
+      // LOG(stderr, "device: dlopen(%s)", name);
+      env->dl_handle = dlopen(name, RTLD_LAZY);
+      if (env->dl_handle == nullptr) LOG("dlopen: %s\n", dlerror());
+      assert(env->dl_handle != nullptr);
+      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va =
+          va(env->dl_handle);
+      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+      break;
+    }
+    case kUnload: {
+      assert(env->dl_handle != nullptr);
+      assert(message_buffer.len == sizeof(MsgPointer));
+      auto* mp = reinterpret_cast<volatile MsgPointer*>(message_buffer.va);
+      assert(pointer(mp->va) == env->dl_handle);
+      dlclose(env->dl_handle);
+      env->dl_handle = nullptr;
+      setMsg(kNone, 0u, 0u);
+      break;
+    }
+    case kResolve: {
+      LOG("device: {kResolve, %lu, %lx}", message_buffer.len,
+          message_buffer.va);
+      assert(env->dl_handle != nullptr);
+      dlerror();
+      const char* name = static_cast<const char*>(pointer(message_buffer.va));
+      void* s = dlsym(env->dl_handle, name);
+      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(s);
+      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+      break;
+    }
+    case kCall: {
+      LOG("device: {kCall, %lu, %lx}", message_buffer.len, message_buffer.va);
+      // Add the task to the queue.
+      auto* mc = reinterpret_cast<MsgCall*>(message_buffer.va);
+      uint32_t size = 4 * (3 + mc->scalar_num + mc->stack_num);
+      MsgCall* t = static_cast<MsgCall*>(malloc(size));
+      memcpy(t, mc, size);
+      task_queue.push_back(t);
+      // Return 0.
+      *reinterpret_cast<volatile uint32_t*>(payload_buffer) = 0;
+      setMsg(kNone, sizeof(uint32_t), va(payload_buffer));
+      break;
+    }
+    case kFlush: {
+      LOG("device: {kFlush}");
+      LOG("device: %d tasks in the queue", task_queue.size());
+      // Execute all tasks from the queue and release memory buffers
+      // for as long as the return values are 0. Upon receiving a non-zero
+      // return value, continue freeing memory but no longer execute
+      // any tasks. The task queue will be cleared in any case.
+      uint32_t rv = 0;
+      uint64_t pcc;  // Pcycle counter, will be 0 under simulator (upcycle).
+      for (MsgCall* t : task_queue) {
+        if (rv == 0) {
+          printMsgCall(*t);
+          rv = launcher(t, &pcc);
+          LOG("device: execution took %lld pcycles", pcc);
+        }
+        free(t);
+      }
+      task_queue.clear();
+      *reinterpret_cast<volatile uint32_t*>(payload_buffer) = rv;
+      setMsg(kNone, sizeof(uint32_t), va(payload_buffer));
+      break;
+    }
+    default:
+      LOG("device: unknown code: %lu", message_buffer.code);
+      abort();
+      break;
+  }
+  return 0;
+}
+
+extern "C" {
+int acquire_vector_unit(int);
+void release_vector_unit();
+}
+
+static void makePathList(const std::string& arg,
+                         std::vector<std::string>* list) {
+  size_t p = 0, e = arg.size();
+  std::vector<char> tmp;
+
+  while (p < e) {
+    tmp.clear();
+    bool check_next = true;
+    size_t i = p;
+    for (; i != e; ++i) {
+      char c = arg[i];
+      if (check_next) {
+        if (c == '\\') {
+          check_next = false;
+          continue;
+        } else if (c == ':') {
+          break;
+        }
+      }
+      check_next = true;
+      tmp.push_back(c);
+    }
+    if (!tmp.empty()) list->emplace_back(tmp.begin(), tmp.end());
+    p = i + 1;
+  }
+}
+
+static std::string findInPaths(const std::string& filename,
+                               const std::string& paths) {
+  std::vector<std::string> path_list;
+  makePathList(paths, &path_list);
+
+  for (const auto& p : path_list) {
+    std::string pf = p + '/' + filename;
+    if (access(pf.c_str(), X_OK) == 0) return std::move(pf);
+  }
+  // If the search failed, try bare filename. If it cannot be loaded,
+  // dlerror will print a meaningful message.
+  return filename;
+}
+
+// Presence of this function indicates that sim_dev is running.
+extern "C" int running_in_sim_dev_17bc90206f6cf5a7();
+int running_in_sim_dev_17bc90206f6cf5a7() { return 0; }
+
+int main(int argc, char* argv[]) {
+  int opt;
+  std::string ld_path;
+  while ((opt = getopt(argc, argv, "L:")) != -1) {
+    switch (opt) {
+      case 'L':
+        ld_path += ':' + std::string(optarg);
+        break;
+      case '?':
+        LOG("Usage %s: [-L path1[:path2...]]", argv[0]);
+        return 1;
+    }
+  }
+
+  std::string rt_path = findInPaths("libtvm_runtime.so", ld_path);
+  LOG("TVM runtime path: %s", rt_path.c_str());
+
+  Environment env;
+  acquire_vector_unit(0);
+
+  const char* builtin[] = {
+      "libgcc.so",    "libc.so",     "libc++.so",
+      "libc++abi.so", "libc++.so.1", "libc++abi.so.1"  // Alternative names.
+  };
+  dlinit(sizeof(builtin) / sizeof(builtin[0]), const_cast<char**>(builtin));
+  void* rt_handle = dlopen(rt_path.c_str(), RTLD_GLOBAL);
+  if (rt_handle == nullptr) {
+    LOG("error loading TVM runtime: %s", dlerror());
+    return 1;
+  }
+
+  // When running TVM runtime on Hexagon there is no longer a device
+  // for Hexagon, but standalone ops can still refer to it. All of
+  // required DeviceAPI's functionality is adequately implemented
+  // via the CPU device, so remap device_api.hexagon to device_api.cpu.
+  auto* get_global = reinterpret_cast<decltype(&TVMFuncGetGlobal)>(
+      dlsym(rt_handle, "TVMFuncGetGlobal"));
+  assert(get_global != nullptr);
+  auto* register_global = reinterpret_cast<decltype(&TVMFuncRegisterGlobal)>(
+      dlsym(rt_handle, "TVMFuncRegisterGlobal"));
+  assert(register_global != nullptr);
+
+  TVMFunctionHandle cpu_api;
+  if (get_global("device_api.cpu", &cpu_api) != 0 ||
+      register_global("device_api.hexagon", cpu_api, true) != 0) {
+    LOG("error setting device_api.hexagon");
+    return 1;
+  }
+
+  while (!dispatch(&env)) {
+  }
+
+  dlclose(rt_handle);
+  release_vector_unit();
+  return 0;
+}
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 9d14d3a14d03..0a2a60c2d059 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -255,6 +255,17 @@ int MaxConcurrency() {
     max_concurrency = std::thread::hardware_concurrency();
 #if defined(_M_X64) || defined(__x86_64__)
     max_concurrency /= 2;  // ignore hyper-threading
+#elif defined(__hexagon__)
+    // With unsigned PDs, getting the number of available hardware threads
+    // is not supported in earlier versions of QuRT. In such cases assume 4.
+    // If running on simulator, set max_concurrency to 1.
+    if (max_concurrency == 0) {
+      if (dlsym(RTLD_DEFAULT, "running_in_sim_dev_17bc90206f6cf5a7")) {
+        max_concurrency = 1;
+      } else {
+        max_concurrency = 4;
+      }
+    }
 #endif
   }
   return std::max(max_concurrency, 1);