diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py
index 278822e2ca04..e2bfca392c1e 100644
--- a/python/tvm/auto_scheduler/search_policy.py
+++ b/python/tvm/auto_scheduler/search_policy.py
@@ -113,6 +113,8 @@ class SketchPolicy(SearchPolicy):
         "retry_search_one_round_on_empty": 10,
 
         'evolutionary_search_population': 2048,
+        'evolutionary_search_num_iters': 10,
+        'evolutionary_search_mutation_prob': 0.85,
         "evolutionary_search_use_measured_ratio": 0.2,
 
         'cpu_multi_level_tiling_structure': 'SSRSRS',
@@ -178,3 +180,21 @@ def sample_initial_population(self, pop_size):
         """
         states = _ffi_api.SketchPolicySampleInitialPopulation(self, pop_size)
         return states
+
+    def evolutionary_search(self, init_populuations, out_size):
+        """Evolutionary search.
+        This python interface is mainly used for debugging and testing.
+        The actual search is all doen in c++.
+        Parameters
+        ----------
+        init_populations: List[State]
+            The initial population states
+        out_size : int
+            The size of generated states
+        Returns
+        -------
+        states: List[State]
+            The generated states
+        """
+        states = _ffi_api.SketchPolicyEvolutionarySearch(self, init_populuations, out_size)
+        return states
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index de70297a7889..a93b0e5fc58c 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -18,7 +18,9 @@
 """Arm(R) Ethos(TM) -N NPU supported operators."""
 from enum import Enum
 import tvm.ir
+from ...dataflow_pattern import wildcard, is_op, is_constant
 from ... import qnn as _qnn
+from .register import register_pattern_table
 from . import _ethosn as support
 
 
@@ -40,6 +42,30 @@ def ethosn_available():
     return Available.SW_AND_HW if hw else Available.SW_ONLY
 
 
+@register_pattern_table("ethos-n")
+def pattern_table():
+    """Get the Ethos-N compiler pattern table."""
+    def qnn_conv_pattern():
+        pattern = is_op('nn.pad')(wildcard()) | wildcard()
+        pattern = is_op('qnn.conv2d')(
+            pattern, is_constant(), is_constant(), is_constant(), is_constant(), is_constant())
+        pattern = is_op('nn.bias_add')(pattern, is_constant())
+        pattern = is_op('qnn.requantize')(
+            pattern, is_constant(), is_constant(), is_constant(), is_constant())
+        return pattern
+
+    def check_conv2d(extract):
+        """Check if a conv2d is supported by Ethos-N."""
+        if not ethosn_available():
+            return False
+
+        return support.conv2d(extract)
+
+    return [
+        ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d),
+    ]
+
+
 @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n")
 def qnn_concatenate(attrs, args):
     """Check if a concatenate is supported by Ethos-N."""
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index 83b7e0ef8c8e..6232b44fbd59 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -32,6 +32,7 @@
 #include <algorithm>
 #include <iomanip>
 #include <limits>
+#include <queue>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -66,6 +67,13 @@ static InitUnroll init_unroll;
 static InitVectorization init_vectorization;
 static InitThreadBind init_thread_bind;
 
+/********** Mutation rules **********/
+
+static MutateTileSize mutate_tile_size;
+static MutateMaxUnrollFactor mutate_max_unroll_factor;
+static MutateComputeLocation mutate_compute_location;
+static MutateParallel mutate_parallel;
+
 /********** Sketch policy **********/
 
 TVM_REGISTER_NODE_TYPE(SketchPolicyNode);
@@ -130,6 +138,12 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel schedule_cost_model,
     LOG(FATAL) << "No default init rules for target: " << task->target;
   }
 
+  // The default mutation rules.
+  node->mutation_rules.push_back(&mutate_tile_size);
+  node->mutation_rules.push_back(&mutate_max_unroll_factor);
+  node->mutation_rules.push_back(&mutate_compute_location);
+  node->mutation_rules.push_back(&mutate_parallel);
+
   data_ = std::move(node);
 }
 
@@ -344,7 +358,7 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
           // Some rules use the random generator of SketchPolicyNode, so this part has to be
           // protected
           std::unique_lock<std::mutex> l(m);
-          if (rule->Apply(this, &tmp_s) == InitPopulationRule::ResultKind::kInvalid) {
+          if (rule->Apply(this, &tmp_s) == PopulationGenerationRule::ResultKind::kInvalid) {
             valid = false;
             break;
           }
@@ -372,8 +386,148 @@ Array<State> SketchPolicyNode::EvolutionarySearch(const Array<State>& init_popul
   Array<State> best_states;
   auto tic_begin = std::chrono::high_resolution_clock::now();
 
-  // TODO(comaniac, merrymercy, jcf94): Since we haven't finished porting the cost model part
-  // yet, currently delete the implementation of EvolutionarySearch. To be added later.
+  size_t population = init_population.size();
+  int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters);
+  double mutation_prob = GetDoubleParam(params, SketchParamKey::EvolutionarySearch::mutation_prob);
+
+  // Two ping pong buffers to avoid copy.
+  Array<State> states_buf1{init_population}, states_buf2;
+  states_buf1.reserve(population);
+  states_buf2.reserve(population);
+  Array<State>* pnow = &states_buf1;
+  Array<State>* pnext = &states_buf2;
+
+  // The set of explored states to avoid redundancy.
+  std::unordered_set<std::string> explored_set;
+
+  // The heap to maintain the so far best states.
+  using StateHeapItem = std::pair<State, float>;
+  auto cmp = [](const StateHeapItem& left, const StateHeapItem& right) {
+    return left.second > right.second;
+  };
+  using StateHeap = std::priority_queue<StateHeapItem, std::vector<StateHeapItem>, decltype(cmp)>;
+  StateHeap heap(cmp);
+  auto update_heap = [&heap, &explored_set](const Array<State>& states,
+                                            const std::vector<float>& scores, const int out_size) {
+    float max_score = 0.0;
+    for (size_t i = 0; i < states.size(); ++i) {
+      const State& state = states[i];
+      std::string state_str = state.ToStr();
+
+      // Skip redundant states.
+      if (explored_set.count(state_str) > 0) {
+        continue;
+      }
+      explored_set.insert(state_str);
+
+      if (static_cast<int>(heap.size()) < out_size) {
+        // Directly push item if the heap is not full yet.
+        heap.push({state, scores[i]});
+      } else if (scores[i] > heap.top().second) {
+        // Replace the worst state in the heap with the new state.
+        heap.pop();
+        heap.push({state, scores[i]});
+      }
+      max_score = (scores[i] > max_score) ? scores[i] : max_score;
+    }
+    return max_score;
+  };
+
+  // Cost model predicted scores.
+  std::vector<float> scores;
+  scores.reserve(population);
+
+  // The function to generate prefix sum probabilities based on the given scores.
+  auto assign_prob = [](const std::vector<float>& scores, std::vector<double>* prefix_sum_probs) {
+    // Compute selection probabilities.
+    double sum = 0.0;
+    prefix_sum_probs->resize(scores.size());
+    for (size_t i = 0; i < scores.size(); ++i) {
+      sum += std::max(scores[i], 0.0f);
+      (*prefix_sum_probs)[i] = sum;
+    }
+    for (size_t i = 0; i < scores.size(); ++i) {
+      (*prefix_sum_probs)[i] /= sum;
+    }
+  };
+
+  // State selection probabilities.
+  std::uniform_real_distribution<> uniform_dist(0.0, 1.0);
+  std::vector<double> state_select_probs;
+  state_select_probs.reserve(population);
+
+  // Mutation rule selection probabilities.
+  std::vector<double> rule_select_probs;
+  rule_select_probs.reserve(mutation_rules.size());
+  std::vector<float> rule_levels;
+  for (const auto& rule : mutation_rules) {
+    rule_levels.push_back(rule->GetLevel(search_task));
+  }
+  assign_prob(rule_levels, &rule_select_probs);
+
+  // Evaluate the init populations.
+  *pnow = search_task->compute_dag.InferBound(*pnow);
+  PruneInvalidState(search_task, pnow);
+  CHECK_GT(pnow->size(), 0) << "All initial populations are invalid";
+  schedule_cost_model->Predict(search_task, *pnow, &scores);
+
+  // Maintain the best states in the heap.
+  float max_score = update_heap(*pnow, scores, out_size);
+
+  // Genetic algorithm.
+  for (auto iter_idx = 1; iter_idx <= num_iters; ++iter_idx) {
+    // Assign the selection probability to each state based on the cost model scores.
+    assign_prob(scores, &state_select_probs);
+
+    // TODO(@comaniac): Perform cross over.
+
+    // Perform mutations.
+    size_t fail_ct = 0;
+    while (pnext->size() < population && fail_ct < population * 2) {
+      // Select a state to be mutated.
+      State tmp_s = (*pnow)[RandomChoose(state_select_probs, &rand_gen)];
+      if (uniform_dist(rand_gen) < mutation_prob) {
+        // Select a rule and mutate the state.
+        const auto& rule = mutation_rules[RandomChoose(rule_select_probs, &rand_gen)];
+        if (rule->Apply(this, &tmp_s) == PopulationGenerationRule::ResultKind::kValid) {
+          pnext->push_back(std::move(tmp_s));
+        } else {
+          fail_ct++;
+        }
+      } else {
+        // Do not mutate this state in this round.
+        pnext->push_back(std::move(tmp_s));
+      }
+    }
+
+    // Evaluate the new populations.
+    *pnext = search_task->compute_dag.InferBound(*pnext);
+    PruneInvalidState(search_task, pnext);
+
+    // Throw away all states generated in this iterations if all new states are invalid.
+    if (pnext->size() > 0) {
+      std::swap(pnext, pnow);
+      schedule_cost_model->Predict(search_task, *pnow, &scores);
+
+      // Maintain the best states in the heap.
+      float iter_max_score = update_heap(*pnow, scores, out_size);
+      max_score = (iter_max_score > max_score) ? iter_max_score : max_score;
+    }
+    pnext->clear();
+
+    if (iter_idx % 5 == 0 || iter_idx == num_iters) {
+      StdCout(verbose) << "GA Iter: " << iter_idx << std::fixed << std::setprecision(4)
+                       << "\tMax Score: " << max_score << "\tPop Size: " << pnow->size()
+                       << std::endl;
+    }
+  }
+
+  // Copy best states in the heap to the output.
+  while (!heap.empty()) {
+    auto item = heap.top();
+    heap.pop();
+    best_states.push_back(std::move(item.first));
+  }
 
   double duration = std::chrono::duration_cast<std::chrono::duration<double>>(
                         std::chrono::high_resolution_clock::now() - tic_begin)
@@ -450,5 +604,11 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicySampleInitialPopulation")
       return init_population;
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicyEvolutionarySearch")
+    .set_body_typed([](SketchPolicy policy, Array<State> init_population, int out_size) {
+      Array<State> states = policy->EvolutionarySearch(init_population, out_size);
+      return states;
+    });
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h
index 0c1e6df170f4..2d93d8775c86 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.h
+++ b/src/auto_scheduler/search_policy/sketch_policy.h
@@ -56,6 +56,10 @@ struct SketchParamKey {
   struct EvolutionarySearch {
     /*! \brief The population size for evolutionary search. */
     static constexpr const char* population = "evolutionary_search_population";
+    /*! \brief The number of iterations performed by generic algorithm.*/
+    static constexpr const char* num_iters = "evolutionary_search_num_iters";
+    /*! \brief The mutation probability.*/
+    static constexpr const char* mutation_prob = "evolutionary_search_mutation_prob";
     /*! \brief The maximum percentage of measured states in the initial population for evolutionary
      * search. */
     static constexpr const char* use_measured_ratio = "evolutionary_search_use_measured_ratio";
@@ -90,7 +94,9 @@ class SketchPolicyNode : public SearchPolicyNode {
   /*! \brief The rules to generate sketches. */
   std::vector<SketchGenerationRule*> sketch_rules;
   /*! \brief The rules to generate initial states. */
-  std::vector<InitPopulationRule*> init_rules;
+  std::vector<PopulationGenerationRule*> init_rules;
+  /*! \brief The rules to mutate states. */
+  std::vector<PopulationMutationRule*> mutation_rules;
   /*! \brief Random generator. */
   std::mt19937 rand_gen;
   /*! \brief Memorize split space for Split. */
@@ -113,6 +119,14 @@ class SketchPolicyNode : public SearchPolicyNode {
    */
   Array<State> SampleInitPopulation(const Array<State>& sketches, int out_size);
 
+  /*!
+   * \brief Perform evolutionary search.
+   * \param init_populations The states generated from init population.
+   * \param out_size The number of expected output states.
+   * \return The generated states after evolutionary search.
+   */
+  Array<State> EvolutionarySearch(const Array<State>& init_populations, int out_size);
+
   static constexpr const char* _type_key = "auto_scheduler.SketchPolicy";
 
   TVM_DECLARE_FINAL_OBJECT_INFO(SketchPolicyNode, SearchPolicyNode);
@@ -127,14 +141,6 @@ class SketchPolicyNode : public SearchPolicyNode {
    */
   Array<State> SearchOneRound(int num_random_states, Array<State>* random_states = nullptr);
 
-  /*!
-   * \brief Perform evolutionary search.
-   * \param init_populations The states generated from init population.
-   * \param out_size The number of expected output states.
-   * \return The generated states after evolutionary search.
-   */
-  Array<State> EvolutionarySearch(const Array<State>& init_populations, int out_size);
-
   /*!
    * \brief Pick states from best states and random states with eps-greedy policy.
    * \param best_states States picked by cost model.
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 92073b68b73a..843301c2bb8f 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -436,8 +436,8 @@ std::vector<std::pair<State, int>> RuleSpecialComputeLocationGPU::Apply(
 
 /********** Init Population **********/
 
-InitPopulationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy,
-                                                       State* state) const {
+PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy,
+                                                             State* state) const {
   StateNode* pstate = state->CopyOnWrite();
   // Scan the transformation history and randomly fill tiles size for all SplitStep
   for (size_t step_id = 0; step_id < (*state)->transform_steps.size(); ++step_id) {
@@ -472,10 +472,11 @@ InitPopulationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy,
   return ResultKind::kValid;
 }
 
-InitPopulationRule::ResultKind InitChangeComputeLocation::Apply(SketchPolicyNode* policy,
-                                                                State* state) const {
+PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNode* policy,
+                                                                 State* state,
+                                                                 bool infer_bound = true) {
   if (GetIntParam(policy->params, SketchParamKey::disable_change_compute_location)) {
-    return ResultKind::kValid;
+    return PopulationGenerationRule::ResultKind::kValid;
   }
 
   for (int stage_id = static_cast<int>((*state)->stages.size()) - 1; stage_id >= 0; stage_id--) {
@@ -584,11 +585,19 @@ InitPopulationRule::ResultKind InitChangeComputeLocation::Apply(SketchPolicyNode
     }
   }
 
-  *state = policy->search_task->compute_dag.InferBound(*state);
-  return ResultKind::kValid;
+  if (infer_bound) {
+    *state = policy->search_task->compute_dag.InferBound(*state);
+  }
+  return PopulationGenerationRule::ResultKind::kValid;
 }
 
-InitPopulationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy, State* state) const {
+PopulationGenerationRule::ResultKind InitChangeComputeLocation::Apply(SketchPolicyNode* policy,
+                                                                      State* state) const {
+  return MutateComputeLocationCommon(policy, state, false);
+}
+
+PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy,
+                                                         State* state) const {
   std::function<void(const SketchPolicyNode&, State*, int stage_id, int iter_offset)>
       annotate_parallel;
   annotate_parallel = [&annotate_parallel](const SketchPolicyNode& policy, State* state,
@@ -652,7 +661,8 @@ InitPopulationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy, Sta
   return ResultKind::kValid;
 }
 
-InitPopulationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy, State* state) const {
+PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy,
+                                                       State* state) const {
   std::vector<int> auto_unroll_configs = IsGPUTask(policy->search_task)
                                              ? std::vector<int>({0, 16, 64, 512, 1024})
                                              : std::vector<int>({0, 16, 64, 512});
@@ -703,8 +713,8 @@ InitPopulationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy, State
   return ResultKind::kValid;
 }
 
-InitPopulationRule::ResultKind InitVectorization::Apply(SketchPolicyNode* policy,
-                                                        State* state) const {
+PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode* policy,
+                                                              State* state) const {
   for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
     const Stage& stage = (*state)->stages[stage_id];
     // Skip the inlined stage and placeholder stage
@@ -762,7 +772,8 @@ InitPopulationRule::ResultKind InitVectorization::Apply(SketchPolicyNode* policy
   return ResultKind::kValid;
 }
 
-InitPopulationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy, State* state) const {
+PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy,
+                                                           State* state) const {
   std::set<int> multi_level_tiling_root_set;
   for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
     if (NeedsMultilevelTiling(policy->search_task, *state, stage_id)) {
@@ -908,7 +919,251 @@ InitPopulationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy, S
       state->bind(stage_id, iters1[1], IteratorAnnotation::kThreadX);
     }
   }
+  return ResultKind::kValid;
+}
+
+PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* policy,
+                                                           State* state) const {
+  int max_innermost_split_factor =
+      GetIntParam(policy->params, SketchParamKey::max_innermost_split_factor);
+
+  // Extract all SplitStep
+  std::vector<size_t> split_step_ids;
+  for (size_t i = 0; i < (*state)->transform_steps.size(); ++i) {
+    if (auto ps = (*state)->transform_steps[i].as<SplitStepNode>()) {
+      if (!ps->extent.defined() || !ps->extent.value()->IsInstance<IntImmNode>()) {
+        continue;
+      }
+      auto innermost_factor = ps->lengths.back().value_or(max_innermost_split_factor + 1);
+      if (GetIntImm(innermost_factor) <= max_innermost_split_factor) {
+        split_step_ids.push_back(i);
+      }
+    }
+  }
+  if (split_step_ids.empty()) {
+    // No tile size could be mutated.
+    return ResultKind::kInvalid;
+  }
+
+  // Select a SplitStep with extent larger than one to mutate.
+  int retry_ct = 0;
+  int64_t extent = 1;
+  int step_id;
+  const SplitStepNode* ps;
+
+  do {
+    step_id = split_step_ids[(policy->rand_gen)() % split_step_ids.size()];
+    ps = (*state)->transform_steps[step_id].as<SplitStepNode>();
+    CHECK(ps != nullptr);
+    extent = GetIntImm(ps->extent.value());
+    retry_ct += 1;
+  } while (retry_ct < static_cast<int>(split_step_ids.size()) << 2 && (extent == 1 || extent == 0));
+
+  if (extent <= 1) {
+    // Cannot find a step with extent larger than one.
+    return ResultKind::kInvalid;
+  }
+
+  // Fetch the current tile sizes.
+  std::vector<int> lengths(ps->lengths.size() + 1, 1);
+  for (int i = 0; i < static_cast<int>(ps->lengths.size()); ++i) {
+    lengths[i + 1] = GetIntImm(ps->lengths[i].value());
+  }
+  lengths[0] = extent / ElementProduct(lengths);
+
+  // Random permute the tile size order.
+  std::vector<int> random_perm;
+  RandomPermutation(lengths.size(), &random_perm, &(policy->rand_gen));
+
+  // Try to divide a factor from one tile size and multiple it to another.
+  for (size_t i = 0; i < random_perm.size(); ++i) {
+    size_t src_idx = random_perm[i];
+    int length = lengths[src_idx];
+    if (length <= 1) {
+      continue;
+    }
+
+    size_t dst_idx = random_perm[(i + 1) % random_perm.size()];
+    const std::vector<int>& factors = policy->split_memo.GetFactors(length);
+    CHECK_GE(factors.size(), 1);
+
+    int divide_factor;
+    if (dst_idx == lengths.size() - 1) {
+      // Maintain the restriction of hardware_params.max_innermost_split_factor.
+      int max_factor_index = static_cast<int>(factors.size()) - 1;
+      for (; max_factor_index >= 1; max_factor_index--) {
+        if (factors[max_factor_index] * lengths[dst_idx] <= max_innermost_split_factor) {
+          break;
+        }
+      }
+      if (max_factor_index == 0) {
+        // Failed on this dst_idx, try next one.
+        continue;
+      }
+      divide_factor = factors[1 + (policy->rand_gen)() % (max_factor_index)];
+    } else {
+      divide_factor = factors[1 + (policy->rand_gen)() % (factors.size() - 1)];
+    }
+
+    // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx].
+    Array<Integer> new_lengths;
+    for (size_t j = 1; j < lengths.size(); ++j) {
+      if (j == src_idx) {
+        new_lengths.push_back(Integer(lengths[j] / divide_factor));
+      } else if (j == dst_idx) {
+        new_lengths.push_back(Integer(lengths[j] * divide_factor));
+      } else {
+        new_lengths.push_back(Integer(lengths[j]));
+      }
+    }
+
+    StateNode* pstate = state->CopyOnWrite();
+    pstate->transform_steps.Set(
+        step_id, SplitStep(ps->stage_id, ps->iter_id, ps->extent,
+                           Array<Optional<Integer>>(new_lengths.begin(), new_lengths.end()),
+                           ps->inner_to_outer));
+    return ResultKind::kValid;
+  }
+  return ResultKind::kInvalid;
+}
+
+PopulationGenerationRule::ResultKind MutateMaxUnrollFactor::Apply(SketchPolicyNode* policy,
+                                                                  State* state) const {
+  // Extract all auto_unroll_max_step pragma steps.
+  std::vector<int> annotate_steps;
+  for (size_t i = 0; i < (*state)->transform_steps.size(); ++i) {
+    if (auto ps = (*state)->transform_steps[i].as<PragmaStepNode>()) {
+      if (StrStartsWith(ps->pragma_type, "auto_unroll_max_step")) {
+        annotate_steps.push_back(i);
+      }
+    }
+  }
+  if (annotate_steps.empty()) {
+    return ResultKind::kInvalid;
+  }
+
+  // Random pick up one unroll factor candidate.
+  auto cands = (IsGPUTask(policy->search_task)) ? &gpu_unroll_cands_ : &cpu_unroll_cands_;
+  auto new_factor = std::to_string((*cands)[(policy->rand_gen)() % cands->size()]);
+
+  // Random pick up and mutate an unroll step.
+  auto step_id = annotate_steps[(policy->rand_gen)() % annotate_steps.size()];
+  auto ps = (*state)->transform_steps[step_id].as<PragmaStepNode>();
+  CHECK(ps);
+  StateNode* pstate = state->CopyOnWrite();
+  pstate->transform_steps.Set(step_id,
+                              PragmaStep(ps->stage_id, ps->iter_id,
+                                         std::string("auto_unroll_max_step") + "$" + new_factor));
+  return ResultKind::kValid;
+}
+
+PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNode* policy,
+                                                                  State* state) const {
+  return MutateComputeLocationCommon(policy, state, true);
+}
+
+PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* policy,
+                                                           State* state) const {
+  // This mutation rule only focuses on a case that parallel was added to
+  // the outermost loop and the loop is generated by fusing other loops.
+  // In short, we mutate the fusion step before the parallel step.
+
+  // Extract all parallel steps.
+  std::vector<int> parallel_steps;
+  for (size_t s = 0; s < (*state)->transform_steps.size(); ++s) {
+    auto ps = (*state)->transform_steps[s].as<AnnotationStepNode>();
+    if (!ps || ps->annotation != IteratorAnnotation::kParallel) {
+      continue;
+    }
+
+    // Skip non-outermost loop or the parallel step without fusion beforehand.
+    if (ps->iter_id > 0 || s == 0 || !(*state)->transform_steps[s - 1].as<FuseStepNode>()) {
+      continue;
+    }
+    parallel_steps.push_back(s);
+  }
+  if (parallel_steps.empty()) {
+    return ResultKind::kInvalid;
+  }
+
+  // Randomly pick one parallel step.
+  size_t step_id = parallel_steps[(policy->rand_gen)() % parallel_steps.size()];
+  auto ps = (*state)->transform_steps[step_id].as<AnnotationStepNode>();
+  CHECK(ps);
+  size_t stage_id = ps->stage_id;
+  size_t iter_id = ps->iter_id;
+  const Stage& stage = (*state)->stages[stage_id];
+  const Iterator& it = stage->iters[iter_id];
+
+  // Replay a new state until the picked fuse step.
+  State tmp_s = policy->search_task->compute_dag->init_state;
+  for (size_t s = 0; s < step_id - 1; ++s) {
+    auto step = (*state)->transform_steps[s];
+    tmp_s.CopyOnWrite()->transform_steps.push_back(step);
+    StepApplyToState(step, &tmp_s, policy->search_task->compute_dag);
+  }
+
+  // Determine the fusion mutation direction.
+  // 0: fuse less; 1: fuse more.
+  auto fuse_step = (*state)->transform_steps[step_id - 1].as<FuseStepNode>();
+  auto fused_ids = fuse_step->fused_ids;
+  std::vector<double> fuse_dir = {0.5, 1.0};
+
+  // The case that we can only fuse more. This may happen after multiple mutations.
+  if (fused_ids.size() == 1) {
+    fuse_dir[0] = 0.0;
+  }
+
+  // The cases that we cannot fuse the next iters.
+  if ((*state)->attach_map->iter_to_attached_stages.count(std::make_pair(stage_id, iter_id)) ||
+      it->iter_kind == IteratorKind::kReduction || it->annotation != IteratorAnnotation::kNone) {
+    if (fuse_dir[0] == 0.0) {
+      // No room to mutate this fusion.
+      return ResultKind::kInvalid;
+    }
+    fuse_dir[0] = 1.0;
+  }
+
+  // Mutate the fusion iters and replay the mutated fused/annotation steps.
+  int iter_offset = 0;
+  if (RandomChoose(fuse_dir, &(policy->rand_gen)) == 0) {
+    fused_ids.pop_back();
+    iter_offset = 1;
+  } else {
+    auto last_id = fused_ids.back().get()->value;
+    fused_ids.push_back(last_id + 1);
+    iter_offset = -1;
+  }
+  auto new_fuse_step = FuseStep(stage_id, fused_ids);
+  tmp_s.CopyOnWrite()->transform_steps.push_back(new_fuse_step);
+  StepApplyToState(new_fuse_step, &tmp_s, policy->search_task->compute_dag);
+  tmp_s.CopyOnWrite()->transform_steps.push_back((*state)->transform_steps[step_id]);
+  StepApplyToState((*state)->transform_steps[step_id], &tmp_s, policy->search_task->compute_dag);
+
+  // Replay the rest steps.
+  for (size_t s = step_id + 1; s < (*state)->transform_steps.size(); ++s) {
+    auto step = (*state)->transform_steps[s];
+    if (step->stage_id == static_cast<int>(stage_id)) {
+      // Since we changed the loop structure, iter ID in later steps to the same stage
+      // has to be adjusted.
+      auto ps = step.as<AnnotationStepNode>();
+      if (ps) {
+        if (ps->iter_id == 0) {
+          step = AnnotationStep(ps->stage_id, 0, ps->annotation);
+        } else {
+          CHECK_LE(ps->iter_id + iter_offset, tmp_s->stages[stage_id]->iters.size());
+          step = AnnotationStep(ps->stage_id, ps->iter_id + iter_offset, ps->annotation);
+        }
+      } else {
+        // Unexpected step node that we did not process for now.
+        return ResultKind::kInvalid;
+      }
+    }
+    tmp_s.CopyOnWrite()->transform_steps.push_back(step);
+    StepApplyToState(step, &tmp_s, policy->search_task->compute_dag);
+  }
 
+  *state = tmp_s;
   return ResultKind::kValid;
 }
 
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h
index 5ddfd181cc5b..418fbda6a030 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.h
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h
@@ -26,10 +26,13 @@
 #define TVM_AUTO_SCHEDULER_SEARCH_POLICY_SKETCH_POLICY_RULES_H_
 
 #include <tvm/auto_scheduler/loop_state.h>
+#include <tvm/auto_scheduler/search_task.h>
 
 #include <utility>
 #include <vector>
 
+#include "utils.h"
+
 namespace tvm {
 namespace auto_scheduler {
 
@@ -122,7 +125,7 @@ DEFINE_SKETCH_GENERATION_RULE(RuleSpecialComputeLocationGPU);
 /********** Init Population **********/
 
 /*! \brief The base class for derivation rules used in the initial population. */
-class InitPopulationRule {
+class PopulationGenerationRule {
  public:
   /*! \brief Result enumeration of the apply function. */
   enum class ResultKind : int { kValid = 0, kInvalid = 1 };
@@ -138,7 +141,7 @@ class InitPopulationRule {
 };
 
 #define DEFINE_INIT_POPULATION_RULE(rule_name)                            \
-  class rule_name : public InitPopulationRule {                           \
+  class rule_name : public PopulationGenerationRule {                     \
    public:                                                                \
     ResultKind Apply(SketchPolicyNode* policy, State* state) const final; \
   };
@@ -162,6 +165,56 @@ DEFINE_INIT_POPULATION_RULE(InitVectorization);
 /*! \brief The rule that annotates thread binding for GPU. */
 DEFINE_INIT_POPULATION_RULE(InitThreadBind);
 
+/********** Mutation **********/
+
+/*! \brief The base class for mutation rules used in the evolutionary search. */
+class PopulationMutationRule : public PopulationGenerationRule {
+ public:
+  /*!
+   * \brief Get the priority level of this mutation rule.
+   * \return The priority level of this mutation rule. Higher the better.
+   */
+  virtual int GetLevel(const SearchTask& task) const = 0;
+};
+
+// A helper to define mutation rules with a constant rule level.
+#define DEFINE_MUTATE_POPULATION_RULE(rule_name, rule_level)                \
+  class rule_name : public PopulationMutationRule {                         \
+   public:                                                                  \
+    ResultKind Apply(SketchPolicyNode* policy, State* state) const final;   \
+    int GetLevel(const SearchTask& task) const final { return rule_level; } \
+  };
+
+/*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor
+    and multipling it to another tile size. */
+DEFINE_MUTATE_POPULATION_RULE(MutateTileSize, 100);
+
+/*! \brief The rule that mutates the fusion iterators annotated by parallel. */
+DEFINE_MUTATE_POPULATION_RULE(MutateParallel, 50);
+
+/*! \brief The rule that mutates the factor of a randomly selected auto max unroll step. */
+class MutateMaxUnrollFactor : public PopulationMutationRule {
+ public:
+  ResultKind Apply(SketchPolicyNode* policy, State* state) const final;
+  int GetLevel(const SearchTask& task) const final { return 10; }
+
+  const std::vector<int> cpu_unroll_cands_ = {0, 16, 64, 512, 1024};
+  const std::vector<int> gpu_unroll_cands_ = {0, 16, 64, 512};
+};
+
+/*! \brief The rule that randomly changes the computation location for some stages, which do not
+ * need tiling and are not strictly inlineable(e.g. data padding). */
+class MutateComputeLocation : public PopulationMutationRule {
+ public:
+  ResultKind Apply(SketchPolicyNode* policy, State* state) const final;
+  int GetLevel(const SearchTask& task) const final {
+    if (IsGPUTask(task)) {
+      return 0;
+    }
+    return 5;
+  }
+};
+
 }  // namespace auto_scheduler
 }  // namespace tvm
 
diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc
index b3f07b1c160f..a09ea596984a 100644
--- a/src/auto_scheduler/search_policy/utils.cc
+++ b/src/auto_scheduler/search_policy/utils.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file auto_scheduler/utils.cc
+ * \file auto_scheduler/search_policy/utils.cc
  * \brief Common utilities
  */
 
@@ -270,6 +270,69 @@ State FollowTiling(const State& state, int stage_id, const std::vector<int>& spl
   return tmp_s;
 }
 
+// Return whether a state has nested parallel, which is invalid on CPUs
+bool HasNestedParallel(const State& state) {
+  std::function<void(int stage_id, size_t*)> count_parallel_ct;
+
+  count_parallel_ct = [&state, &count_parallel_ct](int stage_id, size_t* parallel_ct) {
+    const Stage& stage = state->stages[stage_id];
+
+    if (stage->compute_at == ComputeAtKind::kInlined) {
+      return;
+    }
+
+    for (size_t i = 0; i < stage->iters.size(); ++i) {
+      if (stage->iters[i]->annotation == IteratorAnnotation::kParallel) {
+        (*parallel_ct)++;
+      }
+
+      IterKey iter_key(stage_id, i);
+      auto pair = state->attach_map->iter_to_attached_stages.find(iter_key);
+      if (pair != state->attach_map->iter_to_attached_stages.end()) {
+        for (const auto& attach_stage_id : pair->second) {
+          count_parallel_ct(attach_stage_id, parallel_ct);
+        }
+      }
+    }
+  };
+
+  for (size_t stage_id = 0; stage_id < state->stages.size(); ++stage_id) {
+    size_t parallel_ct = 0;
+
+    if (state->stages[stage_id]->compute_at == ComputeAtKind::kRoot) {
+      count_parallel_ct(stage_id, &parallel_ct);
+      if (parallel_ct >= 2) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+void PruneInvalidState(const SearchTask& task, Array<State>* states) {
+  size_t pt = 0;
+  for (size_t i = 0; i < states->size(); ++i) {
+    if (!(*states)[i].defined()) {
+      continue;
+    }
+    if (!IsGPUTask(task) && HasNestedParallel((*states)[i])) {
+      continue;
+    }
+
+    if (i != pt) {
+      states->Set(pt, (*states)[i]);
+    }
+    pt++;
+  }
+
+  if (pt == 0) {
+    LOG(INFO) << "All states are invalid.";
+  } else {
+    states->resize(pt);
+  }
+}
+
 const Array<Array<Integer>>& SplitFactorizationMemo::GetFactorizationSchemes(
     int extent, int n_lengths, int max_innermost_factor) {
   QueryKey key = std::make_tuple(extent, n_lengths, max_innermost_factor);
diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h
index 2d49ab007c78..792102a2a1ce 100644
--- a/src/auto_scheduler/search_policy/utils.h
+++ b/src/auto_scheduler/search_policy/utils.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file auto_scheduler/search_policy/utils.cc
+ * \file auto_scheduler/search_policy/utils.h
  * \brief Common utilities for search policies.
  */
 
@@ -662,6 +662,20 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo
 State FollowTiling(const State& state, int stage_id, const std::vector<int>& split_step_ids,
                    int n_split);
 
+// Random choose an index according to a prefix sum probability.
+inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt19937* random_gen) {
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+  double x = dis(*random_gen);
+
+  CHECK(!prefix_sum_probs.empty());
+
+  return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) -
+         prefix_sum_probs.begin();
+}
+
+// Prune invalid states and return the results in-place.
+void PruneInvalidState(const SearchTask& task, Array<State>* states);
+
 }  // namespace auto_scheduler
 }  // namespace tvm
 
diff --git a/src/printer/doc.cc b/src/printer/doc.cc
index d487e3e7aa3e..ab1eddbe7d1e 100644
--- a/src/printer/doc.cc
+++ b/src/printer/doc.cc
@@ -129,7 +129,7 @@ Doc Doc::Indent(int indent, Doc doc) {
 }
 
 Doc Doc::StrLiteral(const std::string& value, std::string quote) {
-  // TODO(M.K.): add escape.
+  // TODO(@M.K.): add escape.
   Doc doc;
   return doc << quote << value << quote;
 }
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index f66eb94cb20f..58cd5bf1dd44 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -50,6 +50,16 @@ bool IsEthosnOp(const Call& call, const std::string& op_name) {
   }
 }
 
+bool IsEthosnFunc(const Call& call, const std::string& op_name) {
+  if (call->op->IsInstance<FunctionNode>()) {
+    Function func = Downcast<Function>(call->op);
+    CHECK(func.defined());
+    auto name_node = func->GetAttr<String>(attr::kComposite);
+    return name_node.value() == op_name;
+  }
+  return false;
+}
+
 std::map<Expr, std::vector<sl::TensorInfo>> InferTensorsVisitor::Infer(const Expr& expr) {
   tensor_table_.clear();
   CHECK(expr->checked_type().defined());
@@ -69,7 +79,11 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) {
   EthosnError err;
   Call call = GetRef<Call>(cn);
   // Determine call -> NPU mapping
-  if (IsEthosnOp(call, "qnn.concatenate")) {
+  if (IsEthosnFunc(call, "ethos-n.qnn_conv2d")) {
+    ConvolutionParams params;
+    err += EthosnAPI::QnnConv2d(cn->op.as<FunctionNode>()->body, &params);
+    tensor_table_[cn->args[0]] = {params.activation_info};
+  } else if (IsEthosnOp(call, "qnn.concatenate")) {
     ConcatenateParams params;
     err = EthosnAPI::Concatenate(call, &params);
     tensor_table_[cn->args[0]] = params.input_infos;
@@ -181,7 +195,10 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) {
   sl::TensorAndId<sl::Operand> tensor;
   sl::TensorsAndId tensors;
   // Determine call -> NPU mapping
-  if (IsEthosnOp(call, "qnn.concatenate")) {
+  if (IsEthosnFunc(call, "ethos-n.qnn_conv2d")) {
+    if ((err = MakeConvolutionLayer(call, &tensor))) ReportFatalError(call, err);
+    return MakeOps(tensor);
+  } else if (IsEthosnOp(call, "qnn.concatenate")) {
     if ((err = MakeConcatenateLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
   } else if (IsEthosnOp(call, "split")) {
@@ -227,6 +244,28 @@ void ConstructNetworkVisitor::VisitLeaf(const Expr& expr) {
   if (!expr->IsInstance<FunctionNode>()) MixedModeVisitor::VisitLeaf(expr);
 }
 
+EthosnError ConstructNetworkVisitor::MakeConvolutionLayer(const Call& call,
+                                                          sl::TensorAndId<sl::Operand>* out) {
+  ConvolutionParams params;
+  if (auto err = EthosnAPI::QnnConv2d(call->op.as<FunctionNode>()->body, &params)) {
+    return err;
+  }
+
+  auto activation = operand_table_[call->args[0]][0];
+  auto weights = AddConstant(network_, params.weights_info, params.raw_weights).tensor;
+  auto bias = AddConstant(network_, params.bias_info, params.raw_bias).tensor;
+  try {
+    if (params.is_depthwise) {
+      *out = AddDepthwiseConvolution(network_, *activation, *bias, *weights, params.conv_info);
+    } else {
+      *out = AddConvolution(network_, *activation, *bias, *weights, params.conv_info);
+    }
+  } catch (const sl::NotSupportedException& e) {
+    return EthosnError(e.what());
+  }
+  return EthosnError();
+}
+
 EthosnError ConstructNetworkVisitor::MakeConcatenateLayer(const Call& call,
                                                           sl::TensorAndId<sl::Operand>* out) {
   ConcatenateParams params;
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 714a22d22027..7d1fe9cd5ea9 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -197,6 +197,7 @@ class ConstructNetworkVisitor : public MixedModeVisitor, private ErrorReportingP
   void VisitLeaf(const Expr& expr) final;
 
   // Make a support library operand from a Call
+  EthosnError MakeConvolutionLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeConcatenateLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeSplitLayer(const Call& call, sl::TensorsAndId* outs);
 
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index d92e35afeea0..b7cac6504ac6 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -40,6 +40,105 @@ namespace relay {
 namespace contrib {
 namespace ethosn {
 
+EthosnError EthosnAPI::QnnConv2d(const Expr& expr, ConvolutionParams* params) {
+  Call requantize = Downcast<Call>(expr);
+  Call bias_add = Downcast<Call>(requantize->args[0]);
+  Call conv = Downcast<Call>(bias_add->args[0]);
+  Call pad;
+  if (conv->args[0]->IsInstance<CallNode>() &&
+      Downcast<Call>(conv->args[0])->op == Op::Get("nn.pad"))
+    pad = Downcast<Call>(conv->args[0]);
+  const auto& conv_attr = conv->attrs.as<Conv2DAttrs>();
+
+  // Extract the quantization params from the arguments
+  int input_zero_point;
+  int kernel_zero_point;
+  int output_zero_point;
+  float input_scale;
+  float kernel_scale;
+  float output_scale;
+  EthosnError err = AsConstant<int>(conv->args[2], &input_zero_point);
+  err += AsConstant<int>(conv->args[3], &kernel_zero_point);
+  err += AsConstant<int>(requantize->args[4], &output_zero_point);
+  err += AsConstant<float>(conv->args[4], &input_scale);
+  err += AsConstant<float>(conv->args[5], &kernel_scale);
+  err += AsConstant<float>(requantize->args[3], &output_scale);
+
+  // Convert quantization params
+  sl::QuantizationInfo data_q_info;
+  sl::QuantizationInfo weights_q_info;
+  sl::QuantizationInfo bias_q_info;
+  sl::QuantizationInfo output_q_info;
+  err += Tvm2Npu(input_zero_point, input_scale, &data_q_info);
+  err += Tvm2Npu(kernel_zero_point, kernel_scale, &weights_q_info);
+  err += Tvm2Npu(0, data_q_info.m_Scale * weights_q_info.m_Scale, &bias_q_info);
+  err += Tvm2Npu(output_zero_point, output_scale, &output_q_info);
+
+  // Convert convolution attributes
+  sl::Padding padding;
+  if (pad.defined()) {
+    Tvm2Npu(conv_attr->padding, &padding);
+    // Don't support both standalone operator padding and attribute defined padding
+    if (padding != sl::Padding({0, 0, 0, 0})) {
+      err += EthosnError(
+          ErrStrm() << "both op and attr padding exist, must be either op/attr only or no padding");
+    }
+    err += Tvm2Npu(pad->attrs.as<PadAttrs>()->pad_width, &padding);
+  } else {
+    err += Tvm2Npu(conv_attr->padding, &padding);
+  }
+  sl::Stride stride;
+  err += Tvm2Npu(conv_attr->strides, &stride);
+  // Dilation is not supported
+  std::array<uint32_t, 4> dilation = {1, 1, 1, 1};
+  AsArray(conv_attr->dilation, &dilation);
+  if (conv_attr->dilation.size() != 2 || dilation[0] != 1 || dilation[1] != 1) {
+    err +=
+        EthosnError(ErrStrm() << "dilation=" << conv_attr->dilation << ", dilation must = [1, 1]");
+  }
+  // Create convolution info
+  params->conv_info = sl::ConvolutionInfo(padding, stride, output_q_info);
+
+  // Create data info
+  const TensorTypeNode* data_dtype;
+  if (pad.defined()) {
+    data_dtype = pad->args[0]->checked_type().as<TensorTypeNode>();
+  } else {
+    data_dtype = conv->args[0]->checked_type().as<TensorTypeNode>();
+  }
+  sl::TensorShape activation_tensor_shape;
+  sl::DataType activation_data_type;
+  err += Tvm2Npu(data_dtype->shape, &activation_tensor_shape);
+  err += Tvm2Npu(data_dtype->dtype, &activation_data_type);
+  params->activation_info = sl::TensorInfo(activation_tensor_shape, activation_data_type,
+                                           sl::DataFormat::NHWC, data_q_info);
+
+  // Create weights info
+  params->is_depthwise = conv_attr->channels.defined() &&
+                         tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) &&
+                         conv_attr->groups != 1;
+
+  const auto* weights_dtype = conv->args[1]->checked_type().as<TensorTypeNode>();
+  sl::TensorShape weights_tensor_shape;
+  sl::DataType weights_data_type;
+  sl::DataFormat weights_data_format;
+  // Ignore the error here because weights don't have a batch axis
+  Tvm2Npu(weights_dtype->shape, &weights_tensor_shape);
+  err += Tvm2Npu(weights_dtype->dtype, &weights_data_type);
+  err += Tvm2Npu(params->is_depthwise ? "HWIM" : "HWIO", &weights_data_format);
+  params->weights_info =
+      sl::TensorInfo(weights_tensor_shape, weights_data_type, weights_data_format, weights_q_info);
+  params->raw_weights = conv->args[1].as<ConstantNode>()->data->data;
+
+  // Create bias info
+  params->bias_info = sl::TensorInfo(
+      {1, 1, 1, params->is_depthwise ? weights_tensor_shape[2] : weights_tensor_shape[3]},
+      sl::DataType::INT32_QUANTIZED, sl::DataFormat::NHWC, bias_q_info);
+  params->raw_bias = bias_add->args[1].as<ConstantNode>()->data->data;
+
+  return err;
+}
+
 EthosnError EthosnAPI::Concatenate(const Expr& expr, ConcatenateParams* params) {
   Call call = Downcast<Call>(expr);
   const auto& attrs = call->attrs.as<ConcatenateAttrs>();
@@ -107,6 +206,60 @@ EthosnError EthosnAPI::Split(const Expr& expr, SplitParams* params) {
   return err;
 }
 
+EthosnError EthosnAPI::Tvm2Npu(const Array<IndexExpr>& padding, sl::Padding* npu_padding) {
+  std::array<uint32_t, 4> dim;
+  if (EthosnError err = AsArray<IndexExpr, uint32_t>(padding, &dim)) {
+    return err;
+  }
+  switch (padding.size()) {
+    case 1:
+      *npu_padding = sl::Padding(dim[0], dim[0], dim[0], dim[0]);
+      break;
+    case 2:
+      // Height, width -> top, bottom, left, right
+      *npu_padding = sl::Padding(dim[0], dim[0], dim[1], dim[1]);
+      break;
+    case 4:
+      // Top, left, bottom, right -> top, bottom, left, right
+      *npu_padding = sl::Padding(dim[0], dim[2], dim[1], dim[3]);
+      break;
+    default:
+      return EthosnError(ErrStrm() << "padding tuple size=" << padding.size()
+                                   << ", padding tuple size must be {1, 2, 4}");
+  }
+  return EthosnError();
+}
+
+EthosnError EthosnAPI::Tvm2Npu(const Array<IndexExpr>& strides, sl::Stride* npu_stride) {
+  if (strides.size() != 2) {
+    return EthosnError(ErrStrm() << "stride size=" << strides.size() << ", stride size must = 2");
+  }
+  std::array<uint32_t, 4> dim;
+  if (EthosnError err = AsArray<IndexExpr, uint32_t>(strides, &dim)) {
+    return err;
+  }
+  *npu_stride = sl::Stride(dim[1], dim[0]);
+  return EthosnError();
+}
+
+EthosnError EthosnAPI::Tvm2Npu(const std::string& dformat, sl::DataFormat* data_format) {
+  if (dformat == "NCHW") {
+    *data_format = sl::DataFormat::NCHW;
+    return EthosnError();
+  } else if (dformat == "NHWC") {
+    *data_format = sl::DataFormat::NHWC;
+    return EthosnError();
+  } else if (dformat == "HWIO") {
+    *data_format = sl::DataFormat::HWIO;
+    return EthosnError();
+  } else if (dformat == "HWIM") {
+    *data_format = sl::DataFormat::HWIM;
+    return EthosnError();
+  }
+  return EthosnError(ErrStrm() << "format=" << dformat
+                               << ", format must be {NCHW, NHWC, HWIO, HWIM}");
+}
+
 EthosnError EthosnAPI::Tvm2Npu(const Array<IndexExpr>& shape, sl::TensorShape* npu_shape) {
   EthosnError err = AsArray<IndexExpr, uint32_t>(shape, npu_shape);
   if (npu_shape->front() != 1) {
@@ -128,6 +281,29 @@ EthosnError EthosnAPI::Tvm2Npu(const tvm::DataType& dtype, sl::DataType* data_ty
   return EthosnError(ErrStrm() << "dtype=\'" << dtype << "\', dtype must be either uint8 or int32");
 }
 
+EthosnError EthosnAPI::Tvm2Npu(int32_t zero_point, float scale, sl::QuantizationInfo* npu_qinfo) {
+  *npu_qinfo = sl::QuantizationInfo(zero_point, scale);
+  return EthosnError();
+}
+
+EthosnError EthosnAPI::Tvm2Npu(const Array<Array<Integer>>& padding, sl::Padding* npu_padding) {
+  if (padding.size() != 4) {
+    return EthosnError(ErrStrm() << "padding tuple size=" << padding.size()
+                                 << ", padding tuple size must = 4");
+  }
+  Array<IndexExpr> reduced_padding;
+  reduced_padding.push_back(padding[1][0]);
+  reduced_padding.push_back(padding[1][1]);
+  reduced_padding.push_back(padding[2][0]);
+  reduced_padding.push_back(padding[2][1]);
+  std::array<uint32_t, 4> dim;
+  if (EthosnError err = AsArray<IndexExpr, uint32_t>(reduced_padding, &dim)) {
+    return err;
+  }
+  *npu_padding = sl::Padding(dim[0], dim[1], dim[2], dim[3]);
+  return EthosnError();
+}
+
 // Convert an array of IntImmNodes into ValueT
 // IndexT type of Array indexing variable
 // ValueT type of resulting value
@@ -158,6 +334,20 @@ EthosnError EthosnAPI::AsConstant(const Expr& expr, T* out) {
   return EthosnError();
 }
 
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      ConvolutionParams params;
+      auto err = EthosnAPI::QnnConv2d(call, &params);
+      if (params.is_depthwise) {
+        *rv = !err && sl::IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info,
+                                                          params.conv_info, params.activation_info);
+      } else {
+        *rv = !err && sl::IsConvolutionSupported(params.bias_info, params.weights_info,
+                                                 params.conv_info, params.activation_info);
+      }
+    });
+
 TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate")
     .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
       Call call = args[0];
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h
index 34af7ce0b1d8..20fe8bec03c6 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.h
@@ -44,6 +44,16 @@ namespace ethosn {
 
 namespace sl = ::ethosn::support_library;
 
+struct ConvolutionParams {
+  sl::ConvolutionInfo conv_info;
+  sl::TensorInfo activation_info;
+  sl::TensorInfo weights_info;
+  sl::TensorInfo bias_info;
+  void* raw_weights = nullptr;
+  void* raw_bias = nullptr;
+  bool is_depthwise = false;
+};
+
 struct ConcatenateParams {
   sl::QuantizationInfo qInfo;
   sl::ConcatenationInfo concat_info = sl::ConcatenationInfo(1, qInfo);
@@ -115,6 +125,8 @@ class EthosnError {
  */
 class EthosnAPI {
  public:
+  /*! \brief Extract the Support Library convolution params from an ethos-n.qnn_conv2d func */
+  static EthosnError QnnConv2d(const Expr& expr, ConvolutionParams* params);
   /*! \brief Extract the Support Library concatenate params from a Relay qnn.concatenate call */
   static EthosnError Concatenate(const Expr& expr, ConcatenateParams* params);
   /*! \brief Extract the Support Library split params from a Relay split call */
@@ -125,6 +137,16 @@ class EthosnAPI {
   static EthosnError Tvm2Npu(const Array<IndexExpr>& shape, sl::TensorShape* npu_shape);
   /*! \brief Convert a TVM data type to a SL data type */
   static EthosnError Tvm2Npu(const tvm::DataType& dtype, sl::DataType* data_type);
+  /*! \brief Convert TVM 1D padding to SL padding */
+  static EthosnError Tvm2Npu(const Array<IndexExpr>& padding, sl::Padding* npu_padding);
+  /*! \brief Convert TVM 1D striding to SL striding */
+  static EthosnError Tvm2Npu(const Array<IndexExpr>& strides, sl::Stride* npu_stride);
+  /*! \brief Convert TVM data format to SL data format */
+  static EthosnError Tvm2Npu(const std::string& dformat, sl::DataFormat* data_format);
+  /*! \brief Convert TVM quantization info to SL quantization info */
+  static EthosnError Tvm2Npu(int32_t zero_point, float scale, sl::QuantizationInfo* npu_qinfo);
+  /*! \brief Convert TVM 2D padding to SL padding */
+  static EthosnError Tvm2Npu(const Array<Array<Integer>>& padding, sl::Padding* npu_padding);
 
   // Convert an array of IntImmNodes into ValueT
   // IndexT type of Array indexing variable
diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/gradient.cc
index 7894c34de55d..9c472542cc91 100644
--- a/src/relay/transforms/gradient.cc
+++ b/src/relay/transforms/gradient.cc
@@ -72,7 +72,7 @@ Type WithGradientType(const Type&);
 Expr FirstOrderGradient(const Expr& e, const Optional<IRModule>& mod);
 
 Type WithGradientType(const Type& t) {
-  // TODO(M.K.): stricter checking
+  // TODO(@M.K.): stricter checking
   auto ty = t.as<FuncTypeNode>();
   CHECK(ty) << "input should be a function";
   return FuncType(ty->arg_types, TupleType({ty->ret_type, TupleType(ty->arg_types)}), {}, {});
@@ -85,7 +85,7 @@ Expr DeGlobal(const Optional<IRModule>& mod, const Expr& e) {
   if (mod.defined() && x) {
     BaseFunc base_func = mod.value()->Lookup(GetRef<GlobalVar>(x));
     if (auto* n = base_func.as<FunctionNode>()) {
-      return n->body;
+      return GetRef<Function>(n);
     } else {
       return e;
     }
@@ -338,11 +338,22 @@ Expr FirstOrderGradient(const Expr& re, const Optional<IRModule>& mod) {
 
 TVM_REGISTER_GLOBAL("relay._transform.first_order_gradient").set_body_typed(FirstOrderGradient);
 
+Type bpt = RelayRefType(FuncType({}, TupleType(Array<Type>()), {}, {}));
+
 struct ReverseADType : TypeMutator {
   Type VisitType_(const TensorTypeNode* ttn) final {
     Type t = GetRef<Type>(ttn);
     return TupleType({t, RelayRefType(t)});
   }
+
+  Type VisitType_(const FuncTypeNode* ftn) final {
+    std::vector<Type> arg_types;
+    for (const auto& t : ftn->arg_types) {
+      arg_types.push_back(VisitType(t));
+    }
+    arg_types.push_back(bpt);
+    return FuncType(arg_types, ftn->ret_type, ftn->type_params, ftn->type_constraints);
+  }
 };
 
 Type ReverseType(const Type& t) { return ReverseADType()(t); }
@@ -438,12 +449,18 @@ Expr BPEmpty() {
 
 struct ReverseAD : ExprMutator {
   using ADVarMap = std::unordered_map<Var, Var, ObjectPtrHash, ObjectPtrEqual>;
-
+  using ADGlobalVarMap = std::unordered_map<GlobalVar, GlobalVar, ObjectPtrHash, ObjectPtrEqual>;
+  Optional<IRModule> mod;
+  // TODO(@M.K.) refactor AD to always use mod.
   Var bp;
   std::shared_ptr<ADVarMap> ad_vars;
+  std::shared_ptr<ADGlobalVarMap> ad_gvars;
   const OpAttrMap<FPrimalGradient> rev_map = Op::GetAttrMap<FPrimalGradient>("FPrimalGradient");
 
-  explicit ReverseAD(const Var& bp, std::shared_ptr<ADVarMap> ad_vars) : bp(bp), ad_vars(ad_vars) {}
+  explicit ReverseAD(const Optional<IRModule>& mod, const Var& bp,
+                     const std::shared_ptr<ADVarMap>& ad_vars,
+                     const std::shared_ptr<ADGlobalVarMap>& ad_gvars)
+      : mod(mod), bp(bp), ad_vars(ad_vars), ad_gvars(ad_gvars) {}
 
   Expr VisitExpr_(const OpNode* op) final {
     LOG(FATAL) << "op should only be inside call";
@@ -481,9 +498,8 @@ struct ReverseAD : ExprMutator {
       Expr nbp = Function({}, LetList::With([&](LetList* ll) {
                             // we need a new ReverseAD visitor to avoid clobbering the bp local var
                             auto dup_bp = ll->Push(BPEmpty());
-                            ReverseAD dup_diff(dup_bp, ad_vars);
-                            auto dup_ad = ll->Push(dup_diff.VisitExpr(DeDup(x)));
-
+                            auto dup_ad =
+                                ll->Push(ReverseAD(mod, dup_bp, ad_vars, ad_gvars)(DeDup(x)));
                             TransferGrads(call->checked_type(), ret, dup_ad, ll);
                             ll->Push(Call(RefRead(dup_bp), {}));
                             return Call(bpv, {});
@@ -518,22 +534,29 @@ struct ReverseAD : ExprMutator {
         orig_var->checked_type_ = call->checked_type();
         auto ret = ll->Push(GetRev(call->checked_type(), orig_var, ll));
         auto bpv = ll->Push(RefRead(bp));
-        Expr nbp = Function({}, LetList::With([&](LetList* ll) {
-                              tvm::Array<Expr> rev =
-                                  rev_map[op_ref](orig, GetGrad(call->checked_type(), ret, ll));
-                              CHECK(args.size() == rev.size());
-                              for (size_t i = 0; i < args.size(); ++i) {
-                                UpdateGrad(call->args[i]->checked_type(), args[i], rev[i], ll);
-                              }
-                              return Call(bpv, {});
-                            }),
-                            TupleType::Empty(), {});
+        Expr nbp_body = LetList::With([&](LetList* ll) {
+          tvm::Array<Expr> rev = rev_map[op_ref](orig, GetGrad(call->checked_type(), ret, ll));
+          CHECK(args.size() == rev.size());
+          for (size_t i = 0; i < args.size(); ++i) {
+            UpdateGrad(call->args[i]->checked_type(), args[i], rev[i], ll);
+          }
+          return Call(bpv, {});
+        });
+        Expr nbp = Function({}, nbp_body, TupleType::Empty(), {});
         ll->Push(RefWrite(bp, transform::ToANormalForm(nbp)));
         // TODO(@M.K.): ToANF should be called on rev. Enhance ToANF for that.
         return ret;
       });
+    } else if (call->op.as<ConstructorNode>()) {
+      return ExprMutator::VisitExpr_(call);
+    } else {
+      std::vector<Expr> args;
+      for (const auto& arg : call->args) {
+        args.push_back(VisitExpr(arg));
+      }
+      args.push_back(bp);
+      return Call(VisitExpr(call->op), args);
     }
-    return ExprMutator::VisitExpr_(call);
   }
 
   Expr VisitExpr_(const ConstantNode* op) final {
@@ -559,6 +582,39 @@ struct ReverseAD : ExprMutator {
     return ad_vars->at(var_ref);
   }
 
+  Expr VisitExpr_(const GlobalVarNode* op) final {
+    // todo: concatenating string to add attribute seems like a brittle hack.
+    // maybe get module indexed by a rose tree of string?
+    CHECK(mod.defined());
+    auto orig_gv = GetRef<GlobalVar>(op);
+    if (ad_gvars->count(orig_gv) == 0) {
+      GlobalVar gv(op->name_hint + "_grad");
+      (*ad_gvars)[orig_gv] = gv;
+      Function orig_f = Downcast<Function>(DeDup(mod.value()->Lookup(orig_gv)));
+      std::vector<Var> params;
+      for (const auto& p : orig_f->params) {
+        params.push_back(Downcast<Var>(VisitExpr(p)));
+      }
+      params.push_back(bp);
+      Expr body = VisitExpr(orig_f->body);
+      Function f(params, body, VisitType(orig_f->ret_type), orig_f->type_params, orig_f->attrs);
+      std::cout << "gv " << op->name_hint << ": " << AsText(f, false) << std::endl;
+      mod.value()->Add(gv, f);
+    }
+    return ad_gvars->at(orig_gv);
+  }
+
+  Expr VisitExpr_(const FunctionNode* op) final {
+    std::vector<Var> params;
+    for (const auto& var : op->params) {
+      params.push_back(Downcast<Var>(VisitExpr(var)));
+    }
+    auto new_bp = Var("bp", bpt);
+    params.push_back(new_bp);
+    return Function(params, ReverseAD(mod, new_bp, ad_vars, ad_gvars)(op->body),
+                    VisitType(op->ret_type), op->type_params, op->attrs);
+  }
+
   Type VisitType(const Type& t) final { return t.defined() ? ReverseType(t) : t; }
 };
 
@@ -604,12 +660,16 @@ Expr Gradient(const Expr& re, const Optional<IRModule>& mod) {
   }
   CHECK(!MissingGrad(e)) << "input has operators with missing gradients";
   Expr body = LetList::With([&](LetList* ll) {
-    Var bp = ll->Push(BPEmpty());
-    Expr rev = ReverseAD(bp, std::make_shared<ReverseAD::ADVarMap>())(e);
-    std::vector<Expr> args;
+    Var bp = ll->Push(BPEmpty(), bpt);
+    Expr rev = ReverseAD(mod, bp, std::make_shared<ReverseAD::ADVarMap>(),
+                         std::make_shared<ReverseAD::ADGlobalVarMap>())(e);
+    std::vector<Expr> normal_args, args;
     for (const auto& p : f->params) {
-      args.push_back(ll->Push(Pair(p, RefCreate(ZerosLike(p)))));
+      auto x = ll->Push(Pair(p, RefCreate(ZerosLike(p))));
+      normal_args.push_back(x);
+      args.push_back(x);
     }
+    args.push_back(bp);
     auto c = ll->Push(Call(rev, args));
     std::function<void(const Expr&, const Type&)> init_grad;
     init_grad = [&](const Expr& e, const Type& t) {
@@ -626,7 +686,7 @@ Expr Gradient(const Expr& re, const Optional<IRModule>& mod) {
     init_grad(c, f->body->checked_type());
     ll->Push(Call(RefRead(bp), {}));
     std::vector<Expr> ret;
-    for (const auto& a : args) {
+    for (const auto& a : normal_args) {
       ret.push_back(RefRead(GetField(a, 1)));
     }
     std::function<Expr(const Expr&, const Type&)> get_final_result;
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index c6278334cfec..b43d273553e5 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -94,6 +94,8 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
                 f = relay.build_module.bind_params_by_name(mod["main"], params)
                 mod = tvm.IRModule()
                 mod["main"] = f
+                pattern = get_pattern_table("ethos-n")
+                mod = relay.transform.MergeComposite(pattern)(mod)
                 mod = relay.transform.AnnotateTarget("ethos-n")(mod)
                 mod = relay.transform.MergeCompilerRegions()(mod)
                 mod = relay.transform.PartitionGraph()(mod)
diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py
new file mode 100644
index 000000000000..52e3de94eb4d
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_conv2d.py
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Ethos-N integration conv2d tests"""
+
+import numpy as np
+import math
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib.ethosn import ethosn_available
+from . import infrastructure as tei
+
+
+def _get_same_padding(data, kernel, dilation, stride):
+    dilated_kernel_h = dilation[0] * (kernel[0] - 1) + 1
+    dilated_kernel_w = dilation[1] * (kernel[1] - 1) + 1
+    out = int(math.ceil(float(data[0]) / float(stride[0])))
+    pad = max(0, (out - 1) * stride[0] + dilated_kernel_h - data[0])
+    pad_top = pad // 2
+    pad_bottom = pad - pad_top
+
+    out = int(math.ceil(float(data[1]) / float(stride[1])))
+    pad = max(0, (out - 1) * stride[1] + dilated_kernel_w - data[1])
+    pad_left = pad // 2
+    pad_right = pad - pad_left
+    return [pad_top, pad_left, pad_bottom, pad_right]
+
+
+def _get_model(shape, kernel_h, kernel_w,
+               input_zp, input_sc,
+               kernel_zp, kernel_sc,
+               output_zp, output_sc,
+               pad, strides, dilation,
+               groups, dtype,
+               out_channels, weight_format):
+    """Return a model and any parameters it may have"""
+    a = relay.var("a", shape=shape, dtype=dtype)
+    if pad == "op" or pad == "both":
+        p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+        a = relay.nn.pad(a,
+                         pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
+                         pad_value=input_zp, pad_mode="constant")
+        shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
+
+    p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+    if weight_format == "HWIO":
+        weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
+    else:
+        weight_shape = (kernel_h, kernel_w, out_channels, 1)
+    w = tvm.nd.array(np.random.randint(np.iinfo(dtype).min, high=np.iinfo(dtype).max, size=weight_shape, dtype=dtype))
+    weights = relay.const(w, dtype)
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        groups=groups,
+        channels=out_channels,
+        padding=p if pad == "attr" or pad == "both" else (0, 0, 0, 0),
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    biasc = relay.const(b, "int32")
+    bias = relay.nn.bias_add(conv, biasc, axis=3)
+    req = relay.qnn.op.requantize(
+        bias,
+        relay.const(input_sc * kernel_sc, 'float32'),  # input zero scale
+        relay.const(0, 'int32'),                       # input zero point
+        relay.const(output_sc, 'float32'),             # output zero scale
+        relay.const(output_zp, 'int32'),               # output zero point
+        out_dtype="uint8"
+    )
+    params = {"w": w,
+              "b": b}
+    return req, params
+
+
+def _get_conv2d_qnn_params(input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, channels):
+    input_max = input_sc * (255 - input_zp)
+    input_min = - input_sc * input_zp
+    kernel_max = kernel_sc * (255 - kernel_zp)
+    kernel_min = - kernel_sc * kernel_zp
+    output_limits = [kernel_max * kernel_h * kernel_w * channels * input_max,
+                     kernel_min * kernel_h * kernel_w * channels * input_max,
+                     kernel_min * kernel_h * kernel_w * channels * input_min,
+                     kernel_max * kernel_h * kernel_w * channels * input_min]
+    output_max = max(output_limits)
+    output_min = min(output_limits)
+    output_sc = (output_max - output_min) / 255
+    output_zp = - int(output_min / output_sc)
+    return output_zp, output_sc
+
+
+def test_conv2d():
+    if not ethosn_available():
+        return
+
+    trials = [
+        [(1, 17, 20, 26), 4, 3, 1, 'attr', (2, 2), (1, 1)],
+        [(1, 30, 27, 30), 5, 5, 3, 'none', (1, 1), (1, 1)],
+        [(1, 14, 28, 11), 6, 2, 2, 'op', (2, 2), (1, 1)],
+        [(1, 9, 20, 30), 7, 1, 5, 'none', (1, 1), (1, 1)],
+        [(1, 21, 21, 22), 8, 5, 1, 'attr', (2, 2), (1, 1)],
+        [(1, 21, 25, 29), 9, 2, 5, 'op', (1, 1), (1, 1)],
+        [(1, 31, 28, 15), 10, 1, 2, 'attr', (2, 2), (1, 1)],
+        [(1, 21, 21, 8), 11, 3, 3, 'none', (1, 1), (1, 1)],
+        [(1, 5, 11, 6), 12, 5, 2, 'op', (2, 2), (1, 1)],
+        [(1, 12, 7, 18), 13, 1, 3, 'op', (1, 1), (1, 1)],
+        [(1, 24, 6, 26), 14, 3, 5, 'none', (2, 2), (1, 1)],
+        [(1, 19, 24, 16), 15, 2, 1, 'attr', (1, 1), (1, 1)],
+    ]
+
+    np.random.seed(0)
+    for depthwise in [False, True]:
+        for shape, out_channels, kernel_h, kernel_w, pad, stride, dilation in trials:
+            if depthwise:
+                out_channels = shape[3]
+                groups = out_channels
+                kernel_w = kernel_h
+                weight_format = "HWOI"
+                stride = (1, 1) if kernel_w == 1 else (2, 2)
+            else:
+                groups = 1
+                weight_format = "HWIO"
+
+            outputs = []
+            inputs = {
+                "a": tvm.nd.array(np.random.randint(0, high=255, size=shape, dtype="uint8")),
+            }
+            input_zp = np.random.randint(0, 255)
+            input_sc = np.random.random() * 2
+            kernel_zp = np.random.randint(0, 255)
+            kernel_sc = np.random.random() * 2
+            output_zp, output_sc = _get_conv2d_qnn_params(input_zp, input_sc,
+                                                          kernel_zp, kernel_sc,
+                                                          kernel_h, kernel_w, shape[3])
+            model, params = _get_model(shape, kernel_h, kernel_w,
+                                       input_zp, input_sc,
+                                       kernel_zp, kernel_sc,
+                                       output_zp, output_sc,
+                                       pad, stride, dilation,
+                                       groups, "uint8",
+                                       out_channels, weight_format)
+            for npu in [False, True]:
+                mod = tei.make_module(model, params)
+                outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+
+            tei.verify(outputs, 1)
+
+
+def test_conv2d_failure():
+    if not ethosn_available():
+        return
+
+    trials = [
+        ((1, 4, 4, 4), 1, 1, 0, 1, 0, 1, 0, 1, "none", (1, 1), (1, 1), 1, "uint8", 8, "HWIO",
+         "Overall scale (of the input * weights / output) should be in the range [0, 1)"),
+        ((1, 4, 4, 4), 1, 1, 0, 1, 0, 1, 0, 1, "none", (1, 1), (1, 1), 1, "int8", 8, "HWIO",
+         "dtype='int8', dtype must be either uint8 or int32"),
+        ((1, 4, 4, 4), 2, 2, 0, 1, 0, 1, 0, 2, "both", (1, 1), (1, 1), 1, "uint8", 8, "HWIO",
+         "both op and attr padding exist, must be either op/attr only or no padding"),
+        ((1, 4, 4, 4), 1, 1, 0, 1, 0, 1, 0, 2, "none", (1, 1, 1), (1, 1), 1, "uint8", 8, "HWIO",
+         "stride size=3, stride size must = 2"),
+        ((1, 4, 4, 4), 1, 1, 0, 1, 0, 1, 0, 2, "none", (1, 1), (2, 1), 1, "uint8", 8, "HWIO",
+         "dilation=[2, 1], dilation must = [1, 1]"),
+        ((2, 4, 4, 4), 1, 1, 0, 1, 0, 1, 0, 2, "none", (1, 1), (1, 1), 1, "uint8", 8, "HWIO",
+         "batch size=2, batch size must = 1"),
+    ]
+
+    np.random.seed(0)
+    for shape, kernel_h, kernel_w, input_zp, input_sc, kernel_zp,\
+        kernel_sc, output_zp, output_sc, pad, stride, dilation,\
+        groups, dtype, out_channels, weight_format, err_msg in trials:
+        model, params = _get_model(shape, kernel_h, kernel_w,
+                                   input_zp, input_sc,
+                                   kernel_zp, kernel_sc,
+                                   output_zp, output_sc,
+                                   pad, stride, dilation,
+                                   groups, dtype,
+                                   out_channels, weight_format)
+        model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d")
+        mod = tei.make_ethosn_partition(model)
+        tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 296d3e5e9354..b239ef4fc4a6 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import te
 from tvm import relay
+from tvm.relay import GlobalVar
 from tvm.relay.analysis import free_vars, free_type_vars
 from tvm.relay import create_executor, transform
 from tvm.relay.transform import gradient
@@ -29,7 +30,7 @@
 import tvm.relay.op as op
 
 
-def test_id():
+def test_fo_id():
     shape = (10, 10)
     dtype = 'float32'
     t = relay.TensorType(shape, dtype)
@@ -44,6 +45,21 @@ def test_id():
     tvm.testing.assert_allclose(forward.asnumpy(), x.asnumpy())
     tvm.testing.assert_allclose(grad.asnumpy(), np.ones_like(x.asnumpy()))
 
+def test_id():
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    func = relay.Function([x], x)
+    func = run_infer_type(func)
+    back_func = run_infer_type(gradient(func))
+    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
+    ex = create_executor()
+    x = rand(dtype, *shape)
+    forward, (grad,) = ex.evaluate(back_func)(x)
+    tvm.testing.assert_allclose(forward.asnumpy(), x.asnumpy())
+    tvm.testing.assert_allclose(grad.asnumpy(), np.ones_like(x.asnumpy()))
+
 
 def test_relu():
     shape = (10, 10)
@@ -341,5 +357,28 @@ def test_no_duplication():
     counts = count_ops(gr)
     assert counts['nn.dense'] == 3, "We expect 3 dense (1 forward, two backward)"
 
+
+def test_global_function():
+    m = tvm.IRModule()
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.Var('x', t)
+    d = GlobalVar('double')
+    m[d] = relay.Function([x], x + x)
+    y = relay.Var('y', t)
+    q = GlobalVar('q')
+    m[q] = relay.Function([y], d(d(y)))
+    g = GlobalVar('grad')
+    m[g] = tvm.relay.transform.gradient(q, m)
+    back_func = m[g]
+    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
+    ex = create_executor(mod=m)
+    x = rand(dtype, *shape)
+    forward, (grad,) = ex.evaluate(back_func)(x)
+    tvm.testing.assert_allclose(forward.asnumpy(), 4 * x.asnumpy())
+    tvm.testing.assert_allclose(grad.asnumpy(), 4 * np.ones_like(x.asnumpy()))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
new file mode 100644
index 000000000000..f06f06ac73c0
--- /dev/null
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Test evolutionary search. """
+
+import tvm
+from test_auto_scheduler_common import matmul_auto_scheduler_test
+from tvm import auto_scheduler, te
+from tvm.auto_scheduler.cost_model.cost_model import PythonBasedModel
+
+
+class MockCostModel(PythonBasedModel):
+    """A mock cost model that rates 1 only for the states with tile_k=2."""
+    def predict(self, task, states):
+        scores = []
+        found = False
+        for state in states:
+            for line in str(state).split('\n'):
+                if line.find('k.1') != -1 and line.find('(0,2)') != -1:
+                    found = True
+                    break
+            scores.append(1 if found else 0)
+        return scores
+
+def test_evo_search():
+    """Test evolutionary search. Since we cannot mock random number generator,
+    we mocked the cost model to manually guide the evo search. If evo search works
+    as expected, it should find the target state after a sufficient number of iterations.
+    This unit test has been tested with 1,000 runs with no failures, meaning that
+    the failure rate is less than 0.1%.
+    """
+    workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (10, 10, 4))
+    dag = auto_scheduler.ComputeDAG(workload_key)
+    task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.create('llvm'))
+    policy = auto_scheduler.SketchPolicy(task, schedule_cost_model=MockCostModel(), verbose=0)
+    states = policy.sample_initial_population(50)
+    pruned_states = []
+    for state in states:
+        found = False
+        for line in str(state).split('\n'):
+            # Remove all tile_k=2 states and expect evo search will fine them.
+            if line.find('k.1') != -1 and line.find('(0,2)') != -1:
+                found = True
+                break
+        if not found:
+            pruned_states.append(state)
+
+    new_states = policy.evolutionary_search(pruned_states, 50)
+    found = False
+    for state in new_states:
+        for line in str(state).split('\n'):
+            # Check if evo search found at least one state with tile_k=2.
+            if line.find('k.1') != -1 and line.find('(0,2)') != -1:
+                found = True
+                break
+        if found:
+            break
+    assert found
+
+
+if __name__ == "__main__":
+    test_evo_search()