diff --git a/.github/workflows/buildAndTest.yml b/.github/workflows/buildAndTest.yml
new file mode 100644
index 00000000000..4840d92f97e
--- /dev/null
+++ b/.github/workflows/buildAndTest.yml
@@ -0,0 +1,77 @@
+# This is a basic workflow to help you get started with Actions
+name: Build and Test
+# Controls when the action will run. Triggers the workflow on push or pull request
+# events but only for the master branch
+on: [push, pull_request]
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # Build Phism and run its tests.
+  build-phism:
+    name: Build and Test Phism
+    runs-on: self-hosted
+    steps:
+      # - name: Configure Environment
+      #   run: echo "${GITHUB_WORKSPACE}/llvm/install/bin" >> $GITHUB_PATH
+      # Disabled for self-hosted
+      # - name: Get dependences
+      #   run: |
+      #     sudo apt-get update -y
+      #     sudo apt-get install -y build-essential libtool autoconf pkg-config flex bison libgmp-dev clang-9 libclang-9-dev texinfo python3
+      # - name: Update the LLVM/Clang version to 9
+      #   run: |
+      #     sudo update-alternatives --install /usr/bin/llvm-config llvm-config /usr/bin/llvm-config-9 100
+      #     sudo update-alternatives --install /usr/bin/FileCheck FileCheck /usr/bin/FileCheck-9 100
+          
+     
+      # Clone the Phism repo and its submodules. Do shallow clone to save clone
+      # time.
+      - name: Get Phism
+        uses: actions/checkout@v2
+        with:
+          submodules: "true"
+    
+      # --------
+      # Restore LLVM from cache and build if it's not in there.
+      # --------
+      # Extract the LLVM submodule hash for use in the cache key.
+      - name: Get LLVM Hash
+        id: get-llvm-hash
+        run: echo "::set-output name=hash::$(git rev-parse @:./llvm)"
+        shell: bash
+      # Try to fetch LLVM from the cache.
+      - name: Cache LLVM
+        id: cache-llvm
+        uses: actions/cache@v2
+        with:
+          path: llvm/build
+          key: ${{ runner.os }}-llvm-${{ steps.get-llvm-hash.outputs.hash }}
+      # Build LLVM if we didn't hit in the cache. Even though we build it in
+      # the previous job, there is a low chance that it'll have been evicted by
+      # the time we get here.
+      # Need to delete the test directory to avoid caching them.
+      - name: Rebuild and Install LLVM
+        if: steps.cache-llvm.outputs.cache-hit != 'true'
+        run: |
+          ./scripts/build-llvm.sh ci
+          rm -rf ./llvm/build/test 
+      # --------
+      # Build and test Phism in both debug and release mode.
+      # --------
+      - name: Build and Test Phism (Assert)
+        run: |
+          ./scripts/build-phism.sh ci
+      
+      # Build and test Phism with pb-flow.
+      - name: Build and Test Phism (pb-flow)
+        run: |
+          python3 -m venv env
+          source env/bin/activate
+          which python3
+          python3 -m pip install -r requirements.txt
+          python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --skip-vitis
+          python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --polymer --skip-vitis
+          python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --polymer --loop-transforms --skip-vitis
+          python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --polymer --loop-transforms --array-partition --skip-vitis
+
+
+
diff --git a/include/phism/mlir/Transforms/Utils.h b/include/phism/mlir/Transforms/Utils.h
new file mode 100644
index 00000000000..f1d09726276
--- /dev/null
+++ b/include/phism/mlir/Transforms/Utils.h
@@ -0,0 +1,10 @@
+//===- Utils.h - Utility functions ------------------ C++-===//
+
+#include "mlir/IR/BuiltinOps.h"
+
+namespace phism {
+
+/// Get the top function for the hardware design.
+mlir::FuncOp getTopFunction(mlir::ModuleOp m);
+
+} // namespace phism
diff --git a/lib/llvm/Transforms/VhlsLLVMRewriter.cc b/lib/llvm/Transforms/VhlsLLVMRewriter.cc
index da51844e5cd..3ceea0036a2 100644
--- a/lib/llvm/Transforms/VhlsLLVMRewriter.cc
+++ b/lib/llvm/Transforms/VhlsLLVMRewriter.cc
@@ -16,17 +16,57 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
+#include <queue>
+
 using namespace llvm;
 
+#define DEBUG_TYPE "vhls_llvm"
+
 static cl::opt<std::string>
     XlnTop("xlntop", cl::desc("Specify the top function for Xilinx HLS."),
            cl::value_desc("topname"));
 static cl::opt<std::string>
     XlnNames("xlnnames", cl::desc("Specify the top function param names."),
              cl::value_desc("paramname"));
+static cl::opt<std::string> XlnTBTclNames(
+    "xlntbtclnames",
+    cl::desc(
+        "Specify the file name of the tcl script for test bench generation."),
+    cl::value_desc("tbname"));
+static cl::opt<std::string> XlnTBSources(
+    "xlntbfilesettings",
+    cl::desc(
+        "Specify the file settings for the test bench, e.g. \"add_files ...\""),
+    cl::value_desc("tbfiles"));
+static cl::opt<bool> XlnArrayPartitionEnabled(
+    "xln-ap-enabled", cl::desc("Whether array partition has been enabled"));
+
+/// Will abort if the Value is not a ConstantInt.
+static int64_t getI64Value(Value *value) {
+  assert(isa<ConstantInt>(value));
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(value);
+  assert(CI->getBitWidth() == 64);
+
+  return CI->getSExtValue();
+}
+
+/// Get the dimensions from the provided array type.
+static SmallVector<int64_t> getDimsFromArrayType(ArrayType *type) {
+  SmallVector<int64_t> dims;
+  dims.push_back(type->getNumElements());
+
+  while (type && type->getArrayElementType()->isArrayTy()) {
+    type = dyn_cast<ArrayType>(type->getArrayElementType());
+    dims.push_back(type->getNumElements());
+  }
+
+  return dims;
+}
 
 namespace {
 
@@ -109,17 +149,6 @@ class InsExtSequence {
 
     return true;
   }
-
-  /// Will abort if the Value is not a ConstantInt.
-  int64_t getI64Value(Value *value) {
-    assert(isa<ConstantInt>(value));
-
-    ConstantInt *CI = dyn_cast<ConstantInt>(value);
-    assert(CI->getBitWidth() == 64);
-
-    return CI->getSExtValue();
-  }
-
   /// From the scalar offset to a set of offset for each dim.
   /// There should be in total dimSize * 2 expressions.
   /// Dim size will be dims.size() * 2 since the dims in this Seq only has the
@@ -129,6 +158,12 @@ class InsExtSequence {
     if (isa<ConstantInt>(offset))
       return;
 
+    LLVM_DEBUG({
+      dbgs() << "\n--------------------------------------\n";
+      dbgs() << "Processing offset for target pointer: \n";
+      ptr->dump();
+    });
+
     SmallVector<Value *> offsets;
     SmallVector<int64_t> strides;
 
@@ -137,7 +172,6 @@ class InsExtSequence {
     unsigned dimSize = dims.size() * 2;
     for (unsigned i = 0; i < dimSize; ++i) {
       binOp = cast<BinaryOperator>(curr);
-      // binOp->dump();
       assert(binOp->getOpcode() == BinaryOperator::Add);
 
       Value *lhs = binOp->getOperand(0), *rhs = binOp->getOperand(1);
@@ -167,11 +201,30 @@ class InsExtSequence {
     assert(offsets.size() == strides.size());
     assert(strides[0] == 1);
 
+    LLVM_DEBUG({
+      dbgs() << "Offsets: \n";
+      for (Value *offset : offsets)
+        offset->dump();
+      dbgs() << "Strides: ";
+      interleave(
+          strides, [&](const int64_t &stride) { dbgs() << stride; },
+          [&]() { dbgs() << ", "; });
+      dbgs() << "\n\n";
+    });
+
     SmallVector<int64_t> partialDims;
     for (unsigned i = 1; i < strides.size(); ++i)
       partialDims.push_back(strides[i] / strides[i - 1]);
     assert(partialDims.size() == dimSize - 1);
 
+    LLVM_DEBUG({
+      dbgs() << "Partial dims:\n";
+      interleave(
+          partialDims, [&](const int64_t &v) { dbgs() << v; },
+          [&]() { dbgs() << ", "; });
+      dbgs() << "\n";
+    });
+
     std::reverse(partialDims.begin(), partialDims.end());
     std::reverse(offsets.begin(), offsets.end());
 
@@ -191,11 +244,26 @@ class InsExtSequence {
     LoadInst *load = new LoadInst(
         cast<PointerType>(restoredType)->getElementType(), bitCastInst,
         Twine(""), cast<Instruction>(bitCastInst->getNextNode()));
-    GetElementPtrInst *gep = GetElementPtrInst::Create(
-        rankedArrType, load, {offsets[0], offsets[1]}, Twine(""),
-        cast<Instruction>(load->getNextNode()));
+
+    SmallVector<Value *> gepInds;
+    for (unsigned i = 0; i < offsets.size() / 2; ++i)
+      gepInds.push_back(offsets[i]);
+    GetElementPtrInst *gep =
+        GetElementPtrInst::Create(rankedArrType, load, gepInds, Twine(""),
+                                  cast<Instruction>(load->getNextNode()));
     ptr = new BitCastInst(gep, ptr->getType(), Twine(""),
                           cast<Instruction>(gep)->getNextNode());
+
+    LLVM_DEBUG({
+      dbgs() << "Created the following instructions:\n";
+      bitCastInst->dump();
+      load->dump();
+      gep->dump();
+      ptr->dump();
+
+      dbgs() << "\nExpected result type:\n";
+      gep->getType()->dump();
+    });
   }
 
   /// Append insInst to the insertInsts list, and gather the value to be
@@ -629,6 +697,215 @@ static SmallVector<Function *> TopologicalSort(ArrayRef<Function *> funcs) {
   return sorted;
 }
 
+/// See the doc from rewriteModuloGepIndices.
+static Value *rewriteModulo(Value *value) {
+  SelectInst *selectInst = dyn_cast<SelectInst>(value);
+  if (!selectInst)
+    return nullptr;
+
+  ICmpInst *icmpInst = dyn_cast<ICmpInst>(selectInst->getCondition());
+  if (!icmpInst)
+    return nullptr;
+
+  BinaryOperator *addInst =
+      dyn_cast<BinaryOperator>(selectInst->getTrueValue());
+  if (!addInst || addInst->getOpcode() != BinaryOperator::Add)
+    return nullptr;
+
+  BinaryOperator *sremInst =
+      dyn_cast<BinaryOperator>(selectInst->getFalseValue());
+  if (!sremInst || sremInst->getOpcode() != BinaryOperator::SRem)
+    return nullptr;
+
+  // Now the pattern has been matched, do the rewrite.
+  selectInst->replaceAllUsesWith(sremInst);
+
+  // Clean up
+  selectInst->eraseFromParent();
+  addInst->eraseFromParent();
+  icmpInst->eraseFromParent();
+
+  return sremInst;
+}
+
+static bool isValidGepIndex(Value *value) {
+  return isa<SelectInst, PHINode, ConstantInt>(value);
+}
+
+/// We trace the address calculation (mul and add) chain for the GEP index.
+///
+/// It would looks like (from heat-3d) -
+///
+///     %val_9 = mul i64 %val_3, 400
+///     %val_10 = add i64 %val_9, 400 <-----  Add the offset value of 400
+///     %val_11 = mul i64 %val_5, 20
+///     %val_12 = add i64 %val_10, %val_11
+///     %val_13 = add i64 %val_12, %val_7
+///
+/// Without offset
+///
+///     %val_20 = mul i64 %val_3, 400
+///     %val_21 = mul i64 %val_5, 20
+///     %val_22 = add i64 %val_20, %val_21
+///     %val_23 = add i64 %val_22, %val_7
+///
+/// We cannot recover the indices when there is an offset at present.
+/// It will return all the found indices.
+/// The provided type argument is to verify the extracted information.
+static SmallVector<Value *> getGepIndices(GetElementPtrInst *inst, Type *type) {
+  LLVM_DEBUG({
+    dbgs() << "Recognizing GEP indices from ";
+    inst->dump();
+    dbgs() << "\n";
+    dbgs() << "Using type: ";
+    type->dump();
+    dbgs() << "\n\n";
+  });
+
+  if (inst->getNumIndices() != 1) {
+    LLVM_DEBUG(dbgs() << "Given GEP has 0 or more than 1 indices.");
+    return {};
+  }
+
+  SmallVector<Value *> operands;
+  // Will use this to check with the ranked array type.
+  SmallVector<int64_t> mulDims;
+
+  // First of all, all the adders will be connected by their LHS operator.
+  // If the input is already an index.
+  if (isValidGepIndex(*inst->idx_begin())) {
+    operands.push_back(*inst->idx_begin());
+  } else {
+    SmallVector<BinaryOperator *> addInsts;
+    BinaryOperator *addInst = dyn_cast<BinaryOperator>(*inst->idx_begin());
+    while (addInst && addInst->getOpcode() == BinaryOperator::Add) {
+      addInsts.push_back(addInst);
+      addInst = dyn_cast<BinaryOperator>(addInst->getOperand(0));
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "Recognized adders:\n";
+      for (BinaryOperator *op : addInsts)
+        op->dump();
+      dbgs() << "\n\n";
+    });
+
+    for (unsigned i = 0; i < addInsts.size(); ++i) {
+      if (i == addInsts.size() - 1)
+        operands.push_back(addInsts[i]->getOperand(0));
+      operands.push_back(addInsts[i]->getOperand(1));
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "Adder operands:\n";
+      for (Value *operand : operands)
+        operand->dump();
+      dbgs() << "\n\n";
+    });
+
+    // Replace operand with multipliers.
+    for (unsigned i = 0; i < operands.size(); ++i) {
+      BinaryOperator *mulInst = dyn_cast<BinaryOperator>(operands[i]);
+      if (!mulInst || mulInst->getOpcode() != BinaryOperator::Mul)
+        continue;
+      if (!isa<ConstantInt>(mulInst->getOperand(1))) {
+        LLVM_DEBUG({
+          dbgs() << "The RHS of a multiplied index is not a constant integer.";
+          mulInst->dump();
+        });
+        return {};
+      }
+
+      mulDims.push_back(getI64Value(mulInst->getOperand(1)));
+      operands[i] = mulInst->getOperand(0);
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Updated operands by mul:\n";
+    for (Value *operand : operands)
+      operand->dump();
+    dbgs() << "\n\n";
+  });
+
+  // Check if every operand can be a valid GEP index.
+  for (Value *operand : operands) {
+    if (!isValidGepIndex(operand)) {
+      LLVM_DEBUG({
+        dbgs() << "Found an invalid operand:";
+        operand->dump();
+      });
+      return {};
+    }
+  }
+
+  // Finally, check whether the type matches with the parsed results.
+  ArrayType *arrayType = cast<ArrayType>(type->getPointerElementType());
+  SmallVector<int64_t> dims = getDimsFromArrayType(arrayType);
+  if (dims.size() != operands.size()) {
+    LLVM_DEBUG({
+      dbgs() << "Number of dims from the type: " << dims.size()
+             << " doesn't match the number of operands: " << operands.size()
+             << "\n";
+    });
+    return {};
+  }
+
+  SmallVector<int64_t> parDims;
+  for (unsigned i = 1; i < dims.size(); ++i)
+    parDims.push_back(dims[i] * (parDims.empty() ? 1 : parDims.back()));
+
+  LLVM_DEBUG({
+    dbgs() << "Partial dims resolved from type: ";
+    interleaveComma(parDims, dbgs());
+    dbgs() << "\nPartial dims resolved from multipliers: ";
+    interleaveComma(mulDims, dbgs());
+    dbgs() << "\n";
+  });
+
+  if (parDims != mulDims) {
+    LLVM_DEBUG(dbgs() << "Partial dims don't match.\n");
+    return {};
+  }
+
+  std::reverse(operands.begin(), operands.end());
+
+  return operands;
+}
+
+/// Look at the indices passed to the given GEP and see if there is any chance
+/// we can make the modulo expressions simplier given that the address of GEP
+/// should be positive.
+///
+/// For example, transform:
+///      %0 = srem i64 %arg, 32
+///      %1 = icmp slt i64 %0, 0
+///      %2 = add i64 %0, 32
+///      %3 = select i1 %1, i64 %2, i64 %0
+///
+/// to:
+///      %0 = srem i64 %arg, 32
+///
+static void rewriteModuloGepIndices(SmallVectorImpl<Value *> &indices) {
+  for (unsigned i = 0; i < indices.size(); ++i)
+    if (isa<SelectInst>(indices[i])) {
+      Value *newInd = rewriteModulo(indices[i]);
+      if (!newInd) {
+        LLVM_DEBUG({
+          dbgs() << "Failed to rewrite index at " << i << " : ";
+          indices[i]->dump();
+        });
+        continue;
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "Rewritten index at " << i << " to ";
+        newInd->dump();
+      });
+      indices[i] = newInd;
+    }
+}
+
 /// This helper function convert the MemRef value represented by an
 /// aggregated type to a ranked N-d array. The function interface, as well
 /// as the internal usage of GEP will be updated.
@@ -680,12 +957,24 @@ static void convertMemRefToArray(Module &M, bool ranked = false) {
   // same as the original one, just have additional arguments that are
   // ranked arrays.
   for (Function *F : Funcs) {
+    LLVM_DEBUG({
+      dbgs() << "\nTransforming function:  \n\n";
+      F->dump();
+    });
     ValueToValueMapTy RankedArrVMap;
     auto &Seqs = FuncToSeqs[F];
 
+    // -----------------------------------------------------------------
+    // Step 1: create a rank-duplicated interface.
     Function *NewFunc =
         duplicateFunctionsWithRankedArrays(F, Seqs, RankedArrVMap);
+    LLVM_DEBUG({
+      dbgs() << "\nDuplicated function:  \n\n";
+      NewFunc->dump();
+    });
 
+    // -----------------------------------------------------------------
+    // Step 2: update the GEP expressions.
     SmallVector<Instruction *, 4> GEPList;
     for (BasicBlock &BB : *NewFunc)
       for (Instruction &I : BB)
@@ -695,12 +984,53 @@ static void convertMemRefToArray(Module &M, bool ranked = false) {
     // Create new GEPs that use the ranked arrays and remove the old ones.
     unsigned NumNewGEP = 0;
     for (Instruction *I : GEPList) {
-      Instruction *NewGEP =
-          duplicateGEPWithRankedArray(I, RankedArrVMap, NumNewGEP);
+      // Simplify the address calculation expressions to make Vitis happy.
+      // It is easier to work on the original GEP.
+      SmallVector<Value *> indices =
+          getGepIndices(cast<GetElementPtrInst>(I),
+                        RankedArrVMap[I->getOperand(0)]->getType());
+
+      Instruction *NewGEP;
+      if (indices.empty()) {
+        NewGEP = duplicateGEPWithRankedArray(I, RankedArrVMap, NumNewGEP);
+      } else {
+        // We will directly use the resolved indices.
+        // Try to rewrite the modulo expressions.
+        rewriteModuloGepIndices(indices);
+
+        LLVM_DEBUG({
+          dbgs() << "Indices to use: \n";
+          for (Value *index : indices)
+            index->dump();
+        });
+
+        // We can directly use the indices from the rewrite to get the new GEP.
+        /// TODO: should be more careful.
+        Value *ptr = RankedArrVMap[I->getOperand(0)];
+        assert(ptr);
+
+        indices.push_back(ConstantInt::get(indices.front()->getType(), 0));
+        std::reverse(indices.begin(), indices.end());
+
+        NewGEP = GetElementPtrInst::CreateInBounds(ptr, indices, Twine(""),
+                                                   I->getNextNode());
+        LLVM_DEBUG({
+          dbgs() << "Newly generated GEP: ";
+          NewGEP->dump();
+        });
+      }
+
       I->replaceAllUsesWith(NewGEP);
       I->eraseFromParent();
     }
 
+    LLVM_DEBUG({
+      dbgs() << "\nGEP updated function: \n\n";
+      NewFunc->dump();
+    });
+
+    // -----------------------------------------------------------------
+    // Step 3: update callers within the new function.
     // If there is any caller.
     SmallVector<CallInst *> Callers;
     for (BasicBlock &BB : *NewFunc)
@@ -728,6 +1058,12 @@ static void convertMemRefToArray(Module &M, bool ranked = false) {
         if (RankedArrVMap.count(Arg))
           Args.push_back(RankedArrVMap[Arg]);
         else if (isa<BitCastInst>(Arg)) {
+          LLVM_DEBUG({
+            dbgs() << "Found ";
+            Arg->dump();
+            dbgs() << " as a result from bitcast. Need to transform it into "
+                      "the multi-dimensional type.\n";
+          });
           // Or it is a result from a bitcast expression chain.
           // This chain is based on the instructions generated by the
           // processOffset function.
@@ -765,6 +1101,25 @@ static void convertMemRefToArray(Module &M, bool ranked = false) {
         }
       }
 
+      LLVM_DEBUG({
+        dbgs() << "Creating caller for " << FuncToNew[Callee]->getName()
+               << ", signature: ";
+        FuncToNew[Callee]->getFunctionType()->dump();
+        dbgs() << "-----------------------\n\n";
+        dbgs() << "Argument list:\n";
+        for (auto arg : enumerate(Args)) {
+          dbgs() << arg.index() << "\t-> ";
+          arg.value()->dump();
+        }
+        dbgs() << "\nArgument types:\n";
+        for (auto arg : enumerate(Args)) {
+          dbgs() << arg.index() << "\t-> ";
+          arg.value()->getType()->dump();
+          dbgs() << "\t-> ";
+          FuncToNew[Callee]->getArg(arg.index())->getType()->dump();
+        }
+      });
+
       // New caller.
       CallInst::Create(FuncToNew[Callee], Args, Twine(), Caller);
       // Erase the original caller.
@@ -997,8 +1352,9 @@ struct XilinxUnrollPass : public ModulePass {
         auto DT = llvm::DominatorTree(F);
         LoopInfo LI(DT);
 
-        for (auto &loop : LI)
-          unrollLoop(loop);
+        if (!LI.empty())
+          for (auto &loop : LI)
+            unrollLoop(loop);
       }
 
     return false;
@@ -1022,7 +1378,8 @@ getPartitionInfo(ArrayType *arrayTy) {
   } while (arrayTy);
 
   // The dimension number of arrays after Polymer should be a even number
-  assert(d % 2 == 0);
+  if (d % 2 != 0)
+    return {};
 
   partitions.resize(d / 2);
   return partitions;
@@ -1039,6 +1396,8 @@ struct XilinxArrayPartitionPass : public ModulePass {
   XilinxArrayPartitionPass() : ModulePass(ID) {}
 
   bool runOnModule(Module &M) override {
+    if (!XlnArrayPartitionEnabled)
+      return true;
 
     // Declare array partition APIs in Vitis HLS LLVM frontend
     auto mod = &M;
@@ -1080,6 +1439,149 @@ struct XilinxArrayPartitionPass : public ModulePass {
 
 } // namespace
 
+namespace {
+
+/// Generate test bench tcl script for Xilinx Vitis. This pass parses the LLVM
+/// IR and generates compatible test bench for the design in LLVM IR.
+struct XilinxTBTclGenPass : public ModulePass {
+  static char ID;
+  XilinxTBTclGenPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    std::error_code ec;
+    llvm::raw_fd_ostream XlnTBTcl(XlnTBTclNames, ec);
+
+    XlnTBTcl << "open_project -reset tb\n"
+             << XlnTBSources << "set_top " << XlnTop << "\n"
+             << "open_solution -reset solution1\n"
+             << "set_part \"zynq\"\n"
+             << "create_clock -period \"100MHz\"\n"
+             << "config_bind -effort high\n";
+
+    for (auto &F : M)
+      if (F.getName() == XlnTop) {
+        for (unsigned i = 0; i < F.arg_size(); i++) {
+          auto arg = F.getArg(i);
+          if (arg->getType()->isPointerTy() &&
+              arg->getType()->getPointerElementType()->isArrayTy()) {
+            auto arrayTy =
+                dyn_cast<ArrayType>(arg->getType()->getPointerElementType());
+            if (XlnArrayPartitionEnabled) {
+              auto partitions = getPartitionInfo(arrayTy);
+              for (auto partition : partitions)
+                XlnTBTcl << "set_directive_array_partition -dim "
+                         << partition.first << " -factor " << partition.second
+                         << " -type block \"" << XlnTop << "\" "
+                         << arg->getName() << "\n";
+            }
+          }
+        }
+      }
+
+    XlnTBTcl << "csim_design\n"
+             << "csynth_design\n"
+             << "cosim_design\n"
+             << "exit\n";
+    return false;
+  }
+};
+
+} // namespace
+
+static void nameLoop(Loop *loop, int &loopCounter) {
+  SmallVector<Metadata *, 4> Args;
+
+  // Reserve operand 0 for loop id self reference.
+  LLVMContext &Context = loop->getHeader()->getContext();
+  auto TempNode = MDNode::getTemporary(Context, None);
+  Args.push_back(TempNode.get());
+
+  // Loop name
+  Metadata *nameVals[] = {
+      MDString::get(Context, "llvm.loop.name"),
+      MDString::get(Context, "VITIS_LOOP_" + std::to_string(loopCounter))};
+  Args.push_back(MDNode::get(Context, nameVals));
+
+  // Set the first operand to itself.
+  MDNode *LoopID = MDNode::get(Context, Args);
+  LoopID->replaceOperandWith(0, LoopID);
+  loop->setLoopID(LoopID);
+  loopCounter++;
+
+  if (!loop->isInnermost())
+    for (auto &subloop : loop->getSubLoops())
+      nameLoop(subloop, loopCounter);
+}
+
+namespace {
+
+/// Assign a name to each loop and enable flattening for Xilinx Vitis.
+struct XilinxNameLoopPass : public ModulePass {
+  static char ID;
+  XilinxNameLoopPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+
+    int loopCounter = 0;
+    for (auto &F : M)
+      if (F.getName() != XlnTop && !F.empty()) {
+        auto DT = llvm::DominatorTree(F);
+        LoopInfo LI(DT);
+
+        if (!LI.empty())
+          for (auto &loop : LI)
+            nameLoop(loop, loopCounter);
+      }
+
+    return false;
+  }
+};
+
+} // namespace
+
+// -----------------------------------------------------------------------------------
+// Mark no inline for kernels'
+
+/// Check if the input function is a scop.stmt based on the pattern S[0-1]+
+static bool isScopStmt(Function &F) {
+  StringRef name = F.getName();
+  if (!name.startswith("S"))
+    return false;
+
+  StringRef suffix = name.drop_front();
+  if (any_of(suffix, [](const char &c) { return !isdigit(c); }))
+    return false;
+
+  return true;
+}
+
+namespace {
+
+struct AnnotateNoInlinePass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  AnnotateNoInlinePass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    bool modified = false;
+    for (auto &F : M) {
+      if (!isScopStmt(F)) {
+        if (!F.hasFnAttribute(Attribute::NoInline)) {
+          modified = true;
+          F.addFnAttr(Attribute::NoInline);
+        }
+      } else {
+        modified = true;
+        // Should always inline scop.stmt.
+        F.addFnAttr(Attribute::AlwaysInline);
+      }
+    }
+
+    return modified;
+  }
+};
+
+} // namespace
+
 char ConvertMemRefToArray::ID = 0;
 static RegisterPass<ConvertMemRefToArray>
     X1("mem2ptr",
@@ -1116,3 +1618,15 @@ char XilinxArrayPartitionPass::ID = 7;
 static RegisterPass<XilinxArrayPartitionPass> X8(
     "xlnarraypartition",
     "Partition arrays in the top-level function arguments for Xilinx Vitis.");
+
+char XilinxTBTclGenPass::ID = 8;
+static RegisterPass<XilinxTBTclGenPass>
+    X9("xlntbgen", "Generate test bench tcl script for Xilinx Vitis.");
+
+char XilinxNameLoopPass::ID = 9;
+static RegisterPass<XilinxNameLoopPass> X10("xlnloopname",
+                                            "Name loops for Xilinx Vitis.");
+
+char AnnotateNoInlinePass::ID = 10;
+static RegisterPass<AnnotateNoInlinePass>
+    X11("anno-noinline", "Annotate noinline to the functions.");
diff --git a/lib/mlir/Transforms/ArrayPartition.cc b/lib/mlir/Transforms/ArrayPartition.cc
index 1596c28516d..c6dbeb625fc 100644
--- a/lib/mlir/Transforms/ArrayPartition.cc
+++ b/lib/mlir/Transforms/ArrayPartition.cc
@@ -1,6 +1,7 @@
 //===- ArrayPartitions.cc - Partitioning arrays ------------------ C++-===//
 
 #include "phism/mlir/Transforms/PhismTransforms.h"
+#include "phism/mlir/Transforms/Utils.h"
 
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
@@ -26,6 +27,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 
+#include <fstream>
 #include <queue>
 #include <set>
 
@@ -35,25 +37,15 @@ using namespace mlir;
 using namespace llvm;
 using namespace phism;
 
-static bool hasPeCaller(FuncOp f) {
-  bool ret = false;
-  f.walk([&](CallOp caller) {
-    if (caller->hasAttr("scop.pe"))
-      ret = true;
-  });
-  return ret;
-}
-
-static FuncOp getTopFunction(ModuleOp m) {
-  FuncOp top = nullptr;
-  m.walk([&](FuncOp f) {
-    if (hasPeCaller(f)) {
-      assert(!top && "There should be only one top function.");
-      top = f;
-    }
-  });
-  return top;
-}
+namespace {
+struct ArrayPartitionPipelineOptions
+    : public mlir::PassPipelineOptions<ArrayPartitionPipelineOptions> {
+  Option<bool> dumpFile{
+      *this, "dumpFile",
+      llvm::cl::desc("Enable dumping the tile info into a file."),
+      llvm::cl::init(false)};
+};
+} // namespace
 
 /// -------------------------- Dependence analysis ---------------------------
 
@@ -361,7 +353,6 @@ static void arrayPartition(FuncOp f, ModuleOp m, OpBuilder &b) {
     if (checkAccessOverlap(info, m, partitions))
       continue;
 
-    memref.dump();
     llvm::errs() << "Partitions: \n";
     for (auto it : partitions) {
       for (auto bound : enumerate(it)) {
@@ -484,13 +475,17 @@ static MapVector<Value, TileInfo> getTilingInfo(ArrayRef<Value> memrefs,
   MapVector<Value, TileInfo> tiling;
   // See if they have simple access patterns that can be directly extracted.
   for (Value memref : memrefs) {
+    LLVM_DEBUG({
+      dbgs() << "Trying to tile: ";
+      memref.dump();
+    });
     // Check if all the users of memref are scop.pe callers.
     if (any_of(memref.getUsers(), [&](Operation *op) {
           return !isa<CallOp>(op) || !op->hasAttr("scop.pe");
         })) {
       LLVM_DEBUG({
         memref.dump();
-        llvm::errs() << " has been skipped since it has non PE caller users.\n";
+        dbgs() << " has been skipped since it has non PE caller users.\n";
       });
       continue;
     }
@@ -520,10 +515,10 @@ static MapVector<Value, TileInfo> getTilingInfo(ArrayRef<Value> memrefs,
 
     // Debug the accesses.
     LLVM_DEBUG({
-      memref.dump();
-      for (MemRefAccess &access : accesses) {
+      dbgs() << "Found the following accesses:\n";
+      for (MemRefAccess &access : accesses)
         access.opInst->dump();
-      }
+      dbgs() << "---------------------------\n";
     });
 
     // Check if all accesses are idenity maps.
@@ -559,12 +554,23 @@ static MapVector<Value, TileInfo> getTilingInfo(ArrayRef<Value> memrefs,
       for (AffineForOp forOp : forOps) {
         // Filter out the result that are constants. We don't care about them.
         // ()[s0] -> (70, s0 * 32 + 32) will be ()[s0] -> (s0 * 32 + 32)
-        tmpLbMaps.push_back(
-            filterExtraConstantResults(forOp.getLowerBoundMap()));
-        tmpUbMaps.push_back(
-            filterExtraConstantResults(forOp.getUpperBoundMap()));
+        AffineMap lbMap = filterExtraConstantResults(forOp.getLowerBoundMap());
+        AffineMap ubMap = filterExtraConstantResults(forOp.getUpperBoundMap());
+
+        if (lbMap.isSingleConstant() && ubMap.isSingleConstant()) {
+          llvm::errs() << "There appears a pair of constant loop bounds. We "
+                          "cannot deal with this yet.\n";
+          isIdentical = false;
+          break;
+        }
+
+        tmpLbMaps.push_back(lbMap);
+        tmpUbMaps.push_back(ubMap);
       }
 
+      if (!isIdentical)
+        break;
+
       // Simply ignore those with constant lower upper bounds.
       // They won't cause much trouble (heuristically) if we don't partition
       // for them.
@@ -584,8 +590,10 @@ static MapVector<Value, TileInfo> getTilingInfo(ArrayRef<Value> memrefs,
         std::swap(tmpUbMaps, ubMaps);
       } else {
         isIdentical = tmpLbMaps == lbMaps && tmpUbMaps == ubMaps;
-        if (!isIdentical)
+        if (!isIdentical) {
+          LLVM_DEBUG(dbgs() << "Found not identical loop bound maps.\n");
           break;
+        }
       }
     }
 
@@ -625,11 +633,19 @@ static MapVector<Value, TileInfo> getTilingInfo(ArrayRef<Value> memrefs,
     // Abandon further processing if the tile size cannot match memref's type.
     if ((int64_t)tileSizes.size() !=
         memref.getType().cast<MemRefType>().getRank()) {
-      llvm::errs() << "Tile sizes are not equal to the rank of the memref.\n";
+      LLVM_DEBUG(
+          dbgs() << "Tile sizes are not equal to the rank of the memref.\n");
       continue;
     }
 
     // The resolved memref tiling.
+    LLVM_DEBUG({
+      dbgs() << "Memref ";
+      memref.dump();
+      dbgs() << " has been tiled into: ";
+      interleaveComma(tileSizes, dbgs());
+      dbgs() << "\n\n";
+    });
     tiling[memref] = TileInfo{tileSizes, memref};
   }
 
@@ -863,7 +879,8 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef<Value> memrefs,
               Value operand = op->getOperand(i);
 
               // The index for a tiled memref will be from an affine.apply op.
-              AffineApplyOp applyOp = operand.getDefiningOp<AffineApplyOp>();
+              mlir::AffineApplyOp applyOp =
+                  operand.getDefiningOp<mlir::AffineApplyOp>();
               if (!applyOp)
                 continue;
               assert(applyOp.getNumOperands() == 1);
@@ -871,9 +888,12 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef<Value> memrefs,
               Value indvar = applyOp.getOperand(0);
 
               mlir::AffineForOp forOp = getForInductionVarOwner(indvar);
-              // forOp.dump();
-              assert(forOp.getLowerBoundOperands().size() == 1 ||
-                     forOp.getUpperBoundOperands().size() == 1);
+
+              // At least one bound should have a single operand (for the loop
+              // indvar).
+              if (!(forOp.getLowerBoundOperands().size() == 1 ||
+                    forOp.getUpperBoundOperands().size() == 1))
+                continue;
 
               Value source = forOp.getUpperBoundOperands().size() == 1
                                  ? forOp.getUpperBoundOperands()[0]
@@ -887,6 +907,13 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef<Value> memrefs,
             if (indices.empty())
               std::swap(tmpIndices, indices);
             else {
+              LLVM_DEBUG({
+                op->dump();
+                if (tmpIndices != indices) {
+                  llvm::interleaveComma(tmpIndices, llvm::errs());
+                  llvm::interleaveComma(indices, llvm::errs());
+                }
+              });
               assert(tmpIndices == indices);
               std::swap(tmpIndices, indices);
             }
@@ -927,7 +954,6 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef<Value> memrefs,
           memref::SubViewOp subView =
               b.create<memref::SubViewOp>(caller.getLoc(), newTiledMemRefType,
                                           newMemRef, offsets, sizes, strides);
-          subView.dump();
 
           // Strip the affine map
           MemRefType castMemRefType =
@@ -956,8 +982,6 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef<Value> memrefs,
       worklist[j] = vmap.lookup(worklist[j]);
     }
 
-    newFunc.dump();
-
     prevFunc = newFunc;
   }
 
@@ -1023,11 +1047,23 @@ static void renameTiledFunctions(ModuleOp m, OpBuilder &b) {
 
 struct SimpleArrayPartitionPass
     : public PassWrapper<SimpleArrayPartitionPass, OperationPass<ModuleOp>> {
+  bool dumpFile = false;
+
+  SimpleArrayPartitionPass() = default;
+  SimpleArrayPartitionPass(const SimpleArrayPartitionPass &pass) {}
+  SimpleArrayPartitionPass(const ArrayPartitionPipelineOptions &options)
+      : dumpFile(options.dumpFile) {}
+
   void runOnOperation() override {
     ModuleOp m = getOperation();
     OpBuilder b(m.getContext());
 
     FuncOp top = getTopFunction(m);
+    if (!top) {
+      m.emitRemark() << "No top function found for array partition. Have you "
+                        "forgot to annotate {scop.pe} to callers?\n";
+      return;
+    }
 
     SmallVector<CallOp> callers;
     top.walk([&](CallOp caller) {
@@ -1035,6 +1071,9 @@ struct SimpleArrayPartitionPass
         callers.push_back(caller);
     });
 
+    if (callers.empty())
+      return;
+
     // Get all the memrefs that can be partitioned.
     // TODO: consider scratchpad as well?
     SmallVector<Value> memrefs;
@@ -1044,6 +1083,18 @@ struct SimpleArrayPartitionPass
 
     // Get the tiling info.
     auto tiling = getTilingInfo(memrefs, m);
+    for (Value memref : memrefs)
+      if (!tiling.count(memref)) {
+        LLVM_DEBUG({
+          dbgs() << "There is at least one memref: ";
+          memref.dump();
+          dbgs() << " has not partitioned. We discard the whole case since the "
+                    "performance gain would be minor.\n";
+        });
+        return;
+      }
+
+    auto tilingCopy = tiling;
 
     // Tile the top function.
     FuncOp newTop = tileTopFunction(top, memrefs, tiling, m, b);
@@ -1053,15 +1104,33 @@ struct SimpleArrayPartitionPass
 
     // Reset names.
     renameTiledFunctions(m, b);
+
+    // If array partition has been succesful, dump a file that stores the
+    // corresponding information.
+    if (dumpFile) {
+      std::ofstream infoFile;
+      infoFile.open("array_partition.txt", std::ios::out);
+      if (infoFile.is_open()) {
+        for (auto &it : tilingCopy) {
+          interleave(
+              it.second.sizes,
+              [&](const int64_t &size) { infoFile << std::to_string(size); },
+              [&]() { infoFile << ", "; });
+          infoFile << '\n';
+        }
+      }
+    }
   }
 };
 } // namespace
 
 void phism::registerArrayPartitionPasses() {
   PassRegistration<ArrayPartitionPass>("array-partition", "Partition arrays");
-  PassPipelineRegistration<>(
-      "simple-array-partition", "Partition arrays", [&](OpPassManager &pm) {
-        pm.addPass(std::make_unique<SimpleArrayPartitionPass>());
+
+  PassPipelineRegistration<ArrayPartitionPipelineOptions>(
+      "simple-array-partition", "Partition arrays",
+      [&](OpPassManager &pm, const ArrayPartitionPipelineOptions &options) {
+        pm.addPass(std::make_unique<SimpleArrayPartitionPass>(options));
         pm.addPass(createCanonicalizerPass());
       });
 }
diff --git a/lib/mlir/Transforms/CMakeLists.txt b/lib/mlir/Transforms/CMakeLists.txt
index 29b1d23d736..b4d22142df2 100644
--- a/lib/mlir/Transforms/CMakeLists.txt
+++ b/lib/mlir/Transforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_library(PhismTransforms
   PhismTransforms.cc
   ArrayPartition.cc
   DependenceAnalysis.cc
+  Utils.cc
 
   ADDITIONAL_HEADER_DIRS
   "${PHISM_MAIN_INCLUDE_DIR}/phism/mlir/Transforms"  
diff --git a/lib/mlir/Transforms/LoopTransforms.cc b/lib/mlir/Transforms/LoopTransforms.cc
index e3afff3b2cb..8c50d76cea0 100644
--- a/lib/mlir/Transforms/LoopTransforms.cc
+++ b/lib/mlir/Transforms/LoopTransforms.cc
@@ -1,6 +1,7 @@
 //===- LoopTransforms.cc - Loop transforms ----------------------------C++-===//
 
 #include "phism/mlir/Transforms/PhismTransforms.h"
+#include "phism/mlir/Transforms/Utils.h"
 
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
@@ -9,6 +10,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Dominance.h"
@@ -28,7 +30,7 @@
 
 #include <queue>
 
-#define DEBUG_TYPE "loop-extract"
+#define DEBUG_TYPE "loop-transforms"
 
 using namespace mlir;
 using namespace llvm;
@@ -46,7 +48,7 @@ struct LoopTransformsPipelineOptions
 
 /// -------------------------- Insert Scratchpad ---------------------------
 
-static FuncOp getTopFunction(Operation *op) {
+static FuncOp getRootFunction(Operation *op) {
   while (!op->getParentOfType<FuncOp>())
     op = op->getParentOp();
   return op->getParentOfType<FuncOp>();
@@ -62,7 +64,7 @@ struct InsertScratchpadPass
     DominanceInfo dom(storeOp->getParentOp());
     Value mem = storeOp.getMemRef();
     // TODO: we should further check the address being accessed.
-    FuncOp f = getTopFunction(storeOp);
+    FuncOp f = getRootFunction(storeOp);
     b.setInsertionPointToStart(&f.getBlocks().front());
 
     // New scratchpad memory
@@ -257,7 +259,6 @@ createPointLoopsCallee(mlir::AffineForOp forOp, int id, FuncOp f,
     mapping.map(arg, entry->addArgument(arg.getType()));
 
   callee.setType(b.getFunctionType(entry->getArgumentTypes(), llvm::None));
-  callee.setVisibility(SymbolTable::Visibility::Public);
 
   b.clone(*forOp.getOperation(), mapping);
 
@@ -478,11 +479,628 @@ struct AnnotatePointLoopsPass
 };
 } // namespace
 
+/// --------------------- Redistribute statements ---------------------------
+
+static void getAllScopStmts(FuncOp func, SetVector<FuncOp> &stmts, ModuleOp m) {
+  func.walk([&](mlir::CallOp caller) {
+    FuncOp callee = dyn_cast<FuncOp>(m.lookupSymbol(caller.getCallee()));
+    if (!callee)
+      return;
+    if (!callee->hasAttr("scop.stmt"))
+      return;
+
+    stmts.insert(callee);
+  });
+}
+
+static void detectScopPeWithMultipleStmts(ModuleOp m,
+                                          SetVector<mlir::FuncOp> &pes) {
+  FuncOp top = getTopFunction(m);
+  if (!top)
+    return;
+
+  top.walk([&](mlir::CallOp caller) {
+    if (!caller->hasAttr("scop.pe"))
+      return;
+
+    FuncOp callee = dyn_cast<FuncOp>(m.lookupSymbol(caller.getCallee()));
+    if (!callee)
+      return;
+
+    SetVector<FuncOp> stmts;
+    getAllScopStmts(callee, stmts, m);
+
+    if (stmts.size() >= 2)
+      pes.insert(callee);
+  });
+}
+
+static bool hasOnlyReadByScopStmts(FuncOp f, ModuleOp m, Value memref) {
+  SmallVector<std::pair<FuncOp, unsigned>> funcAndArgIdx;
+  f.walk([&](mlir::CallOp caller) {
+    FuncOp callee = dyn_cast<FuncOp>(m.lookupSymbol(caller.getCallee()));
+    if (!callee || !callee->hasAttr("scop.stmt"))
+      return;
+    auto it = find(caller.getArgOperands(), memref);
+    if (it == caller.arg_operand_end())
+      return;
+
+    funcAndArgIdx.push_back({callee, it - caller.arg_operand_begin()});
+  });
+
+  // Examine the accesses.
+  for (auto &it : funcAndArgIdx) {
+    FuncOp callee;
+    unsigned argIdx;
+    std::tie(callee, argIdx) = it;
+
+    assert(callee.getArgument(argIdx).getType().isa<MemRefType>());
+
+    bool hasWriteAccess = false;
+    callee.walk([&](mlir::AffineStoreOp storeOp) {
+      if (storeOp.getMemRef() == callee.getArgument(argIdx))
+        hasWriteAccess = true;
+    });
+
+    if (hasWriteAccess)
+      return false;
+  }
+
+  return true;
+}
+
+/// Assuming the memrefs at the top-level are not aliases.
+/// Also assuming each scop.stmt will have its accessed memrefs once in its
+/// interface.
+static bool areScopStmtsSeparable(FuncOp f, ModuleOp m) {
+  SetVector<Value> visited; // memrefs visited.
+  SetVector<Value> conflicted;
+  SetVector<FuncOp> visitedStmts;
+  f.walk([&](mlir::CallOp caller) {
+    FuncOp callee = dyn_cast<FuncOp>(m.lookupSymbol(caller.getCallee()));
+    if (!callee || !callee->hasAttr("scop.stmt"))
+      return;
+    if (visitedStmts.count(callee))
+      return;
+    visitedStmts.insert(callee);
+
+    for (Value arg : caller.getArgOperands())
+      if (arg.getType().isa<MemRefType>()) {
+        if (visited.count(arg))
+          conflicted.insert(arg);
+        visited.insert(arg);
+      }
+  });
+
+  unsigned bad = 0;
+  for (auto &memref : conflicted)
+    if (!hasOnlyReadByScopStmts(f, m, memref))
+      ++bad;
+
+  if (!bad)
+    return true;
+
+  LLVM_DEBUG({
+    llvm::errs()
+        << "\nConflicted memrefs that have not only read accesses:\n\n";
+    for (Value memref : conflicted)
+      if (!hasOnlyReadByScopStmts(f, m, memref))
+        memref.dump();
+  });
+
+  return false;
+}
+
+/// Erase those affine.for with empty blocks.
+static void eraseEmptyAffineFor(FuncOp f) {
+  SmallVector<Operation *> eraseOps;
+  while (true) {
+    eraseOps.clear();
+    f.walk([&](mlir::AffineForOp forOp) {
+      if (llvm::hasSingleElement(*forOp.getBody())) // the yield
+        eraseOps.push_back(forOp.getOperation());
+    });
+    for (Operation *op : eraseOps)
+      op->erase();
+
+    if (eraseOps.empty())
+      break;
+  }
+}
+
+static std::pair<FuncOp, SmallVector<unsigned>>
+distributeScopStmt(FuncOp stmt, FuncOp f, ModuleOp m, OpBuilder &b) {
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPointAfter(f);
+  FuncOp newFunc = cast<FuncOp>(b.clone(*f.getOperation()));
+  newFunc.setName(std::string(f.getName()) + "__cloned_for__" +
+                  std::string(stmt.getName()));
+
+  SmallVector<Operation *> eraseOps;
+  newFunc.walk([&](mlir::CallOp caller) {
+    if (caller.getCallee() != stmt.getName())
+      eraseOps.push_back(caller.getOperation());
+  });
+
+  for (Operation *op : eraseOps)
+    op->erase();
+
+  eraseEmptyAffineFor(newFunc);
+
+  // Erase not used arguments.
+  SmallVector<unsigned> usedArgs;
+  for (unsigned i = 0; i < newFunc.getNumArguments(); ++i)
+    if (newFunc.getArgument(i).use_empty())
+      usedArgs.push_back(i);
+  newFunc.eraseArguments(usedArgs);
+
+  return {newFunc, usedArgs};
+}
+
+/// The input function will be altered in-place.
+static LogicalResult distributeScopStmts(
+    FuncOp f, SmallVectorImpl<std::pair<FuncOp, SmallVector<unsigned>>> &dist,
+    ModuleOp m, OpBuilder &b) {
+  SetVector<FuncOp> stmts;
+  getAllScopStmts(f, stmts, m);
+
+  // Need to duplicate the whole function for each statement. And within each
+  // duplication, remove the callers that don't belong there.
+  for (FuncOp stmt : stmts) {
+    auto res = distributeScopStmt(stmt, f, m, b);
+    if (res.first)
+      dist.push_back(res);
+    else {
+      LLVM_DEBUG(dbgs() << "Cannot distribute for: " << stmt.getName() << '\n');
+      return failure();
+    }
+  }
+
+  return success();
+}
+
+namespace {
+struct RedistributeScopStatementsPass
+    : public mlir::PassWrapper<RedistributeScopStatementsPass,
+                               OperationPass<ModuleOp>> {
+
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+    OpBuilder b(m.getContext());
+
+    // -------------------------------------------------------------------
+    // Step 1: detect the scop.pe callee that has more than one scop.stmt.
+    SetVector<FuncOp> pes;
+    detectScopPeWithMultipleStmts(m, pes);
+
+    if (pes.empty())
+      return;
+
+    LLVM_DEBUG({
+      llvm::errs() << "-------------------------------------------\n";
+      llvm::errs() << "Detected PEs with multiple SCoP statements:\n\n";
+      for (FuncOp pe : pes) {
+        pe.dump();
+        llvm::errs() << "\n------------------------\n\n";
+      }
+    });
+
+    // -------------------------------------------------------------------
+    // Step 2: check if the multiple scop.stmt can be fully separated.
+    // The condition is basically each caller refers to different memref.
+    /// TODO: carry out alias analysis (not an issue for polybench)
+    /// TODO: detailed dependence analysis to cover more cases.
+    SetVector<FuncOp> pesToProc;
+    for (FuncOp pe : pes) {
+      if (!areScopStmtsSeparable(pe, m)) {
+        LLVM_DEBUG({
+          llvm::errs() << "Discared " << pe.getName()
+                       << "since its scop.stmts are not separable.\n";
+        });
+        continue;
+      }
+
+      pesToProc.insert(pe);
+    }
+
+    // -------------------------------------------------------------------
+    // Step 3: Process each PE.
+    for (FuncOp pe : pesToProc) {
+      SmallVector<std::pair<FuncOp, SmallVector<unsigned>>> dists;
+      if (failed(distributeScopStmts(pe, dists, m, b))) {
+        LLVM_DEBUG({
+          llvm::errs() << "Failed to distribute scop.stmt: " << pe.getName()
+                       << "\n";
+        });
+        continue;
+      }
+
+      SmallVector<mlir::CallOp> callers;
+      m.walk([&](mlir::CallOp caller) {
+        if (caller.getCallee() == pe.getName())
+          callers.push_back(caller);
+      });
+
+      for (mlir::CallOp caller : callers) {
+        b.setInsertionPointAfter(caller);
+        for (auto dist : dists) {
+          FuncOp callee;
+          SmallVector<unsigned> erased;
+          std::tie(callee, erased) = dist;
+
+          SmallVector<Value> operands;
+          for (auto arg : enumerate(caller.getOperands()))
+            if (find(erased, arg.index()) == erased.end())
+              operands.push_back(arg.value());
+
+          mlir::CallOp newCaller =
+              b.create<CallOp>(caller.getLoc(), callee, operands);
+          newCaller->setAttr("scop.pe", b.getUnitAttr());
+        }
+      }
+
+      for (mlir::CallOp caller : callers)
+        caller.erase();
+      pe.erase();
+    }
+  }
+};
+} // namespace
+
+/// --------------------- Loop merge pass ---------------------------
+
+static LogicalResult loopMergeOnScopStmt(FuncOp f, ModuleOp m, OpBuilder &b) {
+  SetVector<FuncOp> stmts;
+  getAllScopStmts(f, stmts, m);
+
+  if (!llvm::hasSingleElement(stmts)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Being conservative not to merge loops with multiple scop.stmts.\n");
+    return failure();
+  }
+
+  FuncOp targetStmt = *stmts.begin();
+
+  // Get all the callers for the target scop.stmt
+  SmallVector<mlir::CallOp> callers;
+  f.walk([&](mlir::CallOp caller) {
+    if (caller.getCallee() == targetStmt.getName())
+      callers.push_back(caller);
+  });
+
+  if (hasSingleElement(callers)) {
+    LLVM_DEBUG(dbgs() << "There is only one caller instance for PE: "
+                      << f.getName() << ".\n");
+    return failure();
+  }
+
+  // ----------------------------------------------------------------------
+  // Step 1: make sure there are no empty sets in loop domains.
+  SetVector<Operation *> erased;
+  for (mlir::CallOp caller : callers) {
+    SmallVector<Operation *> ops;
+    getEnclosingAffineForAndIfOps(*caller.getOperation(), &ops);
+
+    FlatAffineConstraints cst;
+    getIndexSet(ops, &cst);
+
+    if (!cst.findIntegerSample().hasValue()) {
+      LLVM_DEBUG({
+        dbgs() << "Found a caller in an empty loop nest.\n";
+        caller.dump();
+      });
+      erased.insert(caller.getOperation());
+    };
+  }
+
+  callers.erase(remove_if(callers,
+                          [&](mlir::CallOp caller) {
+                            return erased.count(caller.getOperation());
+                          }),
+                callers.end());
+  for (Operation *op : erased)
+    op->erase();
+
+  eraseEmptyAffineFor(f);
+
+  if (hasSingleElement(callers)) {
+    LLVM_DEBUG(dbgs() << "There is only one caller instance for PE: "
+                      << f.getName() << " after empty loop removal.\n");
+    return failure();
+  }
+
+  // ----------------------------------------------------------------------
+  // Step 2: gather loop structure
+  // Make sure the callers have the same prefix, only the last forOp different.
+  SmallVector<mlir::AffineForOp> outerLoops;
+  SmallVector<mlir::AffineForOp> innermosts; // each corresponds to a caller.
+  for (mlir::CallOp caller : callers) {
+    SmallVector<Operation *> ops;
+    getEnclosingAffineForAndIfOps(*caller.getOperation(), &ops);
+
+    if (ops.empty()) {
+      LLVM_DEBUG(dbgs() << "Callers should be wrapped within loops.\n");
+      return failure();
+    }
+
+    if (any_of(ops, [&](Operation *op) { return isa<mlir::AffineIfOp>(op); })) {
+      LLVM_DEBUG(dbgs() << "Cannot deal with affine.if yet.\n");
+      return failure();
+    }
+
+    // Initialise
+    if (outerLoops.empty()) {
+      innermosts.push_back(cast<mlir::AffineForOp>(ops.back()));
+      ops.pop_back();
+
+      for (Operation *op : ops)
+        outerLoops.push_back(cast<mlir::AffineForOp>(op));
+    } else {
+      SmallVector<mlir::AffineForOp> tmpOuters;
+      mlir::AffineForOp innermost;
+
+      innermost = cast<mlir::AffineForOp>(ops.back());
+      ops.pop_back();
+
+      for (Operation *op : ops)
+        tmpOuters.push_back(cast<mlir::AffineForOp>(op));
+
+      if (tmpOuters != outerLoops) {
+        LLVM_DEBUG(dbgs() << "Outer loops are not the same among statements "
+                             "(given the last being different).\n");
+        return failure();
+      }
+
+      if (find(innermosts, innermost) != innermosts.end()) {
+        LLVM_DEBUG(dbgs() << "Weird to find the same loop structures between "
+                             "two caller instances.\n");
+        return failure();
+      }
+
+      innermosts.push_back(innermost);
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "\n-----------------------------------\n";
+    dbgs() << "Merging PE: \n";
+    f.dump();
+  });
+
+  // ----------------------------------------------------------------------
+  // Step 3: Affine analysis
+  // Check if the innermost loops have no intersection.
+  SmallVector<FlatAffineConstraints, 4> csts;
+  transform(innermosts, std::back_inserter(csts), [&](mlir::AffineForOp forOp) {
+    FlatAffineConstraints cst;
+    cst.addInductionVarOrTerminalSymbol(forOp.getInductionVar());
+
+    LLVM_DEBUG(cst.dump());
+
+    return cst;
+  });
+
+  // Make every constraint has the same induction variable.
+  for (unsigned i = 1; i < csts.size(); ++i)
+    csts[i].setIdValue(0, csts[0].getIdValue(0));
+
+  // Check if all the constraints share the same number of columns.
+  for (unsigned i = 1; i < csts.size(); ++i) {
+    if (csts[i].getNumCols() != csts[0].getNumCols()) {
+      LLVM_DEBUG(dbgs() << "Number of columns don't match between two "
+                           "candidate constraints.\n");
+      return failure();
+    }
+  }
+
+  // Check if two loops have intersection.
+  for (unsigned i = 0; i < csts.size(); ++i)
+    for (unsigned j = i + 1; j < csts.size(); ++j) {
+      FlatAffineConstraints tmp{csts[i]};
+      tmp.append(csts[j]);
+
+      if (tmp.findIntegerSample().hasValue()) {
+        LLVM_DEBUG(dbgs() << "There is intersection between two innermost "
+                             "loops. Cannot merge them safely.\n");
+        return failure();
+      }
+    }
+
+  // Merge: check if one can be merged into another iteratively, until there is
+  // no chance of merging.
+  while (true) {
+    bool merged = false;
+
+    mlir::AffineForOp loopToErase;
+
+    for (unsigned i = 0; i < innermosts.size() && !merged; ++i)
+      for (unsigned j = 0; j < innermosts.size() && !merged; ++j) {
+        if (i == j)
+          continue;
+
+        mlir::AffineForOp loop1 = innermosts[i];
+        mlir::AffineForOp loop2 = innermosts[j];
+
+        AffineMap ubMap = loop1.getUpperBoundMap();
+
+        // Condition BEGIN -
+        if (loop2.getLowerBoundMap().isSingleConstant()) {
+          int64_t constLb = loop2.getLowerBoundMap().getSingleConstantResult();
+          for (AffineExpr ub : ubMap.getResults()) {
+            if (AffineConstantExpr constUbExpr =
+                    ub.dyn_cast<AffineConstantExpr>()) {
+              int64_t constUb = constUbExpr.getValue();
+              if (constLb == constUb) {
+                // Condition END -
+                LLVM_DEBUG(dbgs()
+                           << "Found loop2's single constant lower bound "
+                           << constLb
+                           << " equals to one of the upper bounds of loop1 "
+                           << constUb
+                           << ". We can merge them together since loop1 and "
+                              "loop2 don't intersect.\n");
+
+                merged = true;
+
+                // Set to erase;
+                loopToErase = loop2;
+
+                // Set the new upper bound;
+                SetVector<AffineExpr> results;
+                for (AffineExpr expr : ubMap.getResults())
+                  if (expr != ub)
+                    results.insert(expr);
+                for (AffineExpr expr : loop2.getUpperBoundMap().getResults())
+                  results.insert(expr);
+
+                AffineMap newUbMap =
+                    AffineMap::get(ubMap.getNumDims(), ubMap.getNumSymbols(),
+                                   results.takeVector(), ubMap.getContext());
+                LLVM_DEBUG({
+                  dbgs() << "New upper bound: \n";
+                  newUbMap.dump();
+                });
+                loop1.setUpperBoundMap(newUbMap);
+
+                break;
+              }
+            }
+          }
+        }
+      }
+
+    if (loopToErase) {
+      innermosts.erase(find(innermosts, loopToErase));
+      loopToErase.erase();
+    }
+
+    if (!merged)
+      break;
+  }
+
+  return success();
+}
+
+namespace {
+
+/// Will only work within scop.pe on scop.stmt to avoid side effects.
+struct LoopMergePass
+    : public mlir::PassWrapper<LoopMergePass, OperationPass<ModuleOp>> {
+
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+    OpBuilder b(m.getContext());
+
+    SmallVector<FuncOp> pes;
+    FuncOp f = getTopFunction(m);
+    if (!f)
+      return;
+
+    f.walk([&](mlir::CallOp caller) {
+      if (!caller->hasAttr("scop.pe"))
+        return;
+      FuncOp pe = dyn_cast<FuncOp>(m.lookupSymbol(caller.getCallee()));
+      if (!pe)
+        return;
+      pes.push_back(pe);
+    });
+
+    for (FuncOp pe : pes) {
+      if (failed(loopMergeOnScopStmt(pe, m, b))) {
+        LLVM_DEBUG(dbgs() << "Failed to merge loops in: " << pe.getName()
+                          << ".\n");
+      }
+    }
+  }
+};
+
+} // namespace
+
+/// -------------------------- Scop stmt inline -------------------------------
+
+static LogicalResult inlineScopStmtWithinFunction(FuncOp f, FuncOp stmt,
+                                                  OpBuilder &b) {
+  if (f->hasAttr("scop.stmt")) // skipped.
+    return success();
+
+  SmallVector<mlir::CallOp> callers;
+  f.walk([&](mlir::CallOp caller) {
+    if (caller.getCallee() == stmt.getName())
+      callers.push_back(caller);
+  });
+
+  // Replace each caller with the statement body.
+  for (mlir::CallOp caller : callers) {
+    b.setInsertionPointAfter(caller);
+
+    BlockAndValueMapping vmap;
+    vmap.map(stmt.getArguments(), caller.getArgOperands());
+
+    // We know that the body of the stmt is simply a list of operations without
+    // region.
+    for (Operation &op : stmt.getBlocks().begin()->getOperations())
+      if (!isa<mlir::ReturnOp>(op))
+        b.clone(op, vmap);
+  }
+
+  // Erase the callers.
+  for (mlir::CallOp caller : callers)
+    caller.erase();
+
+  return success();
+}
+
+namespace {
+
+/// Try to merge all the functions with attribute {scop.stmt}.
+struct ScopStmtInlinePass
+    : public mlir::PassWrapper<ScopStmtInlinePass, OperationPass<ModuleOp>> {
+
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+    OpBuilder b(m.getContext());
+
+    SmallVector<FuncOp> stmts;
+    SmallVector<FuncOp> funcs;
+
+    m.walk([&](FuncOp f) {
+      if (f->hasAttr("scop.stmt"))
+        stmts.push_back(f);
+      else
+        funcs.push_back(f);
+    });
+
+    // We know that a scop.stmt won't call another scop.stmt.
+    for (FuncOp stmt : stmts) {
+      bool hasCaller = false;
+      stmt.walk([&](mlir::CallOp caller) { hasCaller = true; });
+
+      assert(!hasCaller && "A scop.stmt cannot call another function.");
+    }
+
+    // Iterate every scop.stmt that should be inlined.
+    for (FuncOp stmt : stmts) {
+      for (FuncOp func : funcs)
+        if (failed(inlineScopStmtWithinFunction(func, stmt, b)))
+          return;
+      stmt.erase();
+    }
+  }
+};
+
+} // namespace
+
 void phism::registerLoopTransformPasses() {
   PassRegistration<AnnotatePointLoopsPass>(
       "annotate-point-loops", "Annotate loops with point/tile info.");
   PassRegistration<ExtractPointLoopsPass>(
       "extract-point-loops", "Extract point loop bands into functions");
+  PassRegistration<RedistributeScopStatementsPass>(
+      "redis-scop-stmts",
+      "Redistribute scop statements across extracted point loops.");
+  PassRegistration<LoopMergePass>("loop-merge",
+                                  "Merge loops by affine analysis.");
 
   PassPipelineRegistration<>(
       "improve-pipelining", "Improve the pipelining performance",
@@ -497,7 +1115,16 @@ void phism::registerLoopTransformPasses() {
         pm.addPass(std::make_unique<AnnotatePointLoopsPass>());
         pm.addPass(std::make_unique<ExtractPointLoopsPass>(pipelineOptions));
         pm.addPass(createCanonicalizerPass());
-        // only those private functions will be inlined.
-        pm.addPass(createInlinerPass());
+      });
+
+  PassPipelineRegistration<>(
+      "loop-redis-and-merge", "Redistribute stmts and merge loops.",
+      [](OpPassManager &pm) {
+        pm.addPass(std::make_unique<RedistributeScopStatementsPass>());
+        pm.addPass(createCanonicalizerPass());
+        pm.addPass(std::make_unique<LoopMergePass>());
+        pm.addPass(createCanonicalizerPass());
+        pm.addPass(std::make_unique<ScopStmtInlinePass>());
+        pm.addPass(createCanonicalizerPass());
       });
 }
diff --git a/lib/mlir/Transforms/Utils.cc b/lib/mlir/Transforms/Utils.cc
new file mode 100644
index 00000000000..2d00c90cb05
--- /dev/null
+++ b/lib/mlir/Transforms/Utils.cc
@@ -0,0 +1,33 @@
+//===- Utils.cc - Utility functions ------------------ C++-===//
+
+#include "phism/mlir/Transforms/Utils.h"
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+
+using namespace mlir;
+using namespace llvm;
+using namespace phism;
+
+static bool hasPeCaller(FuncOp f) {
+  bool ret = false;
+  f.walk([&](CallOp caller) {
+    if (caller->hasAttr("scop.pe"))
+      ret = true;
+  });
+  return ret;
+}
+
+namespace phism {
+
+FuncOp getTopFunction(ModuleOp m) {
+  FuncOp top = nullptr;
+  m.walk([&](FuncOp f) {
+    if (hasPeCaller(f)) {
+      assert(!top && "There should be only one top function.");
+      top = f;
+    }
+  });
+  return top;
+}
+
+} // namespace phism
diff --git a/python/utils/polybench.py b/python/utils/polybench.py
index ed559c6ffb2..38f587ba22f 100644
--- a/python/utils/polybench.py
+++ b/python/utils/polybench.py
@@ -18,8 +18,6 @@
 
 import pandas as pd
 
-logger = logging.getLogger(__name__)
-
 POLYBENCH_DATASETS = ("MINI", "SMALL", "MEDIUM", "LARGE", "EXTRALARGE")
 POLYBENCH_EXAMPLES = (
     "2mm",
@@ -92,6 +90,8 @@ class PbFlowOptions:
     max_span: int = -1
     tile_sizes: Optional[List[int]] = None
     array_partition: bool = False
+    skip_vitis: bool = False
+    skip_csim: bool = False  # Given cosim = True, you can still turn down csim.
 
 
 # ----------------------- Utility functions ------------------------------------
@@ -375,19 +375,55 @@ def get_module_parameters(file: str, module_name: str) -> List[str]:
     end_line = next(i for i, l in enumerate(lines) if ");" in l)
 
     params = (" ".join(line for line in lines[start_line + 1 : end_line])).split(",")
-    return [param.strip() for param in params]
+    return [param.strip() for param in params if param.strip()]
+
+
+def get_autotb_parameters(file: str) -> List[str]:
+    """Read interface from autotb files."""
+    assert os.path.isfile(file)
+    assert file.endswith(".autotb.v")
+
+    with open(file, "r") as f:
+        lines = f.readlines()
+    lines = [line.strip() for line in lines]
+
+    start_line = next(
+        i for i, l in enumerate(lines) if f"`AUTOTB_DUT `AUTOTB_DUT_INST(" in l
+    )
+    assert start_line >= 0 and start_line < len(lines)
+
+    end_line = next(i for i, l in enumerate(lines) if ");" in l and i > start_line)
+    assert end_line >= 0 and end_line < len(lines)
+
+    # Deal with things like -
+    # .ap_clk(ap_clk),
+    # .ap_rst(ap_rst),
+
+    conns = (" ".join(line for line in lines[start_line + 1 : end_line + 1])).split(",")
+    conns = [conn.strip() for conn in conns]
+
+    params = []
+    for conn in conns:
+        if conn.endswith(");"):
+            conn = conn[:-2]
+        assert conn[0] == "." and "(" in conn and conn[-1] == ")"
+        param = conn.split("(")[0][1:]
+        assert param == conn.split("(")[1][:-1]
+        params.append(param)
+
+    return params
 
 
 def get_memory_interfaces(params: List[str]):
     """Parse memory interfaces from the module params."""
     interfaces = OrderedDict()
     for param in params:
-        prefix = param.split("_")[0]
+        prefix = "_".join(param.split("_")[:-1])
         if prefix not in interfaces:
             interfaces[prefix] = []
         if param.startswith("ap") or "_" not in param:
             continue
-        interfaces[prefix].append(param.split("_")[1])
+        interfaces[prefix].append(param.split("_")[-1])
 
     return [
         ApMemoryInterface(name, ports)
@@ -415,32 +451,26 @@ def is_read_write_conflict(
     )
 
 
-def fix_cosim_kernels(dir: str) -> CosimFixStrategy:
-    """Fix issues with co-simulation.
-    Returns directives for (source, destination).
-    """
-
-    dir = os.path.abspath(dir)  # canonicalize path
-    kernel_name = f"kernel_{os.path.basename(dir)}"
-
-    src_proj_dir = os.path.join(dir, "proj", "solution1")
-    assert os.path.isdir(src_proj_dir)
-
-    dst_proj_dir = os.path.join(dir, "tb.backup", "solution1")
-    assert os.path.isdir(dst_proj_dir)
-
-    src_kernel = os.path.join(src_proj_dir, "syn", "verilog", f"{kernel_name}.v")
-    assert os.path.isfile(src_kernel)
+def is_cosim_interface_matched(
+    src_mems: List[ApMemoryInterface], dst_mems: List[ApMemoryInterface]
+) -> bool:
+    if len(src_mems) != len(dst_mems):
+        return False
 
-    dst_kernel = os.path.join(dst_proj_dir, "syn", "verilog", f"{kernel_name}.v")
-    assert os.path.isfile(dst_kernel)
+    for src, dst in zip(src_mems, dst_mems):
+        if src.get_num_ports() != dst.get_num_ports():
+            return False
+        if set(src.ports) != set(dst.ports):
+            return False
 
-    src_params = get_module_parameters(src_kernel, kernel_name)
-    dst_params = get_module_parameters(dst_kernel, kernel_name)
+    return True
 
-    src_mems = get_memory_interfaces(src_params)
-    dst_mems = get_memory_interfaces(dst_params)
 
+def get_cosim_fix_strategy(
+    kernel_name: str,
+    src_mems: List[ApMemoryInterface],
+    dst_mems: List[ApMemoryInterface],
+) -> CosimFixStrategy:
     if len(src_mems) != len(dst_mems):
         raise RuntimeError("The number of ap_memory interfaces should be the same.")
     if [mem.name for mem in src_mems] != [mem.name for mem in dst_mems]:
@@ -496,6 +526,36 @@ def fix_cosim_kernels(dir: str) -> CosimFixStrategy:
     return strategy
 
 
+def fix_cosim_kernels(dir: str) -> CosimFixStrategy:
+    """Fix issues with co-simulation.
+    Returns directives for (source, destination).
+    """
+
+    dir = os.path.abspath(dir)  # canonicalize path
+    kernel_name = f"kernel_{os.path.basename(dir)}"
+
+    src_proj_dir = os.path.join(dir, "proj", "solution1")
+    assert os.path.isdir(src_proj_dir)
+
+    dst_proj_dir = os.path.join(dir, "tb.backup", "solution1")
+    assert os.path.isdir(dst_proj_dir)
+
+    src_kernel = os.path.join(src_proj_dir, "syn", "verilog", f"{kernel_name}.v")
+    assert os.path.isfile(src_kernel)
+
+    dst_kernel = os.path.join(dst_proj_dir, "syn", "verilog", f"{kernel_name}.v")
+    assert os.path.isfile(dst_kernel)
+
+    src_params = get_module_parameters(src_kernel, kernel_name)
+    dst_params = get_module_parameters(dst_kernel, kernel_name)
+
+    return get_cosim_fix_strategy(
+        kernel_name,
+        get_memory_interfaces(src_params),
+        get_memory_interfaces(dst_params),
+    )
+
+
 # ----------------------- Benchmark runners ---------------------------
 
 
@@ -609,6 +669,8 @@ def is_func_decl(item, name):
 exit
 """
 
+TBGEN_VITIS_TCL_FILES = 'add_files {{{src_dir}/{src_base}.c}} -cflags "-I {src_dir} -I {work_dir}/utilities -D {pb_dataset}_DATASET" -csimflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET"\\nadd_files -tb {{{src_dir}/{src_base}.c {work_dir}/utilities/polybench.c}} -cflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET" -csimflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET"\\n'
+
 TBGEN_VITIS_TCL = """
 open_project -reset tb
 add_files {{{src_dir}/{src_base}.c}} -cflags "-I {src_dir} -I {work_dir}/utilities -D {pb_dataset}_DATASET" -csimflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET"
@@ -655,11 +717,30 @@ def __init__(self, work_dir: str, options: PbFlowOptions):
         self.status = 0
         self.errmsg = "No Error"
 
+        # Logger
+        self.logger = logging.getLogger("pb-flow")
+        self.logger.setLevel(logging.DEBUG)
+
     def run(self, src_file):
         """Run the whole pb-flow on the src_file (*.c)."""
         self.cur_file = src_file
         self.c_source = src_file  # Will be useful in some later stages
 
+        base_dir = os.path.dirname(src_file)
+
+        # Setup logging
+        log_file = os.path.join(base_dir, f"pb-flow.log")
+        if os.path.isfile(log_file):
+            os.remove(log_file)
+
+        formatter = logging.Formatter(
+            "[%(asctime)s][%(name)s][%(levelname)s] %(message)s"
+        )
+        fh = logging.FileHandler(log_file)
+        fh.setFormatter(formatter)
+        fh.setLevel(logging.DEBUG)
+        self.logger.addHandler(fh)
+
         # The whole flow
         try:
             (
@@ -669,12 +750,17 @@ def run(self, src_file):
                 .split_statements()
                 .extract_top_func()
                 .polymer_opt()
-                .loop_transforms()
                 .constant_args()
+                .loop_transforms()
                 .array_partition()
                 .lower_llvm()
                 .vitis_opt()
-                .run_vitis()
+                .write_tb_tcl_by_llvm()
+                .run_vitis_on_phism()
+                .run_tbgen_csim()
+                .backup_csim_results()
+                .copy_design_from_phism_to_tb()
+                .run_cosim()
             )
         except Exception as e:
             self.status = 1
@@ -685,16 +771,26 @@ def run_command(
     ):
         """Single entry for running a command."""
         kwargs.update({"cwd": os.path.dirname(self.cur_file)})
+
         if cmd_list:
+            cmd_ = " \\\n\t".join(cmd_list)
+            self.logger.debug(f"{cmd_}")
             if self.options.dry_run:
                 print(" ".join(cmd_list))
                 return
-            return subprocess.run(cmd_list, **kwargs)
+            proc = subprocess.run(cmd_list, **kwargs)
         else:
+            self.logger.debug(f"{cmd}")
             if self.options.dry_run:
                 print(cmd)
                 return
-            return subprocess.run(cmd, **kwargs)
+            proc = subprocess.run(cmd, **kwargs)
+
+        cmd_str = cmd if cmd else " ".join(cmd_list)
+        if proc.returncode != 0:
+            raise RuntimeError(f"{cmd_str} failed.")
+
+        return proc
 
     def get_program_abspath(self, program: str) -> str:
         """Get the absolute path of a program."""
@@ -708,7 +804,8 @@ def generate_tile_sizes(self):
         tile_file = os.path.join(base_dir, "tile.sizes")
 
         if not self.options.tile_sizes:
-            shutil.rmtree(tile_file, ignore_errors=True)
+            if os.path.isfile(tile_file):
+                os.remove(tile_file)
             return self
 
         with open(tile_file, "w") as f:
@@ -865,6 +962,8 @@ def loop_transforms(self):
             self.get_program_abspath("phism-opt"),
             src_file,
             f'-loop-transforms="max-span={self.options.max_span}"',
+            "-loop-redis-and-merge",
+            "-debug-only=loop-transforms",
         ]
 
         self.run_command(
@@ -887,10 +986,16 @@ def array_partition(self):
         )
         log_file = self.cur_file.replace(".mlir", ".log")
 
+        array_partition_file = os.path.join(
+            os.path.dirname(self.cur_file), "array_partition.txt"
+        )
+        if os.path.isfile(array_partition_file):
+            os.remove(array_partition_file)
+
         args = [
             self.get_program_abspath("phism-opt"),
             src_file,
-            "-simple-array-partition",
+            "-simple-array-partition=dumpFile",
             "-debug-only=array-partition",
         ]
 
@@ -959,11 +1064,17 @@ def vitis_opt(self):
         src_file, self.cur_file = self.cur_file, self.cur_file.replace(
             ".llvm", ".vitis.llvm"
         )
+        log_file = self.cur_file.replace(".llvm", ".log")
 
         xln_names = get_top_func_param_names(
             self.c_source, self.work_dir, llvm_dir=os.path.join(self.root_dir, "llvm")
         )
 
+        # Whether array partition has been successful.
+        xln_ap_enabled = os.path.isfile(
+            os.path.join(os.path.dirname(self.cur_file), "array_partition.txt")
+        )
+
         args = [
             os.path.join(self.root_dir, "llvm", "build", "bin", "opt"),
             src_file,
@@ -980,15 +1091,260 @@ def vitis_opt(self):
             "-xlnanno",
             '-xlntop="{}"'.format(get_top_func(src_file)),
             '-xlnnames="{}"'.format(",".join(xln_names)),
+            "-xlnunroll" if self.options.loop_transforms else "",
+            "-xlnarraypartition" if self.options.array_partition else "",
+            "-xln-ap-enabled" if xln_ap_enabled else "",
             "-strip-attr",
-            "-xlnunroll",
-            "-xlnarraypartition",
+            "-debug",
         ]
 
         self.run_command(
             cmd=" ".join(args),
             shell=True,
             stdout=open(self.cur_file, "w"),
+            stderr=open(log_file, "w"),
+            env=self.env,
+        )
+
+        return self
+
+    def write_tb_tcl_by_llvm(self):
+        """Generate the tbgen TCL file from LLVM passes."""
+        if self.options.skip_vitis:
+            return self
+
+        src_file = self.cur_file
+        base_dir = os.path.dirname(src_file)
+        top_func = get_top_func(src_file)
+
+        # Whether array partition has been successful.
+        xln_ap_enabled = os.path.isfile(os.path.join(base_dir, "array_partition.txt"))
+
+        tbgen_vitis_tcl = os.path.join(base_dir, "tbgen.tcl")
+
+        tb_tcl_log = "write_tb_tcl_by_llvm.log"
+
+        # Write the TCL for TBGEN.
+        args = [
+            os.path.join(self.root_dir, "llvm", "build", "bin", "opt"),
+            src_file,
+            "-S",
+            "-enable-new-pm=0",
+            '-load "{}"'.format(
+                os.path.join(self.root_dir, "build", "lib", "VhlsLLVMRewriter.so")
+            ),
+            f'-xlntop="{top_func}"',
+            "-xlntbgen",
+            "-xln-ap-enabled" if xln_ap_enabled else "",
+            "-xlntbfilesettings=$'{}'".format(
+                TBGEN_VITIS_TCL_FILES.format(
+                    src_dir=base_dir,
+                    src_base=os.path.basename(src_file).split(".")[0],
+                    work_dir=self.work_dir,
+                    pb_dataset=self.options.dataset,
+                )
+            ),
+            f'-xlntbtclnames="{tbgen_vitis_tcl}"',
+        ]
+
+        self.run_command(
+            cmd=" ".join(args),
+            shell=True,
+            stdout=open(tb_tcl_log, "w"),
+            env=self.env,
+        )
+
+        return self
+
+    def run_vitis_on_phism(self):
+        """Just run vitis_hls on the LLVM generated from Phism."""
+        if self.options.skip_vitis:
+            self.logger.warn("Vitis won't run since --skip-vitis has been set.")
+            return self
+
+        src_file = self.cur_file
+        base_dir = os.path.dirname(src_file)
+        top_func = get_top_func(src_file)
+
+        phism_vitis_tcl = os.path.join(base_dir, "phism.tcl")
+        run_config = "config_bind -effort high"
+        if self.options.debug:
+            run_config = ""
+
+        # Generate dummy C code as the interface for the top function.
+        dummy_src = src_file.replace(".llvm", ".dummy.c")
+        with open(dummy_src, "w") as f:
+            f.write("void {}() {{}}".format(top_func))
+
+        # Write the TCL for Phism.
+        with open(phism_vitis_tcl, "w") as f:
+            phism_run_config = [str(run_config)]
+            f.write(
+                PHISM_VITIS_TCL.format(
+                    src_file=src_file,
+                    dummy_src=dummy_src,
+                    top_func=top_func,
+                    config="\n".join(phism_run_config),
+                )
+            )
+
+        log_file = os.path.join(base_dir, "phism.vitis_hls.stdout.log")
+
+        # Clean up old results
+        shutil.rmtree(os.path.join(base_dir, "proj"), ignore_errors=True)
+        if os.path.isfile(log_file):
+            os.remove(log_file)
+
+        if self.options.dry_run:
+            return self
+
+        self.run_command(
+            cmd_list=["vitis_hls", phism_vitis_tcl],
+            stdout=open(log_file, "w"),
+            stderr=open(os.path.join(base_dir, "phism.vitis_hls.stderr.log"), "w"),
+            env=self.env,
+        )
+
+        return self
+
+    def run_tbgen_csim(self):
+        """Run the tbgen.tcl file. Assuming the Tcl file has been written."""
+        if not self.options.cosim:
+            self.logger.warn("Cosim won't run due to the input setting.")
+            return self
+        if self.options.skip_csim:
+            self.logger.warn("CSim is set to be skipped.")
+            return self
+
+        src_file = self.cur_file
+        base_dir = os.path.dirname(src_file)
+
+        tbgen_vitis_tcl = os.path.join(base_dir, "tbgen.tcl")
+        assert os.path.isfile(tbgen_vitis_tcl), f"{tbgen_vitis_tcl} should exist."
+
+        if self.options.dry_run:
+            return self
+
+        shutil.rmtree(os.path.join(base_dir, "tb"), ignore_errors=True)
+        log_file = os.path.join(base_dir, "tbgen.vitis_hls.stdout.log")
+        if os.path.isfile(log_file):
+            os.remove(log_file)
+
+        self.run_command(
+            cmd_list=["vitis_hls", tbgen_vitis_tcl],
+            stdout=open(log_file, "w"),
+            stderr=open(os.path.join(base_dir, "tbgen.vitis_hls.stderr.log"), "w"),
+            env=self.env,
+        )
+
+        return self
+
+    def backup_csim_results(self):
+        """Create a backup for the csim results."""
+        # TODO: make this --dry-run compatible
+        base_dir = os.path.dirname(self.cur_file)
+        tbgen_dir = os.path.join(base_dir, "tb")
+        assert os.path.isdir(
+            tbgen_dir
+        ), f"tbgen_dir={tbgen_dir} isn't there, please don't skip csim in this case."
+
+        csim_dir = os.path.join(base_dir, "tb.csim")
+        if os.path.isdir(csim_dir):
+            self.logger.debug(f"csim_dir={csim_dir} exists, deleting it ...")
+            shutil.rmtree(csim_dir)
+
+        # Backup the tbgen (csim) results.
+        shutil.copytree(tbgen_dir, csim_dir)
+
+        return self
+
+    def copy_design_from_phism_to_tb(self):
+        """Move design files from Phism output to the testbench directory."""
+        # TODO: make this --dry-run compatible
+        src_file = self.cur_file
+        base_dir = os.path.dirname(src_file)
+        top_func = get_top_func(src_file)
+
+        # Check results
+        phism_syn_verilog_dir = os.path.join(
+            base_dir, "proj", "solution1", "syn", "verilog"
+        )
+        assert os.path.isdir(
+            phism_syn_verilog_dir
+        ), f"{phism_syn_verilog_dir} doens't exist."
+
+        tbgen_syn_verilog_dir = os.path.join(
+            base_dir, "tb", "solution1", "syn", "verilog"
+        )
+        assert os.path.isdir(
+            tbgen_syn_verilog_dir
+        ), f"{tbgen_syn_verilog_dir} doens't exist."
+
+        tbgen_sim_verilog_dir = os.path.join(
+            base_dir, "tb", "solution1", "sim", "verilog"
+        )
+        assert os.path.isdir(
+            tbgen_sim_verilog_dir
+        ), f"{tbgen_sim_verilog_dir} doens't exist."
+
+        # Copy and paste the design files.
+        design_files = glob.glob(os.path.join(phism_syn_verilog_dir, "*.*"))
+        assert design_files, "There should exist design files."
+        for f in design_files:
+            shutil.copy(f, tbgen_syn_verilog_dir)
+
+        self.logger.debug(f"Design files found: \n" + "\n".join(design_files))
+
+        # Fix the inconsistency between the testbench and the design top.
+        phism_top = os.path.join(tbgen_syn_verilog_dir, f"{top_func}.v")
+        assert os.path.isfile(phism_top), f"The top module {phism_top} should exist."
+        autotb = os.path.join(tbgen_sim_verilog_dir, f"{top_func}.autotb.v")
+        assert os.path.isfile(autotb), f"The autotb file {autotb} should exist."
+
+        phism_params = get_module_parameters(phism_top, top_func)
+        self.logger.debug(
+            f"Parameters parsed from {phism_top}:\n" + "\n".join(phism_params)
+        )
+        autotb_params = get_autotb_parameters(autotb)
+        self.logger.debug(
+            f"Parameters parsed from {autotb}:\n" + "\n".join(autotb_params)
+        )
+
+        phism_mems = get_memory_interfaces(phism_params)
+        self.logger.debug(
+            f"Parsed memory interfaces from {phism_top}:\n"
+            + "\n".join([str(m) for m in phism_mems])
+        )
+        autotb_mems = get_memory_interfaces(autotb_params)
+        self.logger.debug(
+            f"Parsed memory interfaces from {autotb}:\n"
+            + "\n".join([str(m) for m in autotb_mems])
+        )
+
+        if not is_cosim_interface_matched(phism_mems, autotb_mems):
+            print(get_cosim_fix_strategy(top_func, phism_mems, autotb_mems))
+
+        return self
+
+    def run_cosim(self):
+        """Run cosim.tcl"""
+        if not self.options.cosim:
+            self.logger.debug("cosim is skipped since --cosim has not been set.")
+            return self
+
+        src_file = self.cur_file
+        base_dir = os.path.dirname(src_file)
+
+        cosim_vitis_tcl = os.path.join(base_dir, "cosim.tcl")
+        with open(cosim_vitis_tcl, "w") as f:
+            f.write(COSIM_VITIS_TCL)
+
+        log_file = os.path.join(base_dir, "cosim.vitis_hls.stdout.log")
+
+        self.run_command(
+            cmd_list=["vitis_hls", cosim_vitis_tcl],
+            stdout=open(log_file, "w"),
+            stderr=open(os.path.join(base_dir, "cosim.vitis_hls.stderr.log"), "w"),
             env=self.env,
         )
 
@@ -996,6 +1352,9 @@ def vitis_opt(self):
 
     def run_vitis(self, strategy: Optional[CosimFixStrategy] = None):
         """Run synthesize/testbench generation/co-simulation."""
+        if self.options.skip_vitis:
+            return self
+
         src_file = self.cur_file
         base_dir = os.path.dirname(src_file)
         top_func = get_top_func(src_file)
@@ -1028,22 +1387,23 @@ def run_vitis(self, strategy: Optional[CosimFixStrategy] = None):
                 )
             )
 
-        # Write the TCL for TBGEN.
-        with open(tbgen_vitis_tcl, "w") as f:
-            tbgen_run_config = [str(run_config)]
-            if strategy:
-                tbgen_run_config.extend(strategy.tbgen_directives)
-
-            f.write(
-                TBGEN_VITIS_TCL.format(
-                    src_dir=base_dir,
-                    src_base=os.path.basename(src_file).split(".")[0],
-                    top_func=top_func,
-                    work_dir=self.work_dir,
-                    config="\n".join(tbgen_run_config),
-                    pb_dataset=self.options.dataset,
-                )
-            )
+        # Keep it for now in case we need C baseline simulation?
+        # with open(tbgen_vitis_tcl, "w") as f:
+        #     tbgen_run_config = [str(run_config)]
+        #     if strategy:
+        #         tbgen_run_config.extend(strategy.tbgen_directives)
+        #     f.write(
+        #         TBGEN_VITIS_TCL.format(
+        #             src_dir=base_dir,
+        #             src_base=os.path.basename(src_file).split(".")[0],
+        #             top_func=top_func,
+        #             work_dir=self.work_dir,
+        #             config="\n".join(tbgen_run_config),
+        #             pb_dataset=self.options.dataset,
+        #         )
+        #     )
+
+        # Write the TCL for COSIM.
         with open(cosim_vitis_tcl, "w") as f:
             f.write(COSIM_VITIS_TCL)
 
@@ -1180,6 +1540,9 @@ def pb_flow_runner(options: PbFlowOptions):
     """Run pb-flow with the provided arguments."""
     assert os.path.isdir(options.pb_dir)
 
+    if not options.examples:
+        options.examples = POLYBENCH_EXAMPLES
+
     # Copy all the files from the source pb_dir to a target temporary directory.
     if not options.work_dir:
         options.work_dir = os.path.join(
@@ -1204,5 +1567,7 @@ def pb_flow_runner(options: PbFlowOptions):
     end = timer()
     print("Elapsed time: {:.6f} sec".format(end - start))
 
-    print(">>> Dumping report ... ")
-    pb_flow_dump_report(options)
+    # Will only dump report if Vitis has been run.
+    if not options.skip_vitis:
+        print(">>> Dumping report ... ")
+        pb_flow_dump_report(options)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000000..42596e21844
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,66 @@
+argon2-cffi==21.1.0
+attrs==21.2.0
+backcall==0.2.0
+black==21.8b0
+bleach==4.1.0
+cffi==1.14.6
+click==8.0.1
+debugpy==1.4.3
+decorator==5.1.0
+defusedxml==0.7.1
+entrypoints==0.3
+ipykernel==6.4.1
+ipython==7.27.0
+ipython-genutils==0.2.0
+ipywidgets==7.6.4
+jedi==0.18.0
+Jinja2==3.0.1
+jsonschema==3.2.0
+jupyter==1.0.0
+jupyter-client==7.0.2
+jupyter-console==6.4.0
+jupyter-core==4.7.1
+jupyterlab-pygments==0.1.2
+jupyterlab-widgets==1.0.1
+MarkupSafe==2.0.1
+matplotlib-inline==0.1.3
+mistune==0.8.4
+mypy-extensions==0.4.3
+nbclient==0.5.4
+nbconvert==6.1.0
+nbformat==5.1.3
+nest-asyncio==1.5.1
+notebook==6.4.3
+numpy==1.21.2
+packaging==21.0
+pandas==1.3.3
+pandocfilters==1.4.3
+parso==0.8.2
+pathspec==0.9.0
+pexpect==4.8.0
+pickleshare==0.7.5
+platformdirs==2.3.0
+prometheus-client==0.11.0
+prompt-toolkit==3.0.20
+ptyprocess==0.7.0
+pycparser==2.20
+Pygments==2.10.0
+pyparsing==2.4.7
+pyrsistent==0.18.0
+python-dateutil==2.8.2
+pytz==2021.1
+pyzmq==22.2.1
+qtconsole==5.1.1
+QtPy==1.11.0
+regex==2021.8.28
+Send2Trash==1.8.0
+six==1.16.0
+terminado==0.12.1
+testpath==0.5.0
+tomli==1.2.1
+tornado==6.1
+traitlets==5.1.0
+typing-extensions==3.10.0.2
+wcwidth==0.2.5
+webencodings==0.5.1
+widgetsnbextension==3.5.1
diff --git a/scripts/build-llvm.sh b/scripts/build-llvm.sh
index d5b714ecfc2..66a1760b534 100755
--- a/scripts/build-llvm.sh
+++ b/scripts/build-llvm.sh
@@ -36,26 +36,27 @@ mkdir -p build
 cd build
 
 # Configure CMake
-export CC=gcc
-export CXX=g++ 
-cmake ../llvm \
-  -DLLVM_ENABLE_PROJECTS="mlir;llvm;clang" \
-  -DCMAKE_BUILD_TYPE=RELEASE \
-  -DLLVM_BUILD_EXAMPLES=OFF \
-  -DLLVM_TARGETS_TO_BUILD="host" \
-  -DLLVM_OPTIMIZED_TABLEGEN=ON \
-  -DLLVM_ENABLE_OCAMLDOC=OFF \
-  -DLLVM_ENABLE_BINDINGS=OFF \
-  -DLLVM_INSTALL_UTILS=ON \
-  -DLLVM_ENABLE_ASSERTIONS=ON \
-  -DBUILD_POLYMER=ON \
-  -DPLUTO_LIBCLANG_PREFIX="$(llvm-config --prefix)" \
-  -G "${CMAKE_GENERATOR}"
+if [ ! -f "CMakeCache.txt" ]; then
+  export CC=gcc
+  export CXX=g++ 
+  cmake ../llvm \
+    -DLLVM_ENABLE_PROJECTS="mlir;llvm;clang" \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DLLVM_BUILD_EXAMPLES=OFF \
+    -DLLVM_TARGETS_TO_BUILD="host" \
+    -DLLVM_OPTIMIZED_TABLEGEN=ON \
+    -DLLVM_ENABLE_OCAMLDOC=OFF \
+    -DLLVM_ENABLE_BINDINGS=OFF \
+    -DLLVM_INSTALL_UTILS=ON \
+    -DLLVM_ENABLE_ASSERTIONS=ON \
+    -DBUILD_POLYMER=ON \
+    -DPLUTO_LIBCLANG_PREFIX="$(llvm-config --prefix)" \
+    -G "${CMAKE_GENERATOR}"
+fi 
  
 # Run building
-cmake --build . --target all -- -j "$(nproc)"
-
-if [ "${TARGET}" == "ci" ]; then
-  # Run test
-  cmake --build . --target check-llvm -- -j "$(nproc)"
+if [ "${CMAKE_GENERATOR}" == "Ninja" ]; then
+  ninja
+else 
+  make -j "$(nproc)"
 fi
diff --git a/scripts/pb-flow.py b/scripts/pb-flow.py
index 4d2970de534..19a848aee50 100755
--- a/scripts/pb-flow.py
+++ b/scripts/pb-flow.py
@@ -66,6 +66,10 @@ def main():
     parser.add_argument(
         "--array-partition", action="store_true", help="Use array partition."
     )
+    parser.add_argument("--skip-vitis", action="store_true", help="Don't run Vitis.")
+    parser.add_argument(
+        "--skip-csim", action="store_true", help="Don't run tbgen (csim)."
+    )
     args = parser.parse_args()
 
     options = pb_utils.PbFlowOptions(**vars(args))
diff --git a/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir b/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir
index bcf0db62aad..9adbffe6db4 100644
--- a/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir
+++ b/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt -lower-affine -convert-scf-to-std -convert-std-to-llvm='use-bare-ptr-memref-call-conv=1' %s | mlir-translate -mlir-to-llvmir | opt -enable-new-pm=0 -load ${PHISM_LIBS_DIR}/VhlsLLVMRewriter.so -mem2arr -instcombine -strip-debug -S | FileCheck %s 
 
-// CHECK: define void @matmul([200 x [300 x float]]* %[[A:.*]], [300 x [400 x float]]* %[[B:.*]], [200 x [400 x float]]* %[[C:.*]]) {
+// CHECK: noinline
+// CHECK: define void @matmul([200 x [300 x float]]* %[[A:.*]], [300 x [400 x float]]* %[[B:.*]], [200 x [400 x float]]* %[[C:.*]]) #[[ATTR:.*]]
 func @matmul(%A: memref<200x300xf32>, %B: memref<300x400xf32>, %C: memref<200x400xf32>) {
   affine.for %i = 0 to 200 {
     affine.for %j = 0 to 400 {
@@ -16,22 +17,9 @@ func @matmul(%A: memref<200x300xf32>, %B: memref<300x400xf32>, %C: memref<200x40
   return
 }
 
-// CHECK: %[[VAL14:.*]] = mul i64 %[[I:.*]], 300
-// CHECK: %[[VAL15:.*]] = add i64 %[[VAL14]], %[[K:.*]]
-// CHECK: %[[GEP0IDX0:.*]] = udiv i64 %[[VAL15]], 300
-// CHECK: %[[GEP0ADDR1:.*]] = urem i64 %[[VAL15]], 300
-// CHECK: %[[GEP0:.*]] = getelementptr inbounds [200 x [300 x float]], [200 x [300 x float]]* %[[A]], i64 0, i64 %[[GEP0IDX0]], i64 %[[GEP0ADDR1]]
-// CHECK: %[[VAL16:.*]] = load float, float* %[[GEP0]], align 4
-// CHECK: %[[VAL17:.*]] = mul i64 %[[K]], 400
-// CHECK: %[[VAL18:.*]] = add i64 %[[VAL17]], %[[J:.*]]
-// CHECK: %[[GEP1IDX0:.*]] = udiv i64 %[[VAL18]], 400
-// CHECK: %[[GEP1ADDR1:.*]] = urem i64 %[[VAL18]], 400
-// CHECK: %[[GEP1:.*]] = getelementptr inbounds [300 x [400 x float]], [300 x [400 x float]]* %[[B]], i64 0, i64 %[[GEP1IDX0]], i64 %[[GEP1ADDR1]]
-// CHECK: %[[VAL19:.*]] = load float, float* %[[GEP1]], align 4
-// CHECK: %[[VAL20:.*]] = fmul float %[[VAL16]], %[[VAL19]]
-// CHECK: %[[VAL21:.*]] = mul i64 %[[I]], 400
-// CHECK: %[[VAL22:.*]] = add i64 %[[VAL21]], %[[J]]
-// CHECK: %[[GEP2IDX0:.*]] = udiv i64 %[[VAL22]], 400
-// CHECK: %[[GEP2ADDR1:.*]] = urem i64 %[[VAL22]], 400
-// CHECK: %[[GEP2:.*]] = getelementptr inbounds [200 x [400 x float]], [200 x [400 x float]]* %[[C]], i64 0, i64 %[[GEP2IDX0]], i64 %[[GEP2ADDR1]]
-// CHECK: store float %[[VAL20]], float* %[[GEP2]], align 4
+// CHECK: %[[i:.*]] = phi i64
+// CHECK: %[[j:.*]] = phi i64
+// CHECK: %[[k:.*]] = phi i64
+// CHECK: getelementptr inbounds [200 x [300 x float]], [200 x [300 x float]]* %[[A]], i64 0, i64 %[[i]], i64 %[[k]]
+// CHECK: getelementptr inbounds [300 x [400 x float]], [300 x [400 x float]]* %[[B]], i64 0, i64 %[[k]], i64 %[[j]]
+// CHECK: getelementptr inbounds [200 x [400 x float]], [200 x [400 x float]]* %[[C]], i64 0, i64 %[[i]], i64 %[[j]]
diff --git a/test/mlir/Transforms/ArrayPartition/no-scop-pe-caller.mlir b/test/mlir/Transforms/ArrayPartition/no-scop-pe-caller.mlir
new file mode 100644
index 00000000000..2237f1332f2
--- /dev/null
+++ b/test/mlir/Transforms/ArrayPartition/no-scop-pe-caller.mlir
@@ -0,0 +1,27 @@
+// RUN: phism-opt -simple-array-partition %s  -verify-diagnostics | FileCheck %s
+
+#map0 = affine_map<()[s0] -> (s0 * 32)>
+#map1 = affine_map<()[s0] -> (s0 * 32 + 32)>
+
+// expected-remark@below {{No top function found}}
+module {
+
+// CHECK-LABEL: @bar
+func @bar(%A: memref<64xf32>, %i: index) {
+  affine.for %j = #map0()[%i] to #map1()[%i] {
+    %0 = affine.load %A[%j] : memref<64xf32>
+    %1 = addf %0, %0 : f32
+    affine.store %1, %A[%j] : memref<64xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: @foo
+func @foo(%A: memref<64xf32>) {
+  affine.for %i = 0 to 2 {
+    call @bar(%A, %i) : (memref<64xf32>, index) -> ()
+  }
+  return
+}
+
+}
diff --git a/test/mlir/Transforms/ArrayPartition/simple-partition.mlir b/test/mlir/Transforms/ArrayPartition/simple-partition.mlir
new file mode 100644
index 00000000000..80a7d6903a8
--- /dev/null
+++ b/test/mlir/Transforms/ArrayPartition/simple-partition.mlir
@@ -0,0 +1,30 @@
+// RUN: phism-opt -simple-array-partition %s | FileCheck %s
+
+#map0 = affine_map<()[s0] -> (s0 * 32)>
+#map1 = affine_map<()[s0] -> (s0 * 32 + 32)>
+// CHECK: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
+
+// CHECK: func @bar(%[[ARG0:.*]]: memref<32xf32>, %[[ARG1:.*]]: index)
+func @bar(%A: memref<64xf32>, %i: index) {
+  // CHECK: affine.for %[[ARG2:.*]] =  
+  affine.for %j = #map0()[%i] to #map1()[%i] {
+    // CHECK: affine.load %[[ARG0]][%[[ARG2]] mod 32]
+    %0 = affine.load %A[%j] : memref<64xf32>
+    %1 = addf %0, %0 : f32
+    // CHECK: affine.store %{{.*}}, %[[ARG0]][%[[ARG2]] mod 32]
+    affine.store %1, %A[%j] : memref<64xf32>
+  }
+  return
+}
+
+// CHECK: func @foo(%[[ARG0:.*]]: memref<2x32xf32>)
+func @foo(%A: memref<64xf32>) {
+  // CHECK: affine.for %[[ARG1:.*]] = 0 to 2 
+  affine.for %i = 0 to 2 {
+    // CHECK: %[[VAL0:.*]] = memref.subview %[[ARG0]][%[[ARG1]], 0] [1, 32] [1, 1] : memref<2x32xf32> to memref<32xf32, #[[MAP2]]>
+    // CHECK-NEXT: %[[VAL1:.*]] = memref.cast %[[VAL0]] : memref<32xf32, #[[MAP2]]> to memref<32xf32>
+    // CHECK-NEXT: call @bar(%[[VAL1]], %[[ARG1]]) {scop.pe} : (memref<32xf32>, index) -> ()
+    call @bar(%A, %i) {scop.pe} : (memref<64xf32>, index) -> ()
+  }
+  return
+}
diff --git a/test/mlir/Transforms/LoopTransforms/loop-merge.mlir b/test/mlir/Transforms/LoopTransforms/loop-merge.mlir
new file mode 100644
index 00000000000..1a099e15723
--- /dev/null
+++ b/test/mlir/Transforms/LoopTransforms/loop-merge.mlir
@@ -0,0 +1,27 @@
+// RUN: phism-opt -loop-merge %s | FileCheck %s
+
+func @S0(%A: memref<32xf32>, %i: index) attributes {scop.stmt} {
+  %0 = affine.load %A[%i] : memref<32xf32>
+  %1 = addf %0, %0 : f32
+  affine.store %1, %A[%i] : memref<32xf32>
+  return
+}
+
+func @two_loops(%A: memref<32xf32>) {
+  affine.for %i = 0 to 16 {
+    call @S0(%A, %i) : (memref<32xf32>, index) -> ()
+  }
+  affine.for %i = 16 to 32 {
+    call @S0(%A, %i) : (memref<32xf32>, index) -> ()
+  }
+  return
+}
+
+// CHECK: func @two_loops
+// CHECK: affine.for %[[ARG0:.*]] = 0 to 32
+// CHECK: call @S0(%{{.*}}, %[[ARG0]])
+
+func @top(%A : memref<32xf32>) {
+  call @two_loops(%A) {scop.pe} : (memref<32xf32>) -> ()
+  return 
+}
diff --git a/test/mlir/Transforms/LoopTransforms/redis-scop-stmts.mlir b/test/mlir/Transforms/LoopTransforms/redis-scop-stmts.mlir
new file mode 100644
index 00000000000..3ed19d9546a
--- /dev/null
+++ b/test/mlir/Transforms/LoopTransforms/redis-scop-stmts.mlir
@@ -0,0 +1,40 @@
+// RUN: phism-opt %s -redis-scop-stmts | FileCheck %s
+
+func @S0(%A: memref<32xf32>, %i: index) attributes {scop.stmt} {
+  %0 = affine.load %A[%i] : memref<32xf32>
+  %1 = addf %0, %0 : f32
+  affine.store %1, %A[%i] : memref<32xf32>
+  return
+}
+
+func @S1(%A: memref<32xf32>, %i: index) attributes {scop.stmt} {
+  %0 = affine.load %A[%i] : memref<32xf32>
+  %1 = mulf %0, %0 : f32
+  affine.store %1, %A[%i] : memref<32xf32>
+  return
+}
+
+func @two_stmts(%A: memref<32xf32>, %B: memref<32xf32>) {
+  affine.for %i = 0 to 32 {
+    call @S0(%A, %i) : (memref<32xf32>, index) -> ()
+    call @S1(%B, %i) : (memref<32xf32>, index) -> ()
+  }
+  return
+}
+
+// CHECK: func @two_stmts__cloned_for__S1(%[[ARG0:.*]]: memref<32xf32>)
+// CHECK: affine.for %[[ARG1:.*]] = 0 to 32 
+// CHECK: call @S1(%[[ARG0]], %[[ARG1]])
+
+// CHECK: func @two_stmts__cloned_for__S0(%[[ARG0:.*]]: memref<32xf32>)
+// CHECK: affine.for %[[ARG1:.*]] = 0 to 32 
+// CHECK: call @S0(%[[ARG0]], %[[ARG1]])
+
+// CHECK: func @top(%[[ARG0:.*]]: memref<32xf32>, %[[ARG1:.*]]: memref<32xf32>)
+// CHECK: call @two_stmts__cloned_for__S0(%[[ARG0]])
+// CHECK: call @two_stmts__cloned_for__S1(%[[ARG1]])
+
+func @top(%A : memref<32xf32>, %B : memref<32xf32>) {
+  call @two_stmts(%A, %B) {scop.pe} : (memref<32xf32>, memref<32xf32>) -> ()
+  return 
+}