diff --git a/.gitmodules b/.gitmodules
index 8472d78404e..335e1dbd9c8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -82,3 +82,6 @@
 [submodule "contrib/cpu_features"]
 	path = contrib/cpu_features
 	url = https://github.com/google/cpu_features
+[submodule "contrib/arm-optimized-routines"]
+	path = contrib/arm-optimized-routines
+	url = https://github.com/ARM-software/optimized-routines
diff --git a/README.md b/README.md
index aa64e39d5ba..ab996b6f3d6 100644
--- a/README.md
+++ b/README.md
@@ -253,7 +253,7 @@ ninja tiflash
 tiup playground nightly --tiflash.binpath $BUILD/dbms/src/Server/tiflash
 ```
 3. Check $WORKSPACE/tests/_env.sh to make the port and build dir right.
-4. Run your integration tests using commands like "./run-test.sh fullstack-test2/ddl" under $WORKSPACE dir
+4. Run your integration tests using commands like "./run-test.sh fullstack-test2/ddl" under $WORKSPACE/tests dir
 
 ## Run MicroBenchmark Tests
 
@@ -261,7 +261,7 @@ To run micro benchmark tests, you need to build with -DCMAKE_BUILD_TYPE=RELEASE
 
 ```shell
 cd $BUILD
-cmake $WORKSPACE/tiflash -GNinja -DCMAKE_BUILD_TYPE=DEBUG -DENABLE_TESTS=ON
+cmake $WORKSPACE/tiflash -GNinja -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_TESTS=ON
 ninja bench_dbms       
 ```
 
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 71f81ae3ee5..4520d1cb176 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -165,3 +165,7 @@ add_subdirectory(benchmark)
 
 set (BUILD_TESTING OFF CACHE BOOL "Disable cpu-features testing" FORCE)
 add_subdirectory(cpu_features)
+
+if (ARCH_AARCH64 AND ARCH_LINUX)
+    add_subdirectory(arm-optimized-routines-cmake)
+endif ()
diff --git a/contrib/arm-optimized-routines b/contrib/arm-optimized-routines
new file mode 160000
index 00000000000..e373f659523
--- /dev/null
+++ b/contrib/arm-optimized-routines
@@ -0,0 +1 @@
+Subproject commit e373f6595230087a8ddea449bfb14b47150b4059
diff --git a/contrib/arm-optimized-routines-cmake/CMakeLists.txt b/contrib/arm-optimized-routines-cmake/CMakeLists.txt
new file mode 100644
index 00000000000..89baa7222f3
--- /dev/null
+++ b/contrib/arm-optimized-routines-cmake/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Copyright 2022 PingCAP, Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This library is to override performance-critical routines for aarch64 targets.
+# The implementations are imported from official ARM repo.
+# To reduce dispatching cost, indirect function technique is utilized. Therefore,
+# this library should only be enabled with ELF targets.
+
+# Considerations:
+# - By Jun, 2022, most enterprise OSs (CentOS 7, CentOS Stream 8 and RHEL 8) still
+#   use relatively old glibc on ARM64, where ASIMD, MTE, DC ZVA and SVE are not
+#   fully utilized. However, it is becoming increasingly common to use ARM64 instances
+#   in cloud-native situations.
+# - `optimized-routines` repo is actively maintained by ARM officials. Therefore,
+#   the qualities can be ensured while using it also enables us to keep sync with latest
+#   acceleration techniques.
+
+set(CMAKE_C_FLAGS "")
+ENABLE_LANGUAGE(C)
+ENABLE_LANGUAGE(ASM)
+set(TIFLASH_AOR_DIR ../arm-optimized-routines)
+
+file(GLOB TIFLASH_AARCH64_STRING_FILES ${TIFLASH_AOR_DIR}/string/aarch64/*.S)
+add_library(tiflash-aarch64-string STATIC ${TIFLASH_AARCH64_STRING_FILES} src/aor.c)
+target_compile_options(tiflash-aarch64-string PRIVATE -march=armv8-a+sve)
+target_include_directories(tiflash-aarch64-string PRIVATE ${TIFLASH_AOR_DIR}/string/include)
+
+file(GLOB TIFLASH_AARCH64_MATH_FILES ${TIFLASH_AOR_DIR}/math/*.c)
+add_library(tiflash-aarch64-math STATIC ${TIFLASH_AARCH64_MATH_FILES})
+target_include_directories(tiflash-aarch64-math PRIVATE ${TIFLASH_AOR_DIR}/math/include)
+
+# it is reasonable to keep these libraries optimized
+target_compile_options(tiflash-aarch64-string PRIVATE -O3 -g3 -fno-omit-frame-pointer -ffunction-sections -fdata-sections)
+target_compile_options(tiflash-aarch64-math PRIVATE -O3 -g3 -fno-omit-frame-pointer -ffunction-sections -fdata-sections)
diff --git a/contrib/arm-optimized-routines-cmake/src/aor.c b/contrib/arm-optimized-routines-cmake/src/aor.c
new file mode 100644
index 00000000000..daff1df3c4b
--- /dev/null
+++ b/contrib/arm-optimized-routines-cmake/src/aor.c
@@ -0,0 +1,115 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <stringlib.h>
+#include <sys/auxv.h>
+
+// Provide default macro definitions in case that they are not defined on current linux distro.
+// For example, TiFlash compiled on older linux kernels may also be used in newer ones.
+// These values should be stable for Linux: only false negative is expected when running on
+// older kernels, but it is acceptable as `google/cpu_features` is also doing so.
+#ifndef HWCAP2_MTE
+#define HWCAP2_MTE (1 << 18)
+#endif
+
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#ifndef AT_HWCAP
+#define AT_HWCAP 16
+#endif
+
+/// check if MTE is supported in current environment
+static inline bool mte_supported(void)
+{
+    return (getauxval(AT_HWCAP2) & HWCAP2_MTE) != 0;
+}
+
+/// check if SVE is supported in current environment
+static inline bool sve_supported(void)
+{
+    return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+}
+
+#define STRINGIFY_IMPL(X) #X
+#define STRINGIFY(X) STRINGIFY_IMPL(X)
+/**
+ *  \brief
+ *  Symbol is defined as hidden visibility. Therefore, implementations here are only to override routines with TiFlash
+ *  binary itself. This is because dependencies like `ld.so`, `libgcc_s.so`, etc will need essential routines like
+ *  `memcpy` to finish the early loading procedure. Therefore, declare such symbols as visible indirect function will
+ *  create cyclic dependency. It shall be good enough to override symbols within TiFlash, as most heavy computation works
+ *  are happening in the main binary.
+ *  \param NAME: exported symbol name
+ *  \param SVE: preferred implementation when SVE is available
+ *  \param MTE: preferred implementation when MTE is available
+ *  \param ASIMD: preferred implementation for generic aarch64 targets (ASIMD is required by default for Armv8 and above)
+ */
+#define DISPATCH(NAME, SVE, MTE, ASIMD)                                                                                  \
+    extern typeof(ASIMD) __tiflash_##NAME __attribute__((ifunc(STRINGIFY(__tiflash_##NAME##_resolver))));                \
+    extern typeof(ASIMD) NAME __attribute__((visibility("hidden"), alias(STRINGIFY(__tiflash_##NAME))));                 \
+    _Pragma("GCC diagnostic push")                                                                                       \
+        _Pragma("GCC diagnostic ignored \"-Wunused-function\"") static typeof(ASIMD) * __tiflash_##NAME##_resolver(void) \
+    {                                                                                                                    \
+        if (sve_supported())                                                                                             \
+        {                                                                                                                \
+            return SVE;                                                                                                  \
+        }                                                                                                                \
+        if (mte_supported())                                                                                             \
+        {                                                                                                                \
+            return MTE;                                                                                                  \
+        }                                                                                                                \
+        return ASIMD;                                                                                                    \
+    }                                                                                                                    \
+    _Pragma("GCC diagnostic pop")
+#undef memcpy
+#undef memmove
+#undef memset
+#undef memchr
+#undef memrchr
+#undef memcmp
+#undef strcpy
+#undef stpcpy
+#undef strcmp
+#undef strchr
+#undef strrchr
+#undef strchrnul
+#undef strlen
+#undef strnlen
+#undef strncmp
+
+DISPATCH(memcpy, __memcpy_aarch64_sve, __memcpy_aarch64_simd, __memcpy_aarch64_simd)
+DISPATCH(memmove, __memmove_aarch64_sve, __memmove_aarch64_simd, __memmove_aarch64_simd)
+DISPATCH(memset, __memset_aarch64, __memset_aarch64, __memset_aarch64)
+DISPATCH(memchr, __memchr_aarch64_sve, __memchr_aarch64_mte, __memchr_aarch64)
+DISPATCH(memrchr, __memrchr_aarch64, __memrchr_aarch64, __memrchr_aarch64)
+DISPATCH(memcmp, __memcmp_aarch64_sve, __memcmp_aarch64, __memcmp_aarch64)
+DISPATCH(strcpy, __strcpy_aarch64_sve, __strcpy_aarch64, __strcpy_aarch64)
+DISPATCH(stpcpy, __stpcpy_aarch64_sve, __stpcpy_aarch64, __stpcpy_aarch64)
+DISPATCH(strcmp, __strcmp_aarch64_sve, __strcmp_aarch64, __strcmp_aarch64)
+DISPATCH(strchr, __strchr_aarch64_sve, __strchr_aarch64_mte, __strchr_aarch64)
+DISPATCH(strrchr, __strrchr_aarch64_sve, __strrchr_aarch64_mte, __strrchr_aarch64)
+DISPATCH(strchrnul, __strchrnul_aarch64_sve, __strchrnul_aarch64_mte, __strchrnul_aarch64)
+DISPATCH(strlen, __strlen_aarch64_sve, __strlen_aarch64_mte, __strlen_aarch64)
+DISPATCH(strnlen, __strnlen_aarch64_sve, __strnlen_aarch64, __strnlen_aarch64)
+DISPATCH(strncmp, __strncmp_aarch64_sve, __strncmp_aarch64, __strncmp_aarch64)
\ No newline at end of file
diff --git a/contrib/client-c b/contrib/client-c
index 36e05cb0f24..034d1e782cb 160000
--- a/contrib/client-c
+++ b/contrib/client-c
@@ -1 +1 @@
-Subproject commit 36e05cb0f24c085785abf367176dac2a45bfd67b
+Subproject commit 034d1e782cb4697f99b09b679c00dade00f19dd5
diff --git a/contrib/prometheus-cpp b/contrib/prometheus-cpp
index ca1f3463e74..76470b3ec02 160000
--- a/contrib/prometheus-cpp
+++ b/contrib/prometheus-cpp
@@ -1 +1 @@
-Subproject commit ca1f3463e74d957d1cccddd4a1a29e3e5d34bd83
+Subproject commit 76470b3ec024c8214e1f4253fb1f4c0b28d3df94
diff --git a/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt b/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
index daebd1b7c5a..993618e16ac 100644
--- a/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
+++ b/contrib/prometheus-cpp-cmake/pull/CMakeLists.txt
@@ -12,9 +12,18 @@ if(ENABLE_COMPRESSION)
 endif()
 
 add_library(pull
+  ${PROMETHEUS_SRC_DIR}/pull/src/basic_auth.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/basic_auth.h
+  ${PROMETHEUS_SRC_DIR}/pull/src/endpoint.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/endpoint.h
   ${PROMETHEUS_SRC_DIR}/pull/src/exposer.cc
   ${PROMETHEUS_SRC_DIR}/pull/src/handler.cc
   ${PROMETHEUS_SRC_DIR}/pull/src/handler.h
+  ${PROMETHEUS_SRC_DIR}/pull/src/metrics_collector.cc
+  ${PROMETHEUS_SRC_DIR}/pull/src/metrics_collector.h
+
+  ${PROMETHEUS_SRC_DIR}/pull/src/detail/base64.h
+
   $<$<BOOL:${USE_THIRDPARTY_LIBRARIES}>:$<TARGET_OBJECTS:civetweb>>
 )
 
diff --git a/contrib/prometheus-cpp-cmake/push/CMakeLists.txt b/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
index 71dad9fb812..b776d17bdaf 100644
--- a/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
+++ b/contrib/prometheus-cpp-cmake/push/CMakeLists.txt
@@ -3,6 +3,8 @@ if(NOT CURL_FOUND)
 endif()
 
 add_library(push
+  ${PROMETHEUS_SRC_DIR}/push/src/curl_wrapper.cc
+  ${PROMETHEUS_SRC_DIR}/push/src/curl_wrapper.h
   ${PROMETHEUS_SRC_DIR}/push/src/gateway.cc
 )
 
diff --git a/contrib/tiflash-proxy b/contrib/tiflash-proxy
index ca2f51f94e5..6ea4d608b1c 160000
--- a/contrib/tiflash-proxy
+++ b/contrib/tiflash-proxy
@@ -1 +1 @@
-Subproject commit ca2f51f94e55bdd23749dcc02ab4afb94eeb5ae5
+Subproject commit 6ea4d608b1c03fab89d17f54a2e399602231e27c
diff --git a/contrib/tiflash-proxy-cmake/CMakeLists.txt b/contrib/tiflash-proxy-cmake/CMakeLists.txt
index e243ecba37c..e3e2df379a1 100644
--- a/contrib/tiflash-proxy-cmake/CMakeLists.txt
+++ b/contrib/tiflash-proxy-cmake/CMakeLists.txt
@@ -4,7 +4,11 @@ file(GLOB_RECURSE _TIFLASH_PROXY_SRCS "${_TIFLASH_PROXY_SOURCE_DIR}/*.rs")
 list(FILTER _TIFLASH_PROXY_SRCS EXCLUDE REGEX ${_TIFLASH_PROXY_SOURCE_DIR}/target/.*)
 
 # use `CFLAGS=-w CXXFLAGS=-w` to inhibit warning messages.
-set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} CFLAGS=-w CXXFLAGS=-w)
+if (TIFLASH_LLVM_TOOLCHAIN)
+    set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} "CFLAGS=-w -fuse-ld=lld" "CXXFLAGS=-w -fuse-ld=lld -stdlib=libc++")
+else()
+    set(TIFLASH_RUST_ENV CMAKE=${CMAKE_COMMAND} CFLAGS=-w CXXFLAGS=-w)
+endif()
 
 if(TIFLASH_LLVM_TOOLCHAIN AND USE_LIBCXX)
     set(TIFLASH_RUST_LINKER ${CMAKE_CURRENT_BINARY_DIR}/tiflash-linker)
diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp
index 10d0a558a50..ad5010d7826 100644
--- a/dbms/src/Common/FailPoint.cpp
+++ b/dbms/src/Common/FailPoint.cpp
@@ -12,7 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/Exception.h>
 #include <Common/FailPoint.h>
+#include <Poco/String.h>
+#include <Poco/StringTokenizer.h>
+#include <Poco/Util/LayeredConfiguration.h>
+#include <common/defines.h>
+#include <common/logger_useful.h>
 
 #include <boost/core/noncopyable.hpp>
 #include <condition_variable>
@@ -21,7 +27,6 @@
 namespace DB
 {
 std::unordered_map<String, std::shared_ptr<FailPointChannel>> FailPointHelper::fail_point_wait_channels;
-
 #define APPLY_FOR_FAILPOINTS_ONCE(M)                              \
     M(exception_between_drop_meta_and_data)                       \
     M(exception_between_alter_data_and_meta)                      \
@@ -109,6 +114,22 @@ std::unordered_map<String, std::shared_ptr<FailPointChannel>> FailPointHelper::f
     M(pause_query_init)
 
 
+#define APPLY_FOR_RANDOM_FAILPOINTS(M)                  \
+    M(random_tunnel_wait_timeout_failpoint)             \
+    M(random_tunnel_init_rpc_failure_failpoint)         \
+    M(random_receiver_sync_msg_push_failure_failpoint)  \
+    M(random_receiver_async_msg_push_failure_failpoint) \
+    M(random_limit_check_failpoint)                     \
+    M(random_join_build_failpoint)                      \
+    M(random_join_prob_failpoint)                       \
+    M(random_aggregate_create_state_failpoint)          \
+    M(random_aggregate_merge_failpoint)                 \
+    M(random_sharedquery_failpoint)                     \
+    M(random_interpreter_failpoint)                     \
+    M(random_task_lifecycle_failpoint)                  \
+    M(random_task_manager_find_task_failure_failpoint)  \
+    M(random_min_tso_scheduler_failpoint)
+
 namespace FailPoints
 {
 #define M(NAME) extern const char(NAME)[] = #NAME "";
@@ -116,6 +137,7 @@ APPLY_FOR_FAILPOINTS_ONCE(M)
 APPLY_FOR_FAILPOINTS(M)
 APPLY_FOR_PAUSEABLE_FAILPOINTS_ONCE(M)
 APPLY_FOR_PAUSEABLE_FAILPOINTS(M)
+APPLY_FOR_RANDOM_FAILPOINTS(M)
 #undef M
 } // namespace FailPoints
 
@@ -179,7 +201,7 @@ void FailPointHelper::enableFailPoint(const String & fail_point_name)
 #undef M
 #undef SUB_M
 
-    throw Exception("Cannot find fail point " + fail_point_name, ErrorCodes::FAIL_POINT_ERROR);
+    throw Exception(fmt::format("Cannot find fail point {}", fail_point_name), ErrorCodes::FAIL_POINT_ERROR);
 }
 
 void FailPointHelper::disableFailPoint(const String & fail_point_name)
@@ -204,6 +226,41 @@ void FailPointHelper::wait(const String & fail_point_name)
         ptr->wait();
     }
 }
+
+void FailPointHelper::initRandomFailPoints(Poco::Util::LayeredConfiguration & config, Poco::Logger * log)
+{
+    String random_fail_point_cfg = config.getString("flash.random_fail_points", "");
+    if (random_fail_point_cfg.empty())
+        return;
+
+    Poco::StringTokenizer string_tokens(random_fail_point_cfg, ",");
+    for (const auto & string_token : string_tokens)
+    {
+        Poco::StringTokenizer pair_tokens(string_token, "-");
+        RUNTIME_ASSERT((pair_tokens.count() == 2), log, "RandomFailPoints config should be FailPointA-RatioA,FailPointB-RatioB,... format");
+        double rate = atof(pair_tokens[1].c_str()); //NOLINT(cert-err34-c): check conversion error manually
+        RUNTIME_ASSERT((0 <= rate && rate <= 1.0), log, "RandomFailPoint trigger rate should in [0,1], while {}", rate);
+        enableRandomFailPoint(pair_tokens[0], rate);
+    }
+    LOG_FMT_INFO(log, "Enable RandomFailPoints: {}", random_fail_point_cfg);
+}
+
+void FailPointHelper::enableRandomFailPoint(const String & fail_point_name, double rate)
+{
+#define SUB_M(NAME)                                               \
+    if (fail_point_name == FailPoints::NAME)                      \
+    {                                                             \
+        fiu_enable_random(FailPoints::NAME, 1, nullptr, 0, rate); \
+        return;                                                   \
+    }
+
+#define M(NAME) SUB_M(NAME)
+    APPLY_FOR_RANDOM_FAILPOINTS(M)
+#undef M
+#undef SUB_M
+
+    throw Exception(fmt::format("Cannot find fail point {}", fail_point_name), ErrorCodes::FAIL_POINT_ERROR);
+}
 #else
 class FailPointChannel
 {
@@ -214,6 +271,10 @@ void FailPointHelper::enableFailPoint(const String &) {}
 void FailPointHelper::disableFailPoint(const String &) {}
 
 void FailPointHelper::wait(const String &) {}
+
+void FailPointHelper::initRandomFailPoints(Poco::Util::LayeredConfiguration &, Poco::Logger *) {}
+
+void FailPointHelper::enableRandomFailPoint(const String &, double) {}
 #endif
 
 } // namespace DB
diff --git a/dbms/src/Common/FailPoint.h b/dbms/src/Common/FailPoint.h
index 2cf40ad55e4..31df2dbdcd2 100644
--- a/dbms/src/Common/FailPoint.h
+++ b/dbms/src/Common/FailPoint.h
@@ -21,6 +21,15 @@
 
 #include <unordered_map>
 
+namespace Poco
+{
+class Logger;
+namespace Util
+{
+class LayeredConfiguration;
+}
+} // namespace Poco
+
 namespace DB
 {
 namespace ErrorCodes
@@ -35,7 +44,6 @@ extern const int FAIL_POINT_ERROR;
 // When `fail_point` is enabled, wait till it is disabled
 #define FAIL_POINT_PAUSE(fail_point) fiu_do_on(fail_point, FailPointHelper::wait(fail_point);)
 
-
 class FailPointChannel;
 class FailPointHelper
 {
@@ -46,6 +54,16 @@ class FailPointHelper
 
     static void wait(const String & fail_point_name);
 
+    /*
+     * For Server RandomFailPoint test usage. When FIU_ENABLE is defined, this function does the following work:
+     * 1. Return if TiFlash config has empty flash.random_fail_points cfg
+     * 2. Parse flash.random_fail_points, which expect to has "FailPointA-RatioA,FailPointB-RatioB,..." format
+     * 3. Call enableRandomFailPoint method with parsed FailPointName and Rate
+     */
+    static void initRandomFailPoints(Poco::Util::LayeredConfiguration & config, Poco::Logger * log);
+
+    static void enableRandomFailPoint(const String & fail_point_name, double rate);
+
 private:
     static std::unordered_map<String, std::shared_ptr<FailPointChannel>> fail_point_wait_channels;
 };
diff --git a/dbms/src/Common/wrapInvocable.h b/dbms/src/Common/wrapInvocable.h
index d6cee519835..1c93bb3e782 100644
--- a/dbms/src/Common/wrapInvocable.h
+++ b/dbms/src/Common/wrapInvocable.h
@@ -35,7 +35,6 @@ inline auto wrapInvocable(bool propagate_memory_tracker, Func && func, Args &&..
         // run the task with the parameters provided
         return std::apply(std::move(func), std::move(args));
     };
-
     return capture;
 }
 } // namespace DB
diff --git a/dbms/src/Core/Block.cpp b/dbms/src/Core/Block.cpp
index 28db7af82e1..971e8f36e2a 100644
--- a/dbms/src/Core/Block.cpp
+++ b/dbms/src/Core/Block.cpp
@@ -238,10 +238,18 @@ void Block::checkNumberOfRows() const
         if (rows == -1)
             rows = size;
         else if (rows != size)
-            throw Exception("Sizes of columns doesn't match: "
-                                + data.front().name + ": " + toString(rows)
-                                + ", " + elem.name + ": " + toString(size),
+        {
+            auto first_col = data.front();
+            throw Exception(fmt::format(
+                                "Sizes of columns doesn't match: {}(id={}): {}, {}(id={}): {}",
+                                first_col.name,
+                                first_col.column_id,
+                                rows,
+                                elem.name,
+                                elem.column_id,
+                                size),
                             ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
+        }
     }
 }
 
diff --git a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
index f4f8dfc1338..f983de91b37 100644
--- a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
+++ b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.cpp
@@ -198,8 +198,8 @@ void ParallelAggregatingBlockInputStream::Handler::onException(std::exception_pt
 
     /// can not cancel parent inputStream or the exception might be lost
     if (!parent.executed)
-        /// kill the processor so ExchangeReceiver will be closed
-        parent.processor.cancel(true);
+        /// use cancel instead of kill to avoid too many useless error message
+        parent.processor.cancel(false);
 }
 
 
diff --git a/dbms/src/DataStreams/SharedQueryBlockInputStream.h b/dbms/src/DataStreams/SharedQueryBlockInputStream.h
index e7cece67f0b..d7c0707b5aa 100644
--- a/dbms/src/DataStreams/SharedQueryBlockInputStream.h
+++ b/dbms/src/DataStreams/SharedQueryBlockInputStream.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/FailPoint.h>
 #include <Common/MPMCQueue.h>
 #include <Common/ThreadFactory.h>
 #include <Common/ThreadManager.h>
@@ -24,6 +25,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_sharedquery_failpoint[];
+} // namespace FailPoints
+
 /** This block input stream is used by SharedQuery.
   * It enable multiple threads read from one stream.
  */
@@ -136,6 +142,7 @@ class SharedQueryBlockInputStream : public IProfilingBlockInputStream
             in->readPrefix();
             while (true)
             {
+                FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_sharedquery_failpoint);
                 Block block = in->read();
                 // in is finished or queue is canceled
                 if (!block || !queue.push(block))
diff --git a/dbms/src/DataStreams/SizeLimits.cpp b/dbms/src/DataStreams/SizeLimits.cpp
index 7dd5e1524ba..4d1bfaae997 100644
--- a/dbms/src/DataStreams/SizeLimits.cpp
+++ b/dbms/src/DataStreams/SizeLimits.cpp
@@ -12,22 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <DataStreams/SizeLimits.h>
-#include <Common/formatReadable.h>
 #include <Common/Exception.h>
-#include <string>
+#include <Common/FailPoint.h>
+#include <Common/formatReadable.h>
+#include <DataStreams/SizeLimits.h>
 
+#include <string>
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_limit_check_failpoint[];
+} // namespace FailPoints
 
 bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int exception_code) const
 {
-    if (max_rows && rows > max_rows)
+    bool rows_exceed_limit = max_rows && rows > max_rows;
+    fiu_do_on(FailPoints::random_limit_check_failpoint, rows_exceed_limit = true;);
+    if (rows_exceed_limit)
     {
         if (overflow_mode == OverflowMode::THROW)
             throw Exception("Limit for " + std::string(what) + " exceeded, max rows: " + formatReadableQuantity(max_rows)
-                + ", current rows: " + formatReadableQuantity(rows), exception_code);
+                                + ", current rows: " + formatReadableQuantity(rows),
+                            exception_code);
         else
             return false;
     }
@@ -36,7 +44,8 @@ bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int excepti
     {
         if (overflow_mode == OverflowMode::THROW)
             throw Exception("Limit for " + std::string(what) + " exceeded, max bytes: " + formatReadableSizeWithBinarySuffix(max_bytes)
-                + ", current bytes: " + formatReadableSizeWithBinarySuffix(bytes), exception_code);
+                                + ", current bytes: " + formatReadableSizeWithBinarySuffix(bytes),
+                            exception_code);
         else
             return false;
     }
@@ -44,4 +53,4 @@ bool SizeLimits::check(UInt64 rows, UInt64 bytes, const char * what, int excepti
     return true;
 }
 
-}
+} // namespace DB
diff --git a/dbms/src/DataStreams/UnionBlockInputStream.h b/dbms/src/DataStreams/UnionBlockInputStream.h
index 251d0663e14..a782c3dd087 100644
--- a/dbms/src/DataStreams/UnionBlockInputStream.h
+++ b/dbms/src/DataStreams/UnionBlockInputStream.h
@@ -293,8 +293,8 @@ class UnionBlockInputStream final : public IProfilingBlockInputStream
         /// and the exception is lost.
         output_queue.emplace(exception);
         /// can not cancel itself or the exception might be lost
-        /// kill the processor so ExchangeReceiver will be closed
-        processor.cancel(true);
+        /// use cancel instead of kill to avoid too many useless error message
+        processor.cancel(false);
     }
 
     struct Handler
diff --git a/dbms/src/Debug/MockSchemaGetter.h b/dbms/src/Debug/MockSchemaGetter.h
index f02699866ce..11c5d97f036 100644
--- a/dbms/src/Debug/MockSchemaGetter.h
+++ b/dbms/src/Debug/MockSchemaGetter.h
@@ -17,16 +17,25 @@
 #include <Debug/MockTiDB.h>
 #include <TiDB/Schema/SchemaGetter.h>
 
+#include <optional>
+
 namespace DB
 {
-
 struct MockSchemaGetter
 {
     TiDB::DBInfoPtr getDatabase(DatabaseID db_id) { return MockTiDB::instance().getDBInfoByID(db_id); }
 
     Int64 getVersion() { return MockTiDB::instance().getVersion(); }
 
-    SchemaDiff getSchemaDiff(Int64 version) { return MockTiDB::instance().getSchemaDiff(version); }
+    std::optional<SchemaDiff> getSchemaDiff(Int64 version)
+    {
+        return MockTiDB::instance().getSchemaDiff(version);
+    }
+
+    bool checkSchemaDiffExists(Int64 version)
+    {
+        return MockTiDB::instance().checkSchemaDiffExists(version);
+    }
 
     TiDB::TableInfoPtr getTableInfo(DatabaseID, TableID table_id) { return MockTiDB::instance().getTableInfoByID(table_id); }
 
diff --git a/dbms/src/Debug/MockTiDB.cpp b/dbms/src/Debug/MockTiDB.cpp
index 42ab56a97c1..99d9625461b 100644
--- a/dbms/src/Debug/MockTiDB.cpp
+++ b/dbms/src/Debug/MockTiDB.cpp
@@ -221,7 +221,6 @@ TiDB::TableInfoPtr MockTiDB::parseColumns(
             {
                 String & name = string_tokens[index];
                 index_info.idx_cols[index].name = name;
-                index_info.idx_cols[index].offset = pk_column_pos_map[name];
                 index_info.idx_cols[index].length = -1;
             }
         }
@@ -302,7 +301,7 @@ int MockTiDB::newTables(
         tables_by_id.emplace(table->table_info.id, table);
         tables_by_name.emplace(qualified_name, table);
 
-        AffectedOption opt;
+        AffectedOption opt{};
         opt.schema_id = table->database_id;
         opt.table_id = table->id();
         opt.old_schema_id = table->database_id;
@@ -571,7 +570,7 @@ void MockTiDB::renameTables(const std::vector<std::tuple<std::string, std::strin
         tables_by_name.erase(qualified_name);
         tables_by_name.emplace(new_qualified_name, new_table);
 
-        AffectedOption opt;
+        AffectedOption opt{};
         opt.schema_id = table->database_id;
         opt.table_id = new_table->id();
         opt.old_schema_id = table->database_id;
@@ -669,9 +668,14 @@ std::pair<bool, DatabaseID> MockTiDB::getDBIDByName(const String & database_name
     return std::make_pair(false, -1);
 }
 
-SchemaDiff MockTiDB::getSchemaDiff(Int64 version_)
+std::optional<SchemaDiff> MockTiDB::getSchemaDiff(Int64 version_)
 {
     return version_diff[version_];
 }
 
+bool MockTiDB::checkSchemaDiffExists(Int64 version)
+{
+    return version_diff.find(version) != version_diff.end();
+}
+
 } // namespace DB
diff --git a/dbms/src/Debug/MockTiDB.h b/dbms/src/Debug/MockTiDB.h
index 36d2af90859..261e547b13a 100644
--- a/dbms/src/Debug/MockTiDB.h
+++ b/dbms/src/Debug/MockTiDB.h
@@ -127,7 +127,9 @@ class MockTiDB : public ext::Singleton<MockTiDB>
 
     std::pair<bool, DatabaseID> getDBIDByName(const String & database_name);
 
-    SchemaDiff getSchemaDiff(Int64 version);
+    bool checkSchemaDiffExists(Int64 version);
+
+    std::optional<SchemaDiff> getSchemaDiff(Int64 version);
 
     std::unordered_map<String, DatabaseID> getDatabases() { return databases; }
 
diff --git a/dbms/src/Debug/astToExecutor.cpp b/dbms/src/Debug/astToExecutor.cpp
index fec76d7a085..7d1f3bc7209 100644
--- a/dbms/src/Debug/astToExecutor.cpp
+++ b/dbms/src/Debug/astToExecutor.cpp
@@ -1629,7 +1629,6 @@ ExecutorPtr compileProject(ExecutorPtr input, size_t & executor_index, ASTPtr se
             }
         }
     }
-
     auto project = std::make_shared<mock::Project>(executor_index, output_schema, std::move(exprs));
     project->children.push_back(input);
     return project;
diff --git a/dbms/src/Debug/dbgFuncMockRaftCommand.cpp b/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
index df93ee1c78d..3626041f428 100644
--- a/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
+++ b/dbms/src/Debug/dbgFuncMockRaftCommand.cpp
@@ -40,7 +40,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
     const String & database_name = typeid_cast<const ASTIdentifier &>(*args[1]).name;
     const String & table_name = typeid_cast<const ASTIdentifier &>(*args[2]).name;
     auto table = MockTiDB::instance().getTableByName(database_name, table_name);
@@ -49,7 +49,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     if (4 + handle_column_size * 4 != args.size())
         throw Exception("Args not matched, should be: region-id1, database-name, table-name, start1, end1, start2, end2, region-id2",
                         ErrorCodes::BAD_ARGUMENTS);
-    RegionID region_id2 = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[args.size() - 1]).value);
+    auto region_id2 = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[args.size() - 1]).value));
 
     auto table_id = table->id();
     TiKVKey start_key1, start_key2, end_key1, end_key2;
@@ -59,9 +59,17 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
         std::vector<Field> start_keys2;
         std::vector<Field> end_keys1;
         std::vector<Field> end_keys2;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            auto & column_info = table_info.columns[idx];
 
             auto start_field1 = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[3 + i]).value);
             TiDB::DatumBumpy start_datum1 = TiDB::DatumBumpy(start_field1, column_info.tp);
@@ -88,10 +96,10 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
     }
     else
     {
-        HandleID start1 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value);
-        HandleID end1 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value);
-        HandleID start2 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[5]).value);
-        HandleID end2 = (HandleID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[6]).value);
+        auto start1 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
+        auto end1 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[4]).value));
+        auto start2 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[5]).value));
+        auto end2 = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[6]).value));
         start_key1 = RecordKVFormat::genKey(table_id, start1);
         start_key2 = RecordKVFormat::genKey(table_id, start2);
         end_key1 = RecordKVFormat::genKey(table_id, end1);
@@ -110,7 +118,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
         request.set_cmd_type(raft_cmdpb::AdminCmdType::BatchSplit);
         raft_cmdpb::BatchSplitResponse * splits = response.mutable_splits();
         {
-            auto region = splits->add_regions();
+            auto * region = splits->add_regions();
             region->set_id(region_id);
             region->set_start_key(start_key1);
             region->set_end_key(end_key1);
@@ -118,7 +126,7 @@ void MockRaftCommand::dbgFuncRegionBatchSplit(Context & context, const ASTs & ar
             *region->mutable_region_epoch() = new_epoch;
         }
         {
-            auto region = splits->add_regions();
+            auto * region = splits->add_regions();
             region->set_id(region_id2);
             region->set_start_key(start_key2);
             region->set_end_key(end_key2);
@@ -144,8 +152,8 @@ void MockRaftCommand::dbgFuncPrepareMerge(Context & context, const ASTs & args,
         throw Exception("Args not matched, should be: source-id1, target-id2", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
-    RegionID target_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
+    auto target_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -157,7 +165,7 @@ void MockRaftCommand::dbgFuncPrepareMerge(Context & context, const ASTs & args,
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::PrepareMerge);
 
-        auto prepare_merge = request.mutable_prepare_merge();
+        auto * prepare_merge = request.mutable_prepare_merge();
         {
             auto min_index = region->appliedIndex();
             prepare_merge->set_min_index(min_index);
@@ -184,8 +192,8 @@ void MockRaftCommand::dbgFuncCommitMerge(Context & context, const ASTs & args, D
         throw Exception("Args not matched, should be: source-id1, current-id2", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID source_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
-    RegionID current_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value);
+    auto source_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
+    auto current_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[1]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -196,7 +204,7 @@ void MockRaftCommand::dbgFuncCommitMerge(Context & context, const ASTs & args, D
 
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::CommitMerge);
-        auto commit_merge = request.mutable_commit_merge();
+        auto * commit_merge = request.mutable_commit_merge();
         {
             commit_merge->set_commit(source_region->appliedIndex());
             *commit_merge->mutable_source() = source_region->getMetaRegion();
@@ -220,7 +228,7 @@ void MockRaftCommand::dbgFuncRollbackMerge(Context & context, const ASTs & args,
         throw Exception("Args not matched, should be: region-id", ErrorCodes::BAD_ARGUMENTS);
     }
 
-    RegionID region_id = (RegionID)safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value);
+    auto region_id = static_cast<RegionID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[0]).value));
 
     auto & tmt = context.getTMTContext();
     auto & kvstore = tmt.getKVStore();
@@ -231,7 +239,7 @@ void MockRaftCommand::dbgFuncRollbackMerge(Context & context, const ASTs & args,
     {
         request.set_cmd_type(raft_cmdpb::AdminCmdType::RollbackMerge);
 
-        auto rollback_merge = request.mutable_rollback_merge();
+        auto * rollback_merge = request.mutable_rollback_merge();
         {
             auto merge_state = region->getMergeState();
             rollback_merge->set_commit(merge_state.commit());
diff --git a/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp b/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
index 9d5b848ddea..b5d3f252d0a 100644
--- a/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
+++ b/dbms/src/Debug/dbgFuncMockRaftSnapshot.cpp
@@ -68,6 +68,12 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
     size_t handle_column_size = is_common_handle ? table_info.getPrimaryIndexInfo().idx_cols.size() : 1;
     RegionPtr region;
 
+    std::unordered_map<String, size_t> column_name_columns_index_map;
+    for (size_t i = 0; i < table_info.columns.size(); i++)
+    {
+        column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+    }
+
     if (!is_common_handle)
     {
         auto start = static_cast<HandleID>(safeGet<UInt64>(typeid_cast<const ASTLiteral &>(*args[3]).value));
@@ -81,7 +87,8 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
         std::vector<Field> end_keys;
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[3 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
@@ -122,9 +129,9 @@ RegionPtr GenDbgRegionSnapshotWithData(Context & context, const ASTs & args)
                 std::vector<Field> keys; // handle key
                 for (size_t i = 0; i < table_info.getPrimaryIndexInfo().idx_cols.size(); i++)
                 {
-                    auto & idx_col = table_info.getPrimaryIndexInfo().idx_cols[i];
-                    auto & column_info = table_info.columns[idx_col.offset];
-                    auto start_field = RegionBench::convertField(column_info, fields[idx_col.offset]);
+                    auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+                    auto & column_info = table_info.columns[idx];
+                    auto start_field = RegionBench::convertField(column_info, fields[idx]);
                     TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
                     keys.emplace_back(start_datum.field());
                 }
@@ -198,9 +205,16 @@ void MockRaftCommand::dbgFuncRegionSnapshot(Context & context, const ASTs & args
         // Get start key and end key form multiple column if it is clustered_index.
         std::vector<Field> start_keys;
         std::vector<Field> end_keys;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            const auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[1 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
diff --git a/dbms/src/Debug/dbgFuncRegion.cpp b/dbms/src/Debug/dbgFuncRegion.cpp
index b2024eac1d8..f65a18b8fd0 100644
--- a/dbms/src/Debug/dbgFuncRegion.cpp
+++ b/dbms/src/Debug/dbgFuncRegion.cpp
@@ -61,9 +61,15 @@ void dbgFuncPutRegion(Context & context, const ASTs & args, DBGInvoker::Printer
     {
         std::vector<Field> start_keys;
         std::vector<Field> end_keys;
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
         for (size_t i = 0; i < handle_column_size; i++)
         {
-            const auto & column_info = table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset];
+            auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[idx];
             auto start_field = RegionBench::convertField(column_info, typeid_cast<const ASTLiteral &>(*args[1 + i]).value);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             start_keys.emplace_back(start_datum.field());
diff --git a/dbms/src/Debug/dbgTools.cpp b/dbms/src/Debug/dbgTools.cpp
index 685b2563a3b..854d8a18bd5 100644
--- a/dbms/src/Debug/dbgTools.cpp
+++ b/dbms/src/Debug/dbgTools.cpp
@@ -310,7 +310,7 @@ void insert( //
     // Parse the fields in the inserted row
     std::vector<Field> fields;
     {
-        for (ASTs::const_iterator it = values_begin; it != values_end; ++it)
+        for (auto it = values_begin; it != values_end; ++it)
         {
             auto field = typeid_cast<const ASTLiteral *>((*it).get())->value;
             fields.emplace_back(field);
@@ -330,11 +330,18 @@ void insert( //
     if (table_info.is_common_handle)
     {
         std::vector<Field> keys;
+
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
         for (size_t i = 0; i < table_info.getPrimaryIndexInfo().idx_cols.size(); i++)
         {
-            const auto & idx_col = table_info.getPrimaryIndexInfo().idx_cols[i];
-            const auto & column_info = table_info.columns[idx_col.offset];
-            auto start_field = RegionBench::convertField(column_info, fields[idx_col.offset]);
+            const auto & col_idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+            const auto & column_info = table_info.columns[col_idx];
+            auto start_field = RegionBench::convertField(column_info, fields[col_idx]);
             TiDB::DatumBumpy start_datum = TiDB::DatumBumpy(start_field, column_info.tp);
             keys.emplace_back(start_datum.field());
         }
diff --git a/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp b/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
index a1c6061948a..1609c83b029 100644
--- a/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
+++ b/dbms/src/Flash/Coprocessor/ArrowColCodec.cpp
@@ -20,7 +20,6 @@
 #include <DataTypes/DataTypeDecimal.h>
 #include <DataTypes/DataTypeEnum.h>
 #include <DataTypes/DataTypeMyDate.h>
-#include <DataTypes/DataTypeMyDateTime.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
@@ -41,7 +40,7 @@ extern const int NOT_IMPLEMENTED;
 const IColumn * getNestedCol(const IColumn * flash_col)
 {
     if (flash_col->isColumnNullable())
-        return dynamic_cast<const ColumnNullable *>(flash_col)->getNestedColumnPtr().get();
+        return static_cast<const ColumnNullable *>(flash_col)->getNestedColumnPtr().get();
     else
         return flash_col;
 }
@@ -75,8 +74,8 @@ bool flashDecimalColToArrowColInternal(
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     if (checkColumn<ColumnDecimal<T>>(nested_col) && checkDataType<DataTypeDecimal<T>>(data_type))
     {
-        const ColumnDecimal<T> * flash_col = checkAndGetColumn<ColumnDecimal<T>>(nested_col);
-        const DataTypeDecimal<T> * type = checkAndGetDataType<DataTypeDecimal<T>>(data_type);
+        const auto * flash_col = checkAndGetColumn<ColumnDecimal<T>>(nested_col);
+        const auto * type = checkAndGetDataType<DataTypeDecimal<T>>(data_type);
         UInt32 scale = type->getScale();
         for (size_t i = start_index; i < end_index; i++)
         {
@@ -92,8 +91,8 @@ bool flashDecimalColToArrowColInternal(
             std::vector<Int32> digits;
             digits.reserve(type->getPrec());
             decimalToVector<typename T::NativeType>(dec.value, digits, scale);
-            TiDBDecimal tiDecimal(scale, digits, dec.value < 0);
-            dag_column.append(tiDecimal);
+            TiDBDecimal ti_decimal(scale, digits, dec.value < 0);
+            dag_column.append(ti_decimal);
         }
         return true;
     }
@@ -121,7 +120,7 @@ template <typename T, bool is_nullable>
 bool flashIntegerColToArrowColInternal(TiDBColumn & dag_column, const IColumn * flash_col_untyped, size_t start_index, size_t end_index)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    if (const ColumnVector<T> * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
+    if (const auto * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
     {
         constexpr bool is_unsigned = std::is_unsigned_v<T>;
         for (size_t i = start_index; i < end_index; i++)
@@ -135,9 +134,9 @@ bool flashIntegerColToArrowColInternal(TiDBColumn & dag_column, const IColumn *
                 }
             }
             if constexpr (is_unsigned)
-                dag_column.append((UInt64)flash_col->getElement(i));
+                dag_column.append(static_cast<UInt64>(flash_col->getElement(i)));
             else
-                dag_column.append((Int64)flash_col->getElement(i));
+                dag_column.append(static_cast<UInt64>(flash_col->getElement(i)));
         }
         return true;
     }
@@ -148,7 +147,7 @@ template <typename T, bool is_nullable>
 void flashDoubleColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col_untyped, size_t start_index, size_t end_index)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    if (const ColumnVector<T> * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
+    if (const auto * flash_col = checkAndGetColumn<ColumnVector<T>>(nested_col))
     {
         for (size_t i = start_index; i < end_index; i++)
         {
@@ -160,7 +159,7 @@ void flashDoubleColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col
                     continue;
                 }
             }
-            dag_column.append((T)flash_col->getElement(i));
+            dag_column.append(static_cast<T>(flash_col->getElement(i)));
         }
         return;
     }
@@ -196,7 +195,7 @@ void flashDateOrDateTimeColToArrowCol(
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     using DateFieldType = DataTypeMyTimeBase::FieldType;
-    auto * flash_col = checkAndGetColumn<ColumnVector<DateFieldType>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<DateFieldType>>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         if constexpr (is_nullable)
@@ -217,7 +216,7 @@ void flashStringColToArrowCol(TiDBColumn & dag_column, const IColumn * flash_col
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
     // columnFixedString is not used so do not check it
-    auto * flash_col = checkAndGetColumn<ColumnString>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnString>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         // todo check if we can convert flash_col to DAG col directly since the internal representation is almost the same
@@ -242,7 +241,7 @@ void flashBitColToArrowCol(
     const tipb::FieldType & field_type)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    auto * flash_col = checkAndGetColumn<ColumnVector<UInt64>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<UInt64>>(nested_col);
     for (size_t i = start_index; i < end_index; i++)
     {
         if constexpr (is_nullable)
@@ -267,7 +266,7 @@ void flashEnumColToArrowCol(
     const IDataType * data_type)
 {
     const IColumn * nested_col = getNestedCol(flash_col_untyped);
-    auto * flash_col = checkAndGetColumn<ColumnVector<DataTypeEnum16::FieldType>>(nested_col);
+    const auto * flash_col = checkAndGetColumn<ColumnVector<DataTypeEnum16::FieldType>>(nested_col);
     const auto * enum_type = checkAndGetDataType<DataTypeEnum16>(data_type);
     size_t enum_value_size = enum_type->getValues().size();
     for (size_t i = start_index; i < end_index; i++)
@@ -280,10 +279,10 @@ void flashEnumColToArrowCol(
                 continue;
             }
         }
-        auto enum_value = (UInt64)flash_col->getElement(i);
+        auto enum_value = static_cast<UInt64>(flash_col->getElement(i));
         if (enum_value == 0 || enum_value > enum_value_size)
             throw TiFlashException("number of enum overflow enum boundary", Errors::Coprocessor::Internal);
-        TiDBEnum ti_enum(enum_value, enum_type->getNameForValue((const DataTypeEnum16::FieldType)enum_value));
+        TiDBEnum ti_enum(enum_value, enum_type->getNameForValue(static_cast<const DataTypeEnum16::FieldType>(enum_value)));
         dag_column.append(ti_enum);
     }
 }
@@ -300,7 +299,7 @@ void flashColToArrowCol(TiDBColumn & dag_column, const ColumnWithTypeAndName & f
         throw TiFlashException("Flash column and TiDB column has different not null flag", Errors::Coprocessor::Internal);
     }
     if (type->isNullable())
-        type = dynamic_cast<const DataTypeNullable *>(type)->getNestedType().get();
+        type = static_cast<const DataTypeNullable *>(type)->getNestedType().get();
 
     switch (tidb_column_info.tp)
     {
@@ -457,7 +456,7 @@ const char * arrowEnumColToFlashCol(
     {
         if (checkNull(i, null_count, null_bitmap, col))
             continue;
-        const auto enum_value = (Int64)toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos + offsets[i])));
+        const auto enum_value = static_cast<Int64>(toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos + offsets[i]))));
         col.column->assumeMutable()->insert(Field(enum_value));
     }
     return pos + offsets[length];
@@ -479,11 +478,11 @@ const char * arrowBitColToFlashCol(
             continue;
         const String value = String(pos + offsets[i], pos + offsets[i + 1]);
         if (value.length() == 0)
-            col.column->assumeMutable()->insert(Field(UInt64(0)));
+            col.column->assumeMutable()->insert(Field(static_cast<UInt64>(0)));
         UInt64 result = 0;
-        for (auto & c : value)
+        for (const auto & c : value)
         {
-            result = (result << 8u) | (UInt8)c;
+            result = (result << 8u) | static_cast<UInt8>(c);
         }
         col.column->assumeMutable()->insert(Field(result));
     }
@@ -500,7 +499,7 @@ T toCHDecimal(UInt8 digits_int, UInt8 digits_frac, bool negative, const Int32 *
     UInt8 tailing_digit = digits_frac % DIGITS_PER_WORD;
 
     typename T::NativeType value = 0;
-    const int word_max = int(1e9);
+    const int word_max = static_cast<int>(1e9);
     for (int i = 0; i < word_int; i++)
     {
         value = value * word_max + word_buf[i];
@@ -552,28 +551,28 @@ const char * arrowDecimalColToFlashCol(
         pos += 1;
         Int32 word_buf[MAX_WORD_BUF_LEN];
         const DataTypePtr decimal_type
-            = col.type->isNullable() ? dynamic_cast<const DataTypeNullable *>(col.type.get())->getNestedType() : col.type;
-        for (int j = 0; j < MAX_WORD_BUF_LEN; j++)
+            = col.type->isNullable() ? static_cast<const DataTypeNullable *>(col.type.get())->getNestedType() : col.type;
+        for (int & j : word_buf)
         {
-            word_buf[j] = toLittleEndian(*(reinterpret_cast<const Int32 *>(pos)));
+            j = toLittleEndian(*(reinterpret_cast<const Int32 *>(pos)));
             pos += 4;
         }
-        if (auto * type32 = checkDecimal<Decimal32>(*decimal_type))
+        if (const auto * type32 = checkDecimal<Decimal32>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal32>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal32>(res, type32->getScale()));
         }
-        else if (auto * type64 = checkDecimal<Decimal64>(*decimal_type))
+        else if (const auto * type64 = checkDecimal<Decimal64>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal64>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal64>(res, type64->getScale()));
         }
-        else if (auto * type128 = checkDecimal<Decimal128>(*decimal_type))
+        else if (const auto * type128 = checkDecimal<Decimal128>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal128>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal128>(res, type128->getScale()));
         }
-        else if (auto * type256 = checkDecimal<Decimal256>(*decimal_type))
+        else if (const auto * type256 = checkDecimal<Decimal256>(*decimal_type))
         {
             auto res = toCHDecimal<Decimal256>(digits_int, digits_frac, negative, word_buf);
             col.column->assumeMutable()->insert(DecimalField<Decimal256>(res, type256->getScale()));
@@ -600,13 +599,13 @@ const char * arrowDateColToFlashCol(
             continue;
         }
         UInt64 chunk_time = toLittleEndian(*(reinterpret_cast<const UInt64 *>(pos)));
-        UInt16 year = (UInt16)((chunk_time & MyTimeBase::YEAR_BIT_FIELD_MASK) >> MyTimeBase::YEAR_BIT_FIELD_OFFSET);
-        UInt8 month = (UInt8)((chunk_time & MyTimeBase::MONTH_BIT_FIELD_MASK) >> MyTimeBase::MONTH_BIT_FIELD_OFFSET);
-        UInt8 day = (UInt8)((chunk_time & MyTimeBase::DAY_BIT_FIELD_MASK) >> MyTimeBase::DAY_BIT_FIELD_OFFSET);
-        UInt16 hour = (UInt16)((chunk_time & MyTimeBase::HOUR_BIT_FIELD_MASK) >> MyTimeBase::HOUR_BIT_FIELD_OFFSET);
-        UInt8 minute = (UInt8)((chunk_time & MyTimeBase::MINUTE_BIT_FIELD_MASK) >> MyTimeBase::MINUTE_BIT_FIELD_OFFSET);
-        UInt8 second = (UInt8)((chunk_time & MyTimeBase::SECOND_BIT_FIELD_MASK) >> MyTimeBase::SECOND_BIT_FIELD_OFFSET);
-        UInt32 micro_second = (UInt32)((chunk_time & MyTimeBase::MICROSECOND_BIT_FIELD_MASK) >> MyTimeBase::MICROSECOND_BIT_FIELD_OFFSET);
+        auto year = static_cast<UInt16>((chunk_time & MyTimeBase::YEAR_BIT_FIELD_MASK) >> MyTimeBase::YEAR_BIT_FIELD_OFFSET);
+        auto month = static_cast<UInt8>((chunk_time & MyTimeBase::MONTH_BIT_FIELD_MASK) >> MyTimeBase::MONTH_BIT_FIELD_OFFSET);
+        auto day = static_cast<UInt8>((chunk_time & MyTimeBase::DAY_BIT_FIELD_MASK) >> MyTimeBase::DAY_BIT_FIELD_OFFSET);
+        auto hour = static_cast<UInt16>((chunk_time & MyTimeBase::HOUR_BIT_FIELD_MASK) >> MyTimeBase::HOUR_BIT_FIELD_OFFSET);
+        auto minute = static_cast<UInt8>((chunk_time & MyTimeBase::MINUTE_BIT_FIELD_MASK) >> MyTimeBase::MINUTE_BIT_FIELD_OFFSET);
+        auto second = static_cast<UInt8>((chunk_time & MyTimeBase::SECOND_BIT_FIELD_MASK) >> MyTimeBase::SECOND_BIT_FIELD_OFFSET);
+        auto micro_second = static_cast<UInt32>((chunk_time & MyTimeBase::MICROSECOND_BIT_FIELD_MASK) >> MyTimeBase::MICROSECOND_BIT_FIELD_OFFSET);
         MyDateTime mt(year, month, day, hour, minute, second, micro_second);
         pos += field_length;
         col.column->assumeMutable()->insert(Field(mt.toPackedUInt()));
@@ -659,7 +658,7 @@ const char * arrowNumColToFlashCol(
         case TiDB::TypeFloat:
             u32 = toLittleEndian(*(reinterpret_cast<const UInt32 *>(pos)));
             std::memcpy(&f32, &u32, sizeof(Float32));
-            col.column->assumeMutable()->insert(Field((Float64)f32));
+            col.column->assumeMutable()->insert(Field(static_cast<Float64>(f32)));
             break;
         case TiDB::TypeDouble:
             u64 = toLittleEndian(*(reinterpret_cast<const UInt64 *>(pos)));
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.cpp b/dbms/src/Flash/Coprocessor/DAGContext.cpp
index 1ef7338a589..ec0544c6ee4 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGContext.cpp
@@ -206,12 +206,20 @@ void DAGContext::attachBlockIO(const BlockIO & io_)
     io = io_;
 }
 
-const std::unordered_map<String, std::shared_ptr<ExchangeReceiver>> & DAGContext::getMPPExchangeReceiverMap() const
+ExchangeReceiverPtr DAGContext::getMPPExchangeReceiver(const String & executor_id) const
 {
     if (!isMPPTask())
         throw TiFlashException("mpp_exchange_receiver_map is used in mpp only", Errors::Coprocessor::Internal);
-    RUNTIME_ASSERT(mpp_exchange_receiver_map != nullptr, log, "MPPTask without exchange receiver map");
-    return *mpp_exchange_receiver_map;
+    RUNTIME_ASSERT(mpp_receiver_set != nullptr, log, "MPPTask without receiver set");
+    return mpp_receiver_set->getExchangeReceiver(executor_id);
+}
+
+void DAGContext::addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader)
+{
+    if (!isMPPTask())
+        return;
+    RUNTIME_ASSERT(mpp_receiver_set != nullptr, log, "MPPTask without receiver set");
+    return mpp_receiver_set->addCoprocessorReader(coprocessor_reader);
 }
 
 bool DAGContext::containsRegionsInfoForTable(Int64 table_id) const
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.h b/dbms/src/Flash/Coprocessor/DAGContext.h
index 07b65b2d8fe..a50a4d4007b 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.h
+++ b/dbms/src/Flash/Coprocessor/DAGContext.h
@@ -37,8 +37,13 @@ namespace DB
 class Context;
 class MPPTunnelSet;
 class ExchangeReceiver;
-using ExchangeReceiverMap = std::unordered_map<String, std::shared_ptr<ExchangeReceiver>>;
-using ExchangeReceiverMapPtr = std::shared_ptr<std::unordered_map<String, std::shared_ptr<ExchangeReceiver>>>;
+using ExchangeReceiverPtr = std::shared_ptr<ExchangeReceiver>;
+/// key: executor_id of ExchangeReceiver nodes in dag.
+using ExchangeReceiverMap = std::unordered_map<String, ExchangeReceiverPtr>;
+class MPPReceiverSet;
+using MPPReceiverSetPtr = std::shared_ptr<MPPReceiverSet>;
+class CoprocessorReader;
+using CoprocessorReaderPtr = std::shared_ptr<CoprocessorReader>;
 
 class Join;
 using JoinPtr = std::shared_ptr<Join>;
@@ -304,11 +309,12 @@ class DAGContext
 
     bool columnsForTestEmpty() { return columns_for_test_map.empty(); }
 
-    const std::unordered_map<String, std::shared_ptr<ExchangeReceiver>> & getMPPExchangeReceiverMap() const;
-    void setMPPExchangeReceiverMap(ExchangeReceiverMapPtr & exchange_receiver_map)
+    ExchangeReceiverPtr getMPPExchangeReceiver(const String & executor_id) const;
+    void setMPPReceiverSet(const MPPReceiverSetPtr & receiver_set)
     {
-        mpp_exchange_receiver_map = exchange_receiver_map;
+        mpp_receiver_set = receiver_set;
     }
+    void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
 
     void addSubquery(const String & subquery_id, SubqueryForSet && subquery);
     bool hasSubquery() const { return !subqueries.empty(); }
@@ -343,6 +349,10 @@ class DAGContext
     std::vector<tipb::FieldType> output_field_types;
     std::vector<Int32> output_offsets;
 
+    /// Hold the order of list based executors.
+    /// It is used to ensure that the order of Execution summary of list based executors is the same as the order of list based executors.
+    std::vector<String> list_based_executors_order;
+
 private:
     void initExecutorIdToJoinIdMap();
     void initOutputInfo();
@@ -350,7 +360,7 @@ class DAGContext
 private:
     /// Hold io for correcting the destruction order.
     BlockIO io;
-    /// profile_streams_map is a map that maps from executor_id to profile BlockInputStreams
+    /// profile_streams_map is a map that maps from executor_id to profile BlockInputStreams.
     std::unordered_map<String, BlockInputStreams> profile_streams_map;
     /// executor_id_to_join_id_map is a map that maps executor id to all the join executor id of itself and all its children.
     std::unordered_map<String, std::vector<String>> executor_id_to_join_id_map;
@@ -369,8 +379,8 @@ class DAGContext
     ConcurrentBoundedQueue<tipb::Error> warnings;
     /// warning_count is the actual warning count during the entire execution
     std::atomic<UInt64> warning_count;
-    /// key: executor_id of ExchangeReceiver nodes in dag.
-    ExchangeReceiverMapPtr mpp_exchange_receiver_map;
+
+    MPPReceiverSetPtr mpp_receiver_set;
     /// vector of SubqueriesForSets(such as join build subquery).
     /// The order of the vector is also the order of the subquery.
     std::vector<SubqueriesForSets> subqueries;
diff --git a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
index 86d6428c92a..e322a830744 100644
--- a/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGQueryBlockInterpreter.cpp
@@ -481,14 +481,14 @@ void DAGQueryBlockInterpreter::recordProfileStreams(DAGPipeline & pipeline, cons
 
 void DAGQueryBlockInterpreter::handleExchangeReceiver(DAGPipeline & pipeline)
 {
-    auto it = dagContext().getMPPExchangeReceiverMap().find(query_block.source_name);
-    if (unlikely(it == dagContext().getMPPExchangeReceiverMap().end()))
+    auto exchange_receiver = dagContext().getMPPExchangeReceiver(query_block.source_name);
+    if (unlikely(exchange_receiver == nullptr))
         throw Exception("Can not find exchange receiver for " + query_block.source_name, ErrorCodes::LOGICAL_ERROR);
     // todo choose a more reasonable stream number
     auto & exchange_receiver_io_input_streams = dagContext().getInBoundIOInputStreamsMap()[query_block.source_name];
     for (size_t i = 0; i < max_streams; ++i)
     {
-        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(it->second, log->identifier(), query_block.source_name);
+        BlockInputStreamPtr stream = std::make_shared<ExchangeReceiverInputStream>(exchange_receiver, log->identifier(), query_block.source_name);
         exchange_receiver_io_input_streams.push_back(stream);
         stream = std::make_shared<SquashingBlockInputStream>(stream, 8192, 0, log->identifier());
         stream->setExtraInfo("squashing after exchange receiver");
diff --git a/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp b/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
index 882699e1599..d68a7b17aaa 100644
--- a/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGQuerySource.cpp
@@ -20,6 +20,26 @@
 
 namespace DB
 {
+namespace
+{
+void fillOrderForListBasedExecutors(DAGContext & dag_context, const DAGQueryBlock & query_block)
+{
+    assert(query_block.source);
+    auto & list_based_executors_order = dag_context.list_based_executors_order;
+    list_based_executors_order.push_back(query_block.source_name);
+    if (query_block.selection)
+        list_based_executors_order.push_back(query_block.selection_name);
+    if (query_block.aggregation)
+        list_based_executors_order.push_back(query_block.aggregation_name);
+    if (query_block.having)
+        list_based_executors_order.push_back(query_block.having_name);
+    if (query_block.limit_or_topn)
+        list_based_executors_order.push_back(query_block.limit_or_topn_name);
+    if (query_block.exchange_sender)
+        dag_context.list_based_executors_order.push_back(query_block.exchange_sender_name);
+}
+} // namespace
+
 DAGQuerySource::DAGQuerySource(Context & context_)
     : context(context_)
 {
@@ -32,6 +52,9 @@ DAGQuerySource::DAGQuerySource(Context & context_)
     else
     {
         root_query_block = std::make_shared<DAGQueryBlock>(1, dag_request.executors());
+        auto & dag_context = getDAGContext();
+        if (!dag_context.return_executor_id)
+            fillOrderForListBasedExecutors(dag_context, *root_query_block);
     }
 }
 
diff --git a/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
index 53bebc91da8..33f6d99f9d8 100644
--- a/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGResponseWriter.cpp
@@ -89,12 +89,10 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
         }
     }
 
-    /// add execution_summary for local executor
-    for (auto & p : dag_context.getProfileStreamsMap())
-    {
+    auto fill_execution_summary = [&](const String & executor_id, const BlockInputStreams & streams) {
         ExecutionSummary current;
         /// part 1: local execution info
-        for (auto & stream_ptr : p.second)
+        for (const auto & stream_ptr : streams)
         {
             if (auto * p_stream = dynamic_cast<IProfilingBlockInputStream *>(stream_ptr.get()))
             {
@@ -105,16 +103,16 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
             current.concurrency++;
         }
         /// part 2: remote execution info
-        if (merged_remote_execution_summaries.find(p.first) != merged_remote_execution_summaries.end())
+        if (merged_remote_execution_summaries.find(executor_id) != merged_remote_execution_summaries.end())
         {
-            for (auto & remote : merged_remote_execution_summaries[p.first])
+            for (auto & remote : merged_remote_execution_summaries[executor_id])
                 current.merge(remote, false);
         }
         /// part 3: for join need to add the build time
         /// In TiFlash, a hash join's build side is finished before probe side starts,
         /// so the join probe side's running time does not include hash table's build time,
         /// when construct ExecSummaries, we need add the build cost to probe executor
-        auto all_join_id_it = dag_context.getExecutorIdToJoinIdMap().find(p.first);
+        auto all_join_id_it = dag_context.getExecutorIdToJoinIdMap().find(executor_id);
         if (all_join_id_it != dag_context.getExecutorIdToJoinIdMap().end())
         {
             for (const auto & join_executor_id : all_join_id_it->second)
@@ -138,8 +136,27 @@ void DAGResponseWriter::addExecuteSummaries(tipb::SelectResponse & response, boo
         }
 
         current.time_processed_ns += dag_context.compile_time_ns;
-        fillTiExecutionSummary(response.add_execution_summaries(), current, p.first, delta_mode);
+        fillTiExecutionSummary(response.add_execution_summaries(), current, executor_id, delta_mode);
+    };
+
+    /// add execution_summary for local executor
+    if (dag_context.return_executor_id)
+    {
+        for (auto & p : dag_context.getProfileStreamsMap())
+            fill_execution_summary(p.first, p.second);
+    }
+    else
+    {
+        const auto & profile_streams_map = dag_context.getProfileStreamsMap();
+        assert(profile_streams_map.size() == dag_context.list_based_executors_order.size());
+        for (const auto & executor_id : dag_context.list_based_executors_order)
+        {
+            auto it = profile_streams_map.find(executor_id);
+            assert(it != profile_streams_map.end());
+            fill_execution_summary(executor_id, it->second);
+        }
     }
+
     for (auto & p : merged_remote_execution_summaries)
     {
         if (local_executors.find(p.first) == local_executors.end())
diff --git a/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp b/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
index 14cddd94730..ad2de7217e0 100644
--- a/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGStorageInterpreter.cpp
@@ -486,6 +486,7 @@ void DAGStorageInterpreter::buildRemoteStreams(std::vector<RemoteRequest> && rem
         std::vector<pingcap::coprocessor::copTask> tasks(all_tasks.begin() + task_start, all_tasks.begin() + task_end);
 
         auto coprocessor_reader = std::make_shared<CoprocessorReader>(schema, cluster, tasks, has_enforce_encode_type, 1);
+        context.getDAGContext()->addCoprocessorReader(coprocessor_reader);
         BlockInputStreamPtr input = std::make_shared<CoprocessorBlockInputStream>(coprocessor_reader, log->identifier(), table_scan.getTableScanExecutorID());
         pipeline.streams.push_back(input);
         task_start = task_end;
diff --git a/dbms/src/Flash/Coprocessor/TiDBColumn.cpp b/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
index 7183374a5c1..eef89696d3a 100644
--- a/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
+++ b/dbms/src/Flash/Coprocessor/TiDBColumn.cpp
@@ -28,7 +28,7 @@ template <typename T>
 void encodeLittleEndian(const T & value, WriteBuffer & ss)
 {
     auto v = toLittleEndian(value);
-    ss.write(reinterpret_cast<const char *>(&v), sizeof(v));
+    ss.template writeFixed<T>(&v);
 }
 
 TiDBColumn::TiDBColumn(Int8 element_len_)
@@ -141,10 +141,10 @@ void TiDBColumn::append(const TiDBDecimal & decimal)
     encodeLittleEndian<UInt8>(decimal.digits_int, *data);
     encodeLittleEndian<UInt8>(decimal.digits_frac, *data);
     encodeLittleEndian<UInt8>(decimal.result_frac, *data);
-    encodeLittleEndian<UInt8>((UInt8)decimal.negative, *data);
-    for (int i = 0; i < MAX_WORD_BUF_LEN; i++)
+    encodeLittleEndian<UInt8>(static_cast<UInt8>(decimal.negative), *data);
+    for (int i : decimal.word_buf)
     {
-        encodeLittleEndian<Int32>(decimal.word_buf[i], *data);
+        encodeLittleEndian<Int32>(i, *data);
     }
     finishAppendFixed();
 }
diff --git a/dbms/src/Flash/EstablishCall.cpp b/dbms/src/Flash/EstablishCall.cpp
index 8af81e30962..89857a2407e 100644
--- a/dbms/src/Flash/EstablishCall.cpp
+++ b/dbms/src/Flash/EstablishCall.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/EstablishCall.h>
 #include <Flash/FlashService.h>
@@ -19,6 +20,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_tunnel_init_rpc_failure_failpoint[];
+} // namespace FailPoints
+
 EstablishCallData::EstablishCallData(AsyncFlashService * service, grpc::ServerCompletionQueue * cq, grpc::ServerCompletionQueue * notify_cq, const std::shared_ptr<std::atomic<bool>> & is_shutdown)
     : service(service)
     , cq(cq)
@@ -71,6 +77,7 @@ void EstablishCallData::initRpc()
     std::exception_ptr eptr = nullptr;
     try
     {
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_tunnel_init_rpc_failure_failpoint);
         service->establishMPPConnectionSyncOrAsync(&ctx, &request, nullptr, this);
     }
     catch (...)
diff --git a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
index f194afee31f..3b36adf2c40 100644
--- a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
+++ b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <Common/CPUAffinityManager.h>
+#include <Common/FailPoint.h>
 #include <Common/ThreadFactory.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/Coprocessor/CoprocessorReader.h>
@@ -22,6 +23,12 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_receiver_sync_msg_push_failure_failpoint[];
+extern const char random_receiver_async_msg_push_failure_failpoint[];
+} // namespace FailPoints
+
 namespace
 {
 String getReceiverStateStr(const ExchangeReceiverState & s)
@@ -257,7 +264,9 @@ class AsyncRequestHandler : public UnaryCallback<bool>
             recv_msg->packet = std::move(packet);
             recv_msg->source_index = request->source_index;
             recv_msg->req_info = req_info;
-            if (!msg_channel->push(std::move(recv_msg)))
+            bool push_success = msg_channel->push(std::move(recv_msg));
+            fiu_do_on(FailPoints::random_receiver_async_msg_push_failure_failpoint, push_success = false;);
+            if (!push_success)
                 return false;
             // can't reuse packet since it is sent to readers.
             packet = std::make_shared<MPPDataPacket>();
@@ -349,7 +358,7 @@ template <typename RPCContext>
 void ExchangeReceiverBase<RPCContext>::cancel()
 {
     setEndState(ExchangeReceiverState::CANCELED);
-    msg_channel.finish();
+    msg_channel.cancel();
 }
 
 template <typename RPCContext>
@@ -483,7 +492,9 @@ void ExchangeReceiverBase<RPCContext>::readLoop(const Request & req)
                 if (recv_msg->packet->has_error())
                     throw Exception("Exchange receiver meet error : " + recv_msg->packet->error().msg());
 
-                if (!msg_channel.push(std::move(recv_msg)))
+                bool push_success = msg_channel.push(std::move(recv_msg));
+                fiu_do_on(FailPoints::random_receiver_sync_msg_push_failure_failpoint, push_success = false;);
+                if (!push_success)
                 {
                     meet_error = true;
                     auto local_state = getState();
diff --git a/dbms/src/Flash/Mpp/MPPHandler.cpp b/dbms/src/Flash/Mpp/MPPHandler.cpp
index a3096aaa644..7f97a1dd698 100644
--- a/dbms/src/Flash/Mpp/MPPHandler.cpp
+++ b/dbms/src/Flash/Mpp/MPPHandler.cpp
@@ -31,7 +31,7 @@ void MPPHandler::handleError(const MPPTaskPtr & task, String error)
     try
     {
         if (task)
-            task->cancel(error);
+            task->handleError(error);
     }
     catch (...)
     {
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.cpp b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
new file mode 100644
index 00000000000..60cca308c18
--- /dev/null
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
@@ -0,0 +1,48 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Flash/Mpp/ExchangeReceiver.h>
+#include <Flash/Mpp/MPPReceiverSet.h>
+
+namespace DB
+{
+void MPPReceiverSet::addExchangeReceiver(const String & executor_id, const ExchangeReceiverPtr & exchange_receiver)
+{
+    RUNTIME_ASSERT(exchange_receiver_map.find(executor_id) == exchange_receiver_map.end(), log, "Duplicate executor_id: {} in DAGRequest", executor_id);
+    exchange_receiver_map[executor_id] = exchange_receiver;
+}
+
+void MPPReceiverSet::addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader)
+{
+    coprocessor_readers.push_back(coprocessor_reader);
+}
+
+ExchangeReceiverPtr MPPReceiverSet::getExchangeReceiver(const String & executor_id) const
+{
+    auto it = exchange_receiver_map.find(executor_id);
+    if (unlikely(it == exchange_receiver_map.end()))
+        return nullptr;
+    return it->second;
+}
+
+void MPPReceiverSet::cancel()
+{
+    for (auto & it : exchange_receiver_map)
+    {
+        it.second->cancel();
+    }
+    for (auto & cop_reader : coprocessor_readers)
+        cop_reader->cancel();
+}
+} // namespace DB
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.h b/dbms/src/Flash/Mpp/MPPReceiverSet.h
new file mode 100644
index 00000000000..44274cb3ce8
--- /dev/null
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.h
@@ -0,0 +1,44 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Flash/Coprocessor/CoprocessorReader.h>
+#include <Flash/Coprocessor/DAGContext.h>
+
+namespace DB
+{
+class MPPReceiverSet
+{
+public:
+    explicit MPPReceiverSet(const String & req_id)
+        : log(Logger::get("MPPReceiverSet", req_id))
+    {}
+    void addExchangeReceiver(const String & executor_id, const ExchangeReceiverPtr & exchange_receiver);
+    void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
+    ExchangeReceiverPtr getExchangeReceiver(const String & executor_id) const;
+    void cancel();
+
+private:
+    /// two kinds of receiver in MPP
+    /// ExchangeReceiver: receiver data from other MPPTask
+    /// CoprocessorReader: used in remote read
+    ExchangeReceiverMap exchange_receiver_map;
+    std::vector<CoprocessorReaderPtr> coprocessor_readers;
+    const LoggerPtr log;
+};
+
+using MPPReceiverSetPtr = std::shared_ptr<MPPReceiverSet>;
+
+} // namespace DB
diff --git a/dbms/src/Flash/Mpp/MPPTask.cpp b/dbms/src/Flash/Mpp/MPPTask.cpp
index 40f03ff79ba..c2d5e6f49f8 100644
--- a/dbms/src/Flash/Mpp/MPPTask.cpp
+++ b/dbms/src/Flash/Mpp/MPPTask.cpp
@@ -51,6 +51,7 @@ extern const char exception_before_mpp_register_tunnel_for_root_mpp_task[];
 extern const char exception_during_mpp_register_tunnel_for_non_root_mpp_task[];
 extern const char exception_during_mpp_write_err_to_tunnel[];
 extern const char force_no_local_region_for_mpp_task[];
+extern const char random_task_lifecycle_failpoint[];
 } // namespace FailPoints
 
 MPPTask::MPPTask(const mpp::TaskMeta & meta_, const ContextPtr & context_)
@@ -80,6 +81,34 @@ MPPTask::~MPPTask()
     LOG_FMT_DEBUG(log, "finish MPPTask: {}", id.toString());
 }
 
+void MPPTask::abortTunnels(const String & message, AbortType abort_type)
+{
+    if (abort_type == AbortType::ONCANCELLATION)
+    {
+        closeAllTunnels(message);
+    }
+    else
+    {
+        RUNTIME_ASSERT(tunnel_set != nullptr, log, "mpp task without tunnel set");
+        tunnel_set->writeError(message);
+    }
+}
+
+void MPPTask::abortReceivers()
+{
+    if (likely(receiver_set != nullptr))
+    {
+        receiver_set->cancel();
+    }
+}
+
+void MPPTask::abortDataStreams(AbortType abort_type)
+{
+    /// When abort type is ONERROR, it means MPPTask already known it meet error, so let the remaining task stop silently to avoid too many useless error message
+    bool is_kill = abort_type == AbortType::ONCANCELLATION;
+    context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, is_kill);
+}
+
 void MPPTask::closeAllTunnels(const String & reason)
 {
     if (likely(tunnel_set))
@@ -125,7 +154,7 @@ void MPPTask::registerTunnels(const mpp::DispatchTaskRequest & task_request)
 
 void MPPTask::initExchangeReceivers()
 {
-    mpp_exchange_receiver_map = std::make_shared<ExchangeReceiverMap>();
+    receiver_set = std::make_shared<MPPReceiverSet>(log->identifier());
     traverseExecutors(&dag_req, [&](const tipb::Executor & executor) {
         if (executor.tp() == tipb::ExecType::TypeExchangeReceiver)
         {
@@ -147,23 +176,12 @@ void MPPTask::initExchangeReceivers()
             if (status != RUNNING)
                 throw Exception("exchange receiver map can not be initialized, because the task is not in running state");
 
-            (*mpp_exchange_receiver_map)[executor_id] = exchange_receiver;
+            receiver_set->addExchangeReceiver(executor_id, exchange_receiver);
             new_thread_count_of_exchange_receiver += exchange_receiver->computeNewThreadCount();
         }
         return true;
     });
-    dag_context->setMPPExchangeReceiverMap(mpp_exchange_receiver_map);
-}
-
-void MPPTask::cancelAllExchangeReceivers()
-{
-    if (likely(mpp_exchange_receiver_map != nullptr))
-    {
-        for (auto & it : *mpp_exchange_receiver_map)
-        {
-            it.second->cancel();
-        }
-    }
+    dag_context->setMPPReceiverSet(receiver_set);
 }
 
 std::pair<MPPTunnelPtr, String> MPPTask::getTunnel(const ::mpp::EstablishMPPConnectionRequest * request)
@@ -359,92 +377,122 @@ void MPPTask::runImpl()
             return_statistics.blocks,
             return_statistics.bytes);
     }
-    catch (Exception & e)
-    {
-        err_msg = e.displayText();
-        LOG_FMT_ERROR(log, "task running meets error: {} Stack Trace : {}", err_msg, e.getStackTrace().toString());
-    }
-    catch (pingcap::Exception & e)
-    {
-        err_msg = e.message();
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
-    }
-    catch (std::exception & e)
-    {
-        err_msg = e.what();
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
-    }
     catch (...)
     {
-        err_msg = "unrecovered error";
-        LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
+        err_msg = getCurrentExceptionMessage(true);
     }
+
     if (err_msg.empty())
     {
-        // todo when error happens, should try to update the metrics if it is available
-        auto throughput = dag_context->getTableScanThroughput();
-        if (throughput.first)
-            GET_METRIC(tiflash_storage_logical_throughput_bytes).Observe(throughput.second);
-        auto process_info = context->getProcessListElement()->getInfo();
-        auto peak_memory = process_info.peak_memory_usage > 0 ? process_info.peak_memory_usage : 0;
-        GET_METRIC(tiflash_coprocessor_request_memory_usage, type_run_mpp_task).Observe(peak_memory);
-        mpp_task_statistics.setMemoryPeak(peak_memory);
+        if (switchStatus(RUNNING, FINISHED))
+            LOG_INFO(log, "finish task");
+        else
+            LOG_FMT_WARNING(log, "finish task which is in {} state", taskStatusToString(status));
+        if (status == FINISHED)
+        {
+            // todo when error happens, should try to update the metrics if it is available
+            auto throughput = dag_context->getTableScanThroughput();
+            if (throughput.first)
+                GET_METRIC(tiflash_storage_logical_throughput_bytes).Observe(throughput.second);
+            auto process_info = context->getProcessListElement()->getInfo();
+            auto peak_memory = process_info.peak_memory_usage > 0 ? process_info.peak_memory_usage : 0;
+            GET_METRIC(tiflash_coprocessor_request_memory_usage, type_run_mpp_task).Observe(peak_memory);
+            mpp_task_statistics.setMemoryPeak(peak_memory);
+        }
     }
     else
     {
-        context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, true);
-        cancelAllExchangeReceivers();
-        writeErrToAllTunnels(err_msg);
+        if (status == RUNNING)
+        {
+            LOG_FMT_ERROR(log, "task running meets error: {}", err_msg);
+            try
+            {
+                handleError(err_msg);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(log, "Meet error while try to handle error in MPPTask");
+            }
+        }
     }
     LOG_FMT_INFO(log, "task ends, time cost is {} ms.", stopwatch.elapsedMilliseconds());
-    unregisterTask();
-
-    if (switchStatus(RUNNING, FINISHED))
-        LOG_INFO(log, "finish task");
-    else
-        LOG_WARNING(log, "finish task which was cancelled before");
+    // unregister flag is only for FailPoint usage, to produce the situation that MPPTask is destructed
+    // by grpc CancelMPPTask thread;
+    bool unregister = true;
+    fiu_do_on(FailPoints::random_task_lifecycle_failpoint, {
+        if (!err_msg.empty())
+            unregister = false;
+    });
+    if (unregister)
+        unregisterTask();
 
-    mpp_task_statistics.end(status.load(), err_msg);
+    mpp_task_statistics.end(status.load(), err_string);
     mpp_task_statistics.logTracingJson();
 }
 
-void MPPTask::writeErrToAllTunnels(const String & e)
+void MPPTask::handleError(const String & error_msg)
 {
-    RUNTIME_ASSERT(tunnel_set != nullptr, log, "mpp task without tunnel set");
-    tunnel_set->writeError(e);
+    if (manager == nullptr || !manager->isTaskToBeCancelled(id))
+        abort(error_msg, AbortType::ONERROR);
 }
 
-void MPPTask::cancel(const String & reason)
+void MPPTask::abort(const String & message, AbortType abort_type)
 {
-    CPUAffinityManager::getInstance().bindSelfQueryThread();
-    LOG_FMT_WARNING(log, "Begin cancel task: {}", id.toString());
+    String abort_type_string;
+    TaskStatus next_task_status;
+    switch (abort_type)
+    {
+    case AbortType::ONCANCELLATION:
+        abort_type_string = "ONCANCELLATION";
+        next_task_status = CANCELLED;
+        break;
+    case AbortType::ONERROR:
+        abort_type_string = "ONERROR";
+        next_task_status = FAILED;
+        break;
+    }
+    LOG_FMT_WARNING(log, "Begin abort task: {}, abort type: {}", id.toString(), abort_type_string);
     while (true)
     {
         auto previous_status = status.load();
-        if (previous_status == FINISHED || previous_status == CANCELLED)
+        if (previous_status == FINISHED || previous_status == CANCELLED || previous_status == FAILED)
         {
-            LOG_FMT_WARNING(log, "task already {}", (previous_status == FINISHED ? "finished" : "cancelled"));
+            LOG_FMT_WARNING(log, "task already in {} state", taskStatusToString(previous_status));
             return;
         }
-        else if (previous_status == INITIALIZING && switchStatus(INITIALIZING, CANCELLED))
+        else if (previous_status == INITIALIZING && switchStatus(INITIALIZING, next_task_status))
         {
-            closeAllTunnels(reason);
+            err_string = message;
+            /// if the task is in initializing state, mpp task can return error to TiDB directly,
+            /// so just close all tunnels here
+            closeAllTunnels(message);
             unregisterTask();
-            LOG_WARNING(log, "Finish cancel task from uninitialized");
+            LOG_WARNING(log, "Finish abort task from uninitialized");
             return;
         }
-        else if (previous_status == RUNNING && switchStatus(RUNNING, CANCELLED))
+        else if (previous_status == RUNNING && switchStatus(RUNNING, next_task_status))
         {
+            /// abort the components from top to bottom because if bottom components are aborted
+            /// first, the top components may see an error caused by the abort, which is not
+            /// the original error
+            err_string = message;
+            abortTunnels(message, abort_type);
+            abortDataStreams(abort_type);
+            abortReceivers();
             scheduleThisTask(ScheduleState::FAILED);
-            context->getProcessList().sendCancelToQuery(context->getCurrentQueryId(), context->getClientInfo().current_user, true);
-            closeAllTunnels(reason);
             /// runImpl is running, leave remaining work to runImpl
-            LOG_WARNING(log, "Finish cancel task from running");
+            LOG_WARNING(log, "Finish abort task from running");
             return;
         }
     }
 }
 
+void MPPTask::cancel(const String & reason)
+{
+    CPUAffinityManager::getInstance().bindSelfQueryThread();
+    abort(reason, AbortType::ONCANCELLATION);
+}
+
 bool MPPTask::switchStatus(TaskStatus from, TaskStatus to)
 {
     return status.compare_exchange_strong(from, to);
diff --git a/dbms/src/Flash/Mpp/MPPTask.h b/dbms/src/Flash/Mpp/MPPTask.h
index c8423ac484c..a30150b26e8 100644
--- a/dbms/src/Flash/Mpp/MPPTask.h
+++ b/dbms/src/Flash/Mpp/MPPTask.h
@@ -19,6 +19,7 @@
 #include <Common/MemoryTracker.h>
 #include <DataStreams/BlockIO.h>
 #include <Flash/Coprocessor/DAGContext.h>
+#include <Flash/Mpp/MPPReceiverSet.h>
 #include <Flash/Mpp/MPPTaskId.h>
 #include <Flash/Mpp/MPPTaskStatistics.h>
 #include <Flash/Mpp/MPPTunnel.h>
@@ -58,6 +59,8 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void cancel(const String & reason);
 
+    void handleError(const String & error_msg);
+
     void prepare(const mpp::DispatchTaskRequest & task_request);
 
     void run();
@@ -89,12 +92,22 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void unregisterTask();
 
-    void writeErrToAllTunnels(const String & e);
-
     /// Similar to `writeErrToAllTunnels`, but it just try to write the error message to tunnel
     /// without waiting the tunnel to be connected
     void closeAllTunnels(const String & reason);
 
+    enum class AbortType
+    {
+        /// todo add ONKILL to distinguish between silent cancellation and kill
+        ONCANCELLATION,
+        ONERROR,
+    };
+    void abort(const String & message, AbortType abort_type);
+
+    void abortTunnels(const String & message, AbortType abort_type);
+    void abortReceivers();
+    void abortDataStreams(AbortType abort_type);
+
     void finishWrite();
 
     bool switchStatus(TaskStatus from, TaskStatus to);
@@ -109,8 +122,6 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     void initExchangeReceivers();
 
-    void cancelAllExchangeReceivers();
-
     tipb::DAGRequest dag_req;
 
     ContextPtr context;
@@ -120,14 +131,15 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
     MemoryTracker * memory_tracker = nullptr;
 
     std::atomic<TaskStatus> status{INITIALIZING};
+    String err_string;
 
     mpp::TaskMeta meta;
 
     MPPTaskId id;
 
     MPPTunnelSetPtr tunnel_set;
-    /// key: executor_id of ExchangeReceiver nodes in dag.
-    ExchangeReceiverMapPtr mpp_exchange_receiver_map;
+
+    MPPReceiverSetPtr receiver_set;
 
     int new_thread_count_of_exchange_receiver = 0;
 
@@ -137,8 +149,6 @@ class MPPTask : public std::enable_shared_from_this<MPPTask>
 
     MPPTaskStatistics mpp_task_statistics;
 
-    Exception err;
-
     friend class MPPTaskManager;
 
     int needed_threads;
diff --git a/dbms/src/Flash/Mpp/MPPTaskManager.cpp b/dbms/src/Flash/Mpp/MPPTaskManager.cpp
index 531f8f7a10d..c5499eda89d 100644
--- a/dbms/src/Flash/Mpp/MPPTaskManager.cpp
+++ b/dbms/src/Flash/Mpp/MPPTaskManager.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/FmtUtils.h>
 #include <Flash/Mpp/MPPTaskManager.h>
 #include <fmt/core.h>
@@ -22,6 +23,11 @@
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_task_manager_find_task_failure_failpoint[];
+} // namespace FailPoints
+
 MPPTaskManager::MPPTaskManager(MPPTaskSchedulerPtr scheduler_)
     : scheduler(std::move(scheduler_))
     , log(&Poco::Logger::get("TaskManager"))
@@ -50,6 +56,7 @@ MPPTaskPtr MPPTaskManager::findTaskWithTimeout(const mpp::TaskMeta & meta, std::
         it = query_it->second->task_map.find(id);
         return it != query_it->second->task_map.end();
     });
+    fiu_do_on(FailPoints::random_task_manager_find_task_failure_failpoint, ret = false;);
     if (cancelled)
     {
         errMsg = fmt::format("Task [{},{}] has been cancelled.", meta.start_ts(), meta.task_id());
@@ -140,6 +147,17 @@ bool MPPTaskManager::registerTask(MPPTaskPtr task)
     return true;
 }
 
+bool MPPTaskManager::isTaskToBeCancelled(const MPPTaskId & task_id)
+{
+    std::unique_lock lock(mu);
+    auto it = mpp_query_map.find(task_id.start_ts);
+    if (it != mpp_query_map.end() && it->second->to_be_cancelled)
+    {
+        return it->second->task_map.find(task_id) != it->second->task_map.end();
+    }
+    return false;
+}
+
 void MPPTaskManager::unregisterTask(MPPTask * task)
 {
     std::unique_lock lock(mu);
diff --git a/dbms/src/Flash/Mpp/MPPTaskManager.h b/dbms/src/Flash/Mpp/MPPTaskManager.h
index d7047804aca..770acea3853 100644
--- a/dbms/src/Flash/Mpp/MPPTaskManager.h
+++ b/dbms/src/Flash/Mpp/MPPTaskManager.h
@@ -73,6 +73,8 @@ class MPPTaskManager : private boost::noncopyable
 
     void unregisterTask(MPPTask * task);
 
+    bool isTaskToBeCancelled(const MPPTaskId & task_id);
+
     bool tryToScheduleTask(const MPPTaskPtr & task);
 
     void releaseThreadsFromScheduler(const int needed_threads);
diff --git a/dbms/src/Flash/Mpp/MPPTunnel.cpp b/dbms/src/Flash/Mpp/MPPTunnel.cpp
index 826e7fea88a..13a7eaad95e 100644
--- a/dbms/src/Flash/Mpp/MPPTunnel.cpp
+++ b/dbms/src/Flash/Mpp/MPPTunnel.cpp
@@ -25,6 +25,7 @@ namespace DB
 namespace FailPoints
 {
 extern const char exception_during_mpp_close_tunnel[];
+extern const char random_tunnel_wait_timeout_failpoint[];
 } // namespace FailPoints
 
 template <typename Writer>
@@ -322,6 +323,7 @@ void MPPTunnelBase<Writer>::waitUntilConnectedOrFinished(std::unique_lock<std::m
         auto res = cv_for_connected_or_finished.wait_for(lk, timeout, connected_or_finished);
         LOG_FMT_TRACE(log, "end waitUntilConnectedOrFinished");
 
+        fiu_do_on(FailPoints::random_tunnel_wait_timeout_failpoint, res = false;);
         if (!res)
             throw Exception(tunnel_id + " is timeout");
     }
diff --git a/dbms/src/Flash/Mpp/MinTSOScheduler.cpp b/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
index af525bd1a55..967bfcecfa3 100644
--- a/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
+++ b/dbms/src/Flash/Mpp/MinTSOScheduler.cpp
@@ -12,12 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/TiFlashMetrics.h>
 #include <Flash/Mpp/MPPTaskManager.h>
 #include <Flash/Mpp/MinTSOScheduler.h>
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_min_tso_scheduler_failpoint[];
+} // namespace FailPoints
+
 constexpr UInt64 MAX_UINT64 = std::numeric_limits<UInt64>::max();
 constexpr UInt64 OS_THREAD_SOFT_LIMIT = 100000;
 
@@ -193,7 +199,9 @@ bool MinTSOScheduler::scheduleImp(const UInt64 tso, const MPPQueryTaskSetPtr & q
     }
     else
     {
-        if (tso <= min_tso) /// the min_tso query should fully run, otherwise throw errors here.
+        bool is_tso_min = tso <= min_tso;
+        fiu_do_on(FailPoints::random_min_tso_scheduler_failpoint, is_tso_min = true;);
+        if (is_tso_min) /// the min_tso query should fully run, otherwise throw errors here.
         {
             has_error = true;
             auto msg = fmt::format("threads are unavailable for the query {} ({} min_tso {}) {}, need {}, but used {} of the thread hard limit {}, {} active and {} waiting queries.", tso, tso == min_tso ? "is" : "is newer than", min_tso, isWaiting ? "from the waiting set" : "when directly schedule it", needed_threads, estimated_thread_usage, thread_hard_limit, active_set.size(), waiting_set.size());
diff --git a/dbms/src/Flash/Mpp/TaskStatus.cpp b/dbms/src/Flash/Mpp/TaskStatus.cpp
index 423b768faea..c87ae2b8eb4 100644
--- a/dbms/src/Flash/Mpp/TaskStatus.cpp
+++ b/dbms/src/Flash/Mpp/TaskStatus.cpp
@@ -29,6 +29,8 @@ StringRef taskStatusToString(const TaskStatus & status)
         return "FINISHED";
     case CANCELLED:
         return "CANCELLED";
+    case FAILED:
+        return "FAILED";
     default:
         throw Exception("Unknown TaskStatus");
     }
diff --git a/dbms/src/Flash/Mpp/TaskStatus.h b/dbms/src/Flash/Mpp/TaskStatus.h
index 999e30790bf..0997c8adc52 100644
--- a/dbms/src/Flash/Mpp/TaskStatus.h
+++ b/dbms/src/Flash/Mpp/TaskStatus.h
@@ -24,6 +24,7 @@ enum TaskStatus
     RUNNING,
     FINISHED,
     CANCELLED,
+    FAILED,
 };
 
 StringRef taskStatusToString(const TaskStatus & status);
diff --git a/dbms/src/Flash/tests/gtest_executor.cpp b/dbms/src/Flash/tests/gtest_executor.cpp
index 64c60f14bb6..b4ba1a75563 100644
--- a/dbms/src/Flash/tests/gtest_executor.cpp
+++ b/dbms/src/Flash/tests/gtest_executor.cpp
@@ -227,4 +227,4 @@ try
 CATCH
 
 } // namespace tests
-} // namespace DB
\ No newline at end of file
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_limit_executor.cpp b/dbms/src/Flash/tests/gtest_limit_executor.cpp
new file mode 100644
index 00000000000..e4a3aa5db5e
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_limit_executor.cpp
@@ -0,0 +1,77 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorLimitTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColDataType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColumnWithData = std::vector<ColDataType>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_name},
+                             {{col_name, TiDB::TP::TypeString}},
+                             {toNullableVec<String>(col_name, col0)});
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(size_t limit_num)
+    {
+        return context.scan(db_name, table_name).limit(limit_num).build(context);
+    }
+
+    /// Prepare some names
+    const String db_name{"test_db"};
+    const String table_name{"projection_test_table"};
+    const String col_name{"limit_col"};
+    const ColumnWithData col0{"col0-0", {}, "col0-2", "col0-3", {}, "col0-5", "col0-6", "col0-7"};
+};
+
+TEST_F(ExecutorLimitTestRunner, Limit)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+    ColumnsWithTypeAndName expect_cols;
+
+    /// Check limit result with various parameters
+    const size_t col_data_num = col0.size();
+    for (size_t limit_num = 0; limit_num <= col_data_num + 3; ++limit_num)
+    {
+        if (limit_num == col_data_num + 3)
+            limit_num = INT_MAX;
+        request = buildDAGRequest(limit_num);
+
+        if (limit_num == 0)
+            expect_cols = {};
+        else if (limit_num > col_data_num)
+            expect_cols = {toNullableVec<String>(col_name, ColumnWithData(col0.begin(), col0.end()))};
+        else
+            expect_cols = {toNullableVec<String>(col_name, ColumnWithData(col0.begin(), col0.begin() + limit_num))};
+
+        executeStreams(request, expect_cols);
+    }
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_projection_executor.cpp b/dbms/src/Flash/tests/gtest_projection_executor.cpp
new file mode 100644
index 00000000000..4f6401eb483
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_projection_executor.cpp
@@ -0,0 +1,225 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorProjectionTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColDataString = std::vector<std::optional<typename TypeTraits<String>::FieldType>>;
+    using ColDataInt32 = std::vector<std::optional<typename TypeTraits<Int32>::FieldType>>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_name},
+                             {{col_names[0], TiDB::TP::TypeString},
+                              {col_names[1], TiDB::TP::TypeString},
+                              {col_names[2], TiDB::TP::TypeString},
+                              {col_names[3], TiDB::TP::TypeLong},
+                              {col_names[4], TiDB::TP::TypeLong}},
+                             {toNullableVec<String>(col_names[0], col0),
+                              toNullableVec<String>(col_names[1], col1),
+                              toNullableVec<String>(col_names[2], col2),
+                              toNullableVec<Int32>(col_names[3], col3),
+                              toNullableVec<Int32>(col_names[4], col4)});
+    }
+
+    template <typename T>
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(T param, const String & sort_col)
+    {
+        /// topN is introduced, so that we can get stable results in concurrency environment.
+        return context.scan(db_name, table_name).project(param).topN(sort_col, false, 100).build(context);
+    };
+
+    void executeWithConcurrency(const std::shared_ptr<tipb::DAGRequest> & request, const ColumnsWithTypeAndName & expect_columns)
+    {
+        for (size_t i = 1; i < 10; i += 2)
+        {
+            executeStreams(request, expect_columns, i);
+        }
+    }
+
+    /// Prepare column data
+    const ColDataString col0{"col0-0", "col0-1", "", "col0-2", {}, "col0-3", ""};
+    const ColDataString col1{"col1-0", {}, "", "col1-1", "", "col1-2", "col1-3"};
+    const ColDataString col2{"", "col2-0", "col2-1", {}, "col2-3", {}, "col2-4"};
+    const ColDataInt32 col3{1, {}, 0, -111111, {}, 0, 9999};
+
+    /** Each value in col4 should be different from each other so that topn 
+     *  could sort the columns into an unique result, or multi-results could
+     *  be right.
+     */
+    const ColDataInt32 col4{0, 5, -123, -234, {}, 24353, 9999};
+
+    /// Results after sorted by col4
+    const ColDataString col0_sorted_asc{{}, "col0-2", "", "col0-0", "col0-1", "", "col0-3"};
+    const ColDataString col1_sorted_asc{"", "col1-1", "", "col1-0", {}, "col1-3", "col1-2"};
+    const ColDataString col2_sorted_asc{"col2-3", {}, "col2-1", "", "col2-0", "col2-4", {}};
+    const ColDataInt32 col3_sorted_asc{{}, -111111, 0, 1, {}, 9999, 0};
+    const ColDataInt32 col4_sorted_asc{{}, -234, -123, 0, 5, 9999, 24353};
+
+    /// Prepare some names
+    std::vector<String> col_names{"col0", "col1", "col2", "col3", "col4"};
+    const String db_name{"test_db"};
+    const String table_name{"projection_test_table"};
+};
+
+TEST_F(ExecutorProjectionTestRunner, Projection)
+try
+{
+    /// Check single column
+    auto request = buildDAGRequest<MockColumnNames>({col_names[4]}, col_names[4]);
+    executeWithConcurrency(request, {toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Check multi columns
+    request = buildDAGRequest<MockColumnNames>({col_names[0], col_names[4]}, col_names[4]);
+    executeWithConcurrency(request,
+                           {
+                               toNullableVec<String>(col_names[0], col0_sorted_asc),
+                               toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                           });
+
+    /// Check multi columns
+    request = buildDAGRequest<MockColumnNames>({col_names[0], col_names[1], col_names[4]}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<String>(col_names[0], col0_sorted_asc),
+                            toNullableVec<String>(col_names[1], col1_sorted_asc),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Check duplicate columns
+    request = buildDAGRequest<MockColumnNames>({col_names[4], col_names[4], col_names[4]}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    {
+        /// Check large number of columns
+        const size_t col_num = 100;
+        MockColumnNamesVec projection_input;
+        ColumnsWithTypeAndName columns;
+        auto expect_column = toNullableVec<Int32>(col_names[4], col4_sorted_asc);
+
+        for (size_t i = 0; i < col_num; ++i)
+        {
+            projection_input.push_back(col_names[4]);
+            columns.push_back(expect_column);
+        }
+
+        request = buildDAGRequest<MockColumnNamesVec>(projection_input, col_names[4]);
+        executeWithConcurrency(request, columns);
+    }
+}
+CATCH
+
+TEST_F(ExecutorProjectionTestRunner, ProjectionFunction)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+
+    /// Test "equal" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAsts>({eq(col(col_names[0]), col(col_names[0])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 1, 1, 1, 1, 1}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAsts>({eq(col(col_names[0]), col(col_names[1])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 1, 0, {}, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAsts>({eq(col(col_names[3]), col(col_names[4])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 0, 0, {}, 1, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+
+    /// Test "greater" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAsts>({gt(col(col_names[0]), col(col_names[1])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 0, 0, {}, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAsts>({gt(col(col_names[1]), col(col_names[0])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 0, 1, {}, 1, 1}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAsts>({gt(col(col_names[3]), col(col_names[4])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 1, 1, {}, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAsts>({gt(col(col_names[4]), col(col_names[3])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 0, 0, {}, 0, 1}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+
+    /// Test "and" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAsts>({And(col(col_names[0]), col(col_names[0])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 0, 0, 0, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    request = buildDAGRequest<MockAsts>({And(col(col_names[0]), col(col_names[1])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({0, 0, 0, 0, 0, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAsts>({And(col(col_names[3]), col(col_names[4])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 0, 0, {}, 1, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Test "not" function
+
+    /// Data type: TypeString
+    request = buildDAGRequest<MockAsts>({NOT(col(col_names[0])), NOT(col(col_names[1])), NOT(col(col_names[2])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 1, 1, 1, 1, 1, 1}),
+                            toNullableVec<UInt64>({1, 1, 1, 1, {}, 1, 1}),
+                            toNullableVec<UInt64>({1, {}, 1, 1, 1, 1, {}}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// Data type: TypeLong
+    request = buildDAGRequest<MockAsts>({NOT(col(col_names[3])), NOT(col(col_names[4])), col(col_names[4])}, col_names[4]);
+    executeWithConcurrency(request,
+                           {toNullableVec<UInt64>({{}, 0, 1, 0, {}, 0, 1}),
+                            toNullableVec<UInt64>({{}, 0, 0, 1, 0, 0, 0}),
+                            toNullableVec<Int32>(col_names[4], col4_sorted_asc)});
+
+    /// TODO more functions...
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_topn_executor.cpp b/dbms/src/Flash/tests/gtest_topn_executor.cpp
new file mode 100644
index 00000000000..0e55702795d
--- /dev/null
+++ b/dbms/src/Flash/tests/gtest_topn_executor.cpp
@@ -0,0 +1,221 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <TestUtils/ExecutorTestUtils.h>
+#include <TestUtils/mockExecutor.h>
+
+namespace DB
+{
+namespace tests
+{
+
+class ExecutorTopNTestRunner : public DB::tests::ExecutorTest
+{
+public:
+    using ColStringType = std::optional<typename TypeTraits<String>::FieldType>;
+    using ColInt32Type = std::optional<typename TypeTraits<Int32>::FieldType>;
+    using ColumnWithString = std::vector<ColStringType>;
+    using ColumnWithInt32 = std::vector<ColInt32Type>;
+
+    void initializeContext() override
+    {
+        ExecutorTest::initializeContext();
+
+        context.addMockTable({db_name, table_single_name},
+                             {{single_col_name, TiDB::TP::TypeString}},
+                             {toNullableVec<String>(single_col_name, col0)});
+
+        context.addMockTable({db_name, table_name},
+                             {{col_name[0], TiDB::TP::TypeLong},
+                              {col_name[1], TiDB::TP::TypeString},
+                              {col_name[2], TiDB::TP::TypeString},
+                              {col_name[3], TiDB::TP::TypeLong}},
+                             {toNullableVec<Int32>(col_name[0], col_age),
+                              toNullableVec<String>(col_name[1], col_gender),
+                              toNullableVec<String>(col_name[2], col_country),
+                              toNullableVec<Int32>(col_name[3], c0l_salary)});
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(const String & table_name, const String & col_name, bool is_desc, int limit_num)
+    {
+        return context.scan(db_name, table_name).topN(col_name, is_desc, limit_num).build(context);
+    }
+
+    std::shared_ptr<tipb::DAGRequest> buildDAGRequest(const String & table_name, MockOrderByItems order_by_items, int limit, MockAsts func_proj_ast = {}, MockColumnNames out_proj_ast = {})
+    {
+        if (func_proj_ast.size() == 0)
+            return context.scan(db_name, table_name).topN(order_by_items, limit).build(context);
+        else
+            return context.scan(db_name, table_name).project(func_proj_ast).topN(order_by_items, limit).project(out_proj_ast).build(context);
+    }
+
+    /// Prepare some names
+    const String db_name{"test_db"};
+
+    const String table_single_name{"topn_single_table"}; /// For single column test
+    const String single_col_name{"single_col"};
+    ColumnWithString col0{"col0-0", "col0-1", "col0-2", {}, "col0-4", {}, "col0-6", "col0-7"};
+
+    const String table_name{"clerk"};
+    const std::vector<String> col_name{"age", "gender", "country", "salary"};
+    ColumnWithInt32 col_age{{}, 27, 32, 36, {}, 34};
+    ColumnWithString col_gender{"female", "female", "male", "female", "male", "male"};
+    ColumnWithString col_country{"korea", "usa", "usa", "china", "china", "china"};
+    ColumnWithInt32 c0l_salary{1300, 0, {}, 900, {}, -300};
+};
+
+TEST_F(ExecutorTopNTestRunner, TopN)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+    std::vector<ColumnsWithTypeAndName> expect_cols;
+
+    {
+        /// Test single column
+        size_t col_data_num = col0.size();
+        for (size_t i = 1; i <= 1; ++i)
+        {
+            bool is_desc;
+            is_desc = static_cast<bool>(i); /// Set descent or ascent
+            if (is_desc)
+                sort(col0.begin(), col0.end(), std::greater<ColStringType>()); /// Sort col0 for the following comparison
+            else
+                sort(col0.begin(), col0.end());
+
+            for (size_t limit_num = 0; limit_num <= col_data_num + 5; ++limit_num)
+            {
+                request = buildDAGRequest(table_single_name, single_col_name, is_desc, limit_num);
+
+                expect_cols.clear();
+                if (limit_num == 0 || limit_num > col_data_num)
+                    expect_cols.push_back({toNullableVec<String>(single_col_name, ColumnWithString(col0.begin(), col0.end()))});
+                else
+                    expect_cols.push_back({toNullableVec<String>(single_col_name, ColumnWithString(col0.begin(), col0.begin() + limit_num))});
+
+                executeStreams(request, expect_cols[0]);
+                executeStreams(request, expect_cols[0], 2);
+                executeStreams(request, expect_cols[0], 4);
+                executeStreams(request, expect_cols[0], 8);
+            }
+        }
+    }
+
+    {
+        /// Test multi-columns
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{36, 34, 32, 27, {}, {}}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "male", "female"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"china", "china", "usa", "usa", "china", "korea"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{900, -300, {}, 0, {}, 1300})},
+                       {toNullableVec<Int32>(col_name[0], ColumnWithInt32{32, {}, 34, 27, 36, {}}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"male", "male", "male", "female", "female", "female"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"usa", "china", "china", "usa", "china", "korea"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{{}, {}, -300, 0, 900, 1300})},
+                       {toNullableVec<Int32>(col_name[0], ColumnWithInt32{34, {}, 32, 36, {}, 27}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"male", "male", "male", "female", "female", "female"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"china", "china", "usa", "china", "korea", "usa"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{-300, {}, {}, 900, 1300, 0})}};
+
+        std::vector<MockOrderByItems> order_by_items{
+            /// select * from clerk order by age DESC, gender DESC;
+            {MockOrderByItem(col_name[0], true), MockOrderByItem(col_name[1], true)},
+            /// select * from clerk order by gender DESC, salary ASC;
+            {MockOrderByItem(col_name[1], true), MockOrderByItem(col_name[3], false)},
+            /// select * from clerk order by gender DESC, country ASC, salary DESC;
+            {MockOrderByItem(col_name[1], true), MockOrderByItem(col_name[2], false), MockOrderByItem(col_name[3], true)}};
+
+        size_t test_num = expect_cols.size();
+
+        for (size_t i = 0; i < test_num; ++i)
+        {
+            request = buildDAGRequest(table_name, order_by_items[i], 100);
+            executeStreams(request, expect_cols[i]);
+        }
+    }
+}
+CATCH
+
+TEST_F(ExecutorTopNTestRunner, TopNFunction)
+try
+{
+    std::shared_ptr<tipb::DAGRequest> request;
+    std::vector<ColumnsWithTypeAndName> expect_cols;
+    MockColumnNames output_projection{col_name[0], col_name[1], col_name[2], col_name[3]};
+    MockAsts func_projection; // Do function operation for topn
+    MockOrderByItems order_by_items;
+    ASTPtr col0_ast = col(col_name[0]);
+    ASTPtr col1_ast = col(col_name[1]);
+    ASTPtr col2_ast = col(col_name[2]);
+    ASTPtr col3_ast = col(col_name[3]);
+    ASTPtr func_ast;
+
+    {
+        /// "and" function
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{{}, {}, 32, 27, 36, 34}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "female", "male"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"korea", "china", "usa", "usa", "china", "china"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{1300, {}, {}, 0, 900, -300})}};
+
+        {
+            /// select * from clerk order by age and salary ASC limit 100;
+            order_by_items = {MockOrderByItem("and(age, salary)", false)};
+            func_ast = And(col(col_name[0]), col(col_name[3]));
+            func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+            request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+            executeStreams(request, expect_cols[0]);
+        }
+    }
+
+    {
+        /// "equal" function
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{27, 36, 34, 32, {}, {}}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "female", "male", "male", "female", "male"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"usa", "china", "china", "usa", "korea", "china"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{0, 900, -300, {}, 1300, {}})}};
+
+        {
+            /// select age, salary from clerk order by age = salary DESC limit 100;
+            order_by_items = {MockOrderByItem("equals(age, salary)", true)};
+            func_ast = eq(col(col_name[0]), col(col_name[3]));
+            func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+            request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+            executeStreams(request, expect_cols[0]);
+        }
+    }
+
+    {
+        /// "greater" function
+        expect_cols = {{toNullableVec<Int32>(col_name[0], ColumnWithInt32{{}, 32, {}, 36, 27, 34}),
+                        toNullableVec<String>(col_name[1], ColumnWithString{"female", "male", "male", "female", "female", "male"}),
+                        toNullableVec<String>(col_name[2], ColumnWithString{"korea", "usa", "china", "china", "usa", "china"}),
+                        toNullableVec<Int32>(col_name[3], ColumnWithInt32{1300, {}, {}, 900, 0, -300})}};
+
+        {
+            /// select age, gender, country, salary from clerk order by age > salary ASC limit 100;
+            order_by_items = {MockOrderByItem("greater(age, salary)", false)};
+            func_ast = gt(col(col_name[0]), col(col_name[3]));
+            func_projection = {col0_ast, col1_ast, col2_ast, col3_ast, func_ast};
+
+            request = buildDAGRequest(table_name, order_by_items, 100, func_projection, output_projection);
+            executeStreams(request, expect_cols[0]);
+        }
+    }
+
+    /// TODO more functions...
+}
+CATCH
+
+} // namespace tests
+} // namespace DB
diff --git a/dbms/src/IO/WriteBuffer.h b/dbms/src/IO/WriteBuffer.h
index 361081d1176..0c0fa2cb545 100644
--- a/dbms/src/IO/WriteBuffer.h
+++ b/dbms/src/IO/WriteBuffer.h
@@ -96,6 +96,24 @@ class WriteBuffer : public BufferBase
         }
     }
 
+    template <class T>
+    __attribute__((always_inline)) void writeFixed(const T * __restrict from)
+    {
+        if (likely(working_buffer.end() - pos >= static_cast<ptrdiff_t>(sizeof(T))))
+        {
+            tiflash_compiler_builtin_memcpy(pos, from, sizeof(T));
+            pos += sizeof(T);
+        }
+        else
+        {
+            [&]() __attribute__((noinline))
+            {
+                write(reinterpret_cast<const char *>(from), sizeof(T));
+            }
+            ();
+        }
+    }
+
 
     inline void write(char x)
     {
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 6a39bc333a8..6cb947a1bfa 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -17,6 +17,7 @@
 #include <AggregateFunctions/AggregateFunctionState.h>
 #include <Columns/ColumnTuple.h>
 #include <Common/ClickHouseRevision.h>
+#include <Common/FailPoint.h>
 #include <Common/MemoryTracker.h>
 #include <Common/Stopwatch.h>
 #include <Common/ThreadManager.h>
@@ -48,6 +49,11 @@ extern const int CANNOT_MERGE_DIFFERENT_AGGREGATED_DATA_VARIANTS;
 extern const int LOGICAL_ERROR;
 } // namespace ErrorCodes
 
+namespace FailPoints
+{
+extern const char random_aggregate_create_state_failpoint[];
+extern const char random_aggregate_merge_failpoint[];
+} // namespace FailPoints
 
 AggregatedDataVariants::~AggregatedDataVariants()
 {
@@ -317,6 +323,7 @@ void Aggregator::createAggregateStates(AggregateDataPtr & aggregate_data) const
               * In order that then everything is properly destroyed, we "roll back" some of the created states.
               * The code is not very convenient.
               */
+            FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_aggregate_create_state_failpoint);
             aggregate_functions[j]->create(aggregate_data + offsets_of_aggregate_states[j]);
         }
         catch (...)
@@ -1504,6 +1511,8 @@ class MergingAndConvertingBlockInputStream : public IProfilingBlockInputStream
         if (current_bucket_num >= NUM_BUCKETS)
             return {};
 
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_aggregate_merge_failpoint);
+
         AggregatedDataVariantsPtr & first = data[0];
 
         if (current_bucket_num == -1)
diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp
index 820618a6e8b..181ebcaaa64 100644
--- a/dbms/src/Interpreters/Join.cpp
+++ b/dbms/src/Interpreters/Join.cpp
@@ -17,6 +17,7 @@
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnString.h>
 #include <Common/ColumnsHashing.h>
+#include <Common/FailPoint.h>
 #include <Common/typeid_cast.h>
 #include <Core/ColumnNumbers.h>
 #include <DataStreams/IProfilingBlockInputStream.h>
@@ -26,9 +27,17 @@
 #include <Functions/FunctionHelpers.h>
 #include <Interpreters/Join.h>
 #include <Interpreters/NullableUtils.h>
+#include <common/logger_useful.h>
+
 
 namespace DB
 {
+namespace FailPoints
+{
+extern const char random_join_build_failpoint[];
+extern const char random_join_prob_failpoint[];
+} // namespace FailPoints
+
 namespace ErrorCodes
 {
 extern const int UNKNOWN_SET_DATA_VARIANT;
@@ -621,6 +630,7 @@ void NO_INLINE insertFromBlockImplTypeCaseWithLock(
     }
     for (size_t insert_index = 0; insert_index < segment_index_info.size(); insert_index++)
     {
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_join_build_failpoint);
         size_t segment_index = (insert_index + stream_index) % segment_index_info.size();
         if (segment_index == segment_size)
         {
@@ -1513,7 +1523,7 @@ void Join::joinBlockImpl(Block & block, const Maps & maps) const
     default:
         throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
     }
-
+    FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_join_prob_failpoint);
     for (size_t i = 0; i < num_columns_to_add; ++i)
     {
         const ColumnWithTypeAndName & sample_col = sample_block_with_columns_to_add.getByPosition(i);
diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp
index 96cfc0a58ae..78ad4b41ce6 100644
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Common/FailPoint.h>
 #include <Common/ProfileEvents.h>
 #include <Common/formatReadable.h>
 #include <Common/typeid_cast.h>
@@ -53,7 +54,10 @@ extern const int LOGICAL_ERROR;
 extern const int QUERY_IS_TOO_LARGE;
 extern const int INTO_OUTFILE_NOT_ALLOWED;
 } // namespace ErrorCodes
-
+namespace FailPoints
+{
+extern const char random_interpreter_failpoint[];
+} // namespace FailPoints
 namespace
 {
 void checkASTSizeLimits(const IAST & ast, const Settings & settings)
@@ -226,6 +230,7 @@ std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             context.setProcessListElement(&process_list_entry->get());
         }
 
+        FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_interpreter_failpoint);
         auto interpreter = query_src.interpreter(context, stage);
         res = interpreter->execute();
 
diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp
index 901248c7f6d..b422b59535c 100644
--- a/dbms/src/Server/Server.cpp
+++ b/dbms/src/Server/Server.cpp
@@ -18,6 +18,7 @@
 #include <Common/Config/ConfigReloader.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/DynamicThreadPool.h>
+#include <Common/FailPoint.h>
 #include <Common/Macros.h>
 #include <Common/RedactHelpers.h>
 #include <Common/StringUtils/StringUtils.h>
@@ -150,6 +151,7 @@ void loadMiConfig(Logger * log)
 }
 #undef TRY_LOAD_CONF
 #endif
+
 namespace
 {
 [[maybe_unused]] void tryLoadBoolConfigFromEnv(Poco::Logger * log, bool & target, const char * name)
@@ -183,6 +185,7 @@ extern const int NO_ELEMENTS_IN_CONFIG;
 extern const int SUPPORT_IS_DISABLED;
 extern const int ARGUMENT_OUT_OF_BOUND;
 extern const int INVALID_CONFIG_PARAMETER;
+extern const int IP_ADDRESS_NOT_ALLOWED;
 } // namespace ErrorCodes
 
 namespace Debug
@@ -620,6 +623,10 @@ class Server::FlashGrpcServerHolder
             }
         }
         flash_grpc_server = builder.BuildAndStart();
+        if (!flash_grpc_server)
+        {
+            throw Exception("Exception happens when start grpc server, the flash.service_addr may be invalid, flash.service_addr is " + raft_config.flash_server_addr, ErrorCodes::IP_ADDRESS_NOT_ALLOWED);
+        }
         LOG_FMT_INFO(log, "Flash grpc server listening on [{}]", raft_config.flash_server_addr);
         Debug::setServiceAddr(raft_config.flash_server_addr);
         if (enable_async_server)
@@ -960,7 +967,10 @@ class Server::TcpHttpServersHolder
             LOG_DEBUG(log, debug_msg);
     }
 
-    const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const { return servers; }
+    const std::vector<std::unique_ptr<Poco::Net::TCPServer>> & getServers() const
+    {
+        return servers;
+    }
 
 private:
     Server & server;
@@ -976,6 +986,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
     Poco::Logger * log = &logger();
 #ifdef FIU_ENABLE
     fiu_init(0); // init failpoint
+    FailPointHelper::initRandomFailPoints(config(), log);
 #endif
 
     UpdateMallocConfig(log);
@@ -995,7 +1006,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
 #ifdef TIFLASH_ENABLE_SVE_SUPPORT
     tryLoadBoolConfigFromEnv(log, simd_option::ENABLE_SVE, "TIFLASH_ENABLE_SVE");
 #endif
-
     registerFunctions();
     registerAggregateFunctions();
     registerWindowFunctions();
diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
index 132732d6989..8a69b7573e2 100644
--- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
+++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp
@@ -141,6 +141,19 @@ bool DeltaValueSpace::ingestColumnFiles(DMContext & /*context*/, const RowKeyRan
 
 bool DeltaValueSpace::flush(DMContext & context)
 {
+    bool v = false;
+    if (!is_flushing.compare_exchange_strong(v, true))
+    {
+        // other thread is flushing, just return.
+        LOG_FMT_DEBUG(log, "{}, Flush stop because other thread is flushing", simpleInfo());
+        return false;
+    }
+    SCOPE_EXIT({
+        bool v = true;
+        if (!is_flushing.compare_exchange_strong(v, false))
+            throw Exception(simpleInfo() + " is expected to be flushing", ErrorCodes::LOGICAL_ERROR);
+    });
+
     LOG_FMT_DEBUG(log, "{}, Flush start", info());
 
     /// We have two types of data needed to flush to disk:
diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
index 8f14682caa8..04fb97b3004 100644
--- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
+++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h
@@ -77,6 +77,11 @@ class DeltaValueSpace
     /// Note that those things can not be done at the same time.
     std::atomic_bool is_updating = false;
 
+    /// Note that it's safe to do multiple flush concurrently but only one of them can succeed,
+    /// and other thread's work is just a waste of resource.
+    /// So we only allow one flush task running at any time to aviod waste resource.
+    std::atomic_bool is_flushing = false;
+
     std::atomic<size_t> last_try_flush_rows = 0;
     std::atomic<size_t> last_try_flush_bytes = 0;
     std::atomic<size_t> last_try_compact_column_files = 0;
@@ -159,6 +164,8 @@ class DeltaValueSpace
     size_t getTotalCacheBytes() const;
     size_t getValidCacheRows() const;
 
+    bool isFlushing() const { return is_flushing; }
+
     bool isUpdating() const { return is_updating; }
 
     bool tryLockUpdating()
diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
index 195ed5c53c2..09f290e311c 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
+++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp
@@ -980,14 +980,14 @@ void DeltaMergeStore::deleteRange(const Context & db_context, const DB::Settings
         checkSegmentUpdate(dm_context, segment, ThreadType::Write);
 }
 
-void DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRange & range)
+bool DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRange & range, bool try_until_succeed)
 {
     RowKeyRange cur_range = range;
     while (!cur_range.none())
     {
         RowKeyRange segment_range;
 
-        // Keep trying until succeeded.
+        // Keep trying until succeeded if needed.
         while (true)
         {
             SegmentPtr segment;
@@ -1010,10 +1010,15 @@ void DeltaMergeStore::flushCache(const DMContextPtr & dm_context, const RowKeyRa
             {
                 break;
             }
+            else if (!try_until_succeed)
+            {
+                return false;
+            }
         }
 
         cur_range.setStart(segment_range.end);
     }
+    return true;
 }
 
 void DeltaMergeStore::mergeDeltaAll(const Context & context)
@@ -1347,6 +1352,12 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         && (delta_rows - delta_last_try_flush_rows >= delta_cache_limit_rows
             || delta_bytes - delta_last_try_flush_bytes >= delta_cache_limit_bytes);
     bool should_foreground_flush = unsaved_rows >= delta_cache_limit_rows * 3 || unsaved_bytes >= delta_cache_limit_bytes * 3;
+    /// For write thread, we want to avoid foreground flush to block the process of apply raft command.
+    /// So we increase the threshold of foreground flush for write thread.
+    if (thread_type == ThreadType::Write)
+    {
+        should_foreground_flush = unsaved_rows >= delta_cache_limit_rows * 10 || unsaved_bytes >= delta_cache_limit_bytes * 10;
+    }
 
     bool should_background_merge_delta = ((delta_check_rows >= delta_limit_rows || delta_check_bytes >= delta_limit_bytes) //
                                           && (delta_rows - delta_last_try_merge_delta_rows >= delta_cache_limit_rows
@@ -1404,9 +1415,16 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         }
         else if (should_background_flush)
         {
-            delta_last_try_flush_rows = delta_rows;
-            delta_last_try_flush_bytes = delta_bytes;
-            try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment, {}});
+            /// It's meaningless to add more flush tasks if the segment is flushing.
+            /// Because only one flush task can proceed at any time.
+            /// And after the current flush task finished,
+            /// it will call `checkSegmentUpdate` again to check whether there is more flush task to do.
+            if (!segment->isFlushing())
+            {
+                delta_last_try_flush_rows = delta_rows;
+                delta_last_try_flush_bytes = delta_bytes;
+                try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment, {}});
+            }
         }
     }
 
@@ -1502,7 +1520,12 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const
         return false;
     };
     auto try_bg_compact = [&]() {
-        if (should_compact)
+        /// Compact task should be a really low priority task.
+        /// And if the segment is flushing,
+        /// we should avoid adding background compact task to reduce lock contention on the segment and save disk throughput.
+        /// And after the current flush task complete,
+        /// it will call `checkSegmentUpdate` again to check whether there is other kinds of task to do.
+        if (should_compact && !segment->isFlushing())
         {
             delta_last_try_compact_column_files = column_file_count;
             try_add_background_task(BackgroundTask{TaskType::Compact, dm_context, segment, {}});
diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
index 705481ca107..57c2a42b807 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
+++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h
@@ -367,14 +367,14 @@ class DeltaMergeStore : private boost::noncopyable
                            const SegmentIdSet & read_segments = {},
                            size_t extra_table_id_index = InvalidColumnID);
 
-    /// Force flush all data to disk.
-    void flushCache(const Context & context, const RowKeyRange & range)
+    /// Try flush all data in `range` to disk and return whether the task succeed.
+    bool flushCache(const Context & context, const RowKeyRange & range, bool try_until_succeed = true)
     {
         auto dm_context = newDMContext(context, context.getSettingsRef());
-        flushCache(dm_context, range);
+        return flushCache(dm_context, range, try_until_succeed);
     }
 
-    void flushCache(const DMContextPtr & dm_context, const RowKeyRange & range);
+    bool flushCache(const DMContextPtr & dm_context, const RowKeyRange & range, bool try_until_succeed = true);
 
     /// Merge delta into the stable layer for all segments.
     ///
diff --git a/dbms/src/Storages/DeltaMerge/DeltaTree.h b/dbms/src/Storages/DeltaMerge/DeltaTree.h
index 47674ab2cfc..29e127fe35f 100644
--- a/dbms/src/Storages/DeltaMerge/DeltaTree.h
+++ b/dbms/src/Storages/DeltaMerge/DeltaTree.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/TargetSpecific.h>
 #include <Core/Types.h>
 #include <IO/WriteHelpers.h>
 #include <Storages/DeltaMerge/Tuple.h>
@@ -810,6 +811,20 @@ class DeltaTree
     template <typename T>
     InternPtr afterNodeUpdated(T * node);
 
+#ifdef __x86_64__
+    template <typename T>
+    InternPtr afterNodeUpdatedGeneric(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedAVX512(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedAVX(T * node);
+
+    template <typename T>
+    InternPtr afterNodeUpdatedSSE4(T * node);
+#endif
+
     inline void afterLeafUpdated(LeafPtr leaf)
     {
         if (leaf->count == 0 && isRootOnly())
@@ -1348,158 +1363,86 @@ typename DT_CLASS::InterAndSid DT_CLASS::submitMinSid(T * node, UInt64 subtree_m
     }
 }
 
-DT_TEMPLATE
-template <class T>
-typename DT_CLASS::InternPtr DT_CLASS::afterNodeUpdated(T * node)
+#ifndef __x86_64__
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdated
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+#else
+
+// generic implementation
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedGeneric
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+
+// avx512 implementation
+TIFLASH_BEGIN_AVX512_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedAVX512
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+// avx implementation
+TIFLASH_BEGIN_AVX_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedAVX
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+// sse4 implementation
+TIFLASH_BEGIN_SSE4_SPECIFIC_CODE
+#define TIFLASH_DT_IMPL_NAME afterNodeUpdatedSSE4
+#include "DeltaTree.ipp"
+#undef TIFLASH_DT_IMPL_NAME
+TIFLASH_END_TARGET_SPECIFIC_CODE
+
+namespace Impl
 {
-    if (!node)
-        return {};
-
-    constexpr bool is_leaf = std::is_same<Leaf, T>::value;
+enum class DeltaTreeVariant
+{
+    Generic,
+    SSE4,
+    AVX,
+    AVX512
+};
 
-    if (root == asNode(node) && !isLeaf(root) && node->count == 1)
+static inline DeltaTreeVariant resolveDeltaTreeVariant()
+{
+    if (DB::TargetSpecific::AVX512Checker::runtimeSupport())
     {
-        /// Decrease tree height.
-        root = as(Intern, root)->children[0];
-
-        --(node->count);
-        freeNode<T>(node);
-
-        if (isLeaf(root))
-            as(Leaf, root)->parent = nullptr;
-        else
-            as(Intern, root)->parent = nullptr;
-        --height;
-
-        LOG_FMT_TRACE(log, "height {} -> {}", (height + 1), height);
-
-        return {};
+        return DeltaTreeVariant::AVX512;
     }
-
-    auto parent = node->parent;
-    bool parent_updated = false;
-
-    if (T::overflow(node->count)) // split
+    if (DB::TargetSpecific::AVXChecker::runtimeSupport())
     {
-        if (!parent)
-        {
-            /// Increase tree height.
-            parent = createNode<Intern>();
-            root = asNode(parent);
-
-            parent->deltas[0] = checkDelta(node->getDelta());
-            parent->children[0] = asNode(node);
-            ++(parent->count);
-            parent->refreshChildParent();
-
-            ++height;
-
-            LOG_FMT_TRACE(log, "height {} -> {}", (height - 1), height);
-        }
-
-        auto pos = parent->searchChild(asNode(node));
-
-        T * next_n = createNode<T>();
-
-        UInt64 sep_sid = node->split(next_n);
-
-        // handle parent update
-        parent->shiftEntries(pos + 1, 1);
-        // for current node
-        parent->deltas[pos] = checkDelta(node->getDelta());
-        // for next node
-        parent->sids[pos] = sep_sid;
-        parent->deltas[pos + 1] = checkDelta(next_n->getDelta());
-        parent->children[pos + 1] = asNode(next_n);
-
-        ++(parent->count);
-
-        if constexpr (is_leaf)
-        {
-            if (as(Leaf, node) == right_leaf)
-                right_leaf = as(Leaf, next_n);
-        }
-
-        parent_updated = true;
+        return DeltaTreeVariant::AVX;
     }
-    else if (T::underflow(node->count) && root != asNode(node)) // adopt or merge
+    if (DB::TargetSpecific::SSE4Checker::runtimeSupport())
     {
-        auto pos = parent->searchChild(asNode(node));
-
-        // currently we always adopt from the right one if possible
-        bool is_sibling_left;
-        size_t sibling_pos;
-        T * sibling;
-
-        if (unlikely(parent->count <= 1))
-            throw Exception("Unexpected parent entry count: " + DB::toString(parent->count));
-
-        if (pos == parent->count - 1)
-        {
-            is_sibling_left = true;
-            sibling_pos = pos - 1;
-            sibling = as(T, parent->children[sibling_pos]);
-        }
-        else
-        {
-            is_sibling_left = false;
-            sibling_pos = pos + 1;
-            sibling = as(T, parent->children[sibling_pos]);
-        }
-
-        if (unlikely(sibling->parent != node->parent))
-            throw Exception("parent not the same");
-
-        auto after_adopt = (node->count + sibling->count) / 2;
-        if (T::underflow(after_adopt))
-        {
-            // Do merge.
-            // adoption won't work because the sibling doesn't have enough entries.
-
-            node->merge(sibling, is_sibling_left, pos);
-            freeNode<T>(sibling);
-
-            pos = std::min(pos, sibling_pos);
-            parent->deltas[pos] = checkDelta(node->getDelta());
-            parent->children[pos] = asNode(node);
-            parent->shiftEntries(pos + 2, -1);
-
-            if constexpr (is_leaf)
-            {
-                if (is_sibling_left && (as(Leaf, sibling) == left_leaf))
-                    left_leaf = as(Leaf, node);
-                else if (!is_sibling_left && as(Leaf, sibling) == right_leaf)
-                    right_leaf = as(Leaf, node);
-            }
-            --(parent->count);
-        }
-        else
-        {
-            // Do adoption.
-
-            auto adopt_count = after_adopt - node->count;
-            auto new_sep_sid = node->adopt(sibling, is_sibling_left, adopt_count, pos);
+        return DeltaTreeVariant::SSE4;
+    }
+    return DeltaTreeVariant::Generic;
+}
 
-            parent->sids[std::min(pos, sibling_pos)] = new_sep_sid;
-            parent->deltas[pos] = checkDelta(node->getDelta());
-            parent->deltas[sibling_pos] = checkDelta(sibling->getDelta());
-        }
+static inline DeltaTreeVariant DELTA_TREE_VARIANT = resolveDeltaTreeVariant();
+} // namespace Impl
 
-        parent_updated = true;
-    }
-    else if (parent)
+DT_TEMPLATE
+template <class T>
+typename DT_CLASS::InternPtr DT_CLASS::afterNodeUpdated(T * node)
+{
+    switch (Impl::DELTA_TREE_VARIANT)
     {
-        auto pos = parent->searchChild(asNode(node));
-        auto delta = node->getDelta();
-        parent_updated = parent->deltas[pos] != delta;
-        parent->deltas[pos] = checkDelta(delta);
+    case Impl::DeltaTreeVariant::Generic:
+        return afterNodeUpdatedGeneric(node);
+    case Impl::DeltaTreeVariant::SSE4:
+        return afterNodeUpdatedSSE4(node);
+    case Impl::DeltaTreeVariant::AVX:
+        return afterNodeUpdatedAVX(node);
+    case Impl::DeltaTreeVariant::AVX512:
+        return afterNodeUpdatedAVX512(node);
     }
-
-    if (parent_updated)
-        return parent;
-    else
-        return {};
 }
+#endif
+
 
 #undef as
 #undef asNode
diff --git a/dbms/src/Storages/DeltaMerge/DeltaTree.ipp b/dbms/src/Storages/DeltaMerge/DeltaTree.ipp
new file mode 100644
index 00000000000..27b8a3b96f1
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/DeltaTree.ipp
@@ -0,0 +1,165 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+DT_TEMPLATE
+template <class T>
+__attribute__((noinline, flatten)) typename DT_CLASS::InternPtr DT_CLASS::TIFLASH_DT_IMPL_NAME(T * node)
+{
+    if (!node)
+        return {};
+
+    constexpr bool is_leaf = std::is_same<Leaf, T>::value;
+
+    if (root == asNode(node) && !isLeaf(root) && node->count == 1)
+    {
+        /// Decrease tree height.
+        root = as(Intern, root)->children[0];
+
+        --(node->count);
+        freeNode<T>(node);
+
+        if (isLeaf(root))
+            as(Leaf, root)->parent = nullptr;
+        else
+            as(Intern, root)->parent = nullptr;
+        --height;
+
+        LOG_FMT_TRACE(log, "height {} -> {}", (height + 1), height);
+
+        return {};
+    }
+
+    auto parent = node->parent;
+    bool parent_updated = false;
+
+    if (T::overflow(node->count)) // split
+    {
+        if (!parent)
+        {
+            /// Increase tree height.
+            parent = createNode<Intern>();
+            root = asNode(parent);
+
+            parent->deltas[0] = checkDelta(node->getDelta());
+            parent->children[0] = asNode(node);
+            ++(parent->count);
+            parent->refreshChildParent();
+
+            ++height;
+
+            LOG_FMT_TRACE(log, "height {} -> {}", (height - 1), height);
+        }
+
+        auto pos = parent->searchChild(asNode(node));
+
+        T * next_n = createNode<T>();
+
+        UInt64 sep_sid = node->split(next_n);
+
+        // handle parent update
+        parent->shiftEntries(pos + 1, 1);
+        // for current node
+        parent->deltas[pos] = checkDelta(node->getDelta());
+        // for next node
+        parent->sids[pos] = sep_sid;
+        parent->deltas[pos + 1] = checkDelta(next_n->getDelta());
+        parent->children[pos + 1] = asNode(next_n);
+
+        ++(parent->count);
+
+        if constexpr (is_leaf)
+        {
+            if (as(Leaf, node) == right_leaf)
+                right_leaf = as(Leaf, next_n);
+        }
+
+        parent_updated = true;
+    }
+    else if (T::underflow(node->count) && root != asNode(node)) // adopt or merge
+    {
+        auto pos = parent->searchChild(asNode(node));
+
+        // currently we always adopt from the right one if possible
+        bool is_sibling_left;
+        size_t sibling_pos;
+        T * sibling;
+
+        if (unlikely(parent->count <= 1))
+            throw Exception("Unexpected parent entry count: " + DB::toString(parent->count));
+
+        if (pos == parent->count - 1)
+        {
+            is_sibling_left = true;
+            sibling_pos = pos - 1;
+            sibling = as(T, parent->children[sibling_pos]);
+        }
+        else
+        {
+            is_sibling_left = false;
+            sibling_pos = pos + 1;
+            sibling = as(T, parent->children[sibling_pos]);
+        }
+
+        if (unlikely(sibling->parent != node->parent))
+            throw Exception("parent not the same");
+
+        auto after_adopt = (node->count + sibling->count) / 2;
+        if (T::underflow(after_adopt))
+        {
+            // Do merge.
+            // adoption won't work because the sibling doesn't have enough entries.
+
+            node->merge(sibling, is_sibling_left, pos);
+            freeNode<T>(sibling);
+
+            pos = std::min(pos, sibling_pos);
+            parent->deltas[pos] = checkDelta(node->getDelta());
+            parent->children[pos] = asNode(node);
+            parent->shiftEntries(pos + 2, -1);
+
+            if constexpr (is_leaf)
+            {
+                if (is_sibling_left && (as(Leaf, sibling) == left_leaf))
+                    left_leaf = as(Leaf, node);
+                else if (!is_sibling_left && as(Leaf, sibling) == right_leaf)
+                    right_leaf = as(Leaf, node);
+            }
+            --(parent->count);
+        }
+        else
+        {
+            // Do adoption.
+
+            auto adopt_count = after_adopt - node->count;
+            auto new_sep_sid = node->adopt(sibling, is_sibling_left, adopt_count, pos);
+
+            parent->sids[std::min(pos, sibling_pos)] = new_sep_sid;
+            parent->deltas[pos] = checkDelta(node->getDelta());
+            parent->deltas[sibling_pos] = checkDelta(sibling->getDelta());
+        }
+
+        parent_updated = true;
+    }
+    else if (parent)
+    {
+        auto pos = parent->searchChild(asNode(node));
+        auto delta = node->getDelta();
+        parent_updated = parent->deltas[pos] != delta;
+        parent->deltas[pos] = checkDelta(delta);
+    }
+
+    if (parent_updated)
+        return parent;
+    else
+        return {};
+}
\ No newline at end of file
diff --git a/dbms/src/Storages/DeltaMerge/Segment.h b/dbms/src/Storages/DeltaMerge/Segment.h
index cccfc5091b9..8058329ae91 100644
--- a/dbms/src/Storages/DeltaMerge/Segment.h
+++ b/dbms/src/Storages/DeltaMerge/Segment.h
@@ -300,6 +300,8 @@ class Segment : private boost::noncopyable
 
     void drop(const FileProviderPtr & file_provider, WriteBatches & wbs);
 
+    bool isFlushing() const { return delta->isFlushing(); }
+
     RowsAndBytes getRowsAndBytesInRange(
         DMContext & dm_context,
         const SegmentSnapshotPtr & segment_snap,
diff --git a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
index b35dae0cbe2..84fafbc46ef 100644
--- a/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
+++ b/dbms/src/Storages/DeltaMerge/tests/DMTestEnv.h
@@ -273,7 +273,8 @@ class DMTestEnv
                                          DataTypePtr pk_type = EXTRA_HANDLE_COLUMN_INT_TYPE,
                                          bool is_common_handle = false,
                                          size_t rowkey_column_size = 1,
-                                         bool with_internal_columns = true)
+                                         bool with_internal_columns = true,
+                                         bool is_deleted = false)
     {
         Block block;
         const size_t num_rows = (end - beg);
@@ -324,7 +325,7 @@ class DMTestEnv
                 VERSION_COLUMN_ID));
             // tag_col
             block.insert(DB::tests::createColumn<UInt8>(
-                std::vector<UInt64>(num_rows, 0),
+                std::vector<UInt64>(num_rows, is_deleted),
                 TAG_COLUMN_NAME,
                 TAG_COLUMN_ID));
         }
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp
new file mode 100644
index 00000000000..1c68ba3bb2a
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp
@@ -0,0 +1,86 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Common/CurrentMetrics.h>
+#include <DataStreams/OneBlockInputStream.h>
+#include <Storages/DeltaMerge/DeltaMergeStore.h>
+#include <Storages/DeltaMerge/tests/gtest_segment_test_basic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+class SegmentOperationTest : public SegmentTestBasic
+{
+protected:
+    static void SetUpTestCase() {}
+};
+
+TEST_F(SegmentOperationTest, Issue4956)
+try
+{
+    SegmentTestOptions options;
+    reloadWithOptions(options);
+
+    // flush data, make the segment can be split.
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID);
+    // write data to cache, reproduce the https://github.com/pingcap/tiflash/issues/4956
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    deleteRangeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    auto segment_id = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    ASSERT_TRUE(segment_id.has_value());
+
+    mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id);
+}
+CATCH
+
+TEST_F(SegmentOperationTest, TestSegment)
+try
+{
+    SegmentTestOptions options;
+    reloadWithOptions(options);
+    writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID);
+    mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID);
+    auto segment_id = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID);
+    ASSERT_TRUE(segment_id.has_value());
+
+    size_t origin_rows = getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID);
+
+    writeSegment(*segment_id);
+    flushSegmentCache(*segment_id);
+    deleteRangeSegment(*segment_id);
+    writeSegmentWithDeletedPack(*segment_id);
+    mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id);
+
+    EXPECT_EQ(getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID), origin_rows);
+}
+CATCH
+
+TEST_F(SegmentOperationTest, TestSegmentRandom)
+try
+{
+    SegmentTestOptions options;
+    options.is_common_handle = true;
+    reloadWithOptions(options);
+    randomSegmentTest(100);
+}
+CATCH
+} // namespace tests
+} // namespace DM
+} // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp
new file mode 100644
index 00000000000..c676f2e08d5
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp
@@ -0,0 +1,430 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <Common/CurrentMetrics.h>
+#include <DataStreams/OneBlockInputStream.h>
+#include <Storages/DeltaMerge/DMContext.h>
+#include <Storages/DeltaMerge/DeltaMergeStore.h>
+#include <Storages/DeltaMerge/Segment.h>
+#include <Storages/DeltaMerge/tests/DMTestEnv.h>
+#include <Storages/DeltaMerge/tests/gtest_segment_test_basic.h>
+#include <Storages/Transaction/TMTContext.h>
+#include <Storages/tests/TiFlashStorageTestBasic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+void SegmentTestBasic::reloadWithOptions(SegmentTestOptions config)
+{
+    TiFlashStorageTestBasic::SetUp();
+    options = config;
+    table_columns = std::make_shared<ColumnDefines>();
+
+    root_segment = reload(config.is_common_handle);
+    ASSERT_EQ(root_segment->segmentId(), DELTA_MERGE_FIRST_SEGMENT_ID);
+    segments.clear();
+    segments[DELTA_MERGE_FIRST_SEGMENT_ID] = root_segment;
+}
+
+PageId SegmentTestBasic::createNewSegmentWithSomeData()
+{
+    SegmentPtr new_segment;
+    std::tie(root_segment, new_segment) = root_segment->split(dmContext(), tableColumns());
+
+    const size_t num_rows_write_per_batch = 100;
+    {
+        // write to segment and flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(0, num_rows_write_per_batch, false);
+        new_segment->write(dmContext(), std::move(block), true);
+    }
+    {
+        // write to segment and don't flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(num_rows_write_per_batch, 2 * num_rows_write_per_batch, false);
+        new_segment->write(dmContext(), std::move(block), false);
+    }
+    return new_segment->segmentId();
+}
+
+size_t SegmentTestBasic::getSegmentRowNumWithoutMVCC(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    auto in = segment->getInputStreamRaw(dmContext(), *tableColumns());
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    return num_rows_read;
+}
+
+size_t SegmentTestBasic::getSegmentRowNum(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    auto in = segment->getInputStream(dmContext(), *tableColumns(), {segment->getRowKeyRange()});
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    return num_rows_read;
+}
+
+void SegmentTestBasic::checkSegmentRow(PageId segment_id, size_t expected_row_num)
+{
+    auto segment = segments[segment_id];
+    // read written data
+    auto in = segment->getInputStream(dmContext(), *tableColumns(), {segment->getRowKeyRange()});
+
+    size_t num_rows_read = 0;
+    in->readPrefix();
+    while (Block block = in->read())
+    {
+        num_rows_read += block.rows();
+    }
+    in->readSuffix();
+    ASSERT_EQ(num_rows_read, expected_row_num);
+}
+
+std::optional<PageId> SegmentTestBasic::splitSegment(PageId segment_id)
+{
+    auto origin_segment = segments[segment_id];
+    size_t origin_segment_row_num = getSegmentRowNum(segment_id);
+    SegmentPtr segment, new_segment;
+    std::tie(segment, new_segment) = origin_segment->split(dmContext(), tableColumns());
+    if (new_segment)
+    {
+        segments[new_segment->segmentId()] = new_segment;
+        segments[segment_id] = segment;
+
+        EXPECT_EQ(origin_segment_row_num, getSegmentRowNum(segment_id) + getSegmentRowNum(new_segment->segmentId()));
+        return new_segment->segmentId();
+    }
+    return std::nullopt;
+}
+
+void SegmentTestBasic::mergeSegment(PageId left_segment_id, PageId right_segment_id)
+{
+    auto left_segment = segments[left_segment_id];
+    auto right_segment = segments[right_segment_id];
+
+    size_t left_segment_row_num = getSegmentRowNum(left_segment_id);
+    size_t right_segment_row_num = getSegmentRowNum(right_segment_id);
+    LOG_FMT_TRACE(&Poco::Logger::root(), "merge in segment:{}:{} and {}:{}", left_segment->segmentId(), left_segment_row_num, right_segment->segmentId(), right_segment_row_num);
+
+    SegmentPtr merged_segment = Segment::merge(dmContext(), tableColumns(), left_segment, right_segment);
+    segments[merged_segment->segmentId()] = merged_segment;
+    auto it = segments.find(right_segment->segmentId());
+    if (it != segments.end())
+    {
+        segments.erase(it);
+    }
+    EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), left_segment_row_num + right_segment_row_num);
+}
+
+void SegmentTestBasic::mergeSegmentDelta(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNum(segment_id);
+    SegmentPtr merged_segment = segment->mergeDelta(dmContext(), tableColumns());
+    segments[merged_segment->segmentId()] = merged_segment;
+    EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), segment_row_num);
+}
+
+void SegmentTestBasic::flushSegmentCache(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNum(segment_id);
+    segment->flushCache(dmContext());
+    EXPECT_EQ(getSegmentRowNum(segment_id), segment_row_num);
+}
+
+std::pair<Int64, Int64> SegmentTestBasic::getSegmentKeyRange(SegmentPtr segment)
+{
+    Int64 start_key, end_key;
+    if (!options.is_common_handle)
+    {
+        start_key = segment->getRowKeyRange().getStart().int_value;
+        end_key = segment->getRowKeyRange().getEnd().int_value;
+        return {start_key, end_key};
+    }
+    EXPECT_EQ(segment->getRowKeyRange().getStart().data[0], TiDB::CodecFlagInt);
+    EXPECT_EQ(segment->getRowKeyRange().getEnd().data[0], TiDB::CodecFlagInt);
+    {
+        size_t cursor = 1;
+        start_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getStart().data, segment->getRowKeyRange().getStart().size));
+    }
+    {
+        size_t cursor = 1;
+        end_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getEnd().data, segment->getRowKeyRange().getEnd().size));
+    }
+    return {start_key, end_key};
+}
+
+void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows)
+{
+    if (write_rows == 0)
+    {
+        return;
+    }
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id);
+    std::pair<Int64, Int64> keys = getSegmentKeyRange(segment);
+    Int64 start_key = keys.first;
+    Int64 end_key = keys.second;
+    UInt64 remain_row_num = 0;
+    if (static_cast<UInt64>(end_key - start_key) > write_rows)
+    {
+        end_key = start_key + write_rows;
+    }
+    else
+    {
+        remain_row_num = write_rows - static_cast<UInt64>(end_key - start_key);
+    }
+    {
+        // write to segment and not flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle);
+        segment->write(dmContext(), std::move(block), false);
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key);
+        version++;
+    }
+    while (remain_row_num > 0)
+    {
+        UInt64 write_num = std::min(remain_row_num, static_cast<UInt64>(end_key - start_key));
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle);
+        segment->write(dmContext(), std::move(block), false);
+        remain_row_num -= write_num;
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key);
+        version++;
+    }
+    EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows);
+}
+
+void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id)
+{
+    UInt64 write_rows = DEFAULT_MERGE_BLOCK_SIZE;
+    auto segment = segments[segment_id];
+    size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id);
+    std::pair<Int64, Int64> keys = getSegmentKeyRange(segment);
+    Int64 start_key = keys.first;
+    Int64 end_key = keys.second;
+    UInt64 remain_row_num = 0;
+    if (static_cast<UInt64>(end_key - start_key) > write_rows)
+    {
+        end_key = start_key + write_rows;
+    }
+    else
+    {
+        remain_row_num = write_rows - static_cast<UInt64>(end_key - start_key);
+    }
+    {
+        // write to segment and not flush
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true);
+        segment->write(dmContext(), std::move(block), true);
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key);
+        version++;
+    }
+    while (remain_row_num > 0)
+    {
+        UInt64 write_num = std::min(remain_row_num, static_cast<UInt64>(end_key - start_key));
+        Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true);
+        segment->write(dmContext(), std::move(block), true);
+        remain_row_num -= write_num;
+        LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key);
+        version++;
+    }
+    EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows);
+}
+
+void SegmentTestBasic::deleteRangeSegment(PageId segment_id)
+{
+    auto segment = segments[segment_id];
+    segment->write(dmContext(), /*delete_range*/ segment->getRowKeyRange());
+    EXPECT_EQ(getSegmentRowNum(segment_id), 0);
+}
+
+void SegmentTestBasic::writeRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment:{}", random_segment_id);
+    writeSegment(random_segment_id);
+}
+void SegmentTestBasic::writeRandomSegmentWithDeletedPack()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment with deleted pack:{}", random_segment_id);
+    writeSegmentWithDeletedPack(random_segment_id);
+}
+
+void SegmentTestBasic::deleteRangeRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start delete range segment:{}", random_segment_id);
+    deleteRangeSegment(random_segment_id);
+}
+
+void SegmentTestBasic::splitRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start split segment:{}", random_segment_id);
+    splitSegment(random_segment_id);
+}
+
+void SegmentTestBasic::mergeRandomSegment()
+{
+    if (segments.empty() || segments.size() == 1)
+    {
+        return;
+    }
+    std::pair<PageId, PageId> segment_pair;
+    segment_pair = getRandomMergeablePair();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start merge segment:{} and {}", segment_pair.first, segment_pair.second);
+    mergeSegment(segment_pair.first, segment_pair.second);
+}
+
+void SegmentTestBasic::mergeDeltaRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start merge delta in segment:{}", random_segment_id);
+    mergeSegmentDelta(random_segment_id);
+}
+
+void SegmentTestBasic::flushCacheRandomSegment()
+{
+    if (segments.empty())
+    {
+        return;
+    }
+    PageId random_segment_id = getRandomSegmentId();
+    LOG_FMT_TRACE(&Poco::Logger::root(), "start flush cache in segment:{}", random_segment_id);
+    flushSegmentCache(random_segment_id);
+}
+
+void SegmentTestBasic::randomSegmentTest(size_t operator_count)
+{
+    for (size_t i = 0; i < operator_count; i++)
+    {
+        auto op = static_cast<SegmentOperaterType>(random() % SegmentOperaterMax);
+        segment_operator_entries[op]();
+    }
+}
+
+PageId SegmentTestBasic::getRandomSegmentId()
+{
+    auto max_segment_id = segments.rbegin()->first;
+    PageId random_segment_id = random() % (max_segment_id + 1);
+    auto it = segments.find(random_segment_id);
+    while (it == segments.end())
+    {
+        random_segment_id = random() % (max_segment_id + 1);
+        it = segments.find(random_segment_id);
+    }
+    return random_segment_id;
+}
+
+std::pair<PageId, PageId> SegmentTestBasic::getRandomMergeablePair()
+{
+    while (true)
+    {
+        PageId random_left_segment_id = getRandomSegmentId();
+        PageId random_right_segment_id = random_left_segment_id;
+        while (random_right_segment_id == random_left_segment_id)
+        {
+            random_right_segment_id = getRandomSegmentId();
+        }
+        auto left_segment = segments[random_left_segment_id];
+        auto right_segment = segments[random_right_segment_id];
+        if (compare(left_segment->getRowKeyRange().getEnd(), right_segment->getRowKeyRange().getStart()) != 0 || left_segment->nextSegmentId() != right_segment->segmentId())
+        {
+            continue;
+        }
+        return {random_left_segment_id, random_right_segment_id};
+    }
+}
+
+RowKeyRange SegmentTestBasic::commanHandleKeyRange()
+{
+    String start_key, end_key;
+    {
+        WriteBufferFromOwnString ss;
+        ::DB::EncodeUInt(static_cast<UInt8>(TiDB::CodecFlagInt), ss);
+        ::DB::EncodeInt64(std::numeric_limits<Int64>::min(), ss);
+        start_key = ss.releaseStr();
+    }
+    {
+        WriteBufferFromOwnString ss;
+        ::DB::EncodeUInt(static_cast<UInt8>(TiDB::CodecFlagInt), ss);
+        ::DB::EncodeInt64(std::numeric_limits<Int64>::max(), ss);
+        end_key = ss.releaseStr();
+    }
+    return RowKeyRange(RowKeyValue(true, std::make_shared<String>(start_key), 0), RowKeyValue(true, std::make_shared<String>(end_key), 0), true, 1);
+}
+
+SegmentPtr SegmentTestBasic::reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns, DB::Settings && db_settings)
+{
+    TiFlashStorageTestBasic::reload(std::move(db_settings));
+    storage_path_pool = std::make_unique<StoragePathPool>(db_context->getPathPool().withTable("test", "t1", false));
+    storage_pool = std::make_unique<StoragePool>(*db_context, /*ns_id*/ 100, *storage_path_pool, "test.t1");
+    storage_pool->restore();
+    ColumnDefinesPtr cols = (!pre_define_columns) ? DMTestEnv::getDefaultColumns(is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) : pre_define_columns;
+    setColumns(cols);
+
+    return Segment::newSegment(*dm_context, table_columns, is_common_handle ? commanHandleKeyRange() : RowKeyRange::newAll(is_common_handle, 1), storage_pool->newMetaPageId(), 0);
+}
+
+void SegmentTestBasic::setColumns(const ColumnDefinesPtr & columns)
+{
+    *table_columns = *columns;
+
+    dm_context = std::make_unique<DMContext>(*db_context,
+                                             *storage_path_pool,
+                                             *storage_pool,
+                                             0,
+                                             /*min_version_*/ 0,
+                                             settings.not_compress_columns,
+                                             options.is_common_handle,
+                                             1,
+                                             db_context->getSettingsRef());
+}
+} // namespace tests
+} // namespace DM
+} // namespace DB
diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h
new file mode 100644
index 00000000000..ab0c7d6d0be
--- /dev/null
+++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h
@@ -0,0 +1,123 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Storages/DeltaMerge/DMContext.h>
+#include <Storages/DeltaMerge/Segment.h>
+#include <Storages/Transaction/TMTContext.h>
+#include <Storages/tests/TiFlashStorageTestBasic.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+#include <vector>
+
+namespace DB
+{
+namespace DM
+{
+namespace tests
+{
+class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic
+{
+public:
+    struct SegmentTestOptions
+    {
+        bool is_common_handle = false;
+    };
+
+public:
+    void reloadWithOptions(SegmentTestOptions config);
+
+    std::optional<PageId> splitSegment(PageId segment_id);
+    void mergeSegment(PageId left_segment_id, PageId right_segment_id);
+    void mergeSegmentDelta(PageId segment_id);
+    void flushSegmentCache(PageId segment_id);
+    void writeSegment(PageId segment_id, UInt64 write_rows = 100);
+    void writeSegmentWithDeletedPack(PageId segment_id);
+    void deleteRangeSegment(PageId segment_id);
+
+
+    void writeRandomSegment();
+    void writeRandomSegmentWithDeletedPack();
+    void deleteRangeRandomSegment();
+    void splitRandomSegment();
+    void mergeRandomSegment();
+    void mergeDeltaRandomSegment();
+    void flushCacheRandomSegment();
+
+    void randomSegmentTest(size_t operator_count);
+
+    PageId createNewSegmentWithSomeData();
+    size_t getSegmentRowNumWithoutMVCC(PageId segment_id);
+    size_t getSegmentRowNum(PageId segment_id);
+    void checkSegmentRow(PageId segment_id, size_t expected_row_num);
+    std::pair<Int64, Int64> getSegmentKeyRange(SegmentPtr segment);
+
+protected:
+    // <segment_id, segment_ptr>
+    std::map<PageId, SegmentPtr> segments;
+
+    enum SegmentOperaterType
+    {
+        Write = 0,
+        DeleteRange,
+        Split,
+        Merge,
+        MergeDelta,
+        FlushCache,
+        WriteDeletedPack,
+        SegmentOperaterMax
+    };
+
+    const std::vector<std::function<void()>> segment_operator_entries = {
+        [this] { writeRandomSegment(); },
+        [this] { deleteRangeRandomSegment(); },
+        [this] { splitRandomSegment(); },
+        [this] { mergeRandomSegment(); },
+        [this] { mergeDeltaRandomSegment(); },
+        [this] { flushCacheRandomSegment(); },
+        [this] {
+            writeRandomSegmentWithDeletedPack();
+        }};
+
+    PageId getRandomSegmentId();
+
+    std::pair<PageId, PageId> getRandomMergeablePair();
+
+    RowKeyRange commanHandleKeyRange();
+
+    SegmentPtr reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns = {}, DB::Settings && db_settings = DB::Settings());
+
+    // setColumns should update dm_context at the same time
+    void setColumns(const ColumnDefinesPtr & columns);
+
+    const ColumnDefinesPtr & tableColumns() const { return table_columns; }
+
+    DMContext & dmContext() { return *dm_context; }
+
+protected:
+    /// all these var lives as ref in dm_context
+    std::unique_ptr<StoragePathPool> storage_path_pool;
+    std::unique_ptr<StoragePool> storage_pool;
+    /// dm_context
+    std::unique_ptr<DMContext> dm_context;
+    ColumnDefinesPtr table_columns;
+    DM::DeltaMergeStore::Settings settings;
+
+    SegmentPtr root_segment;
+    UInt64 version = 0;
+    SegmentTestOptions options;
+};
+} // namespace tests
+} // namespace DM
+} // namespace DB
\ No newline at end of file
diff --git a/dbms/src/Storages/IManageableStorage.h b/dbms/src/Storages/IManageableStorage.h
index ebf84c592e4..2ff766a9c6d 100644
--- a/dbms/src/Storages/IManageableStorage.h
+++ b/dbms/src/Storages/IManageableStorage.h
@@ -68,7 +68,7 @@ class IManageableStorage : public IStorage
 
     virtual void flushCache(const Context & /*context*/) {}
 
-    virtual void flushCache(const Context & /*context*/, const DM::RowKeyRange & /*range_to_flush*/) {}
+    virtual bool flushCache(const Context & /*context*/, const DM::RowKeyRange & /*range_to_flush*/, [[maybe_unused]] bool try_until_succeed = true) { return true; }
 
     virtual BlockInputStreamPtr status() { return {}; }
 
diff --git a/dbms/src/Storages/Page/V3/PageDirectory.cpp b/dbms/src/Storages/Page/V3/PageDirectory.cpp
index 5eb275f5af5..951da42de1c 100644
--- a/dbms/src/Storages/Page/V3/PageDirectory.cpp
+++ b/dbms/src/Storages/Page/V3/PageDirectory.cpp
@@ -478,7 +478,7 @@ PageSize VersionedPageEntries::getEntriesByBlobIds(
 bool VersionedPageEntries::cleanOutdatedEntries(
     UInt64 lowest_seq,
     std::map<PageIdV3Internal, std::pair<PageVersion, Int64>> * normal_entries_to_deref,
-    PageEntriesV3 & entries_removed,
+    PageEntriesV3 * entries_removed,
     const PageLock & /*page_lock*/)
 {
     if (type == EditRecordType::VAR_EXTERNAL)
@@ -541,7 +541,10 @@ bool VersionedPageEntries::cleanOutdatedEntries(
             {
                 if (iter->second.being_ref_count == 1)
                 {
-                    entries_removed.emplace_back(iter->second.entry);
+                    if (entries_removed)
+                    {
+                        entries_removed->emplace_back(iter->second.entry);
+                    }
                     iter = entries.erase(iter);
                 }
                 // The `being_ref_count` for this version is valid. While for older versions,
@@ -551,7 +554,10 @@ bool VersionedPageEntries::cleanOutdatedEntries(
             else
             {
                 // else there are newer "entry" in the version list, the outdated entries should be removed
-                entries_removed.emplace_back(iter->second.entry);
+                if (entries_removed)
+                {
+                    entries_removed->emplace_back(iter->second.entry);
+                }
                 iter = entries.erase(iter);
             }
         }
@@ -564,7 +570,7 @@ bool VersionedPageEntries::cleanOutdatedEntries(
     return entries.empty() || (entries.size() == 1 && entries.begin()->second.isDelete());
 }
 
-bool VersionedPageEntries::derefAndClean(UInt64 lowest_seq, PageIdV3Internal page_id, const PageVersion & deref_ver, const Int64 deref_count, PageEntriesV3 & entries_removed)
+bool VersionedPageEntries::derefAndClean(UInt64 lowest_seq, PageIdV3Internal page_id, const PageVersion & deref_ver, const Int64 deref_count, PageEntriesV3 * entries_removed)
 {
     auto page_lock = acquireLock();
     if (type == EditRecordType::VAR_EXTERNAL)
@@ -1239,7 +1245,7 @@ bool PageDirectory::tryDumpSnapshot(const ReadLimiterPtr & read_limiter, const W
     return done_any_io;
 }
 
-PageEntriesV3 PageDirectory::gcInMemEntries()
+PageEntriesV3 PageDirectory::gcInMemEntries(bool return_removed_entries)
 {
     UInt64 lowest_seq = sequence.load();
 
@@ -1303,7 +1309,7 @@ PageEntriesV3 PageDirectory::gcInMemEntries()
         const bool all_deleted = iter->second->cleanOutdatedEntries(
             lowest_seq,
             &normal_entries_to_deref,
-            all_del_entries,
+            return_removed_entries ? &all_del_entries : nullptr,
             iter->second->acquireLock());
 
         {
@@ -1342,7 +1348,7 @@ PageEntriesV3 PageDirectory::gcInMemEntries()
             page_id,
             /*deref_ver=*/deref_counter.first,
             /*deref_count=*/deref_counter.second,
-            all_del_entries);
+            return_removed_entries ? &all_del_entries : nullptr);
 
         if (all_deleted)
         {
diff --git a/dbms/src/Storages/Page/V3/PageDirectory.h b/dbms/src/Storages/Page/V3/PageDirectory.h
index bd7c433022f..2f0f09f4e42 100644
--- a/dbms/src/Storages/Page/V3/PageDirectory.h
+++ b/dbms/src/Storages/Page/V3/PageDirectory.h
@@ -223,14 +223,14 @@ class VersionedPageEntries
     bool cleanOutdatedEntries(
         UInt64 lowest_seq,
         std::map<PageIdV3Internal, std::pair<PageVersion, Int64>> * normal_entries_to_deref,
-        PageEntriesV3 & entries_removed,
+        PageEntriesV3 * entries_removed,
         const PageLock & page_lock);
     bool derefAndClean(
         UInt64 lowest_seq,
         PageIdV3Internal page_id,
         const PageVersion & deref_ver,
         Int64 deref_count,
-        PageEntriesV3 & entries_removed);
+        PageEntriesV3 * entries_removed);
 
     void collapseTo(UInt64 seq, PageIdV3Internal page_id, PageEntriesEdit & edit);
 
@@ -360,7 +360,9 @@ class PageDirectory
 
     bool tryDumpSnapshot(const ReadLimiterPtr & read_limiter = nullptr, const WriteLimiterPtr & write_limiter = nullptr);
 
-    PageEntriesV3 gcInMemEntries();
+    // Perform a GC for in-memory entries and return the removed entries.
+    // If `return_removed_entries` is false, then just return an empty set.
+    PageEntriesV3 gcInMemEntries(bool return_removed_entries = true);
 
     std::set<PageId> getAliveExternalIds(NamespaceId ns_id) const;
 
diff --git a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
index 483c5073ab5..968049a3273 100644
--- a/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
+++ b/dbms/src/Storages/Page/V3/PageDirectoryFactory.cpp
@@ -44,7 +44,8 @@ PageDirectoryPtr PageDirectoryFactory::createFromReader(String storage_name, WAL
 
     // After restoring from the disk, we need cleanup all invalid entries in memory, or it will
     // try to run GC again on some entries that are already marked as invalid in BlobStore.
-    dir->gcInMemEntries();
+    // It's no need to remove the expired entries in BlobStore, so skip filling removed_entries to imporve performance.
+    dir->gcInMemEntries(/*return_removed_entries=*/false);
     LOG_FMT_INFO(DB::Logger::get("PageDirectoryFactory", storage_name), "PageDirectory restored [max_page_id={}] [max_applied_ver={}]", dir->getMaxId(), dir->sequence);
 
     if (blob_stats)
@@ -84,7 +85,8 @@ PageDirectoryPtr PageDirectoryFactory::createFromEdit(String storage_name, FileP
 
     // After restoring from the disk, we need cleanup all invalid entries in memory, or it will
     // try to run GC again on some entries that are already marked as invalid in BlobStore.
-    dir->gcInMemEntries();
+    // It's no need to remove the expired entries in BlobStore when restore, so no need to fill removed_entries.
+    dir->gcInMemEntries(/*return_removed_entries=*/false);
 
     if (blob_stats)
     {
diff --git a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
index 83e07f75d37..6d6ef41630f 100644
--- a/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
+++ b/dbms/src/Storages/Page/V3/tests/gtest_page_directory.cpp
@@ -644,14 +644,14 @@ class VersionedEntriesTest : public ::testing::Test
     {
         DerefCounter deref_counter;
         PageEntriesV3 removed_entries;
-        bool all_removed = entries.cleanOutdatedEntries(seq, &deref_counter, removed_entries, entries.acquireLock());
+        bool all_removed = entries.cleanOutdatedEntries(seq, &deref_counter, &removed_entries, entries.acquireLock());
         return {all_removed, removed_entries, deref_counter};
     }
 
     std::tuple<bool, PageEntriesV3> runDeref(UInt64 seq, PageVersion ver, Int64 decrease_num)
     {
         PageEntriesV3 removed_entries;
-        bool all_removed = entries.derefAndClean(seq, buildV3Id(TEST_NAMESPACE_ID, page_id), ver, decrease_num, removed_entries);
+        bool all_removed = entries.derefAndClean(seq, buildV3Id(TEST_NAMESPACE_ID, page_id), ver, decrease_num, &removed_entries);
         return {all_removed, removed_entries};
     }
 
diff --git a/dbms/src/Storages/StorageDeltaMerge.cpp b/dbms/src/Storages/StorageDeltaMerge.cpp
index 67d32c73a05..a6de4efb3ac 100644
--- a/dbms/src/Storages/StorageDeltaMerge.cpp
+++ b/dbms/src/Storages/StorageDeltaMerge.cpp
@@ -775,12 +775,12 @@ void StorageDeltaMerge::checkStatus(const Context & context)
 
 void StorageDeltaMerge::flushCache(const Context & context)
 {
-    flushCache(context, DM::RowKeyRange::newAll(is_common_handle, rowkey_column_size));
+    flushCache(context, DM::RowKeyRange::newAll(is_common_handle, rowkey_column_size), /* try_until_succeed */ true);
 }
 
-void StorageDeltaMerge::flushCache(const Context & context, const DM::RowKeyRange & range_to_flush)
+bool StorageDeltaMerge::flushCache(const Context & context, const DM::RowKeyRange & range_to_flush, bool try_until_succeed)
 {
-    getAndMaybeInitStore()->flushCache(context, range_to_flush);
+    return getAndMaybeInitStore()->flushCache(context, range_to_flush, try_until_succeed);
 }
 
 void StorageDeltaMerge::mergeDelta(const Context & context)
diff --git a/dbms/src/Storages/StorageDeltaMerge.h b/dbms/src/Storages/StorageDeltaMerge.h
index 79ee225d237..9e4ab12ad4f 100644
--- a/dbms/src/Storages/StorageDeltaMerge.h
+++ b/dbms/src/Storages/StorageDeltaMerge.h
@@ -73,7 +73,7 @@ class StorageDeltaMerge
 
     void flushCache(const Context & context) override;
 
-    void flushCache(const Context & context, const DM::RowKeyRange & range_to_flush) override;
+    bool flushCache(const Context & context, const DM::RowKeyRange & range_to_flush, bool try_until_succeed) override;
 
     /// Merge delta into the stable layer for all segments.
     ///
diff --git a/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h b/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
index e8e0610326c..b0cacefe6f4 100644
--- a/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
+++ b/dbms/src/Storages/Transaction/DecodingStorageSchemaSnapshot.h
@@ -77,10 +77,12 @@ struct DecodingStorageSchemaSnapshot
         , decoding_schema_version{decoding_schema_version_}
     {
         std::unordered_map<ColumnID, size_t> column_lut;
+        std::unordered_map<String, ColumnID> column_name_id_map;
         for (size_t i = 0; i < table_info_.columns.size(); i++)
         {
             const auto & ci = table_info_.columns[i];
             column_lut.emplace(ci.id, i);
+            column_name_id_map.emplace(ci.name, ci.id);
         }
         for (size_t i = 0; i < column_defines->size(); i++)
         {
@@ -88,7 +90,7 @@ struct DecodingStorageSchemaSnapshot
             sorted_column_id_with_pos.insert({cd.id, i});
             if (cd.id != TiDBPkColumnID && cd.id != VersionColumnID && cd.id != DelMarkColumnID)
             {
-                auto & columns = table_info_.columns;
+                const auto & columns = table_info_.columns;
                 column_infos.push_back(columns[column_lut.at(cd.id)]);
             }
             else
@@ -100,10 +102,14 @@ struct DecodingStorageSchemaSnapshot
         // create pk related metadata if needed
         if (is_common_handle)
         {
-            const auto & primary_index_info = table_info_.getPrimaryIndexInfo();
-            for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
+            /// we will not update the IndexInfo except Rename DDL.
+            /// When the add column / drop column action happenes, the offset of each column may change
+            /// Thus, we should not use offset to get the column we want,
+            /// but use to compare the column name to get the column id.
+            const auto & primary_index_cols = table_info_.getPrimaryIndexInfo().idx_cols;
+            for (const auto & col : primary_index_cols)
             {
-                auto pk_column_id = table_info_.columns[primary_index_info.idx_cols[i].offset].id;
+                auto pk_column_id = column_name_id_map[col.name];
                 pk_column_ids.emplace_back(pk_column_id);
                 pk_pos_map.emplace(pk_column_id, reinterpret_cast<size_t>(std::numeric_limits<size_t>::max()));
             }
diff --git a/dbms/src/Storages/Transaction/KVStore.cpp b/dbms/src/Storages/Transaction/KVStore.cpp
index 318a04c6ed9..f9d6d01955e 100644
--- a/dbms/src/Storages/Transaction/KVStore.cpp
+++ b/dbms/src/Storages/Transaction/KVStore.cpp
@@ -129,7 +129,7 @@ void KVStore::traverseRegions(std::function<void(RegionID, const RegionPtr &)> &
         callback(region.first, region.second);
 }
 
-void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log)
+bool KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log, bool try_until_succeed)
 {
     auto table_id = region.getMappedTableID();
     auto storage = tmt.getStorages().get(table_id);
@@ -139,7 +139,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
                         "tryFlushRegionCacheInStorage can not get table for region {} with table id {}, ignored",
                         region.toString(),
                         table_id);
-        return;
+        return true;
     }
 
     try
@@ -151,7 +151,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
             region.getRange()->getMappedTableID(),
             storage->isCommonHandle(),
             storage->getRowKeyColumnSize());
-        storage->flushCache(tmt.getContext(), rowkey_range);
+        return storage->flushCache(tmt.getContext(), rowkey_range, try_until_succeed);
     }
     catch (DB::Exception & e)
     {
@@ -159,6 +159,7 @@ void KVStore::tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & regi
         if (e.code() != ErrorCodes::TABLE_IS_DROPPED)
             throw;
     }
+    return true;
 }
 
 void KVStore::tryPersist(RegionID region_id)
@@ -366,12 +367,12 @@ EngineStoreApplyRes KVStore::handleUselessAdminRaftCmd(
             if (rows >= region_compact_log_min_rows.load(std::memory_order_relaxed)
                 || size_bytes >= region_compact_log_min_bytes.load(std::memory_order_relaxed))
             {
-                // if rows or bytes more than threshold, flush cache and perist mem data.
+                // if rows or bytes more than threshold, try to flush cache and persist mem data.
                 return true;
             }
             else
             {
-                // if thhere is little data in mem, wait until time interval reached threshold.
+                // if there is little data in mem, wait until time interval reached threshold.
                 // use random period so that lots of regions will not be persisted at same time.
                 auto compact_log_period = std::rand() % region_compact_log_period.load(std::memory_order_relaxed); // NOLINT
                 return !(curr_region.lastCompactLogTime() + Seconds{compact_log_period} > Clock::now());
@@ -381,11 +382,17 @@ EngineStoreApplyRes KVStore::handleUselessAdminRaftCmd(
 
     if (check_sync_log())
     {
-        tryFlushRegionCacheInStorage(tmt, curr_region, log);
-        persistRegion(curr_region, region_task_lock, "compact raft log");
-        curr_region.markCompactLog();
-        curr_region.cleanApproxMemCacheInfo();
-        return EngineStoreApplyRes::Persist;
+        if (tryFlushRegionCacheInStorage(tmt, curr_region, log, /* try_until_succeed */ false))
+        {
+            persistRegion(curr_region, region_task_lock, "compact raft log");
+            curr_region.markCompactLog();
+            curr_region.cleanApproxMemCacheInfo();
+            return EngineStoreApplyRes::Persist;
+        }
+        else
+        {
+            return EngineStoreApplyRes::None;
+        }
     }
     return EngineStoreApplyRes::None;
 }
diff --git a/dbms/src/Storages/Transaction/KVStore.h b/dbms/src/Storages/Transaction/KVStore.h
index bb45e65d18b..66e2fe32b75 100644
--- a/dbms/src/Storages/Transaction/KVStore.h
+++ b/dbms/src/Storages/Transaction/KVStore.h
@@ -91,7 +91,7 @@ class KVStore final : private boost::noncopyable
 
     void tryPersist(RegionID region_id);
 
-    static void tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log);
+    static bool tryFlushRegionCacheInStorage(TMTContext & tmt, const Region & region, Poco::Logger * log, bool try_until_succeed = true);
 
     size_t regionSize() const;
     EngineStoreApplyRes handleAdminRaftCmd(raft_cmdpb::AdminRequest && request,
diff --git a/dbms/src/Storages/Transaction/ReadIndexWorker.cpp b/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
index 3223c815989..7de79dd5c6d 100644
--- a/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
+++ b/dbms/src/Storages/Transaction/ReadIndexWorker.cpp
@@ -880,7 +880,7 @@ BatchReadIndexRes ReadIndexWorkerManager::batchReadIndex(
         }
     }
     { // if meet timeout, which means part of regions can not get response from leader, try to poll rest tasks
-        TEST_LOG_FMT("rest {}, poll rest tasks onece", tasks.size());
+        TEST_LOG_FMT("rest {}, poll rest tasks once", tasks.size());
 
         while (!tasks.empty())
         {
diff --git a/dbms/src/Storages/Transaction/RegionBlockReader.cpp b/dbms/src/Storages/Transaction/RegionBlockReader.cpp
index a9384e4a14d..2ec690c467b 100644
--- a/dbms/src/Storages/Transaction/RegionBlockReader.cpp
+++ b/dbms/src/Storages/Transaction/RegionBlockReader.cpp
@@ -208,6 +208,8 @@ bool RegionBlockReader::readImpl(Block & block, const RegionDataReadInfoList & d
         }
         index++;
     }
+    block.checkNumberOfRows();
+
     return true;
 }
 
diff --git a/dbms/src/Storages/Transaction/RegionTable.cpp b/dbms/src/Storages/Transaction/RegionTable.cpp
index c855d5b3226..5ae36a4bd64 100644
--- a/dbms/src/Storages/Transaction/RegionTable.cpp
+++ b/dbms/src/Storages/Transaction/RegionTable.cpp
@@ -230,7 +230,7 @@ void removeObsoleteDataInStorage(
         auto rowkey_range
             = DM::RowKeyRange::fromRegionRange(handle_range, table_id, table_id, storage->isCommonHandle(), storage->getRowKeyColumnSize());
         dm_storage->deleteRange(rowkey_range, context->getSettingsRef());
-        dm_storage->flushCache(*context, rowkey_range); // flush to disk
+        dm_storage->flushCache(*context, rowkey_range, /*try_until_succeed*/ true); // flush to disk
     }
     catch (DB::Exception & e)
     {
diff --git a/dbms/src/Storages/Transaction/TiDB.cpp b/dbms/src/Storages/Transaction/TiDB.cpp
index 15bf2a3fb58..dc7f1f3e348 100644
--- a/dbms/src/Storages/Transaction/TiDB.cpp
+++ b/dbms/src/Storages/Transaction/TiDB.cpp
@@ -631,8 +631,8 @@ catch (const Poco::Exception & e)
 ///////////////////////
 
 IndexColumnInfo::IndexColumnInfo(Poco::JSON::Object::Ptr json)
-    : offset(0)
-    , length(0)
+    : length(0)
+    , offset(0)
 {
     deserialize(json);
 }
diff --git a/dbms/src/Storages/Transaction/TiDB.h b/dbms/src/Storages/Transaction/TiDB.h
index f67bfb332c7..4c28a614857 100644
--- a/dbms/src/Storages/Transaction/TiDB.h
+++ b/dbms/src/Storages/Transaction/TiDB.h
@@ -179,7 +179,6 @@ struct ColumnInfo
 
     ColumnID id = -1;
     String name;
-    Int32 offset = -1;
     Poco::Dynamic::Var origin_default_value;
     Poco::Dynamic::Var default_value;
     Poco::Dynamic::Var default_bit_value;
@@ -212,6 +211,12 @@ struct ColumnInfo
     static Int64 getTimeValue(const String &);
     static Int64 getYearValue(const String &);
     static UInt64 getBitValue(const String &);
+
+private:
+    /// please be very careful when you have to use offset,
+    /// because we never update offset when DDL action changes.
+    /// Thus, our offset will not exactly correspond the order of columns.
+    Int32 offset = -1;
 };
 
 enum PartitionType
@@ -298,8 +303,13 @@ struct IndexColumnInfo
     void deserialize(Poco::JSON::Object::Ptr json);
 
     String name;
-    Int32 offset;
     Int32 length;
+
+private:
+    /// please be very careful when you have to use offset,
+    /// because we never update offset when DDL action changes.
+    /// Thus, our offset will not exactly correspond the order of columns.
+    Int32 offset;
 };
 struct IndexInfo
 {
@@ -385,7 +395,12 @@ struct TableInfo
 
     bool isLogicalPartitionTable() const { return is_partition_table && belonging_table_id == DB::InvalidTableID && partition.enable; }
 
-    /// should not be called if is_common_handle = false
+    /// should not be called if is_common_handle = false.
+    /// when use IndexInfo, please avoid to use the offset info
+    /// the offset value may be wrong in some cases,
+    /// due to we will not update IndexInfo except RENAME DDL action,
+    /// but DDL like add column / drop column may change the offset of columns
+    /// Thus, please be very careful when you must have to use offset information !!!!!
     const IndexInfo & getPrimaryIndexInfo() const { return index_infos[0]; }
 
     IndexInfo & getPrimaryIndexInfo() { return index_infos[0]; }
diff --git a/dbms/src/Storages/Transaction/TiKVRecordFormat.h b/dbms/src/Storages/Transaction/TiKVRecordFormat.h
index c507616f6e9..10a7f7220e9 100644
--- a/dbms/src/Storages/Transaction/TiKVRecordFormat.h
+++ b/dbms/src/Storages/Transaction/TiKVRecordFormat.h
@@ -154,9 +154,16 @@ inline TiKVKey genKey(const TiDB::TableInfo & table_info, std::vector<Field> key
     memcpy(key.data() + 1, reinterpret_cast<const char *>(&big_endian_table_id), 8);
     memcpy(key.data() + 1 + 8, RecordKVFormat::RECORD_PREFIX_SEP, 2);
     WriteBufferFromOwnString ss;
+
+    std::unordered_map<String, size_t> column_name_columns_index_map;
+    for (size_t i = 0; i < table_info.columns.size(); i++)
+    {
+        column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+    }
     for (size_t i = 0; i < keys.size(); i++)
     {
-        DB::EncodeDatum(keys[i], table_info.columns[table_info.getPrimaryIndexInfo().idx_cols[i].offset].getCodecFlag(), ss);
+        auto idx = column_name_columns_index_map[table_info.getPrimaryIndexInfo().idx_cols[i].name];
+        DB::EncodeDatum(keys[i], table_info.columns[idx].getCodecFlag(), ss);
     }
     return encodeAsTiKVKey(key + ss.releaseStr());
 }
diff --git a/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h b/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
index 20b395a9952..34e0d3d4104 100644
--- a/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
+++ b/dbms/src/Storages/Transaction/tests/RowCodecTestUtils.h
@@ -237,14 +237,14 @@ std::pair<TableInfo, std::vector<Field>> getTableInfoAndFields(ColumnIDs handle_
     {
         table_info.is_common_handle = true;
         TiDB::IndexInfo index_info;
-        for (size_t i = 0; i < handle_ids.size(); i++)
+        for (auto handle_id : handle_ids)
         {
             TiDB::IndexColumnInfo index_column_info;
-            for (size_t pos = 0; pos < table_info.columns.size(); pos++)
+            for (auto & column : table_info.columns)
             {
-                if (table_info.columns[pos].id == handle_ids[i])
+                if (column.id == handle_id)
                 {
-                    index_column_info.offset = pos;
+                    index_column_info.name = column.name;
                     break;
                 }
             }
diff --git a/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp b/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp
new file mode 100644
index 00000000000..05ab637de7f
--- /dev/null
+++ b/dbms/src/Storages/Transaction/tests/bench_region_block_reader.cpp
@@ -0,0 +1,171 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Storages/DeltaMerge/DeltaMergeDefines.h>
+#include <Storages/Transaction/RegionBlockReader.h>
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
+
+#include "RowCodecTestUtils.h"
+
+using TableInfo = TiDB::TableInfo;
+namespace DB::tests
+{
+using ColumnIDs = std::vector<ColumnID>;
+class RegionBlockReaderBenchTest : public benchmark::Fixture
+{
+protected:
+    Int64 handle_value = 100;
+    UInt8 del_mark_value = 0;
+    UInt64 version_value = 100;
+
+    RegionDataReadInfoList data_list_read;
+    std::unordered_map<ColumnID, Field> fields_map;
+
+    enum RowEncodeVersion
+    {
+        RowV1,
+        RowV2
+    };
+
+protected:
+    void SetUp(const benchmark::State & /*state*/) override
+    {
+        data_list_read.clear();
+        fields_map.clear();
+    }
+
+    void encodeColumns(TableInfo & table_info, std::vector<Field> & fields, RowEncodeVersion row_version, size_t num_rows)
+    {
+        // for later check
+        std::unordered_map<String, size_t> column_name_columns_index_map;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            fields_map.emplace(table_info.columns[i].id, fields[i]);
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
+
+        std::vector<Field> value_fields;
+        std::vector<Field> pk_fields;
+        for (size_t i = 0; i < table_info.columns.size(); i++)
+        {
+            if (!table_info.columns[i].hasPriKeyFlag())
+                value_fields.emplace_back(fields[i]);
+            else
+                pk_fields.emplace_back(fields[i]);
+        }
+
+        // create PK
+        WriteBufferFromOwnString pk_buf;
+        if (table_info.is_common_handle)
+        {
+            auto & primary_index_info = table_info.getPrimaryIndexInfo();
+            for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
+            {
+                auto idx = column_name_columns_index_map[primary_index_info.idx_cols[i].name];
+                EncodeDatum(pk_fields[i], table_info.columns[idx].getCodecFlag(), pk_buf);
+            }
+        }
+        else
+        {
+            DB::EncodeInt64(handle_value, pk_buf);
+        }
+        RawTiDBPK pk{std::make_shared<String>(pk_buf.releaseStr())};
+        // create value
+        WriteBufferFromOwnString value_buf;
+        if (row_version == RowEncodeVersion::RowV1)
+        {
+            encodeRowV1(table_info, value_fields, value_buf);
+        }
+        else if (row_version == RowEncodeVersion::RowV2)
+        {
+            encodeRowV2(table_info, value_fields, value_buf);
+        }
+        else
+        {
+            throw Exception("Unknown row format " + std::to_string(row_version), ErrorCodes::LOGICAL_ERROR);
+        }
+        auto row_value = std::make_shared<const TiKVValue>(std::move(value_buf.str()));
+        for (size_t i = 0; i < num_rows; i++)
+            data_list_read.emplace_back(pk, del_mark_value, version_value, row_value);
+    }
+
+    bool decodeColumns(DecodingStorageSchemaSnapshotConstPtr decoding_schema, bool force_decode) const
+    {
+        RegionBlockReader reader{decoding_schema};
+        Block block = createBlockSortByColumnID(decoding_schema);
+        return reader.read(block, data_list_read, force_decode);
+    }
+
+    std::pair<TableInfo, std::vector<Field>> getNormalTableInfoFields(const ColumnIDs & handle_ids, bool is_common_handle) const
+    {
+        return getTableInfoAndFields(
+            handle_ids,
+            is_common_handle,
+            ColumnIDValue(2, handle_value),
+            ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
+            ColumnIDValue(4, std::numeric_limits<Float32>::min()),
+            ColumnIDValue(9, String("aaa")),
+            ColumnIDValue(10, DecimalField(ToDecimal<UInt64, Decimal64>(12345678910ULL, 4), 4)),
+            ColumnIDValueNull<UInt64>(11));
+    }
+};
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, CommonHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({2, 3, 4}, true);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, PKIsNotHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({EXTRA_HANDLE_COLUMN_ID}, false);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+BENCHMARK_DEFINE_F(RegionBlockReaderBenchTest, PKIsHandle)
+(benchmark::State & state)
+{
+    size_t num_rows = state.range(0);
+    auto [table_info, fields] = getNormalTableInfoFields({2}, false);
+    encodeColumns(table_info, fields, RowEncodeVersion::RowV2, num_rows);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+    for (auto _ : state)
+    {
+        decodeColumns(decoding_schema, true);
+    }
+}
+
+constexpr size_t num_iterations_test = 1000;
+
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, PKIsHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, CommonHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+BENCHMARK_REGISTER_F(RegionBlockReaderBenchTest, PKIsNotHandle)->Iterations(num_iterations_test)->Arg(1)->Arg(10)->Arg(100);
+
+} // namespace DB::tests
diff --git a/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp b/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp
new file mode 100644
index 00000000000..1de9809ecad
--- /dev/null
+++ b/dbms/src/Storages/Transaction/tests/gtest_decoding_storage_schema_snapshot.cpp
@@ -0,0 +1,65 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Storages/Transaction/DecodingStorageSchemaSnapshot.h>
+#include <TestUtils/TiFlashTestBasic.h>
+#include <gtest/gtest.h>
+
+#include "RowCodecTestUtils.h"
+
+namespace DB::tests
+{
+static TableInfo getTableInfoByJson(const String & json_table_info)
+{
+    return TableInfo(json_table_info);
+}
+TEST(DecodingStorageSchemaSnapshotTest, CheckPKInfosUnderClusteredIndex)
+{
+    // table with column [A,B,C,D], primary keys [A,C]
+    const String json_table_info = R"json({"id":75,"name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"A","L":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":2,"name":{"O":"B","L":"b"},"offset":1,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":15,"Flag":0,"Flen":20,"Decimal":0,"Charset":"utf8mb4","Collate":"utf8mb4_bin","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":3,"name":{"O":"C","L":"c"},"offset":2,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":4,"name":{"O":"D","L":"d"},"offset":3,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"A","L":"a"},"offset":0,"length":-1},{"name":{"O":"C","L":"c"},"offset":2,"length":-1}],"state":5,"comment":"","index_type":1,"is_unique":true,"is_primary":true,"is_invisible":false,"is_global":false}],"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":true,"common_handle_version":1,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":4,"max_idx_id":1,"max_cst_id":0,"update_timestamp":434039123413303302,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":4,"tiflash_replica":{"Count":1,"LocationLabels":[],"Available":false,"AvailablePartitionIDs":null},"is_columnar":false,"temp_table_type":0,"cache_table_status":0,"policy_ref_info":null,"stats_options":null})json";
+    auto table_info = getTableInfoByJson(json_table_info);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+
+    //check decoding_schema->pk_column_ids infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), 2);
+    ASSERT_EQ(decoding_schema->pk_column_ids[0], 1);
+    ASSERT_EQ(decoding_schema->pk_column_ids[1], 3);
+
+    //check decoding_schema->pk_pos_map infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), decoding_schema->pk_pos_map.size());
+    // there are three hidden column in the decoded block, so the position of A,C is 3,5
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[0]), 3);
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[1]), 5);
+}
+
+TEST(DecodingStorageSchemaSnapshotTest, CheckPKInfosUnderClusteredIndexAfterDropColumn)
+{
+    // drop column B for [A,B,C,D]; table with column [A,C,D], primary keys [A,C]
+    const String json_table_info = R"json({"id":75,"name":{"O":"test","L":"test"},"charset":"utf8mb4","collate":"utf8mb4_bin","cols":[{"id":1,"name":{"O":"A","L":"a"},"offset":0,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":3,"name":{"O":"C","L":"c"},"offset":2,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":4099,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2},{"id":4,"name":{"O":"D","L":"d"},"offset":3,"origin_default":null,"origin_default_bit":null,"default":null,"default_bit":null,"default_is_expr":false,"generated_expr_string":"","generated_stored":false,"dependences":null,"type":{"Tp":3,"Flag":0,"Flen":11,"Decimal":0,"Charset":"binary","Collate":"binary","Elems":null},"state":5,"comment":"","hidden":false,"change_state_info":null,"version":2}],"index_info":[{"id":1,"idx_name":{"O":"PRIMARY","L":"primary"},"tbl_name":{"O":"","L":""},"idx_cols":[{"name":{"O":"A","L":"a"},"offset":0,"length":-1},{"name":{"O":"C","L":"c"},"offset":2,"length":-1}],"state":5,"comment":"","index_type":1,"is_unique":true,"is_primary":true,"is_invisible":false,"is_global":false}],"constraint_info":null,"fk_info":null,"state":5,"pk_is_handle":false,"is_common_handle":true,"common_handle_version":1,"comment":"","auto_inc_id":0,"auto_id_cache":0,"auto_rand_id":0,"max_col_id":4,"max_idx_id":1,"max_cst_id":0,"update_timestamp":434039123413303302,"ShardRowIDBits":0,"max_shard_row_id_bits":0,"auto_random_bits":0,"pre_split_regions":0,"partition":null,"compression":"","view":null,"sequence":null,"Lock":null,"version":4,"tiflash_replica":{"Count":1,"LocationLabels":[],"Available":false,"AvailablePartitionIDs":null},"is_columnar":false,"temp_table_type":0,"cache_table_status":0,"policy_ref_info":null,"stats_options":null})json";
+    auto table_info = getTableInfoByJson(json_table_info);
+    auto decoding_schema = getDecodingStorageSchemaSnapshot(table_info);
+
+    //check decoding_schema->pk_column_ids infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), 2);
+    ASSERT_EQ(decoding_schema->pk_column_ids[0], 1);
+    ASSERT_EQ(decoding_schema->pk_column_ids[1], 3);
+
+    //check decoding_schema->pk_pos_map infos
+    ASSERT_EQ(decoding_schema->pk_column_ids.size(), decoding_schema->pk_pos_map.size());
+    // there are three hidden column in the decoded block, so the position of A,C is 3,4
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[0]), 3);
+    ASSERT_EQ(decoding_schema->pk_pos_map.at(decoding_schema->pk_column_ids[1]), 4);
+}
+
+} // namespace DB::tests
diff --git a/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp b/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
index 6a883230854..d08b4dd3738 100644
--- a/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
+++ b/dbms/src/Storages/Transaction/tests/gtest_region_block_reader.cpp
@@ -26,13 +26,13 @@ using ColumnIDs = std::vector<ColumnID>;
 class RegionBlockReaderTestFixture : public ::testing::Test
 {
 protected:
-    Int64 handle_value_ = 100;
-    UInt8 del_mark_value_ = 0;
-    UInt64 version_value_ = 100;
-    size_t rows_ = 3;
+    Int64 handle_value = 100;
+    UInt8 del_mark_value = 0;
+    UInt64 version_value = 100;
+    size_t rows = 3;
 
-    RegionDataReadInfoList data_list_read_;
-    std::unordered_map<ColumnID, Field> fields_map_;
+    RegionDataReadInfoList data_list_read;
+    std::unordered_map<ColumnID, Field> fields_map;
 
     enum RowEncodeVersion
     {
@@ -43,8 +43,8 @@ class RegionBlockReaderTestFixture : public ::testing::Test
 protected:
     void SetUp() override
     {
-        data_list_read_.clear();
-        fields_map_.clear();
+        data_list_read.clear();
+        fields_map.clear();
     }
 
     void TearDown() override {}
@@ -52,8 +52,12 @@ class RegionBlockReaderTestFixture : public ::testing::Test
     void encodeColumns(TableInfo & table_info, std::vector<Field> & fields, RowEncodeVersion row_version)
     {
         // for later check
+        std::unordered_map<String, size_t> column_name_columns_index_map;
         for (size_t i = 0; i < table_info.columns.size(); i++)
-            fields_map_.emplace(table_info.columns[i].id, fields[i]);
+        {
+            fields_map.emplace(table_info.columns[i].id, fields[i]);
+            column_name_columns_index_map.emplace(table_info.columns[i].name, i);
+        }
 
         std::vector<Field> value_fields;
         std::vector<Field> pk_fields;
@@ -72,13 +76,13 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             auto & primary_index_info = table_info.getPrimaryIndexInfo();
             for (size_t i = 0; i < primary_index_info.idx_cols.size(); i++)
             {
-                size_t pk_offset = primary_index_info.idx_cols[i].offset;
-                EncodeDatum(pk_fields[i], table_info.columns[pk_offset].getCodecFlag(), pk_buf);
+                auto idx = column_name_columns_index_map[primary_index_info.idx_cols[i].name];
+                EncodeDatum(pk_fields[i], table_info.columns[idx].getCodecFlag(), pk_buf);
             }
         }
         else
         {
-            DB::EncodeInt64(handle_value_, pk_buf);
+            DB::EncodeInt64(handle_value, pk_buf);
         }
         RawTiDBPK pk{std::make_shared<String>(pk_buf.releaseStr())};
         // create value
@@ -96,44 +100,44 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             throw Exception("Unknown row format " + std::to_string(row_version), ErrorCodes::LOGICAL_ERROR);
         }
         auto row_value = std::make_shared<const TiKVValue>(std::move(value_buf.str()));
-        for (size_t i = 0; i < rows_; i++)
-            data_list_read_.emplace_back(pk, del_mark_value_, version_value_, row_value);
+        for (size_t i = 0; i < rows; i++)
+            data_list_read.emplace_back(pk, del_mark_value, version_value, row_value);
     }
 
     void checkBlock(DecodingStorageSchemaSnapshotConstPtr decoding_schema, const Block & block) const
     {
         ASSERT_EQ(block.columns(), decoding_schema->column_defines->size());
-        for (size_t row = 0; row < rows_; row++)
+        for (size_t row = 0; row < rows; row++)
         {
             for (size_t pos = 0; pos < block.columns(); pos++)
             {
-                auto & column_element = block.getByPosition(pos);
+                const auto & column_element = block.getByPosition(pos);
                 if (row == 0)
                 {
-                    ASSERT_EQ(column_element.column->size(), rows_);
+                    ASSERT_EQ(column_element.column->size(), rows);
                 }
                 if (column_element.name == EXTRA_HANDLE_COLUMN_NAME)
                 {
                     if (decoding_schema->is_common_handle)
                     {
-                        ASSERT_EQ((*column_element.column)[row], Field(*std::get<0>(data_list_read_[row])));
+                        ASSERT_EQ((*column_element.column)[row], Field(*std::get<0>(data_list_read[row])));
                     }
                     else
                     {
-                        ASSERT_EQ((*column_element.column)[row], Field(handle_value_));
+                        ASSERT_EQ((*column_element.column)[row], Field(handle_value));
                     }
                 }
                 else if (column_element.name == VERSION_COLUMN_NAME)
                 {
-                    ASSERT_EQ((*column_element.column)[row], Field(version_value_));
+                    ASSERT_EQ((*column_element.column)[row], Field(version_value));
                 }
                 else if (column_element.name == TAG_COLUMN_NAME)
                 {
-                    ASSERT_EQ((*column_element.column)[row], Field(NearestFieldType<UInt8>::Type(del_mark_value_)));
+                    ASSERT_EQ((*column_element.column)[row], Field(NearestFieldType<UInt8>::Type(del_mark_value)));
                 }
                 else
                 {
-                    ASSERT_EQ((*column_element.column)[row], fields_map_.at(column_element.column_id));
+                    ASSERT_EQ((*column_element.column)[row], fields_map.at(column_element.column_id));
                 }
             }
         }
@@ -143,7 +147,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
     {
         RegionBlockReader reader{decoding_schema};
         Block block = createBlockSortByColumnID(decoding_schema);
-        if (!reader.read(block, data_list_read_, force_decode))
+        if (!reader.read(block, data_list_read, force_decode))
             return false;
 
         checkBlock(decoding_schema, block);
@@ -155,7 +159,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         return getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
@@ -170,7 +174,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
             handle_ids,
             is_common_handle,
             ColumnIDValue(1, String("")),
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(8, String("")),
@@ -182,12 +186,12 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         // add default value for missing column
         std::vector<ColumnID> missing_column_ids{1, 8, 13};
         String missing_column_default_value = String("default");
-        for (size_t i = 0; i < table_info.columns.size(); i++)
+        for (auto & column : table_info.columns)
         {
-            if (std::find(missing_column_ids.begin(), missing_column_ids.end(), table_info.columns[i].id) != missing_column_ids.end())
+            if (std::find(missing_column_ids.begin(), missing_column_ids.end(), column.id) != missing_column_ids.end())
             {
-                table_info.columns[i].origin_default_value = missing_column_default_value;
-                fields_map_.emplace(table_info.columns[i].id, Field(missing_column_default_value));
+                column.origin_default_value = missing_column_default_value;
+                fields_map.emplace(column.id, Field(missing_column_default_value));
             }
         }
         return table_info;
@@ -199,7 +203,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
             ColumnIDValue(10, DecimalField(ToDecimal<UInt64, Decimal64>(12345678910ULL, 4), 4)));
@@ -212,7 +216,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt8>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
@@ -227,7 +231,7 @@ class RegionBlockReaderTestFixture : public ::testing::Test
         std::tie(table_info, std::ignore) = getTableInfoAndFields(
             handle_ids,
             is_common_handle,
-            ColumnIDValue(2, handle_value_),
+            ColumnIDValue(2, handle_value),
             ColumnIDValue(3, std::numeric_limits<UInt64>::max()),
             ColumnIDValue(4, std::numeric_limits<Float32>::min()),
             ColumnIDValue(9, String("aaa")),
diff --git a/dbms/src/TestUtils/FunctionTestUtils.cpp b/dbms/src/TestUtils/FunctionTestUtils.cpp
index 637fbf51c00..7fb526aeb01 100644
--- a/dbms/src/TestUtils/FunctionTestUtils.cpp
+++ b/dbms/src/TestUtils/FunctionTestUtils.cpp
@@ -108,14 +108,15 @@ void blockEqual(
     const Block & actual)
 {
     size_t columns = actual.columns();
+    size_t expected_columns = expected.columns();
 
-    ASSERT_TRUE(expected.columns() == columns);
+    ASSERT_EQ(expected_columns, columns);
 
     for (size_t i = 0; i < columns; ++i)
     {
         const auto & expected_col = expected.getByPosition(i);
         const auto & actual_col = actual.getByPosition(i);
-        ASSERT_TRUE(actual_col.type->getName() == expected_col.type->getName());
+        ASSERT_EQ(actual_col.type->getName(), expected_col.type->getName());
         ASSERT_COLUMN_EQ(expected_col.column, actual_col.column);
     }
 }
diff --git a/dbms/src/TestUtils/mockExecutor.cpp b/dbms/src/TestUtils/mockExecutor.cpp
index 2cf8a939b58..9a6e92dd9c1 100644
--- a/dbms/src/TestUtils/mockExecutor.cpp
+++ b/dbms/src/TestUtils/mockExecutor.cpp
@@ -219,6 +219,11 @@ DAGRequestBuilder & DAGRequestBuilder::project(MockAsts exprs)
 }
 
 DAGRequestBuilder & DAGRequestBuilder::project(MockColumnNames col_names)
+{
+    return project(MockColumnNamesVec(col_names));
+}
+
+DAGRequestBuilder & DAGRequestBuilder::project(MockColumnNamesVec col_names)
 {
     assert(root);
     auto exp_list = std::make_shared<ASTExpressionList>();
diff --git a/dbms/src/TestUtils/mockExecutor.h b/dbms/src/TestUtils/mockExecutor.h
index c11635ac93e..bad92c4226d 100644
--- a/dbms/src/TestUtils/mockExecutor.h
+++ b/dbms/src/TestUtils/mockExecutor.h
@@ -31,6 +31,7 @@ using MockOrderByItems = std::initializer_list<MockOrderByItem>;
 using MockPartitionByItem = std::pair<String, bool>;
 using MockPartitionByItems = std::initializer_list<MockPartitionByItem>;
 using MockColumnNames = std::initializer_list<String>;
+using MockColumnNamesVec = std::vector<String>;
 using MockAsts = std::initializer_list<ASTPtr>;
 using MockWindowFrame = mock::MockWindowFrame;
 
@@ -84,6 +85,7 @@ class DAGRequestBuilder
     DAGRequestBuilder & project(const String & col_name);
     DAGRequestBuilder & project(MockAsts expr);
     DAGRequestBuilder & project(MockColumnNames col_names);
+    DAGRequestBuilder & project(MockColumnNamesVec col_names);
 
     DAGRequestBuilder & exchangeSender(tipb::ExchangeType exchange_type);
 
@@ -181,8 +183,8 @@ MockWindowFrame buildDefaultRowsFrame();
 #define gt(expr1, expr2) makeASTFunction("greater", (expr1), (expr2))
 #define And(expr1, expr2) makeASTFunction("and", (expr1), (expr2))
 #define Or(expr1, expr2) makeASTFunction("or", (expr1), (expr2))
-#define NOT(expr) makeASTFunction("not", (expr1), (expr2))
-#define Max(expr) makeASTFunction("max", expr)
+#define NOT(expr) makeASTFunction("not", (expr))
+#define Max(expr) makeASTFunction("max", (expr))
 /// Window functions
 #define RowNumber() makeASTFunction("RowNumber")
 #define Rank() makeASTFunction("Rank")
diff --git a/dbms/src/TiDB/Schema/SchemaGetter.cpp b/dbms/src/TiDB/Schema/SchemaGetter.cpp
index 7f52f9301b1..6e333d6ba87 100644
--- a/dbms/src/TiDB/Schema/SchemaGetter.cpp
+++ b/dbms/src/TiDB/Schema/SchemaGetter.cpp
@@ -19,7 +19,6 @@
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
 extern const int SCHEMA_SYNC_ERROR;
@@ -188,18 +187,26 @@ Int64 SchemaGetter::getVersion()
     return std::stoll(ver);
 }
 
+bool SchemaGetter::checkSchemaDiffExists(Int64 ver)
+{
+    String key = getSchemaDiffKey(ver);
+    String data = TxnStructure::get(snap, key);
+    return !data.empty();
+}
+
 String SchemaGetter::getSchemaDiffKey(Int64 ver)
 {
     return std::string(schemaDiffPrefix) + ":" + std::to_string(ver);
 }
 
-SchemaDiff SchemaGetter::getSchemaDiff(Int64 ver)
+std::optional<SchemaDiff> SchemaGetter::getSchemaDiff(Int64 ver)
 {
     String key = getSchemaDiffKey(ver);
     String data = TxnStructure::get(snap, key);
     if (data.empty())
     {
-        throw TiFlashException("cannot find schema diff for version: " + std::to_string(ver), Errors::Table::SyncError);
+        LOG_FMT_WARNING(log, "The schema diff for version {}, key {} is empty.", ver, key);
+        return std::nullopt;
     }
     SchemaDiff diff;
     diff.deserialize(data);
diff --git a/dbms/src/TiDB/Schema/SchemaGetter.h b/dbms/src/TiDB/Schema/SchemaGetter.h
index 02d2f7a7c88..fe0ecd59af0 100644
--- a/dbms/src/TiDB/Schema/SchemaGetter.h
+++ b/dbms/src/TiDB/Schema/SchemaGetter.h
@@ -26,6 +26,8 @@
 
 #include <common/logger_useful.h>
 
+#include <optional>
+
 namespace DB
 {
 // The enum results are completely the same as the DDL Action listed in the "parser/model/ddl.go" of TiDB codebase, which must be keeping in sync.
@@ -138,7 +140,9 @@ struct SchemaGetter
 
     Int64 getVersion();
 
-    SchemaDiff getSchemaDiff(Int64 ver);
+    bool checkSchemaDiffExists(Int64 ver);
+
+    std::optional<SchemaDiff> getSchemaDiff(Int64 ver);
 
     static String getSchemaDiffKey(Int64 ver);
 
diff --git a/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h b/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
index 4fdba195acb..a23aeab139f 100644
--- a/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
+++ b/dbms/src/TiDB/Schema/TiDBSchemaSyncer.h
@@ -106,21 +106,31 @@ struct TiDBSchemaSyncer : public SchemaSyncer
         Stopwatch watch;
         SCOPE_EXIT({ GET_METRIC(tiflash_schema_apply_duration_seconds).Observe(watch.elapsedSeconds()); });
 
-        LOG_FMT_INFO(log, "start to sync schemas. current version is: {} and try to sync schema version to: {}", cur_version, version);
+        LOG_FMT_INFO(log, "Start to sync schemas. current version is: {} and try to sync schema version to: {}", cur_version, version);
 
         // Show whether the schema mutex is held for a long time or not.
         GET_METRIC(tiflash_schema_applying).Set(1.0);
         SCOPE_EXIT({ GET_METRIC(tiflash_schema_applying).Set(0.0); });
 
         GET_METRIC(tiflash_schema_apply_count, type_diff).Increment();
-        if (!tryLoadSchemaDiffs(getter, version, context))
+        // After the feature concurrent DDL, TiDB does `update schema version` before `set schema diff`, and they are done in separate transactions.
+        // So TiFlash may see a schema version X but no schema diff X, meaning that the transaction of schema diff X has not been committed or has
+        // been aborted.
+        // However, TiDB makes sure that if we get a schema version X, then the schema diff X-1 must exist. Otherwise the transaction of schema diff
+        // X-1 is aborted and we can safely ignore it.
+        // Since TiDB can not make sure the schema diff of the latest schema version X is not empty, under this situation we should set the `cur_version`
+        // to X-1 and try to fetch the schema diff X next time.
+        Int64 version_after_load_diff = 0;
+        if (version_after_load_diff = tryLoadSchemaDiffs(getter, version, context); version_after_load_diff == -1)
         {
             GET_METRIC(tiflash_schema_apply_count, type_full).Increment();
             loadAllSchema(getter, version, context);
+            // After loadAllSchema, we need update `version_after_load_diff` by last diff value exist or not
+            version_after_load_diff = getter.checkSchemaDiffExists(version) ? version : version - 1;
         }
-        cur_version = version;
+        cur_version = version_after_load_diff;
         GET_METRIC(tiflash_schema_version).Set(cur_version);
-        LOG_FMT_INFO(log, "end sync schema, version has been updated to {}", cur_version);
+        LOG_FMT_INFO(log, "End sync schema, version has been updated to {}{}", cur_version, cur_version == version ? "" : "(latest diff is empty)");
         return true;
     }
 
@@ -144,30 +154,60 @@ struct TiDBSchemaSyncer : public SchemaSyncer
         return it->second;
     }
 
-    bool tryLoadSchemaDiffs(Getter & getter, Int64 version, Context & context)
+    // Return Values
+    // - if latest schema diff is not empty, return the (latest_version)
+    // - if latest schema diff is empty, return the (latest_version - 1)
+    // - if error happend, return (-1)
+    Int64 tryLoadSchemaDiffs(Getter & getter, Int64 latest_version, Context & context)
     {
-        if (isTooOldSchema(cur_version, version))
+        if (isTooOldSchema(cur_version, latest_version))
         {
-            return false;
+            return -1;
         }
 
-        LOG_FMT_DEBUG(log, "try load schema diffs.");
+        LOG_FMT_DEBUG(log, "Try load schema diffs.");
 
-        SchemaBuilder<Getter, NameMapper> builder(getter, context, databases, version);
+        SchemaBuilder<Getter, NameMapper> builder(getter, context, databases, latest_version);
 
         Int64 used_version = cur_version;
-        std::vector<SchemaDiff> diffs;
-        while (used_version < version)
+        // First get all schema diff from `cur_version` to `latest_version`. Only apply the schema diff(s) if we fetch all
+        // schema diff without any exception.
+        std::vector<std::optional<SchemaDiff>> diffs;
+        while (used_version < latest_version)
         {
             used_version++;
             diffs.push_back(getter.getSchemaDiff(used_version));
         }
-        LOG_FMT_DEBUG(log, "end load schema diffs with total {} entries.", diffs.size());
+        LOG_FMT_DEBUG(log, "End load schema diffs with total {} entries.", diffs.size());
+
         try
         {
-            for (const auto & diff : diffs)
+            for (size_t diff_index = 0; diff_index < diffs.size(); ++diff_index)
             {
-                builder.applyDiff(diff);
+                const auto & schema_diff = diffs[diff_index];
+
+                if (!schema_diff)
+                {
+                    // If `schema diff` from `latest_version` got empty `schema diff`
+                    // Then we won't apply to `latest_version`, but we will apply to `latest_version - 1`
+                    // If `schema diff` from [`cur_version`, `latest_version - 1`] got empty `schema diff`
+                    // Then we should just skip it.
+                    //
+                    // example:
+                    //  - `cur_version` is 1, `latest_version` is 10
+                    //  - The schema diff of schema version [2,4,6] is empty, Then we just skip it.
+                    //  - The schema diff of schema version 10 is empty, Then we should just apply version into 9
+                    if (diff_index != diffs.size() - 1)
+                    {
+                        LOG_FMT_WARNING(log, "Skip the schema diff from version {}. ", cur_version + diff_index + 1);
+                        continue;
+                    }
+
+                    // if diff_index == diffs.size() - 1, return used_version - 1;
+                    return used_version - 1;
+                }
+
+                builder.applyDiff(*schema_diff);
             }
         }
         catch (TiFlashException & e)
@@ -177,7 +217,7 @@ struct TiDBSchemaSyncer : public SchemaSyncer
                 GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             }
             LOG_FMT_WARNING(log, "apply diff meets exception : {} \n stack is {}", e.displayText(), e.getStackTrace().toString());
-            return false;
+            return -1;
         }
         catch (Exception & e)
         {
@@ -187,21 +227,22 @@ struct TiDBSchemaSyncer : public SchemaSyncer
             }
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {} \n stack is {}", e.displayText(), e.getStackTrace().toString());
-            return false;
+            return -1;
         }
         catch (Poco::Exception & e)
         {
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {}", e.displayText());
-            return false;
+            return -1;
         }
         catch (std::exception & e)
         {
             GET_METRIC(tiflash_schema_apply_count, type_failed).Increment();
             LOG_FMT_WARNING(log, "apply diff meets exception : {}", e.what());
-            return false;
+            return -1;
         }
-        return true;
+
+        return used_version;
     }
 
     void loadAllSchema(Getter & getter, Int64 version, Context & context)
diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt
index 5fd25c5d238..2bedb312d07 100644
--- a/libs/libcommon/CMakeLists.txt
+++ b/libs/libcommon/CMakeLists.txt
@@ -198,3 +198,7 @@ if (ARCH_AMD64)
             src/crc64_sse2_asimd.cpp
             APPEND COMPILE_FLAGS "-mpclmul")
 endif()
+
+if (ARCH_AARCH64 AND ARCH_LINUX)
+    target_link_libraries (common PUBLIC tiflash-aarch64-string tiflash-aarch64-math)
+endif()
diff --git a/tests/fullstack-test-dt/clustered_index/ddl.test b/tests/fullstack-test-dt/clustered_index/ddl.test
index 8abe450c11a..6c4925c9619 100644
--- a/tests/fullstack-test-dt/clustered_index/ddl.test
+++ b/tests/fullstack-test-dt/clustered_index/ddl.test
@@ -66,3 +66,89 @@ mysql> set session tidb_isolation_read_engines='tiflash'; select * from test.t_2
 
 mysql> drop table test.t_1;
 mysql> drop table test.t_2;
+
+### about issue 5154 to check whether add column/drop column will effect the cluster index decode
+### drop the column between two columns that are cluster index columns 
+
+mysql> drop table if exists test.t_3;
+mysql> create table test.t_3 (A int, B varchar(20), C int, D int, PRIMARY KEY(A,C) CLUSTERED);
+mysql> insert into test.t_3 values (1,'1',1,1),(2,'2',2,2);
+
+mysql> alter table test.t_3 set tiflash replica 1;
+
+func> wait_table test t_3
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+---+
+| A | B | C | D |
++---+---+---+---+
+| 1 | 1 | 1 | 1 |
+| 2 | 2 | 2 | 2 |
++---+---+---+---+
+
+mysql> alter table test.t_3 drop column B;
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+
+| A | C | D |
++---+---+---+
+| 1 | 1 | 1 |
+| 2 | 2 | 2 |
++---+---+---+
+
+# insert some rows
+mysql> insert into test.t_3 values (3,3,3),(4,4,4);
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_3;
++---+---+---+
+| A | C | D |
++---+---+---+
+| 1 | 1 | 1 |
+| 2 | 2 | 2 |
+| 3 | 3 | 3 |
+| 4 | 4 | 4 |
++---+---+---+
+
+mysql> drop table test.t_3;
+
+### add the column between two columns that are cluster index columns 
+mysql> drop table if exists test.t_4
+mysql> create table test.t_4 (A int, B varchar(20), C int, D int, PRIMARY KEY(A,C) CLUSTERED);
+
+mysql> insert into test.t_4 values (1,'1',1,1),(2,'2',2,2);
+
+mysql> alter table test.t_4 set tiflash replica 1;
+
+func> wait_table test t_4
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+---+---+
+| A | B | C | D |
++---+---+---+---+
+| 1 | 1 | 1 | 1 |
+| 2 | 2 | 2 | 2 |
++---+---+---+---+
+
+mysql> alter table test.t_4 Add column E int after B;
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+------+---+---+
+| A | B |  E   | C | D |
++---+---+------+---+---+
+| 1 | 1 | NULL | 1 | 1 |
+| 2 | 2 | NULL | 2 | 2 |
++---+---+------+---+---+
+
+mysql> insert into test.t_4 values (3,'3',3,3,3),(4,'4',4,4,4);
+
+mysql> set session tidb_isolation_read_engines='tiflash';select * from test.t_4;
++---+---+------+------+------+
+| A | B |  E   |  C   |   D  |
++---+---+------+------+------+
+| 1 | 1 | NULL |  1   |   1  |
+| 2 | 2 | NULL |  2   |   2  |
+| 3 | 3 |    3 |  3   |   3  |
+| 4 | 4 |    4 |  4   |   4  |
++---+---+------+------+------+
+
+mysql> drop table test.t_4;
\ No newline at end of file
diff --git a/tests/fullstack-test/mpp/issue_2471.test b/tests/fullstack-test/mpp/issue_2471.test
index 4a1528595e8..497ce605893 100644
--- a/tests/fullstack-test/mpp/issue_2471.test
+++ b/tests/fullstack-test/mpp/issue_2471.test
@@ -35,7 +35,15 @@ mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_bro
 => DBGInvoke __enable_fail_point(exception_in_creating_set_input_stream)
 
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_opt_broadcast_cartesian_join=2; select * from a as t1 left join a as t2 on t1.id = t2.id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_in_creating_set_input_stream is triggered., e.what() = DB::Exception, Stack trace:
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
 
 => DBGInvoke __disable_fail_point(exception_in_creating_set_input_stream)
 
diff --git a/tests/fullstack-test/mpp/mpp_fail.test b/tests/fullstack-test/mpp/mpp_fail.test
index 7af5fef3f89..e03c6150be6 100644
--- a/tests/fullstack-test/mpp/mpp_fail.test
+++ b/tests/fullstack-test/mpp/mpp_fail.test
@@ -71,20 +71,44 @@ ERROR 1105 (HY000) at line 1: DB::Exception: Fail point FailPoints::exception_be
 ## exception during mpp run non root task
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, Stack trace:
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 
 ## exception during mpp run root task
 => DBGInvoke __enable_fail_point(exception_during_mpp_root_task_run)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_root_task_run is triggered., e.what() = DB::Exception, Stack trace:
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
 => DBGInvoke __disable_fail_point(exception_during_mpp_root_task_run)
 
 ## exception during mpp write err to tunnel
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __enable_fail_point(exception_during_mpp_write_err_to_tunnel)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Failed to write error msg to tunnel, e.what() = DB::Exception, Stack trace:
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __disable_fail_point(exception_during_mpp_write_err_to_tunnel)
 
@@ -92,7 +116,14 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchang
 => DBGInvoke __enable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __enable_fail_point(exception_during_mpp_close_tunnel)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; select count(value), id from t group by id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchange receiver meet error : DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 0, e.displayText() = DB::Exception: Exchange receiver meet error : Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_during_mpp_non_root_task_run is triggered., e.what() = DB::Exception, Stack trace:
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
 => DBGInvoke __disable_fail_point(exception_during_mpp_non_root_task_run)
 => DBGInvoke __disable_fail_point(exception_during_mpp_close_tunnel)
 
@@ -125,7 +156,16 @@ ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Exchang
 ## ensure build1, build2-probe1, probe2 in the CreatingSets, test the bug where build1 throw exception but not change the build state, thus block the build2-probe1, at last this query hangs.
 => DBGInvoke __enable_fail_point(exception_mpp_hash_build)
 mysql> use test; set @@tidb_isolation_read_engines='tiflash'; set @@tidb_allow_mpp=1; set @@tidb_broadcast_join_threshold_count=0; set @@tidb_broadcast_join_threshold_size=0; select t1.id from test.t t1 join test.t t2 on t1.id = t2.id and t1.id <2 join (select id from test.t group by id) t3 on t2.id=t3.id;
-ERROR 1105 (HY000) at line 1: other error for mpp stream: DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered.
+ERROR 1105 (HY000) at line 1: other error for mpp stream: Code: 10007, e.displayText() = DB::Exception: Fail point FailPoints::exception_mpp_hash_build is triggered., e.what() = DB::Exception, Stack trace:
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
+{#LINE}
 => DBGInvoke __disable_fail_point(exception_mpp_hash_build)
 
 # Clean up.
diff --git a/tests/run-test.py b/tests/run-test.py
index 843fe7c79b4..a2bcee0ce99 100644
--- a/tests/run-test.py
+++ b/tests/run-test.py
@@ -29,6 +29,7 @@
 UNFINISHED_1_PREFIX = '\t'
 UNFINISHED_2_PREFIX = '   '
 WORD_PH = '{#WORD}'
+LINE_PH = '{#LINE}'
 CURL_TIDB_STATUS_PREFIX = 'curl_tidb> '
 
 verbose = False
@@ -138,18 +139,22 @@ def match_ph_word(line):
 
 # TODO: Support more place holders, eg: {#NUMBER}
 def compare_line(line, template):
-    while True:
-        i = template.find(WORD_PH)
-        if i < 0:
-            return line == template
-        else:
-            if line[:i] != template[:i]:
-                return False
-            j = match_ph_word(line[i:])
-            if j == 0:
-                return False
-            template = template[i + len(WORD_PH):]
-            line = line[i + j:]
+    l = template.find(LINE_PH)
+    if l >= 0:
+        return True
+    else:
+        while True:
+            i = template.find(WORD_PH)
+            if i < 0:
+                return line == template
+            else:
+                if line[:i] != template[:i]:
+                    return False
+                j = match_ph_word(line[i:])
+                if j == 0:
+                    return False
+                template = template[i + len(WORD_PH):]
+                line = line[i + j:]
 
 
 class MySQLCompare:
@@ -194,11 +199,14 @@ def matched(outputs, matches):
             b = MySQLCompare.parse_excepted_outputs(matches)
             return a == b
         else:
-            if len(outputs) != len(matches):
+            if len(outputs) > len(matches):
                 return False
             for i in range(0, len(outputs)):
                 if not compare_line(outputs[i], matches[i]):
                     return False
+            for i in range(len(outputs), len(matches)):
+                if not compare_line("", matches[i]):
+                    return False
             return True
 
 
@@ -212,11 +220,14 @@ def matched(outputs, matches, fuzz):
         b = parse_table_parts(matches, fuzz)
         return a == b
     else:
-        if len(outputs) != len(matches):
+        if len(outputs) > len(matches):
             return False
         for i in range(0, len(outputs)):
             if not compare_line(outputs[i], matches[i]):
                 return False
+        for i in range(len(outputs), len(matches)):
+            if not compare_line("", matches[i]):
+                return False
         return True