diff --git a/runtime/core/cmake/xpu.cmake b/runtime/core/cmake/xpu.cmake
new file mode 100644
index 000000000..38418671b
--- /dev/null
+++ b/runtime/core/cmake/xpu.cmake
@@ -0,0 +1,37 @@
+if(NOT WIN32)
+  string(ASCII 27 Esc)
+  set(ColourReset "${Esc}[m")
+  set(ColourBold  "${Esc}[1m")
+  set(Red         "${Esc}[31m")
+  set(Green       "${Esc}[32m")
+  set(Yellow      "${Esc}[33m")
+  set(Blue        "${Esc}[34m")
+  set(Magenta     "${Esc}[35m")
+  set(Cyan        "${Esc}[36m")
+  set(White       "${Esc}[37m")
+  set(BoldRed     "${Esc}[1;31m")
+  set(BoldGreen   "${Esc}[1;32m")
+  set(BoldYellow  "${Esc}[1;33m")
+  set(BoldBlue    "${Esc}[1;34m")
+  set(BoldMagenta "${Esc}[1;35m")
+  set(BoldCyan    "${Esc}[1;36m")
+  set(BoldWhite   "${Esc}[1;37m")
+endif()
+
+if(XPU)
+  set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR})
+  message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n")
+  set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu)
+  if(NOT DEFINED ENV{XPU_API_PATH})
+    message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n")
+  else()
+    set(XPU_API_PATH $ENV{XPU_API_PATH})
+    message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.")
+  endif()
+
+  include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/
+                      ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include)
+  link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/)
+
+  add_definitions(-DUSE_XPU)
+endif()
diff --git a/runtime/core/decoder/CMakeLists.txt b/runtime/core/decoder/CMakeLists.txt
index cfa439f42..098fdcdb5 100644
--- a/runtime/core/decoder/CMakeLists.txt
+++ b/runtime/core/decoder/CMakeLists.txt
@@ -7,8 +7,8 @@ set(decoder_srcs
   ctc_endpoint.cc
 )
 
-if(NOT TORCH AND NOT ONNX)
-  message(FATAL_ERROR "Please build with TORCH or ONNX!!!")
+if(NOT TORCH AND NOT ONNX AND NOT XPU)
+  message(FATAL_ERROR "Please build with TORCH or ONNX or XPU!!!")
 endif()
 if(TORCH)
   list(APPEND decoder_srcs torch_asr_model.cc)
@@ -18,7 +18,8 @@ if(ONNX)
 endif()
 
 add_library(decoder STATIC ${decoder_srcs})
-target_link_libraries(decoder PUBLIC kaldi-decoder frontend post_processor utils)
+target_link_libraries(decoder PUBLIC kaldi-decoder frontend
+                      post_processor utils)
 
 if(ANDROID)
   target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY})
@@ -29,4 +30,7 @@ else()
   if(ONNX)
     target_link_libraries(decoder PUBLIC onnxruntime)
   endif()
+  if(XPU)
+    target_link_libraries(decoder PUBLIC xpu_conformer)
+  endif()
 endif()
diff --git a/runtime/core/decoder/params.h b/runtime/core/decoder/params.h
index dcabaeadc..ede5cfbee 100644
--- a/runtime/core/decoder/params.h
+++ b/runtime/core/decoder/params.h
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #ifndef DECODER_PARAMS_H_
 #define DECODER_PARAMS_H_
 
@@ -29,17 +28,24 @@
 #ifdef USE_TORCH
 #include "decoder/torch_asr_model.h"
 #endif
+#ifdef USE_XPU
+#include "xpu/xpu_asr_model.h"
+#endif
 #include "frontend/feature_pipeline.h"
 #include "post_processor/post_processor.h"
 #include "utils/flags.h"
 #include "utils/string.h"
 
 DEFINE_int32(num_threads, 1, "num threads for ASR model");
+DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model");
 
 // TorchAsrModel flags
 DEFINE_string(model_path, "", "pytorch exported model path");
 // OnnxAsrModel flags
 DEFINE_string(onnx_dir, "", "directory where the onnx model is saved");
+// XPUAsrModel flags
+DEFINE_string(xpu_model_dir, "",
+              "directory where the XPU model and weights is saved");
 
 // FeaturePipelineConfig flags
 DEFINE_int32(num_bins, 80, "num mel bins for fbank feature");
@@ -66,7 +72,8 @@ DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search");
 DEFINE_double(blank_skip_thresh, 1.0,
               "blank skip thresh for ctc wfst search, 1.0 means no skip");
-DEFINE_double(length_penalty, 0.0, "length penalty ctc wfst search, will not"
+DEFINE_double(length_penalty, 0.0,
+              "length penalty ctc wfst search, will not"
               "apply on self-loop arc, for balancing the del/ins ratio, "
               "suggest set to -3.0");
 DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search");
@@ -130,7 +137,7 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
 #else
     LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'.";
 #endif
-  } else {
+  } else if (!FLAGS_model_path.empty()) {
 #ifdef USE_TORCH
     LOG(INFO) << "Reading torch model " << FLAGS_model_path;
     TorchAsrModel::InitEngineThreads(FLAGS_num_threads);
@@ -140,6 +147,19 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
 #else
     LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'.";
 #endif
+  } else if (!FLAGS_xpu_model_dir.empty()) {
+#ifdef USE_XPU
+    LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir;
+    auto model = std::make_shared<XPUAsrModel>();
+    model->SetEngineThreads(FLAGS_num_threads);
+    model->SetDeviceId(FLAGS_device_id);
+    model->Read(FLAGS_xpu_model_dir);
+    resource->model = model;
+#else
+    LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'.";
+#endif
+  } else {
+    LOG(FATAL) << "Please set ONNX, TORCH or XPU model path!!!";
   }
 
   LOG(INFO) << "Reading unit table " << FLAGS_unit_path;
diff --git a/runtime/kunlun/.gitignore b/runtime/kunlun/.gitignore
new file mode 100644
index 000000000..c6767241c
--- /dev/null
+++ b/runtime/kunlun/.gitignore
@@ -0,0 +1,2 @@
+build/
+fc_base/
diff --git a/runtime/kunlun/CMakeLists.txt b/runtime/kunlun/CMakeLists.txt
new file mode 100644
index 000000000..71628eb7f
--- /dev/null
+++ b/runtime/kunlun/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+project(wenet VERSION 0.1)
+
+option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF)
+option(GRAPH_TOOLS "whether to build TLG graph tools" OFF)
+option(BUILD_TESTING "whether to build unit test" OFF)
+
+option(GRPC "whether to build with gRPC" OFF)
+# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost
+#                     which is a very big library
+option(WEBSOCKET "whether to build with websocket" OFF)
+option(XPU "whether to build with XPU" ON)
+
+set(CMAKE_VERBOSE_MAKEFILE OFF)
+
+include(FetchContent)
+set(FETCHCONTENT_QUIET OFF)
+get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(FETCHCONTENT_BASE_DIR ${fc_base})
+
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC")
+
+# Include all dependency
+include(openfst)
+# This CMakeLists.txt is only used for kunlun xpu, so remove the contents
+#  about onnx, libtorch, gpu and windows.
+include(xpu)
+# Compile xpu_conformer.a and conformer_test
+add_subdirectory(xpu)
+
+include_directories(
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/kaldi
+)
+
+# Build all libraries
+add_subdirectory(utils)
+add_subdirectory(frontend)
+add_subdirectory(post_processor)
+add_subdirectory(kaldi)  # kaldi: wfst based decoder
+add_subdirectory(decoder)
+add_subdirectory(api)
+
+# Optionally, you can build with websocket
+if(WEBSOCKET)
+  include(boost)
+  add_subdirectory(websocket)
+endif()
+
+# Optionally, you can build with gRPC
+if(GRPC)
+  include(grpc)
+  add_subdirectory(grpc)
+endif()
+
+# Build all bins
+add_subdirectory(bin)
+
+# Unit Test
+if(BUILD_TESTING)
+  include(gtest)
+  add_subdirectory(test)
+endif()
diff --git a/runtime/kunlun/README.md b/runtime/kunlun/README.md
new file mode 100644
index 000000000..2e096b796
--- /dev/null
+++ b/runtime/kunlun/README.md
@@ -0,0 +1,83 @@
+# 在昆仑芯片上运行Wenet
+## 介绍
+下面的示例展示了如何在XPU上部署WeNet离线或在线的ASR模型。XPU是一种由昆仑芯100%自主研发的通用人工智能计算核心架构。
+
+## 准备XPU运行环境
+
+在开始之前，请确认您获得以下必须的环境。
+
+    XRE(XPU Runtime Environment):昆仑芯片的基础运行环境，包括芯片驱动程序、runtime api库、固件FW工具等功能模块。
+    XDNN(XPU Deep Neural Network Library):加速深度神经网络的昆仑芯片库，提供应用程序中使用的高性能DNN功能库。
+
+如果您需要任何帮助，或是想要进一步了解昆仑芯片，请通过官方网址联系我们：
+https://www.kunlunxin.com.cn/
+
+## 操作步骤
+- 第一步：构建，需要cmake 3.14及以上版本
+
+``` sh
+export CXX=${your_g++_path}
+export CC=${your_gcc_path}
+export XPU_API_PATH=${your_api_path}
+
+# -r : release version; -d : debug version
+bash ./compile.sh -r
+```
+
+- 第二步：测试，测试结果将在控制台输出
+
+``` sh
+## set KUNLUN XPU visible device
+export XPU_VISIBLE_DEVICES=0
+export XPUSIM_DEVICE_MODEL=KUNLUN2
+## set logging level
+export GLOG_logtostderr=1
+export GLOG_v=3
+## set speech wav and model/weight path
+wav_path=${your_test_wav_path}
+xpu_model_dir=${your_xpu_weight_dir}
+units=${your_units.txt}
+## executive command
+./build/bin/decoder_main \
+    --chunk_size -1 \
+    --wav_path ${wav_path} \
+    --xpu_model_dir ${xpu_model_di} \
+    --unit_path ${units}   \
+    --device_id 0           \
+    --nbest  3  2>&1 | tee log.txt
+```
+
+单条语音执行结果如下所示:
+
+``` sh
+XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
+I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
+I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
+I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
+I1027 06:06:23.832749 111767 xpu_asr_model.cc:66]       subsampling_rate 4
+I1027 06:06:23.832777 111767 xpu_asr_model.cc:67]       right_context 6
+I1027 06:06:23.832789 111767 xpu_asr_model.cc:68]       sos 5538
+I1027 06:06:23.832795 111767 xpu_asr_model.cc:69]       eos 5538
+I1027 06:06:23.832799 111767 xpu_asr_model.cc:70]       is bidirectional decoder 1
+I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
+I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
+I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
+I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
+I1027 06:06:23.843616 111776 xpu_asr_model.cc:173]       max_seqlen is 418
+I1027 06:06:23.843619 111776 xpu_asr_model.cc:174]       q_seqlen   is 103
+I1027 06:06:23.843623 111776 xpu_asr_model.cc:175]       att_dim    is 512
+I1027 06:06:23.843626 111776 xpu_asr_model.cc:176]       ctc_dim    is 5538
+I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
+I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852537 111776 xpu_asr_model.cc:248]       num_hyps  is 3
+I1027 06:06:23.852541 111776 xpu_asr_model.cc:249]       beam_size is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:250]       new_bs    is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:251]       max_hyps_len is 14
+I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
+I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
+test 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
+```
diff --git a/runtime/kunlun/README_EN.md b/runtime/kunlun/README_EN.md
new file mode 100644
index 000000000..ff78792f9
--- /dev/null
+++ b/runtime/kunlun/README_EN.md
@@ -0,0 +1,87 @@
+# WeNet running on KUNLUNXIN XPU device
+## Introduction
+The below example shows how to deploy WeNet offline and online ASR models on XPUs.
+XPU is a core architecture 100% independently developed by KUNLUNXIN for general artificial intelligence computing.
+
+## Setup environment for XPU device
+
+Before the start, makesure you have these necessary environment
+
+    XRE(XPU Runtime Environment):The basic operating environment of the XPUs
+    includes functional modules such as chip drivers, runtime api library, and firmware tools.
+
+    XDNN(XPU Deep Neural Network Library): XPU library for accelerating deep neural networks, providing high-performance DNN function library used in applications.
+
+If you would like to know more about XPUs or need any help, please contact us through the official website:
+
+https://www.kunlunxin.com.cn/
+
+## Instruction
+- Step 1. Build, the build requires cmake 3.14 or above.
+
+``` sh
+export CXX=${your_g++_path}
+export CC=${your_gcc_path}
+export XPU_API_PATH=${your_api_path}
+
+# -r : release version; -d : debug version
+bash ./compile.sh -r
+```
+
+- Step 2. Testing, the result is shown in the console.
+
+``` sh
+## set KUNLUN XPU visible device
+export XPU_VISIBLE_DEVICES=0
+export XPUSIM_DEVICE_MODEL=KUNLUN2
+## set logging level
+export GLOG_logtostderr=1
+export GLOG_v=3
+## set speech wav and model/weight/units path
+wav_path=${your_test_wav_path}
+xpu_model_dir=${your_xpu_weight_dir}
+units=${your_units.txt}
+## executive command
+./build/bin/decoder_main \
+    --chunk_size -1 \
+    --wav_path $wav_path \
+    --xpu_model_dir $xpu_model_dir \
+    --unit_path $units   \
+    --device_id 0           \
+    --nbest  3  2>&1 | tee log.txt
+```
+
+A typical output result is as following:
+
+``` sh
+XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
+I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
+I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
+I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
+I1027 06:06:23.832749 111767 xpu_asr_model.cc:66]       subsampling_rate 4
+I1027 06:06:23.832777 111767 xpu_asr_model.cc:67]       right_context 6
+I1027 06:06:23.832789 111767 xpu_asr_model.cc:68]       sos 5538
+I1027 06:06:23.832795 111767 xpu_asr_model.cc:69]       eos 5538
+I1027 06:06:23.832799 111767 xpu_asr_model.cc:70]       is bidirectional decoder 1
+I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
+I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
+I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
+I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
+I1027 06:06:23.843616 111776 xpu_asr_model.cc:173]       max_seqlen is 418
+I1027 06:06:23.843619 111776 xpu_asr_model.cc:174]       q_seqlen   is 103
+I1027 06:06:23.843623 111776 xpu_asr_model.cc:175]       att_dim    is 512
+I1027 06:06:23.843626 111776 xpu_asr_model.cc:176]       ctc_dim    is 5538
+I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
+I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
+I1027 06:06:23.852537 111776 xpu_asr_model.cc:248]       num_hyps  is 3
+I1027 06:06:23.852541 111776 xpu_asr_model.cc:249]       beam_size is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:250]       new_bs    is 3
+I1027 06:06:23.852545 111776 xpu_asr_model.cc:251]       max_hyps_len is 14
+I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
+I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
+test 甚至出现交易几乎停滞的情况
+I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
+```
diff --git a/runtime/kunlun/api b/runtime/kunlun/api
new file mode 120000
index 000000000..5c1acaccc
--- /dev/null
+++ b/runtime/kunlun/api
@@ -0,0 +1 @@
+../core/api
\ No newline at end of file
diff --git a/runtime/kunlun/bin b/runtime/kunlun/bin
new file mode 120000
index 000000000..938df7215
--- /dev/null
+++ b/runtime/kunlun/bin
@@ -0,0 +1 @@
+../core/bin
\ No newline at end of file
diff --git a/runtime/kunlun/cmake b/runtime/kunlun/cmake
new file mode 120000
index 000000000..17afee87d
--- /dev/null
+++ b/runtime/kunlun/cmake
@@ -0,0 +1 @@
+../core/cmake
\ No newline at end of file
diff --git a/runtime/kunlun/compile.sh b/runtime/kunlun/compile.sh
new file mode 100755
index 000000000..d64a6f050
--- /dev/null
+++ b/runtime/kunlun/compile.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -e
+
+usage() {
+    echo "Usage:"
+    echo "bash compile.sh [-r] [-d] [-c]"
+    echo "Description:"
+    echo "-r, build release."
+    echo "-d, build debug."
+    echo "-c, remove cmakecache or build dir, then build."
+    echo "Example 1:"
+    echo "  ./compile.sh -r "
+    echo "  means: remove cache files in build dir, then build release."
+    echo "Example 2:"
+    echo "  ./compile.sh -d -c all "
+    echo "  means: remove all files in build dir, then build debug."
+    exit -1
+}
+
+if [ -z $CXX ]; then
+  echo -e "\033[31m [WARNING]: NO CXX in your env. Suggest setting CXX variable to support C++14. \033[0m"
+  sleep 2
+fi
+
+build_type='Release'
+clean_type='cache'
+
+while getopts 'rdc:h' OPT; do
+    case $OPT in
+        r) build_type="Release";;
+        d) build_type="Debug";;
+        c) clean_type="$OPTARG";;
+        h) usage;;
+        ?) usage;;
+    esac
+done
+
+if [ ! -d ./build ];then
+  mkdir build
+fi
+
+if [ "$clean_type" = "all" ];then
+  pushd build
+  rm -rf ./*
+  popd
+else
+  pushd build
+  rm -rf CMakeFiles/ cmake_install.cmake CMakeCache.txt CPackSourceConfig.cmake
+  popd
+fi
+
+build_cmd="cd build && cmake -DINTTYPES_FORMAT:STRING=C99 "
+
+if [ "$build_type" = "Release" ];then
+  build_cmd="${build_cmd} -DCMAKE_BUILD_TYPE=Release .. && cmake --build ./ "
+else
+  build_cmd="${build_cmd} -DCMAKE_BUILD_TYPE=Debug .. && cmake --build ./ "
+fi
+
+echo "build command is ${build_cmd}"
+
+eval ${build_cmd}
diff --git a/runtime/kunlun/decoder b/runtime/kunlun/decoder
new file mode 120000
index 000000000..3088ea48b
--- /dev/null
+++ b/runtime/kunlun/decoder
@@ -0,0 +1 @@
+../core/decoder
\ No newline at end of file
diff --git a/runtime/kunlun/frontend b/runtime/kunlun/frontend
new file mode 120000
index 000000000..0292335d1
--- /dev/null
+++ b/runtime/kunlun/frontend
@@ -0,0 +1 @@
+../core/frontend
\ No newline at end of file
diff --git a/runtime/kunlun/grpc b/runtime/kunlun/grpc
new file mode 120000
index 000000000..57533a588
--- /dev/null
+++ b/runtime/kunlun/grpc
@@ -0,0 +1 @@
+../core/grpc
\ No newline at end of file
diff --git a/runtime/kunlun/kaldi b/runtime/kunlun/kaldi
new file mode 120000
index 000000000..764a9d445
--- /dev/null
+++ b/runtime/kunlun/kaldi
@@ -0,0 +1 @@
+../core/kaldi
\ No newline at end of file
diff --git a/runtime/kunlun/patch b/runtime/kunlun/patch
new file mode 120000
index 000000000..69789fa5e
--- /dev/null
+++ b/runtime/kunlun/patch
@@ -0,0 +1 @@
+../core/patch
\ No newline at end of file
diff --git a/runtime/kunlun/post_processor b/runtime/kunlun/post_processor
new file mode 120000
index 000000000..4e434a5cc
--- /dev/null
+++ b/runtime/kunlun/post_processor
@@ -0,0 +1 @@
+../core/post_processor
\ No newline at end of file
diff --git a/runtime/kunlun/test b/runtime/kunlun/test
new file mode 120000
index 000000000..e60cf87a7
--- /dev/null
+++ b/runtime/kunlun/test
@@ -0,0 +1 @@
+../core/test
\ No newline at end of file
diff --git a/runtime/kunlun/utils b/runtime/kunlun/utils
new file mode 120000
index 000000000..9e19e7af5
--- /dev/null
+++ b/runtime/kunlun/utils
@@ -0,0 +1 @@
+../core/utils
\ No newline at end of file
diff --git a/runtime/kunlun/websocket b/runtime/kunlun/websocket
new file mode 120000
index 000000000..18f5de12c
--- /dev/null
+++ b/runtime/kunlun/websocket
@@ -0,0 +1 @@
+../core/websocket
\ No newline at end of file
diff --git a/runtime/kunlun/xpu/CMakeLists.txt b/runtime/kunlun/xpu/CMakeLists.txt
new file mode 100644
index 000000000..380e23204
--- /dev/null
+++ b/runtime/kunlun/xpu/CMakeLists.txt
@@ -0,0 +1,25 @@
+message("cmake build type is ${CMAKE_BUILD_TYPE} .")
+
+if(XPU)
+  list(APPEND xpu_conformer_srcs ./xpu_asr_model.cc)
+  list(APPEND xpu_conformer_srcs ./xpu_conformer.cpp)
+  list(APPEND xpu_conformer_srcs ./xpu_util.cpp)
+  message(STATUS "Use src_files: [ ${xpu_conformer_srcs} ] to compile xpu_conformer.a .")
+
+  # compile xpu_conformer.a
+  add_library(xpu_conformer STATIC ${xpu_conformer_srcs})
+  target_link_libraries(xpu_conformer PUBLIC xpuapi xpurt)
+endif()
+
+set(CMAKE_VERBOSE_MAKEFILE OFF)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive")
+set(CMAKE_EXE_LINKER_FLAGS "-lpthread -lrt -lm -ldl")
+
+set(SRC_FILES ./conformer_test.cpp ./xpu_conformer.cpp ./xpu_util.cpp)
+message(STATUS "Use src_files: [ ${SRC_FILES} ] to compile xpu_conformer_test.")
+
+add_executable(xpu_conformer_test ${SRC_FILES})
+target_link_libraries(xpu_conformer_test -lxpuapi -lxpurt)
diff --git a/runtime/kunlun/xpu/conformer_test.cpp b/runtime/kunlun/xpu/conformer_test.cpp
new file mode 100644
index 000000000..1d9fd672a
--- /dev/null
+++ b/runtime/kunlun/xpu/conformer_test.cpp
@@ -0,0 +1,276 @@
+// Copyright (c) 2022 KUNLUNXIN Inc.
+//               2022 Han Qi (qihan@baidu.com)
+//                    Hehe Pan (panhehe@baidu.com)
+//                    Zikui Yan (yanzikui@baidu.com)
+//                    Chaolin Li (lichaolin@baidu.com)
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <mutex>
+#include <thread>
+#include <tuple>
+#include "xpu_conformer.h"  // NOLINT
+#include "xpu_util.h"       // NOLINT
+namespace api = baidu::xpu::api;
+namespace wenet = xpu::wenet;
+
+template <typename T, typename TW, typename TGEMM>
+static void conformer_test(const std::string& data_dir,
+                           const std::string& params_dir, int threads_number,
+                           int dev_id) {
+  typedef std::vector<
+      std::tuple<std::tuple<float*, std::vector<int>>,
+                 std::tuple<std::vector<int>, std::vector<int>>>>
+      Dtype;
+  ConformerEncoderParam<T, TW> encoder_param;
+  init_encoder_params<T, TW>(params_dir, encoder_param);
+  ConformerDecoderParam<T, TW> decoder_param;
+  init_decoder_params<T, TW>(params_dir, decoder_param);
+  int real_threads_number = threads_number <= 0 ? 1 : threads_number;
+  std::cout << "Encoder + Decoder MultiStreamTest threads:"
+            << real_threads_number << std::endl;
+  // init test data
+  std::vector<int> ids = get_all_ids(data_dir);
+  Dtype data_list;
+  for (auto index_id : ids) {
+    std::string input_lenghts_prefix =
+        data_dir + std::to_string(index_id) + "_len";
+    std::string input_prefix = data_dir + std::to_string(index_id);
+    auto input_lenghts_cpu_info =
+        read_cpu_data_from_file<int>(input_lenghts_prefix, 1);
+    auto input_xpu_info = read_xpu_data_from_file<float>(input_prefix, 3);
+    data_list.push_back(
+        std::make_tuple(input_xpu_info, input_lenghts_cpu_info));
+  }
+  bool write_res = true;
+  // init mem
+  int ret = 0;
+  std::vector<api::Context*> ctx_xpu_ptrs(real_threads_number);
+  std::vector<XPUStream> streams(real_threads_number);
+
+  int nsdnn = real_threads_number > 1 ? 2 : 6;
+  int ncluster = real_threads_number > 1 ? 2 : 8;
+  for (int i = 0; i < real_threads_number; i++) {
+    ret = xpu_stream_create(&streams[i]);
+    ctx_xpu_ptrs[i] = new api::Context(api::kXPU2);
+    ctx_xpu_ptrs[i]->xpu_stream = streams[i];
+    ctx_xpu_ptrs[i]->set_nsdnn(nsdnn);
+    ctx_xpu_ptrs[i]->set_ncluster(ncluster);
+  }
+  // threads
+  std::vector<float> thread_times(real_threads_number);
+  std::vector<std::thread> threads;
+  int data_counter = 0;
+  std::mutex data_mutex;
+  std::vector<float> time_info(real_threads_number, 0.0f);
+  auto f = [&](int thread_id) {
+    xpu_set_device(dev_id);
+    api::Context* ctx_xpu = ctx_xpu_ptrs[thread_id];
+    api::ctx_guard RAII_GUARD(ctx_xpu);
+    while (true) {
+      int data_index = -1;
+      data_mutex.lock();
+      if (data_counter >= data_list.size()) {
+        data_mutex.unlock();
+        break;
+      }
+      data_index = data_counter++;
+      data_mutex.unlock();
+      if (data_index < 0) {
+        continue;
+      }
+      auto start_time = std::chrono::system_clock::now();
+      // get input data
+      auto& input_xpu_info = std::get<0>(data_list[data_index]);
+      auto& input_lenghts_info = std::get<1>(data_list[data_index]);
+      auto& input_xpu_data = std::get<0>(input_xpu_info);
+      auto& speech_shape = std::get<1>(input_xpu_info);
+      int batch = speech_shape[0];
+      int max_seqlen = speech_shape[1];
+      auto xpu_mask_info_float = create_mask_according_speech_length<float>(
+          std::get<0>(input_lenghts_info), max_seqlen, ctx_xpu->xpu_stream);
+      ret = xpu_wait(ctx_xpu->xpu_stream);
+      CHECK_RET(ret);
+      int q_seqlen = ((max_seqlen - 1) / 2 - 1) / 2;
+      // encoder run
+      int att_dim = encoder_param.head_num * encoder_param.head_dim;
+      int ctc_dim = encoder_param.ctc_dim;
+      T* encoder_out = RAII_GUARD.alloc<T>(batch * q_seqlen * att_dim);
+      T* ctc_probs = RAII_GUARD.alloc<T>(batch * q_seqlen * ctc_dim);
+      // get encoder_out & ctc_probs
+      ret = wenet::conformer_encoder_wenet<T, TW, TGEMM>(
+          ctx_xpu, input_xpu_data, speech_shape, encoder_out, ctc_probs,
+          encoder_param, xpu_mask_info_float);
+      CHECK_RET(ret);
+      ret = xpu_wait(ctx_xpu->xpu_stream);
+      CHECK_RET(ret);
+      // ctc_prefix_beamsearch implement in cpu
+      int beam_size = encoder_param.beam_size;
+      int new_bs = batch * beam_size;
+      std::vector<int> hyps_len(new_bs);
+      std::vector<float> ctc_scores(new_bs);
+      std::vector<int> hyps_cpu;
+      int* hyps = RAII_GUARD.alloc<int>(new_bs * q_seqlen);
+      ret = wenet::ctc_prefix_beamsearch<T>(ctx_xpu, ctc_probs, hyps_cpu,
+                                            hyps_len, ctc_scores, batch,
+                                            beam_size, q_seqlen, ctc_dim);
+      CHECK_RET(ret);
+      ret = xpu_wait(ctx_xpu->xpu_stream);
+      CHECK_RET(ret);
+      int max_target_len =
+          padding_target(hyps_cpu, hyps_len, beam_size, decoder_param.eos_id);
+      ret = xpu_memcpy(hyps, reinterpret_cast<void*>(&hyps_cpu[0]),
+                       max_target_len * new_bs * sizeof(int),
+                       XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+      ret = xpu_wait(ctx_xpu->xpu_stream);
+      CHECK_RET(ret);
+      // decoder
+      int pad_target_len = decoder_param.add_sos_num + max_target_len;
+      float* character_scores =
+          RAII_GUARD.alloc<float>(new_bs * pad_target_len * ctc_dim);
+      ret = wenet::conformer_decoder_wenet<T, TW, TGEMM>(
+          ctx_xpu, encoder_out, {batch, q_seqlen, att_dim},
+          std::get<0>(xpu_mask_info_float), hyps, {new_bs, max_target_len},
+          character_scores, decoder_param);
+      CHECK_RET(ret);
+      ret = xpu_wait(ctx_xpu->xpu_stream);
+      CHECK_RET(ret);
+      // Only use decoder score for rescoring
+      std::vector<float> best_score(batch, -std::numeric_limits<float>::max());
+      std::vector<int> best_index(batch, 0);
+      float ctc_weight = 0.5;
+      std::vector<float> decoder_out(new_bs * pad_target_len * ctc_dim);
+      ret = xpu_memcpy(&decoder_out[0], character_scores,
+                       new_bs * max_target_len * ctc_dim * sizeof(float),
+                       XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+      xpu_wait(ctx_xpu->xpu_stream);
+      CHECK_RET(ret);
+      // cal score && output
+      std::string wav_prefix =
+          data_dir + std::to_string(data_index) + "_wav.txt";
+      std::string res_prefix = "./token_id.txt";
+      std::ofstream res;
+      std::string wav_name;
+      std::vector<std::string> wav_info;
+      if (write_res) {
+        std::ifstream wav(wav_prefix.c_str());
+        if (!wav.is_open()) {
+          std::cout << "wav file open fail" << std::endl;
+          exit(0);
+        }
+        while (getline(wav, wav_name)) {
+          wav_info.push_back(wav_name);
+        }
+        wav.close();
+      }
+      for (int i = 0; i < batch; i++) {
+        for (int j = 0; j < beam_size; j++) {
+          T score = 0.0;
+          for (int k = 0; k < hyps_len[i * beam_size + j]; k++) {
+            int index = i * beam_size * max_target_len * ctc_dim +
+                        j * max_target_len * ctc_dim + k * ctc_dim +
+                        hyps_cpu[k];
+            score += decoder_out[index];
+          }
+          score += decoder_out[i * beam_size * max_target_len * ctc_dim +
+                               j * max_target_len * ctc_dim +
+                               hyps_len[i * batch + j] * ctc_dim + ctc_dim - 1];
+          // add ctc score
+          score += ctc_weight * ctc_scores[i * beam_size + j];
+          if (score > best_score[i]) {
+            best_score[i] = score;
+            best_index[i] = j;
+          }
+        }
+        int token_index = best_index[i] + i * beam_size;
+        if (write_res) {
+          data_mutex.lock();
+          res.open(res_prefix, std::ios::app);
+          if (!res.is_open()) {
+            std::cout << "res file open fail" << std::endl;
+            exit(0);
+          }
+          res << wav_info[i] << ":";
+          for (int k = 0; k < hyps_len[token_index]; k++)
+            res << hyps_cpu[k] << " ";
+          res << std::endl;
+          res.close();
+          data_mutex.unlock();
+        }
+      }
+      auto end_time = std::chrono::system_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+          end_time - start_time);
+      time_info[thread_id] += static_cast<float>(duration.count()) / 1000;
+      ret = xpu_free(std::get<0>(input_xpu_info));
+      CHECK_RET(ret);
+      ret = xpu_free(std::get<0>(xpu_mask_info_float));
+      CHECK_RET(ret);
+    }
+  };
+  auto all_start = std::chrono::system_clock::now();
+  for (auto i = 0; i < real_threads_number; i++) {
+    std::thread t(f, i);
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  auto all_end = std::chrono::system_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      all_end - all_start);
+  float total_time = static_cast<float>(duration.count()) / 1000;
+  std::cout << "Total time cost:" << total_time << std::endl;
+  for (int i = 0; i < real_threads_number; i++) {
+    if (ctx_xpu_ptrs[i]) delete ctx_xpu_ptrs[i];
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 6) {
+    std::cout << "Only support the following three params:" << std::endl;
+    std::cout
+        << "\t1. " << argv[0]
+        << " encoder_test [params_dir] [data_dir] [dev_id] [threads_number]"
+        << std::endl;
+    std::cout
+        << "\t2. " << argv[0]
+        << " decoder_test [params_dir] [data_dir] [dev_id] [threads_number]"
+        << std::endl;
+    std::cout << "\t3. " << argv[0]
+              << " all [params_dir] [data_dir] [dev_id] [threads_number]"
+              << std::endl;
+    return 0;
+  }
+  std::string mode = argv[1];
+  std::string params_dir = argv[2];
+  std::string data_dir = argv[3];
+  int dev_id = std::stoi(argv[4]);
+  int threads_number = std::stoi(argv[5]);
+  add_separator_when_necessary(params_dir);
+  add_separator_when_necessary(data_dir);
+  xpu_set_device(dev_id);
+
+  typedef float16 T;
+  typedef int16_t TW;
+  typedef int16_t TGEMM;
+
+  if (mode == "all") {
+    conformer_test<T, TW, TGEMM>(data_dir, params_dir, threads_number, dev_id);
+  } else {
+    std::cout << "Unkown test mode: " << mode << std::endl;
+    std::exit(1);
+  }
+}
diff --git a/runtime/kunlun/xpu/xpu_asr_model.cc b/runtime/kunlun/xpu/xpu_asr_model.cc
new file mode 100644
index 000000000..71b60bd15
--- /dev/null
+++ b/runtime/kunlun/xpu/xpu_asr_model.cc
@@ -0,0 +1,318 @@
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu)
+//               2022 Han Qi (qihan@baidu.com, Kunlunxin Inc)
+//                    Hehe Pan (panhehe@baidu.com, Kunlunxin Inc)
+//                    Zikui Yan (yanzikui@baidu.com, Kunlunxin Inc)
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xpu_asr_model.h"  // NOLINT
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <utility>
+
+#include "utils/string.h"
+
+namespace wenet {
+
+void XPUAsrModel::SetEngineThreads(int num_threads) {
+  real_threads_number = num_threads;
+}
+
+void XPUAsrModel::SetDeviceId(int dev_id) { device_id_ = dev_id; }
+
+void XPUAsrModel::Read(const std::string& model_dir) {
+  // init xpu runtime params
+  ctx_xpu_ptr = std::make_shared<api::Context>(api::kXPU2);
+  RAII_GUARD.reset(new api::ctx_guard(ctx_xpu_ptr.get()));
+
+  // For XPU, model_dir is params_dir, which is used to store weights for every
+  // layer.
+  std::string weight_dir = model_dir + "/model_weights/";
+  std::string weight_info_txt_path = weight_dir + "/weights_info.txt";
+
+  LOG(INFO) << "\e[1;34mXPU weight_dir is: " << weight_dir << "\e[0m\n";
+  if (!std::ifstream(weight_info_txt_path.c_str()).good()) {
+    LOG(FATAL) << "weight_info_txt: " << weight_info_txt_path
+               << " NOT exist !!!\n";
+  }
+
+  // 1. Load weight for every layer
+  init_encoder_params<T, TW>(weight_dir, encoder_param);
+  init_decoder_params<T, TW>(weight_dir, decoder_param);
+
+  // 2. Read metadata
+  // TODO(panhehe): Load following parameters from config file or
+  // encoder/decoder params.
+  subsampling_rate_ = 4;
+  right_context_ = 6;
+  sos_ = 5538;
+  eos_ = 5538;
+  is_bidirectional_decoder_ = 1;
+
+  LOG(INFO) << "======= XPU Kunlun Model Info: =======";
+  LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_;
+  LOG(INFO) << "\tright_context " << right_context_;
+  LOG(INFO) << "\tsos " << sos_;
+  LOG(INFO) << "\teos " << eos_;
+  LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_;
+}
+
+XPUAsrModel::XPUAsrModel(const XPUAsrModel& other) {
+  // 1. Init the model info
+  right_context_ = other.right_context_;
+  subsampling_rate_ = other.subsampling_rate_;
+  sos_ = other.sos_;
+  eos_ = other.eos_;
+  is_bidirectional_decoder_ = other.is_bidirectional_decoder_;
+  chunk_size_ = other.chunk_size_;
+  num_left_chunks_ = other.num_left_chunks_;
+  offset_ = other.offset_;
+
+  l3ptr = other.l3ptr;
+  real_threads_number = other.real_threads_number;
+  device_id_ = other.device_id_;
+  ctx_xpu_ptr = other.ctx_xpu_ptr;
+  RAII_GUARD = other.RAII_GUARD;
+  encoder_param = other.encoder_param;
+  decoder_param = other.decoder_param;
+  stream = other.stream;
+  // other member variables may not need to copy here
+}
+
+std::shared_ptr<AsrModel> XPUAsrModel::Copy() const {
+  auto asr_model = std::make_shared<XPUAsrModel>(*this);
+  // Reset the inner states for new decoding
+  asr_model->Reset();
+  return asr_model;
+}
+
+void XPUAsrModel::Reset() {
+  offset_ = 0;
+  encoder_out = nullptr;
+  ctc_probs = nullptr;
+  cached_feature_.clear();
+  // Reset att_cache
+  att_cache_.resize(0, 0.0);
+  cnn_cache_.resize(0, 0.0);
+}
+
+void XPUAsrModel::ForwardEncoderFunc(
+    const std::vector<std::vector<float>>& chunk_feats,
+    std::vector<std::vector<float>>* out_prob) {
+  // Set Device Id
+  LOG(INFO) << "Now Use XPU:" << device_id_ << "!\n";
+  xpu_set_device(device_id_);
+
+  // 1. Prepare XPU required data, splice cached_feature_ and chunk_feats
+  // The first dimension is for batchsize, which is 1.
+  // chunk
+
+  int num_frames = cached_feature_.size() + chunk_feats.size();
+  const int feature_dim = chunk_feats[0].size();
+
+  std::vector<int> feats_length_shape = {1};
+  std::vector<int> feats_length_data = {num_frames};
+  input_lenghts_cpu_info =
+      std::make_tuple(feats_length_data, feats_length_shape);
+
+  std::vector<int> feats_data_shape = {1, num_frames, feature_dim};
+  std::vector<float> feats_data_cpu;
+  feats_data_cpu.reserve(1 * num_frames * feature_dim);
+  // convert 2d-vector to 1d-vector
+  for (auto& row : chunk_feats) {
+    auto end_iter = feats_data_cpu.end();
+    feats_data_cpu.insert(end_iter, row.cbegin(), row.cend());
+  }
+
+  float* input_xpu_data = get_xpu_data<float>("wav_test", feats_data_cpu);
+  input_xpu_info = std::make_tuple(input_xpu_data, feats_data_shape);
+
+  // init L3 Memory
+  int ret = 0;
+  real_threads_number = 1;
+  int nsdnn = real_threads_number > 1 ? 2 : 6;
+  int ncluster = real_threads_number > 1 ? 2 : 8;
+  for (int i = 0; i < real_threads_number; i++) {
+    ret = xpu_stream_create(&stream);
+    ctx_xpu_ptr->xpu_stream = stream;
+    ctx_xpu_ptr->set_nsdnn(nsdnn);
+    ctx_xpu_ptr->set_ncluster(ncluster);
+  }
+
+  std::shared_ptr<api::Context> ctx_xpu = ctx_xpu_ptr;
+
+  // get input speech info and data
+  batch = feats_data_shape.at(0);  // batch = 1
+  max_seqlen = feats_data_shape.at(1);
+
+  xpu_mask_info_float = create_mask_according_speech_length<float>(
+      feats_length_data, max_seqlen, ctx_xpu->xpu_stream);
+
+  ret = xpu_wait(ctx_xpu->xpu_stream);
+  CHECK_RET(ret);
+
+  q_seqlen = ((max_seqlen - 1) / 2 - 1) / 2;
+
+  // Encoder run
+  int att_dim = encoder_param.head_num * encoder_param.head_dim;
+  int ctc_dim = encoder_param.ctc_dim;
+
+  LOG(INFO) << "\t max_seqlen is " << max_seqlen << "\n";
+  LOG(INFO) << "\t q_seqlen   is " << q_seqlen << "\n";
+  LOG(INFO) << "\t att_dim    is " << att_dim << "\n";
+  LOG(INFO) << "\t ctc_dim    is " << ctc_dim << "\n";
+
+  // T is float16
+  encoder_out = RAII_GUARD->alloc<T>(batch * q_seqlen * att_dim);
+  ctc_probs = RAII_GUARD->alloc<T>(batch * q_seqlen * ctc_dim);
+
+  // 2. Encoder chunk forward, including ctc_activation
+  // get encoder_out & ctc_probs
+  ret = xpu::wenet::conformer_encoder_wenet<T, TW, int16_t>(
+      ctx_xpu.get(), input_xpu_data, feats_data_shape, encoder_out, ctc_probs,
+      encoder_param, xpu_mask_info_float);
+  CHECK_RET(ret);
+
+  // Copy to output(cpu)
+  int num_outputs = q_seqlen;
+  int output_dim = ctc_dim;
+  out_prob->resize(num_outputs);
+
+  float* logp = RAII_GUARD->alloc<float>(batch * q_seqlen * ctc_dim);
+  // cast T to float32
+  ret = api::cast_v2<T, float>(ctx_xpu.get(), ctc_probs, logp,
+                               batch * q_seqlen * ctc_dim);
+  CHECK_RET(ret);
+  ret = xpu_wait(ctx_xpu->xpu_stream);
+  CHECK_RET(ret);
+
+  // xpu_memcpy logp from device to host
+  for (int i = 0; i < num_outputs; i++) {
+    (*out_prob)[i].resize(output_dim);
+    ret = xpu_memcpy(reinterpret_cast<void*>((*out_prob)[i].data()),
+                     logp + output_dim * i, output_dim * sizeof(float),
+                     XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+    CHECK_RET(ret);
+  }
+}
+
+float XPUAsrModel::ComputeAttentionScore(const float* prob,
+                                         const std::vector<int>& hyp, int eos,
+                                         int decode_out_len) {
+  float score = 0.0f;
+  for (size_t j = 0; j < hyp.size(); ++j) {
+    score += *(prob + j * decode_out_len + hyp[j]);
+  }
+  score += *(prob + hyp.size() * decode_out_len + eos);
+  return score;
+}
+
+void XPUAsrModel::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                     float reverse_weight,
+                                     std::vector<float>* rescoring_score) {
+  CHECK(rescoring_score != nullptr);
+  int num_hyps = hyps.size();
+  rescoring_score->resize(num_hyps, 0.0f);
+
+  if (num_hyps == 0) {
+    return;
+  }
+
+  if (encoder_out == nullptr) {
+    return;
+  }
+
+  int beam_size = encoder_param.beam_size;
+  int new_bs = batch * beam_size;
+
+  std::vector<int64_t> hyps_lens;
+  int max_hyps_len = 0;
+  for (size_t i = 0; i < num_hyps; ++i) {
+    int length = hyps[i].size() + 1;
+    max_hyps_len = std::max(length, max_hyps_len);
+    hyps_lens.emplace_back(static_cast<int64_t>(length));
+  }
+  LOG(INFO) << "\t num_hyps  is " << num_hyps << "\n";
+  LOG(INFO) << "\t beam_size is " << beam_size << "\n";
+  LOG(INFO) << "\t new_bs    is " << new_bs << "\n";
+  LOG(INFO) << "\t max_hyps_len is " << max_hyps_len << "\n";
+
+  // pad hyps
+  std::vector<int> hyps_pad_cpu(max_hyps_len * beam_size);
+  for (size_t i = 0; i < num_hyps; ++i) {
+    const std::vector<int>& hyp = hyps[i];
+    hyps_pad_cpu.emplace_back(sos_);
+    size_t j = 0;
+    for (; j < hyp.size(); ++j) {
+      hyps_pad_cpu.emplace_back(hyp[j]);
+    }
+    if (j == max_hyps_len - 1) {
+      continue;
+    }
+    for (; j < max_hyps_len - 1; ++j) {
+      hyps_pad_cpu.emplace_back(0);
+    }
+  }
+  int* hyps_xpu = RAII_GUARD->alloc<int>(new_bs * q_seqlen);
+  int max_target_len = max_hyps_len;
+  // xpu_memcpy hyps_pad_cup to device
+  int ret = xpu_memcpy(hyps_xpu, reinterpret_cast<void*>(hyps_pad_cpu.data()),
+                       max_target_len * new_bs * sizeof(int),
+                       XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  CHECK_RET(ret);
+
+  // Decoder
+  int att_dim = encoder_param.head_num * encoder_param.head_dim;
+  int ctc_dim = encoder_param.ctc_dim;
+  int pad_target_len = decoder_param.add_sos_num + max_target_len;
+  float* character_scores =
+      RAII_GUARD->alloc<float>(new_bs * pad_target_len * ctc_dim);
+  ret = xpu::wenet::conformer_decoder_wenet<T, TW, int16_t>(
+      ctx_xpu_ptr.get(), encoder_out, {batch, q_seqlen, att_dim},
+      std::get<0>(xpu_mask_info_float), hyps_xpu, {new_bs, max_target_len},
+      character_scores, decoder_param);
+  CHECK_RET(ret);
+  ret = xpu_wait(ctx_xpu_ptr->xpu_stream);
+  CHECK_RET(ret);
+
+  // xpu_memcpy from xpu device to host
+  std::vector<float> decoder_out(new_bs * pad_target_len * ctc_dim);
+  ret = xpu_memcpy(&decoder_out[0], character_scores,
+                   new_bs * max_target_len * ctc_dim * sizeof(float),
+                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  CHECK_RET(ret);
+  ret = xpu_wait(ctx_xpu_ptr->xpu_stream);
+  CHECK_RET(ret);
+
+  // cal score
+  float* decoder_outs_data = decoder_out.data();
+  for (size_t i = 0; i < num_hyps; ++i) {
+    const std::vector<int>& hyp = hyps[i];
+    float score = 0.0f;
+    // left to right decoder score
+    // ctc_dim maybe equal to decode_out_len
+    score = ComputeAttentionScore(
+        decoder_outs_data + max_target_len * ctc_dim * i, hyp, eos_, ctc_dim);
+    // Optional: Used for right to left score
+    float r_score = 0.0f;
+    // reverse_weight is 0 ; so the codes in if-condition is be ignored.
+    // combined left-to-right and right-to-left score
+    (*rescoring_score)[i] =
+        score * (1 - reverse_weight) + r_score * reverse_weight;
+  }
+}
+
+}  // namespace wenet
diff --git a/runtime/kunlun/xpu/xpu_asr_model.h b/runtime/kunlun/xpu/xpu_asr_model.h
new file mode 100644
index 000000000..500081ad9
--- /dev/null
+++ b/runtime/kunlun/xpu/xpu_asr_model.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu)
+//               2022 Han Qi (qihan@baidu.com, Kunlunxin Inc)
+//                    Hehe Pan (panhehe@baidu.com, Kunlunxin Inc)
+//                    Zikui Yan (yanzikui@baidu.com, Kunlunxin Inc)
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_
+#define RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "decoder/asr_model.h"
+#include "utils/log.h"
+#include "utils/utils.h"
+
+#include "xpu_conformer.h"  // NOLINT
+
+namespace wenet {
+
+class XPUAsrModel : public AsrModel {
+  typedef float16 T;
+  typedef int16_t TW;
+
+ public:
+  // Note: Do not call the InitEngineThreads function more than once.
+  void SetEngineThreads(int num_threads = 1);
+
+ public:
+  XPUAsrModel() = default;
+  XPUAsrModel(const XPUAsrModel& other);
+  void SetDeviceId(int dev_id);
+  void Read(const std::string& model_dir);
+  void Reset() override;
+  void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                          float reverse_weight,
+                          std::vector<float>* rescoring_score) override;
+  std::shared_ptr<AsrModel> Copy() const override;
+
+ protected:
+  void ForwardEncoderFunc(const std::vector<std::vector<float>>& chunk_feats,
+                          std::vector<std::vector<float>>* ctc_prob) override;
+
+  float ComputeAttentionScore(const float* prob, const std::vector<int>& hyp,
+                              int eos, int decode_out_len);
+
+ private:
+  int encoder_output_size_ = 0;
+  int num_blocks_ = 0;
+  int cnn_module_kernel_ = 0;
+  int head_ = 0;
+
+  // XPU device id
+  int device_id_ = 0;
+  int real_threads_number = 1;
+
+  // XPU Conformer EncoderParam and DecoderParam
+  ConformerEncoderParam<T, TW> encoder_param;
+  ConformerDecoderParam<T, TW> decoder_param;
+
+  // XPU input and weights params
+  using INPUT_LENGTH_CPU_TUPLE = std::tuple<std::vector<int>, std::vector<int>>;
+  using INPUT_XPU_INFO_TUPLE = std::tuple<float*, std::vector<int>>;
+  INPUT_LENGTH_CPU_TUPLE input_lenghts_cpu_info;
+  INPUT_XPU_INFO_TUPLE input_xpu_info;
+  INPUT_XPU_INFO_TUPLE xpu_mask_info_float;
+
+  // XPU encoder and decoder outputs
+  T* encoder_out = nullptr;
+  T* ctc_probs = nullptr;
+
+  // XPU runtime params
+  void* l3ptr = nullptr;
+  XPUStream stream;
+  std::shared_ptr<api::Context> ctx_xpu_ptr;
+  std::shared_ptr<api::ctx_guard> RAII_GUARD;
+
+  int batch, max_seqlen, q_seqlen;
+
+  // caches
+  std::vector<float> att_cache_;
+  std::vector<float> cnn_cache_;
+};
+
+}  // namespace wenet
+
+#endif  // RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_
diff --git a/runtime/kunlun/xpu/xpu_conformer.cpp b/runtime/kunlun/xpu/xpu_conformer.cpp
new file mode 100644
index 000000000..f5fd562a6
--- /dev/null
+++ b/runtime/kunlun/xpu/xpu_conformer.cpp
@@ -0,0 +1,971 @@
+// Copyright (c) 2022 KUNLUNXIN Inc.
+//               2022 Han Qi (qihan@baidu.com)
+//                    Hehe Pan (panhehe@baidu.com)
+//                    Zikui Yan (yanzikui@baidu.com)
+//                    Chaolin Li (lichaolin@baidu.com)
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xpu_conformer.h"  // NOLINT
+#include <chrono>
+#include <mutex>
+#include <thread>
+#include <tuple>
+
+namespace xpu {
+namespace wenet {
+const int X4_BEGIN = 8;
+template <typename T, typename TW>
+static int encoder_embed(api::Context* ctx_xpu, const float* x, T* y, int batch,
+                         int max_seqlen, int seq_dim, int att_dim,
+                         const ConformerEncoderParam<T, TW>& param) {
+  api::ctx_guard RAII_GUARD(ctx_xpu);
+  int ret = 0;
+  int h_seqlen = (max_seqlen - 1) / 2;
+  int q_seqlen = (h_seqlen - 1) / 2;
+  int out_channels = att_dim;
+  int h_dim = (seq_dim - 1) / 2;
+  int q_dim = (h_dim - 1) / 2;
+
+  float xscale = std::sqrt(att_dim);
+  std::vector<int> sizes = {std::max(batch * max_seqlen * seq_dim,
+                                     batch * out_channels * q_seqlen * q_dim),
+                            batch * out_channels * h_seqlen * h_dim};
+  std::vector<T*> ptrs;
+  for (auto size_ind : sizes) {
+    ptrs.push_back(RAII_GUARD.alloc<T>(size_ind));
+  }
+
+  auto& emb_conv_w_list = param.emb_conv_w_list;
+  auto& emb_conv_maxw_list = param.emb_conv_maxw_list;
+  auto& emb_conv_bias_list = param.emb_conv_bias_list;
+  auto& emb_fc_w = param.emb_fc_w_list;
+  auto& emb_fc_maxw = param.emb_fc_maxw_list;
+  auto& emb_fc_bias = param.emb_fc_bias_list;
+
+  ret =
+      api::cast_v2<float, T>(ctx_xpu, x, ptrs[0], batch * max_seqlen * seq_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret);
+  ret = api::conv2d_fusion<T, TW, T, int16_t>(
+      ctx_xpu, ptrs[0], emb_conv_w_list[0], ptrs[1], batch, 1, max_seqlen,
+      seq_dim, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, 1, nullptr,
+      emb_conv_maxw_list[0], nullptr, true, emb_conv_bias_list[0], nullptr,
+      api::Activation_t::RELU, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret);
+  ret = api::conv2d_fusion<T, TW, T, int16_t>(
+      ctx_xpu, ptrs[1], emb_conv_w_list[1], ptrs[0], batch, out_channels,
+      h_seqlen, h_dim, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, 1, nullptr,
+      emb_conv_maxw_list[1], nullptr, true, emb_conv_bias_list[1], nullptr,
+      api::Activation_t::RELU, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret);
+  ret = api::transpose<T>(ctx_xpu, ptrs[0], ptrs[1],
+                          {batch, out_channels, q_seqlen, q_dim}, {0, 2, 1, 3});
+  WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret);
+  ret = api::fc_fusion<T, TW, T, int16_t>(
+      ctx_xpu, ptrs[1], emb_fc_w[0], ptrs[0], batch * q_seqlen, att_dim,
+      out_channels * q_dim, false, true, nullptr, emb_fc_maxw[0], nullptr,
+      out_channels * q_dim, out_channels * q_dim, att_dim, 1.0f, 0.0f,
+      emb_fc_bias[0], api::Activation_t::LINEAR);
+  WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret);
+  ret = api::scale<T>(ctx_xpu, ptrs[0], y, batch * q_seqlen * out_channels,
+                      false, xscale, 0);
+  WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret);
+  ret = xpu_wait(ctx_xpu->xpu_stream);
+  WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret);
+  return api::SUCCESS;
+}
+
+template <typename T, typename TW, typename TGEMM>
+static int ffn(api::Context* ctx, int batch, int q_seqlen, int hidden_dim,
+               bool with_endln, const T* x, T* y, int ln_begin, int fc_begin,
+               std::vector<const float*> ln_scale_list,
+               std::vector<const float*> ln_bias_list,
+               std::vector<const TW*> fc_w_list,
+               std::vector<const float*> fc_maxw_list,
+               std::vector<const float*> fc_bias_list,
+               std::vector<T*> mem_single, int ffn_factor) {
+  api::ctx_guard RAII_GUARD(ctx);
+  int ret = api::SUCCESS;
+  std::unordered_map<std::string, T*> buf_mapping = {
+      {"ffn_ln", mem_single[1]},          {"ffn_fc0", mem_single[X4_BEGIN]},
+      {"tmp0", mem_single[X4_BEGIN + 1]}, {"tmp1", mem_single[X4_BEGIN]},
+      {"ffn_fc1", mem_single[1]},
+  };
+  int ffn1_out_dim = hidden_dim * ffn_factor;
+  int ffn2_input_dim = ffn1_out_dim;
+  ret = api::layer_norm<T>(ctx, x, buf_mapping["ffn_ln"], batch * q_seqlen,
+                           hidden_dim, 1e-5, ln_scale_list[ln_begin],
+                           ln_bias_list[ln_begin], nullptr, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::fc_fusion<T, TW, T, TGEMM>(
+      ctx, buf_mapping["ffn_ln"], fc_w_list[fc_begin], buf_mapping["ffn_fc0"],
+      batch * q_seqlen, ffn1_out_dim, hidden_dim, false, true, nullptr,
+      fc_maxw_list[fc_begin], nullptr, hidden_dim, hidden_dim, ffn1_out_dim,
+      1.0f, 0.0f, fc_bias_list[fc_begin], api::Activation_t::LINEAR);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::sigmoid<T>(ctx, buf_mapping["ffn_fc0"], buf_mapping["tmp0"],
+                        batch * q_seqlen * hidden_dim * ffn_factor);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::mul<T>(ctx, buf_mapping["ffn_fc0"], buf_mapping["tmp0"],
+                    buf_mapping["tmp1"],
+                    batch * q_seqlen * hidden_dim * ffn_factor);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::fc_fusion<T, TW, T, TGEMM>(
+      ctx, buf_mapping["tmp1"], fc_w_list[fc_begin + 1], buf_mapping["ffn_fc1"],
+      batch * q_seqlen, hidden_dim, ffn2_input_dim, false, true, nullptr,
+      fc_maxw_list[fc_begin + 1], nullptr, ffn2_input_dim, ffn2_input_dim,
+      hidden_dim, 0.5f, 0.0f, fc_bias_list[fc_begin + 1],
+      api::Activation_t::LINEAR);
+  if (with_endln) {
+    ret = api::add_layer_norm_fusion<T>(
+        ctx, x, buf_mapping["ffn_fc1"], y, batch * q_seqlen, hidden_dim, 1e-5,
+        ln_scale_list[ln_begin + 1], ln_bias_list[ln_begin + 1]);
+  } else {
+    ret = api::add<T>(ctx, x, buf_mapping["ffn_fc1"], y,
+                      batch * q_seqlen * hidden_dim);
+  }
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  return api::SUCCESS;
+}
+
+template <typename T, typename TW, typename TGEMM>
+int wenet_encoder_layer(api::Context* ctx,
+                        api::ctx_guard& RAII_GUARD,  // NOLINT
+                        int batch, int q_seqlen, int hidden_dim, int ln_begin,
+                        int fc_begin, int attn_pos_begin, int conv_begin,
+                        const T* x, T* y,
+                        ConformerEncoderParam<T, TW>& param,  // NOLINT
+                        std::vector<T*>& mem_single,          // NOLINT
+                        std::vector<T*>& mem_double,          // NOLINT
+                        float* mem_float, float* mask_score) {
+  WRAPPER_CHECK_CTX(ctx);
+  int max_size = ctx->max_ptr_size();
+  int ret = api::SUCCESS;
+  std::unordered_map<std::string, T*> buf_mapping = {
+      {"ffn0_out", mem_single[1]},
+      {"swp0", mem_single[2]},
+      {"swp1", mem_single[3]},
+      {"matrix_bd_pre", mem_double[0]},
+      {"soft_scores", mem_double[0]},
+      {"qkv", mem_single[2]},
+      {"qkv_add", mem_single[1]},
+      {"conv_p1", mem_single[X4_BEGIN + 2]},
+      {"conv_glu0", mem_single[X4_BEGIN + 3]},
+      {"conv_glu1", mem_single[X4_BEGIN + 4]},
+      {"conv_d1", mem_single[X4_BEGIN + 3]},
+      {"conv_p2", mem_single[X4_BEGIN + 2]},
+      {"conv_after", mem_single[0]},
+  };
+
+  auto ln_scale_list = param.ln_scale_list;
+  auto ln_bias_list = param.ln_bias_list;
+
+  auto fc_w_list = param.fc_w_list;
+  auto fc_maxw_list = param.fc_maxw_list;
+  auto fc_bias_list = param.fc_bias_list;
+
+  auto attn_pos_w_list = param.attn_pos_w_list;
+  auto attn_pos_maxw_list = param.attn_pos_maxw_list;
+  auto attn_pos_uv_bias_list = param.attn_pos_uv_bias_list;
+
+  auto conv_w_list = param.conv_w_list;
+  auto conv_maxw_list = param.conv_maxw_list;
+  auto conv_bias_list = param.conv_bias_list;
+
+  auto kernel_size = param.conv_param.kernel_size;
+  auto lorder = param.conv_param.lorder;
+  auto padding = param.conv_param.padding;
+  auto head_num = param.head_num;
+  auto head_dim = param.head_dim;
+  /*
+  ** feed forward macaron-style module
+  ** x = residual + 0.5*ff(x)
+  */
+  ret = ffn<T, TW, TGEMM>(ctx, batch, q_seqlen, hidden_dim, false, x,
+                          buf_mapping["ffn0_out"], ln_begin, fc_begin,
+                          ln_scale_list, ln_bias_list, fc_w_list, fc_maxw_list,
+                          fc_bias_list, mem_single, param.ffn_factor);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  /*
+  ** multi-headed self-attention module
+  ** qkv_list[0-4]: q,k,v,qu,qv  mapping single[2-6]
+  ** attn_pos_uv_bias_list : float -> float16
+  ** q_pos_attention : get pos_emb before cal
+  ** q_pos_attention : cal matrix_bd to qk_attention's mask ,when cal
+  *qk_attention, mask will be added
+  **/
+  T* qkv_list[5] = {mem_single[6], mem_single[3], mem_single[4], mem_single[5],
+                    mem_single[2]};
+  ret = api::layer_norm<T>(ctx, buf_mapping["ffn0_out"], buf_mapping["swp0"],
+                           batch * q_seqlen, hidden_dim, 1e-5,
+                           ln_scale_list[ln_begin + 1],
+                           ln_bias_list[ln_begin + 1], nullptr, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::fc_fusion_3c<T, TW, T, TGEMM>(
+      ctx, buf_mapping["swp0"], fc_w_list[fc_begin + 2], qkv_list[0],
+      qkv_list[1], qkv_list[2], batch * q_seqlen, hidden_dim * 3, hidden_dim,
+      false, true, nullptr, fc_maxw_list[fc_begin + 2], nullptr, hidden_dim,
+      hidden_dim, hidden_dim * 3, 1.0f, 0.0f, fc_bias_list[fc_begin + 2],
+      api::Activation_t::LINEAR);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  for (int i = 0; i < 2; i++) {
+    ret = api::broadcast_add<T>(
+        ctx, qkv_list[0], attn_pos_uv_bias_list[attn_pos_begin * 2 + i],
+        qkv_list[i + 3], {batch, q_seqlen, hidden_dim}, {1, 1, hidden_dim});
+    WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  }
+  int pos_emb_dim = 2 * q_seqlen - 1;
+  T* pos_emb_sliced = RAII_GUARD.alloc<T>(pos_emb_dim * hidden_dim);
+  ret = api::slice<T>(ctx, param.pos_emb[attn_pos_begin], pos_emb_sliced,
+                      {5000, head_num, head_dim}, {0, 0, 0},
+                      {pos_emb_dim, head_num, head_dim});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  int tmp_sliced_len = batch * head_num * q_seqlen * q_seqlen;
+  float* tmp_mask = RAII_GUARD.alloc<float>(tmp_sliced_len);
+  ret = api::q_pos_attention<T, T, T, TGEMM>(
+      ctx, qkv_list[4], pos_emb_sliced, buf_mapping["matrix_bd_pre"], batch,
+      q_seqlen, head_num, head_dim, 1.0f / std::sqrt(head_dim), nullptr,
+      nullptr, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::slice<T>(ctx, buf_mapping["matrix_bd_pre"],
+                      reinterpret_cast<T*>(mem_float),
+                      {batch, head_num, q_seqlen, pos_emb_dim}, {0, 0, 0, 0},
+                      {batch, head_num, q_seqlen, q_seqlen});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::cast_v2<T, float>(ctx, reinterpret_cast<T*>(mem_float), tmp_mask,
+                               batch * head_num * q_seqlen * q_seqlen);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::broadcast_add<float>(ctx, tmp_mask, mask_score, mem_float,
+                                  {batch, head_num, q_seqlen, q_seqlen},
+                                  {batch, q_seqlen});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  api::QKVAttnParam loop_p(batch, q_seqlen, head_num, head_dim,
+                           {batch, head_num, q_seqlen, q_seqlen},
+                           api::Activation_t::LINEAR, -1, false, hidden_dim);
+  float* qk_maxptr = RAII_GUARD.alloc<float>(max_size);
+  ret = api::qk_attention<T, T, T, TGEMM>(
+      ctx, qkv_list[3], qkv_list[1], buf_mapping["soft_scores"], nullptr,
+      nullptr, qk_maxptr, loop_p, mem_float);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  float* qkv_maxptr = RAII_GUARD.alloc<float>(max_size);
+  ret = api::qk_v_attention<T, T, T, TGEMM>(
+      ctx, buf_mapping["soft_scores"], qkv_list[2], buf_mapping["qkv"],
+      qk_maxptr, nullptr, qkv_maxptr, loop_p);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::fc_fusion<T, TW, T, TGEMM>(
+      ctx, buf_mapping["qkv"], fc_w_list[fc_begin + 3], buf_mapping["swp1"],
+      batch * q_seqlen, hidden_dim, hidden_dim, false, true, qkv_maxptr,
+      fc_maxw_list[fc_begin + 3], nullptr, hidden_dim, hidden_dim, hidden_dim,
+      1.0f, 0.0f, fc_bias_list[fc_begin + 3], api::Activation_t::LINEAR);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::add<T>(ctx, buf_mapping["ffn0_out"], buf_mapping["swp1"],
+                    buf_mapping["qkv_add"], batch * q_seqlen * hidden_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  /*
+  ** Conv conv_p1-conv_d1-conv_p2
+  */
+  ret = api::layer_norm<T>(ctx, buf_mapping["qkv_add"], buf_mapping["swp1"],
+                           batch * q_seqlen, hidden_dim, 1e-5,
+                           ln_scale_list[ln_begin + 2],
+                           ln_bias_list[ln_begin + 2], nullptr, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::transpose<T>(ctx, buf_mapping["swp1"], buf_mapping["swp0"],
+                          {batch, q_seqlen, hidden_dim}, {0, 2, 1});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  int pad_seqlen = q_seqlen;
+  if (lorder > 0) {
+    ret = api::pad<T>(ctx, buf_mapping["swp0"], buf_mapping["swp1"],
+                      {batch, hidden_dim, q_seqlen}, {0, 0, lorder}, {0, 0, 0},
+                      padding);
+    WRAPPER_ASSERT_SUCCESS(ctx, ret);
+    pad_seqlen += lorder;
+  }
+  ret = api::conv2d_fusion<T, TW, T, TGEMM>(
+      ctx, buf_mapping["swp1"], conv_w_list[conv_begin], buf_mapping["swp0"],
+      batch, hidden_dim, 1, pad_seqlen, hidden_dim * 2, {1, 1}, {1, 1},
+      {0, 0, 0, 0}, {1, 1}, 1, nullptr, conv_maxw_list[conv_begin], nullptr,
+      true, conv_bias_list[conv_begin], nullptr, api::Activation_t::LINEAR,
+      nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::split<T>(ctx, buf_mapping["swp0"],
+                      {buf_mapping["conv_glu0"], buf_mapping["conv_glu1"]},
+                      {batch, hidden_dim * 2, pad_seqlen},
+                      {hidden_dim, hidden_dim}, 1);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::sigmoid(ctx, buf_mapping["conv_glu1"], buf_mapping["conv_glu1"],
+                     batch * pad_seqlen * hidden_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::mul(ctx, buf_mapping["conv_glu0"], buf_mapping["conv_glu1"],
+                 buf_mapping["conv_p1"], batch * pad_seqlen * hidden_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::conv1d_fusion<T, TW, T, TGEMM>(
+      ctx, buf_mapping["conv_p1"], conv_w_list[conv_begin + 1],
+      buf_mapping["conv_d1"], batch, hidden_dim, pad_seqlen, hidden_dim,
+      kernel_size, 1, {0}, 1, hidden_dim, nullptr,
+      conv_maxw_list[conv_begin + 1], nullptr, true,
+      conv_bias_list[conv_begin + 1], nullptr, api::Activation_t::LINEAR,
+      nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+
+  ret = api::transpose<T>(ctx, buf_mapping["conv_d1"], buf_mapping["swp0"],
+                          {batch, hidden_dim, q_seqlen}, {0, 2, 1});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::layer_norm<T>(ctx, buf_mapping["swp0"], buf_mapping["swp1"],
+                           batch * q_seqlen, hidden_dim, 1e-5,
+                           ln_scale_list[ln_begin + 3],
+                           ln_bias_list[ln_begin + 3], nullptr, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::sigmoid<T>(ctx, buf_mapping["swp1"], buf_mapping["swp0"],
+                        batch * q_seqlen * hidden_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::mul<T>(ctx, buf_mapping["swp0"], buf_mapping["swp1"],
+                    buf_mapping["conv_p1"], batch * q_seqlen * hidden_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::transpose<T>(ctx, buf_mapping["conv_p1"], buf_mapping["conv_d1"],
+                          {batch, q_seqlen, hidden_dim}, {0, 2, 1});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::conv2d_fusion<T, TW, T, TGEMM>(
+      ctx, buf_mapping["conv_d1"], conv_w_list[conv_begin + 2],
+      buf_mapping["conv_p2"], batch, hidden_dim, 1, q_seqlen, hidden_dim,
+      {1, 1}, {1, 1}, {0, 0, 0, 0}, {1, 1}, 1, nullptr,
+      conv_maxw_list[conv_begin + 2], nullptr, true,
+      conv_bias_list[conv_begin + 2], nullptr, api::Activation_t::LINEAR,
+      nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::transpose<T>(ctx, buf_mapping["conv_p2"], buf_mapping["swp0"],
+                          {batch, hidden_dim, q_seqlen}, {0, 2, 1});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::add<T>(ctx, buf_mapping["swp0"], buf_mapping["qkv_add"],
+                    buf_mapping["conv_after"], batch * q_seqlen * hidden_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  /*
+  ** feed forward module
+  ** x = residual + 0.5*ff(x)
+  */
+  ret = ffn<T, TW, TGEMM>(
+      ctx, batch, q_seqlen, hidden_dim, true, buf_mapping["conv_after"], y,
+      ln_begin + 4, fc_begin + 4, ln_scale_list, ln_bias_list, fc_w_list,
+      fc_maxw_list, fc_bias_list, mem_single, param.ffn_factor);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  return api::SUCCESS;
+}
+
+template <typename T, typename TW, typename TGEMM>
+int conformer_encoder_wenet(
+    api::Context* ctx, float* x, const std::vector<int>& data_shape,
+    T* encoder_out, T* ctc_probs,
+    ConformerEncoderParam<T, TW>& param,  // NOLINT
+    const std::tuple<float*, std::vector<int>>& xpu_mask_info) {
+  // Embedding -> Encoder_layer * N -> Layernorm -> Ctc_loss
+  int ret = 0;
+  int fc_num_per_layer = param.fc_num_per_layer;
+  int conv_num_per_layer = param.conv_num_per_layer;
+  int ln_num_per_layer = param.ln_num_per_layer;
+  int ffn_factor = param.ffn_factor;
+  int head_num = param.head_num;
+  int head_dim = param.head_dim;
+  int att_dim = head_num * head_dim;
+  int ctc_dim = param.ctc_dim;
+  int batch = data_shape[0];
+  int max_seqlen = data_shape[1];
+  int seq_dim = data_shape[2];
+  int h_seqlen = (max_seqlen - 1) / 2;
+  int q_seqlen = (h_seqlen - 1) / 2;
+
+  WRAPPER_ASSERT_GT(ctx, param.layer_num, 0);
+  WRAPPER_ASSERT_GT(ctx, batch, 0);
+  WRAPPER_ASSERT_GT(ctx, head_num, 0);
+  WRAPPER_ASSERT_GT(ctx, ctc_dim, 0);
+  WRAPPER_ASSERT_GT(ctx, head_dim, 0);
+  // Inital GM
+  api::ctx_guard RAII_GUARD(ctx);
+  std::vector<T*> mem_double;
+  std::vector<T*> mem_single;
+  int base_len = batch * (q_seqlen + 14) * (att_dim + 14);
+  for (int i = 0; i < 8; i++) {
+    mem_single.push_back(RAII_GUARD.alloc<T>(base_len));
+  }
+  mem_single.push_back(RAII_GUARD.alloc<T>(base_len * ffn_factor));
+  mem_single.push_back(RAII_GUARD.alloc<T>(base_len * ffn_factor));
+  mem_single.push_back(RAII_GUARD.alloc<T>(base_len * 4));
+  mem_single.push_back(RAII_GUARD.alloc<T>(base_len * 4));
+  mem_single.push_back(RAII_GUARD.alloc<T>(base_len * 2));
+  mem_double.push_back(
+      RAII_GUARD.alloc<T>(batch * head_num * q_seqlen * q_seqlen * 3));
+  mem_double.push_back(
+      RAII_GUARD.alloc<T>(batch * head_num * q_seqlen * q_seqlen));
+  int ind_len = base_len * 6 + batch * param.head_num * q_seqlen * q_seqlen * 2;
+  int lens =
+      batch * param.head_num * q_seqlen * q_seqlen * sizeof(float) / sizeof(T);
+  float* mem_float = RAII_GUARD.alloc<float>(lens);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  T* calx = mem_single[0];
+  T* caly = mem_single[0];
+
+  // embedding + mask
+  float* emb = RAII_GUARD.alloc<float>(batch * max_seqlen * seq_dim);
+  float* emb_nm = RAII_GUARD.alloc<float>(batch * max_seqlen * seq_dim);
+  T* emb_fc = RAII_GUARD.alloc<T>(batch * q_seqlen * att_dim);
+  ret = api::broadcast_sub<float>(ctx, x, param.cmvn_mean, emb, data_shape,
+                                  {1, 1, 80});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::broadcast_mul<float>(ctx, emb, param.cmvn_istd, emb_nm, data_shape,
+                                  {1, 1, 80});
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = encoder_embed<T, TW>(ctx, emb_nm, calx, batch, max_seqlen, seq_dim,
+                             att_dim, param);
+  float* mask_scores = RAII_GUARD.alloc<float>(batch * q_seqlen);
+  ret = api::scale<float>(ctx, std::get<0>(xpu_mask_info), mask_scores,
+                          batch * q_seqlen, false, 1e4, -1);
+  CHECK_RET(ret);
+  // encoder * N
+  for (int i = 0; i < param.layer_num; i++) {
+    int ln_begin = i * ln_num_per_layer;
+    int fc_begin = i * fc_num_per_layer;
+    int attn_pos_begin = i;
+    int conv_begin = i * conv_num_per_layer;
+    ret = wenet_encoder_layer<T, TW, int16_t>(
+        ctx, RAII_GUARD, batch, q_seqlen, att_dim, ln_begin, fc_begin,
+        attn_pos_begin, conv_begin, calx, caly, param, mem_single, mem_double,
+        mem_float, mask_scores);
+    WRAPPER_ASSERT_SUCCESS(ctx, ret);
+    calx = caly;
+  }
+  // Final Layer_Norm
+  int ln_begin = param.layer_num * param.ln_num_per_layer;
+  int fc_begin = param.layer_num * param.fc_num_per_layer;
+  auto final_ln_scale = param.ln_scale_list[ln_begin];
+  auto final_ln_bias = param.ln_bias_list[ln_begin];
+  ret = api::layer_norm(ctx, caly, encoder_out, batch * q_seqlen, att_dim, 1e-5,
+                        final_ln_scale, final_ln_bias, nullptr, nullptr);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  // Ctc_Loss + log_sofmax
+  auto ctc_fc_w = param.fc_w_list[fc_begin];
+  auto ctc_fc_maxw = param.fc_maxw_list[fc_begin];
+  auto ctc_fc_bias = param.fc_bias_list[fc_begin];
+  float* ctc_buffer = RAII_GUARD.alloc<float>(batch * q_seqlen * ctc_dim);
+  ret = api::fc_fusion<T, TW, float, TGEMM>(
+      ctx, encoder_out, ctc_fc_w, ctc_buffer, batch * q_seqlen, ctc_dim,
+      att_dim, false, true, nullptr, ctc_fc_maxw, nullptr, att_dim, att_dim,
+      ctc_dim, 1.0f, 0.0f, ctc_fc_bias, api::Activation_t::LINEAR);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  float* softmax_out = RAII_GUARD.alloc<float>(batch * q_seqlen * ctc_dim);
+  ret = api::softmax<float>(ctx, ctc_buffer, softmax_out,
+                            {batch, q_seqlen, ctc_dim}, 2);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  float* log_out = RAII_GUARD.alloc<float>(batch * q_seqlen * ctc_dim);
+  ret = api::log<float>(ctx, softmax_out, log_out, batch * q_seqlen * ctc_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::cast_v2<float, T>(ctx, log_out, ctc_probs,
+                               batch * q_seqlen * ctc_dim);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  return api::SUCCESS;
+}
+
+#define INSTANTIATION_CONSFORMER_WENET(T, TW, TGEMM)          \
+  template int conformer_encoder_wenet<T, TW, TGEMM>(         \
+      api::Context*, float*, const std::vector<int>&, T*, T*, \
+      ConformerEncoderParam<T, TW>&,                          \
+      const std::tuple<float*, std::vector<int>>&);
+INSTANTIATION_CONSFORMER_WENET(float16, int16_t, int16_t);
+
+const float kFloatMax = std::numeric_limits<float>::max();
+float logadd(std::vector<float> const& x) {
+  float xmax = *max_element(x.begin(), x.end());
+  if (xmax <= -kFloatMax) {
+    return -kFloatMax;
+  }
+  float sum = 0.0;
+  for (auto& it : x) {
+    sum += std::exp(it - xmax);
+  }
+  return std::log(sum) + xmax;
+}
+
+struct PrefixScore {
+  float s = -kFloatMax;
+  float ns = -kFloatMax;
+  float score() const { return logadd({s, ns}); }
+  void check() const {
+    std::cout << "score " << s << std::endl;
+    std::cout << "nscore " << ns << std::endl;
+  }
+};
+
+struct PrefixHash {
+  size_t operator()(const std::vector<int>& prefix) const {
+    size_t hash_code = 0;
+    // here we use KB&DR hash code
+    for (int id : prefix) {
+      hash_code = id + 31 * hash_code;
+    }
+    return hash_code;
+  }
+};
+
+static bool PrefixScoreCompare(
+    const std::pair<std::vector<int>, PrefixScore>& a,
+    const std::pair<std::vector<int>, PrefixScore>& b) {
+  return a.second.score() > b.second.score();
+}
+
+template <typename T>
+int ctc_prefix_beamsearch(api::Context* ctx, T* ctc_probs,
+                          std::vector<int>& hyps,                     // NOLINT
+                          std::vector<int>& hyps_len,                 // NOLINT
+                          std::vector<float>& ctc_scores, int batch,  // NOLINT
+                          int beam_size, int max_len, int ctc_dim) {
+  // 0. get topk
+  api::ctx_guard RAII_GUARD(ctx);
+  int data_len = batch * max_len * beam_size;
+  int* topk_index_buf = RAII_GUARD.alloc<int>(data_len);
+  float* topk_score_buf = RAII_GUARD.alloc<float>(data_len);
+  float* logp = RAII_GUARD.alloc<float>(batch * max_len * ctc_dim);
+  int ret =
+      api::cast_v2<T, float>(ctx, ctc_probs, logp, batch * max_len * ctc_dim);
+  ret = api::sorted_topk<float>(ctx, logp, topk_score_buf, topk_index_buf,
+                                max_len, ctc_dim, beam_size, true);
+  xpu_wait(ctx->xpu_stream);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  std::vector<int> topk_index(data_len);
+  std::vector<float> topk_score(data_len);
+  ret = xpu_memcpy(reinterpret_cast<void*>(&topk_index[0]), topk_index_buf,
+                   data_len * sizeof(int), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  CHECK_RET(ret);
+  ret = xpu_memcpy(reinterpret_cast<void*>(&topk_score[0]), topk_score_buf,
+                   data_len * sizeof(float), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  CHECK_RET(ret);
+  std::unordered_map<std::vector<int>, PrefixScore, PrefixHash> cur_hyps;
+  PrefixScore prefix_score;
+  prefix_score.s = 0.0;
+  prefix_score.ns = -kFloatMax;
+  std::vector<int> empty;
+  cur_hyps[empty] = prefix_score;
+  for (int t = 0; t < max_len; ++t) {
+    int offset = beam_size * t;
+    std::unordered_map<std::vector<int>, PrefixScore, PrefixHash> next_hyps;
+    // 1. Token passing
+    for (int i = 0; i < beam_size; ++i) {
+      int id = topk_index[i + offset];
+      float prob = topk_score[i + offset];
+      for (const auto& it : cur_hyps) {
+        const std::vector<int>& prefix = it.first;
+        const PrefixScore& prefix_score = it.second;
+        if (id == 0) {
+          // Case 0: *a + ε => *a
+          PrefixScore& next_score = next_hyps[prefix];
+          next_score.s = logadd(
+              {next_score.s, prefix_score.s + prob, prefix_score.ns + prob});
+          // Prefix not changed, copy the context from prefix.
+          next_hyps[prefix] = next_score;
+        } else if (!prefix.empty() && id == prefix.back()) {
+          // Case 1: *a + a => *a
+          PrefixScore& next_score = next_hyps[prefix];
+          next_score.ns = logadd({next_score.ns, prefix_score.ns + prob});
+          next_hyps[prefix] = next_score;
+          // Case 2: *aε + a => *aa
+          std::vector<int> new_prefix(prefix);
+          new_prefix.emplace_back(id);
+          PrefixScore& next_score1 = next_hyps[new_prefix];
+          next_score1.ns = logadd({next_score1.ns, prefix_score.s + prob});
+          next_hyps[new_prefix] = next_score1;
+        } else {
+          // Case 3: *a + b => *ab, *aε + b => *ab
+          std::vector<int> new_prefix(prefix);
+          new_prefix.emplace_back(id);
+          PrefixScore& next_score = next_hyps[new_prefix];
+          next_score.ns = logadd(
+              {next_score.ns, prefix_score.s + prob, prefix_score.ns + prob});
+          next_hyps[new_prefix] = next_score;
+        }
+      }
+    }
+    // 2. Second beam prune, only keep top n best paths
+    std::vector<std::pair<std::vector<int>, PrefixScore>> arr(next_hyps.begin(),
+                                                              next_hyps.end());
+    std::nth_element(arr.begin(), arr.begin() + beam_size, arr.end(),
+                     PrefixScoreCompare);
+    arr.resize(beam_size);
+    std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
+    // 3. Update cur_hyps and get new result
+    cur_hyps.clear();
+    for (int k = 0; k < beam_size; k++) {
+      cur_hyps[arr[k].first] = arr[k].second;
+    }
+  }
+  std::vector<std::pair<std::vector<int>, PrefixScore>> arr(cur_hyps.begin(),
+                                                            cur_hyps.end());
+  std::sort(arr.begin(), arr.end(), PrefixScoreCompare);
+  int beam = 0;
+  for (auto it : arr) {
+    auto vec = it.first;
+    hyps_len[beam] = vec.size();
+    ctc_scores[beam] = it.second.score();
+    hyps.insert(hyps.end(), vec.begin(), vec.end());
+    beam++;
+  }
+  return api::SUCCESS;
+}
+
+template int ctc_prefix_beamsearch<float16>(
+    api::Context* ctx, float16* logp,
+    std::vector<int>& hyps,          // NOLINT
+    std::vector<int>& hyps_len,      // NOLINT
+    std::vector<float>& ctc_scores,  // NOLINT
+    int batch, int beam_size, int max_len, int ctc_dim);
+
+static int clip_cpu(int x, int min, int max) {
+  if (x <= min) return min;
+  if (x >= max) return max;
+  return x;
+}
+
+static int add_sos_and_pad_ignored_id(
+    api::Context* ctx, const int* target,
+    std::vector<int>& pad_target,      // NOLINT
+    std::vector<int>& pad_target_lod,  // NOLINT
+    int batch_size, int target_seq_len, int max_target_seq_len, int eos_id,
+    int ignored_id, int add_sos_num, int vocab_size) {
+  int ret = -1;
+  int target_data_len = batch_size * target_seq_len;
+  std::vector<int> target_cpu(target_data_len);
+  ret = xpu_wait(ctx->xpu_stream);
+  ret = xpu_memcpy(reinterpret_cast<void*>(target_cpu.data()), target,
+                   target_data_len * sizeof(int),
+                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  for (int i = 0; i < batch_size; i++) {
+    int valid_target_len = add_sos_num;
+    for (int j = 0; j < target_seq_len; j++) {
+      if (target_cpu[i * target_seq_len + j] == eos_id) {
+        pad_target[i * max_target_seq_len + j + add_sos_num] = ignored_id;
+      } else {
+        pad_target[i * max_target_seq_len + j + add_sos_num] =
+            clip_cpu(target_cpu[i * target_seq_len + j], 0, vocab_size);
+        valid_target_len++;
+      }
+    }
+    pad_target_lod[i + 1] = pad_target_lod[i] + valid_target_len;
+  }
+  return api::SUCCESS;
+}
+
+template <typename T, typename TW, typename TGEMM>
+int conformer_decoder_wenet(api::Context* ctx, const T* x,
+                            const std::vector<int32_t>& x_shape,
+                            const float* x_mask, const int* padded_target,
+                            const std::vector<int32_t>& target_shape,
+                            float* character_scores,
+                            const ConformerDecoderParam<T, TW>& param) {
+  int layer_num = param.layer_num;
+  int batch_size = x_shape[0];
+  int beam_size = param.beam_size;
+  int head_num = param.head_num;
+  int head_dim = param.head_dim;
+  int vocab_size = param.vocab_size;
+  int dim = head_num * head_dim;
+  int add_sos_num = param.add_sos_num;
+  int new_bs = batch_size * beam_size;
+  int sos_id = param.sos_id;
+  int eos_id = param.eos_id;
+  int ignored_id = param.ignored_id;
+  WRAPPER_CHECK_CTX(ctx);
+  WRAPPER_ASSERT_GT(ctx, layer_num, 0);
+  WRAPPER_ASSERT_GT(ctx, batch_size, 0);
+  WRAPPER_ASSERT_GT(ctx, head_num, 0);
+  WRAPPER_ASSERT_GT(ctx, vocab_size, 0);
+  WRAPPER_ASSERT_GT(ctx, dim, 0);
+
+  api::ctx_guard RAII_GUARD(ctx);
+  const int max_seq_len = x_shape[1];
+  WRAPPER_ASSERT_GT(ctx, max_seq_len, 0);
+  const int ffn1_out_dim = param.ffn_dim;
+  // if ffn_act is glu
+  const int ffn2_input_dim = ffn1_out_dim;
+  const int d_k = dim / head_num;
+  WRAPPER_ASSERT_GT(ctx, d_k, 0);
+  int target_seq_len = target_shape[1];
+  WRAPPER_ASSERT_GT(ctx, target_seq_len, 1);
+  int max_target_seq_len = target_seq_len + add_sos_num;  // add sos
+  WRAPPER_ASSERT_GT(ctx, max_seq_len, max_target_seq_len);
+
+  int seqlen_sum = new_bs * max_seq_len;
+  T* new_x = const_cast<T*>(x);
+  int ret = -1;
+  // get src_attn vsl input
+  std::vector<float> cpu_mask_data(new_bs * max_seq_len, 0);
+  std::vector<int> src_lod_vec(new_bs + 1, 0);
+  ret = xpu_wait(ctx->xpu_stream);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = xpu_memcpy(reinterpret_cast<void*>(&cpu_mask_data.front()), x_mask,
+                   new_bs * max_seq_len * sizeof(float),
+                   XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  for (int b = 1; b < src_lod_vec.size(); b++) {
+    int curr_seqlen = 0;
+    for (int idx = 0; idx < max_seq_len; idx++) {
+      if (static_cast<int>(cpu_mask_data[idx]) == 1) {
+        curr_seqlen++;
+      }
+    }
+    src_lod_vec[b] = src_lod_vec[b - 1] + curr_seqlen;
+  }
+  api::VectorParam<int> src_qk_lods = {
+      src_lod_vec.data(), static_cast<int>(src_lod_vec.size()), nullptr};
+  src_qk_lods = src_qk_lods.to_xpu(RAII_GUARD);
+  seqlen_sum = src_qk_lods.cpu[new_bs];
+
+  T* broadcast_x = RAII_GUARD.alloc<T>(new_bs * max_seq_len * dim);
+  ret = api::broadcast<T>(ctx, x, broadcast_x, {batch_size, max_seq_len, dim},
+                          {new_bs, max_seq_len, dim});
+
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  // add sos_id and pad ignored_id
+  std::vector<int> real_target_cpu(max_target_seq_len * new_bs, sos_id);
+  std::vector<int> real_target_lod(new_bs + 1, 0);
+
+  ret = add_sos_and_pad_ignored_id(ctx, padded_target, real_target_cpu,
+                                   real_target_lod, batch_size * beam_size,
+                                   target_seq_len, max_target_seq_len, eos_id,
+                                   ignored_id, add_sos_num, vocab_size);
+
+  // get self/src QKVParam
+  int target_seq_sum = real_target_lod[new_bs];
+  api::VectorParam<int> self_qk_lods = {
+      real_target_lod.data(), static_cast<int>(real_target_lod.size()),
+      nullptr};
+  self_qk_lods = self_qk_lods.to_xpu(RAII_GUARD);
+  api::QKVAttnParam self_qkv_param(self_qk_lods, head_num, d_k,
+                                   api::Activation_t::LINEAR);
+  api::ConformerQKVParam src_qkv_param(self_qk_lods, src_qk_lods, head_num, d_k,
+                                       false, -1);
+
+  seqlen_sum = seqlen_sum > target_seq_sum ? seqlen_sum : target_seq_sum;
+  std::vector<int> buf_sizes = {
+      new_bs * max_target_seq_len *
+          static_cast<int>(sizeof(int) / sizeof(T)),  // padded_target
+      new_bs * max_target_seq_len * dim,              // embedding_out
+      new_bs * max_target_seq_len * dim,              // mid_a
+      new_bs * max_target_seq_len * dim,              // mid_b
+      new_bs * max_target_seq_len *
+          dim,  // attention_out, src_attention qk_v的结果
+      new_bs * max_target_seq_len * dim,  // residual
+      // ffn buffer
+      new_bs * max_target_seq_len * ffn1_out_dim,    // ffn1_out
+      new_bs * max_target_seq_len * ffn2_input_dim,  // ffn_glu_out
+      new_bs * max_target_seq_len * ffn2_input_dim,  // ffn_glu_a
+      new_bs * max_target_seq_len * ffn2_input_dim,  // ffn_glu_b
+      new_bs * max_target_seq_len * ffn2_input_dim,  // ffn_glu_sigmoid
+      // feature buffer
+      new_bs * max_target_seq_len * dim * 3,  // feature_in buffer
+      new_bs * max_target_seq_len * dim * 2,  // feature_out buffer
+      new_bs * max_target_seq_len * 2,        // final_out
+      seqlen_sum * dim,                       // q
+      seqlen_sum * dim,                       // k
+      seqlen_sum * dim,                       // v
+      new_bs * max_seq_len * dim,             // src_x
+      // attention buffer
+      new_bs * max_seq_len * max_seq_len * dim,  // src_qk
+  };
+  std::vector<T*> buffer_ptrs(buf_sizes.size());
+  for (int i = 0; i < buf_sizes.size(); i++) {
+    buffer_ptrs[i] = RAII_GUARD.alloc<T>(buf_sizes[i]);
+  }
+  int b_id = 0;
+  std::unordered_map<std::string, T*> buffer_map = {
+      {"padded_target", buffer_ptrs[b_id++]},
+      {"embedding_out", buffer_ptrs[b_id++]},
+      {"mid_a", buffer_ptrs[b_id++]},
+      {"mid_b", buffer_ptrs[b_id++]},
+      {"attention_out", buffer_ptrs[b_id++]},
+      {"residual", buffer_ptrs[b_id++]},
+      {"ffn1_out", buffer_ptrs[b_id++]},
+      {"ffn_glu_out", buffer_ptrs[b_id++]},
+      {"ffn_glu_a", buffer_ptrs[b_id++]},
+      {"ffn_glu_b", buffer_ptrs[b_id++]},
+      {"ffn_glu_sigmoid", buffer_ptrs[b_id++]},
+      {"feature_in", buffer_ptrs[b_id++]},
+      {"feature_out", buffer_ptrs[b_id++]},
+      {"final_out", buffer_ptrs[b_id++]},
+      {"q", buffer_ptrs[b_id++]},
+      {"k", buffer_ptrs[b_id++]},
+      {"v", buffer_ptrs[b_id++]},
+      {"src_x", buffer_ptrs[b_id++]},
+      {"src_qk", buffer_ptrs[b_id++]},
+  };
+  // maxptr buffer
+  int max_size = ctx->max_ptr_size();
+  float* max_buffer = RAII_GUARD.alloc<float>(6 * max_size);
+  float* max_x = max_buffer;
+  float* max_q = max_buffer + max_size;
+  float* max_k = max_buffer + 2 * max_size;
+  float* max_v = max_buffer + 3 * max_size;
+  float* max_qk = max_buffer + 4 * max_size;
+  float* max_qkv = max_buffer + 5 * max_size;
+  // copy pad_sos target to xpu
+  int* new_paded_target = reinterpret_cast<int*>(buffer_map["padded_target"]);
+  ret = api::do_host2device(ctx, real_target_cpu.data(), new_paded_target,
+                            max_target_seq_len * new_bs * sizeof(int));
+  T* embedding_out = buffer_map["embedding_out"];
+  T* attention_out = buffer_map["attention_out"];
+  T* mid_a = buffer_map["mid_a"];
+  T* mid_b = buffer_map["mid_b"];
+  T* q = buffer_map["q"];
+  T* k = buffer_map["k"];
+  T* v = buffer_map["v"];
+  T* src_qk = buffer_map["src_qk"];
+  T* residual = buffer_map["residual"];
+  T* ffn1_out = buffer_map["ffn1_out"];
+  T* ffn_glu_a = buffer_map["ffn_glu_a"];
+  T* ffn_glu_b = buffer_map["ffn_glu_b"];
+  T* ffn_glu_sigmoid = buffer_map["ffn_glu_sigmoid"];
+  T* ffn_glu_out = buffer_map["ffn_glu_out"];
+  // 1.1 embedding input: target{3,14} out:{3,14,512}
+  ret =
+      api::embedding<T, int>(ctx, param.embed_table, new_paded_target, residual,
+                             vocab_size, dim, new_bs * max_target_seq_len, -1);
+  float logit_scale = 1.0f;
+  ret =
+      api::scale<T>(ctx, residual, embedding_out,
+                    new_bs * max_target_seq_len * dim, true, logit_scale, 0.0f);
+  // 1.2 pos_embed, pos=[1, 5000, dim]
+  ret = api::broadcast_add<T>(ctx, embedding_out, param.pe, residual,
+                              {new_bs, max_target_seq_len, dim},
+                              {1, max_target_seq_len, dim});
+  // 2. decoder
+  auto fc_weight_itr = param.fc_w_list.begin();
+  auto fc_bias_itr = param.fc_bias_list.begin();
+  auto fc_w_maxptr_itr = param.fc_maxw_list.begin();
+  auto ln_scale_itr = param.ln_scale_list.begin();
+  auto ln_bias_itr = param.ln_bias_list.begin();
+  const float eps = 1e-5f;
+
+  std::vector<float> mask_cpu(max_target_seq_len * max_target_seq_len, 0.0);
+  const float kFloatMax = std::numeric_limits<float>::max();
+  for (int j = 0; j < max_target_seq_len; j++) {
+    for (int k = j + 1; k < max_target_seq_len; k++)
+      mask_cpu[j * max_target_seq_len + k] = -kFloatMax;
+  }
+  float* mask_xpu;
+  mask_xpu = reinterpret_cast<float*>(
+      RAII_GUARD.alloc<float>(max_target_seq_len * max_target_seq_len));
+  float* tg_mask;
+  tg_mask = reinterpret_cast<float*>(RAII_GUARD.alloc<float>(
+      new_bs * head_num * max_target_seq_len * max_target_seq_len));
+  ret = xpu_memcpy(mask_xpu, reinterpret_cast<void*>(&mask_cpu[0]),
+                   max_target_seq_len * max_target_seq_len * sizeof(float),
+                   XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  ret = api::broadcast<float>(
+      ctx, mask_xpu, tg_mask, {1, 1, max_target_seq_len, max_target_seq_len},
+      {new_bs, head_num, max_target_seq_len, max_target_seq_len});
+  for (int j = 0; j < layer_num; j++) {
+    // 2.1 self attention
+    ret = api::layer_norm<T>(ctx, residual, mid_b, new_bs * max_target_seq_len,
+                             dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr,
+                             nullptr);
+    ret = api::fc_fusion_3c<T, TW, T, TGEMM>(
+        ctx, mid_b, *fc_weight_itr++, q, k, v, target_seq_sum, dim * 3, dim,
+        false, true, nullptr, *fc_w_maxptr_itr++, max_q, dim, dim, dim * 3,
+        1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR);
+
+    api::QKVAttnParam loop_p(
+        new_bs, max_target_seq_len, head_num, d_k,
+        {new_bs, head_num, max_target_seq_len, max_target_seq_len},
+        api::Activation_t::LINEAR, -1, false, dim);
+
+    ret = api::qk_attention<T, T, T, TGEMM>(ctx, q, k, src_qk, nullptr, nullptr,
+                                            max_qk, loop_p, tg_mask);
+    ret = api::qk_v_attention<T, T, T, TGEMM>(ctx, src_qk, v, mid_a, max_qk,
+                                              nullptr, max_qkv, loop_p);
+    // x + residual fused with fc
+    ret = api::fc_fusion<T, TW, T, TGEMM>(
+        ctx, mid_a, *fc_weight_itr++, residual, new_bs * max_target_seq_len,
+        dim, dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim,
+        dim, 1.0f, 1.0f, *fc_bias_itr++, api::Activation_t::LINEAR);
+    // 2.2 src attention
+    ret = api::layer_norm<T>(ctx, residual, mid_a, new_bs * max_target_seq_len,
+                             dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr,
+                             nullptr);
+    ret = api::fc_fusion<T, TW, T, TGEMM>(
+        ctx, mid_a, *fc_weight_itr++, mid_b, new_bs * max_target_seq_len, dim,
+        dim, false, true, nullptr, *fc_w_maxptr_itr++, max_q, dim, dim, dim,
+        1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR);
+    // get k,v use encoder_out
+    ret = api::fc_fusion<T, TW, T, TGEMM>(
+        ctx, broadcast_x, *fc_weight_itr++, k, new_bs * max_seq_len, dim, dim,
+        false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, dim, 1.0f,
+        0.0f, *fc_bias_itr++, api::Activation_t::LINEAR);
+    ret = api::fc_fusion<T, TW, T, TGEMM>(
+        ctx, broadcast_x, *fc_weight_itr++, v, new_bs * max_seq_len, dim, dim,
+        false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, dim, 1.0f,
+        0.0f, *fc_bias_itr++, api::Activation_t::LINEAR);
+    ret = api::qk_attention<T, T, T, TGEMM>(ctx, mid_b, k, src_qk, nullptr,
+                                            nullptr, max_qk, src_qkv_param);
+
+    ret = api::qk_v_attention<T, T, T, TGEMM>(ctx, src_qk, v, mid_a, max_qk,
+                                              nullptr, max_qkv, src_qkv_param);
+    // x = x + residual fused with fc
+    ret = api::fc_fusion<T, TW, T, TGEMM>(
+        ctx, mid_a, *fc_weight_itr++, residual, new_bs * max_target_seq_len,
+        dim, dim, false, true, max_qkv, *fc_w_maxptr_itr++, nullptr, dim, dim,
+        dim, 1.0f, 1.0f, *fc_bias_itr++, api::Activation_t::LINEAR);
+    // normalize before
+    ret = api::layer_norm<T>(ctx, residual, mid_a, new_bs * max_target_seq_len,
+                             dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr,
+                             nullptr);
+    // ffn1
+    ret = api::fc_fusion<T, TW, T, TGEMM>(
+        ctx, mid_a, *fc_weight_itr++, ffn1_out, new_bs * max_target_seq_len,
+        ffn1_out_dim, dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr,
+        dim, dim, ffn1_out_dim, 1.0, 0.0, *fc_bias_itr++,
+        api::Activation_t::RELU);
+    // ffn2
+    ret = api::fc_fusion<T, TW, T, TGEMM>(
+        ctx, ffn1_out, *fc_weight_itr++, residual, new_bs * max_target_seq_len,
+        dim, ffn2_input_dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr,
+        ffn2_input_dim, ffn2_input_dim, dim, 1.0, 1.0, *fc_bias_itr++,
+        api::Activation_t::LINEAR);
+  }
+
+  ret =
+      api::layer_norm(ctx, residual, mid_a, new_bs * max_target_seq_len, dim,
+                      1e-5, *ln_scale_itr++, *ln_bias_itr++, nullptr, nullptr);
+  int ctc_dim = param.vocab_size;
+  ret = api::fc_fusion<T, TW, T, TGEMM>(
+      ctx, mid_a, *fc_weight_itr++, mid_b, new_bs * max_target_seq_len, ctc_dim,
+      dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, ctc_dim,
+      1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR);
+  // log_softmax
+  int data_len = new_bs * max_target_seq_len * ctc_dim;
+  float* softmax_in = RAII_GUARD.alloc<float>(data_len);
+  float* softmax_out = RAII_GUARD.alloc<float>(data_len);
+  float* log_out = RAII_GUARD.alloc<float>(data_len);
+  ret = api::cast_v2<T, float>(ctx, mid_b, softmax_in, data_len);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::softmax<float>(ctx, softmax_in, softmax_out,
+                            {new_bs, max_target_seq_len, ctc_dim}, 2);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+  ret = api::log<float>(ctx, softmax_out, character_scores, data_len);
+  WRAPPER_ASSERT_SUCCESS(ctx, ret);
+
+  return api::SUCCESS;
+}
+
+template int conformer_decoder_wenet<float16, int16_t, int16_t>(
+    api::Context* ctx, const float16* x, const std::vector<int32_t>& x_shape,
+    const float* x_mask, const int* padded_target,
+    const std::vector<int32_t>& target_shape, float* character_scores,
+    const ConformerDecoderParam<float16, int16_t>& param);
+
+}  // namespace wenet
+}  // namespace xpu
diff --git a/runtime/kunlun/xpu/xpu_conformer.h b/runtime/kunlun/xpu/xpu_conformer.h
new file mode 100644
index 000000000..c20af03e1
--- /dev/null
+++ b/runtime/kunlun/xpu/xpu_conformer.h
@@ -0,0 +1,781 @@
+// Copyright (c) 2022 KUNLUNXIN Inc.
+//               2022 Han Qi (qihan@baidu.com)
+//                    Hehe Pan (panhehe@baidu.com)
+//                    Zikui Yan (yanzikui@baidu.com)
+//                    Chaolin Li (lichaolin@baidu.com)
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "xpu/runtime.h"
+#include "xpu/xdnn.h"
+#include "xpu_util.h"  // NOLINT
+#pragma once
+
+namespace api = baidu::xpu::api;
+template <typename T, typename TW>
+class ConformerEncoderParam {
+ public:
+  int layer_num;
+  int fc_num_per_layer;
+  int conv_num_per_layer;
+  int ln_num_per_layer;
+  int head_num;
+  int head_dim;
+  int ctc_dim;
+  int ffn_factor;
+  int beam_size;
+  struct Embedding {
+    int conv_num;
+    int fc_num;
+    int embed_dim;
+  } emb_param;
+  struct ConvBlock {
+    bool is_casual;
+    int kernel_size;
+    int lorder;
+    T padding;
+  } conv_param;
+
+  std::vector<const T*> pos_emb;
+  std::vector<const TW*> emb_conv_w_list;
+  std::vector<const float*> emb_conv_maxw_list;
+  std::vector<const float*> emb_conv_bias_list;
+  std::vector<const TW*> emb_fc_w_list;
+  std::vector<const float*> emb_fc_maxw_list;
+  std::vector<const float*> emb_fc_bias_list;
+
+  std::vector<const TW*> conv_w_list;
+  std::vector<const float*> conv_maxw_list;
+  std::vector<const float*> conv_bias_list;
+
+  std::vector<const float*> ln_scale_list;
+  std::vector<const float*> ln_bias_list;
+
+  std::vector<const TW*> fc_w_list;
+  std::vector<const float*> fc_maxw_list;
+  std::vector<const float*> fc_bias_list;
+
+  std::vector<const TW*> attn_pos_w_list_;
+  std::vector<const T*> attn_pos_w_list;
+  std::vector<const float*> attn_pos_maxw_list;
+  std::vector<const T*> attn_pos_uv_bias_list;
+
+  const float* cmvn_istd{nullptr};
+  const float* cmvn_mean{nullptr};
+  const float* pe{nullptr};
+  float* mask{nullptr};
+};
+
+template <typename T, typename TW>
+class ConformerDecoderParam {
+ public:
+  int layer_num;
+  int fc_num_per_layer;
+  int ln_num_per_layer;
+
+  int head_num;
+  int head_dim;
+  int vocab_size;
+  int sos_id;
+  int eos_id;
+  int ignored_id;
+  int beam_size;
+  int max_token_num;
+  int add_sos_num;
+  int ffn_dim;
+
+  const T* embed_table{nullptr};
+  const T* pe{nullptr};
+  std::vector<const TW*> fc_w_list;
+  std::vector<const float*> fc_maxw_list;
+  std::vector<const float*> fc_bias_list;
+  std::vector<const float*> ln_scale_list;
+  std::vector<const float*> ln_bias_list;
+};
+
+template <typename T>
+static int64_t vec_prod(const std::vector<T>& data) {
+  int len = data.size();
+  if (len < 1) {
+    return 0;
+  }
+  int64_t prod = data[0];
+  for (int i = 1; i < len; ++i) {
+    prod *= data[i];
+  }
+  return prod;
+}
+
+template <typename T>
+static std::vector<const T*> get_w_list_from(
+    const std::vector<XPUQunatData<T>>& quant_data_list) {
+  int len = quant_data_list.size();
+  std::vector<const T*> w_list(len, nullptr);
+  for (int i = 0; i < len; ++i) {
+    w_list[i] = quant_data_list[i].data_;
+  }
+  return w_list;
+}
+
+template <typename T>
+static std::vector<const float*> get_w_maxptr_list_from(
+    const std::vector<XPUQunatData<T>>& quant_data_list) {
+  int len = quant_data_list.size();
+  std::vector<const float*> w_maxptr_list(len, nullptr);
+  for (int i = 0; i < len; ++i) {
+    w_maxptr_list[i] = quant_data_list[i].max_ptr_;
+  }
+  return w_maxptr_list;
+}
+
+template <typename TW>
+void get_fc_param(const std::unordered_map<std::string, int>& weights_len_info,
+                  const std::string& params_dir,
+                  const std::string& fc_name_prefix,
+                  XPUQunatData<TW>& fc_w,                         // NOLINT
+                  const float*& fc_bias, bool has_bias = true) {  // NOLINT
+  const std::string fc_file_prefix = params_dir + fc_name_prefix;
+  int wlen = weights_len_info.at(fc_name_prefix + "weight");
+  fc_w = get_xpu_quant_data<float, TW>(fc_file_prefix + "weight", wlen);
+  if (has_bias) {
+    int blen = weights_len_info.at(fc_name_prefix + "bias");
+    fc_bias = get_xpu_data<float>(fc_file_prefix + "bias", blen);
+  } else {
+    fc_bias = nullptr;
+  }
+}
+
+template <typename TW>
+void get_conv_param(
+    const std::unordered_map<std::string, int>& weights_len_info,
+    const std::string& params_dir, const std::string& conv_name_prefix,
+    XPUQunatData<TW>& conv_w, const float*& conv_b,  // NOLINT
+    bool has_bias = true) {                          // NOLINT
+  std::string conv_file_prefix = params_dir + conv_name_prefix;
+  int wlen = weights_len_info.at(conv_name_prefix + "weight");
+  conv_w = get_xpu_quant_data<float, TW>(conv_file_prefix + "weight", wlen);
+  if (has_bias) {
+    int blen = weights_len_info.at(conv_name_prefix + "bias");
+    conv_b = get_xpu_data<float>(conv_file_prefix + "bias", blen);
+  } else {
+    conv_b = nullptr;
+  }
+}
+
+template <typename TW>
+void get_fc_fused_param(
+    const std::unordered_map<std::string, int>& weights_len_info,
+    const std::string& params_dir,
+    const std::vector<std::string> fc_name_prefixs,
+    XPUQunatData<TW>& _fc_w,                      // NOLINT
+    const float*& _fc_b, bool has_bias = true) {  // NOLINT
+  // get cpu fc params
+  std::vector<float> fc_ws;
+  std::vector<float> fc_bs;
+  for (int ids = 0; ids < fc_name_prefixs.size(); ids++) {
+    std::string fc_file_prefix = params_dir + fc_name_prefixs[ids];
+    int wlen = weights_len_info.at(fc_name_prefixs[ids] + "weight");
+    std::vector<float> fc_w =
+        get_cpu_data<float>(fc_file_prefix + "weight", wlen);
+    std::vector<float> fc_b;
+    if (has_bias) {
+      int blen = weights_len_info.at(fc_name_prefixs[ids] + "bias");
+      fc_b = get_cpu_data<float>(fc_file_prefix + "bias", blen);
+    }
+    fc_ws.insert(fc_ws.end(), fc_w.begin(), fc_w.end());
+    fc_bs.insert(fc_bs.end(), fc_b.begin(), fc_b.end());
+  }
+  _fc_w = get_xpu_quant_data<float, TW>("fused_fc_weight", fc_ws);
+  _fc_b = get_xpu_data<float>("fused_fc_bias", fc_bs);
+}
+
+template <typename TW>
+void get_fc_ln_fused_param(
+    const std::unordered_map<std::string, int>& weights_len_info,
+    const std::string& params_dir,
+    const std::vector<std::string> fc_name_prefixs,
+    std::vector<std::string> ln_name_prefixs,
+    XPUQunatData<TW>& _fc_w,                      // NOLINT
+    const float*& _fc_b, bool has_bias = true) {  // NOLINT
+  // get cpu fc params
+  std::vector<float> fc_ws;
+  std::vector<float> fc_bs;
+  for (int ids = 0; ids < fc_name_prefixs.size(); ids++) {
+    std::string fc_file_prefix = params_dir + fc_name_prefixs[ids];
+    int wlen = weights_len_info.at(fc_name_prefixs[ids] + "weight");
+    std::vector<float> fc_w =
+        get_cpu_data<float>(fc_file_prefix + "weight", wlen);
+    std::vector<float> fc_b;
+    if (has_bias) {
+      int blen = weights_len_info.at(fc_name_prefixs[ids] + "bias");
+      fc_b = get_cpu_data<float>(fc_file_prefix + "bias", blen);
+    }
+    // get cpu ln params
+    std::string ln_file_prefix = params_dir + ln_name_prefixs[ids];
+    wlen = weights_len_info.at(ln_name_prefixs[ids] + "weight");
+    int blen = weights_len_info.at(ln_name_prefixs[ids] + "bias");
+    std::vector<float> ln_scale =
+        get_cpu_data<float>(ln_file_prefix + "weight", wlen);
+    std::vector<float> ln_bias =
+        get_cpu_data<float>(ln_file_prefix + "bias", blen);
+    int col = ln_scale.size();
+    int row = static_cast<int>(fc_w.size()) / col;
+    if (!has_bias) {
+      fc_b.resize(row);
+    }
+    // get new fc_bias
+    for (int i = 0; i < row; i++) {
+      float b = has_bias ? fc_b[i] : 0.f;
+      for (int j = 0; j < col; j++) {
+        b += fc_w[i * col + j] * ln_bias[j];
+      }
+      fc_b[i] = b;
+    }
+    // get new fc_weight
+    for (int i = 0; i < row; i++) {
+      for (int j = 0; j < col; j++) {
+        fc_w[i * col + j] = fc_w[i * col + j] * ln_scale[j];
+      }
+    }
+    fc_ws.insert(fc_ws.end(), fc_w.begin(), fc_w.end());
+    fc_bs.insert(fc_bs.end(), fc_b.begin(), fc_b.end());
+  }
+  _fc_w = get_xpu_quant_data<float, TW>("fused_fc_weight", fc_ws);
+  _fc_b = get_xpu_data<float>("fused_fc_bias", fc_bs);
+}
+
+template <typename TW>
+void get_conv_bn_fused_param(
+    const std::unordered_map<std::string, int>& weights_len_info,
+    const std::string& params_dir, const std::string& conv_name_prefix,
+    const std::string& bn_name_prefix, XPUQunatData<TW>& _conv_w,  // NOLINT
+    const float*& _conv_b, bool has_bias = true) {                 // NOLINT
+  // get cpu conv params
+  std::string conv_file_prefix = params_dir + conv_name_prefix;
+  int wlen = weights_len_info.at(conv_name_prefix + "weight");
+  std::vector<float> conv_w =
+      get_cpu_data<float>(conv_file_prefix + "weight", wlen);
+  std::vector<float> conv_b;
+  if (has_bias) {
+    int blen = weights_len_info.at(conv_name_prefix + "bias");
+    conv_b = get_cpu_data<float>(conv_file_prefix + "bias", blen);
+  }
+  // get cpu bn params
+  std::string bn_file_prefix = params_dir + bn_name_prefix;
+  wlen = weights_len_info.at(bn_name_prefix + "weight");
+  int blen = weights_len_info.at(bn_name_prefix + "bias");
+  int mlen = weights_len_info.at(bn_name_prefix + "running_mean");
+  int vlen = weights_len_info.at(bn_name_prefix + "running_var");
+  std::vector<float> bn_scale =
+      get_cpu_data<float>(bn_file_prefix + "weight", wlen);
+  std::vector<float> bn_bias =
+      get_cpu_data<float>(bn_file_prefix + "bias", blen);
+  std::vector<float> bn_mean =
+      get_cpu_data<float>(bn_file_prefix + "running_mean", mlen);
+  std::vector<float> bn_var =
+      get_cpu_data<float>(bn_file_prefix + "running_var", vlen);
+  // fuse conv, bn, new weight is conv_w, new bias is bn_bias
+  int h = bn_scale.size();
+  int w = static_cast<int>(conv_w.size()) / h;
+  float eps = 1e-5f;  // assume eps is 1e-5;
+  for (int i = 0; i < h; ++i) {
+    bn_scale[i] = bn_scale[i] / std::sqrt(bn_var[i] + eps);
+  }
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      conv_w[i * w + j] *= bn_scale[i];
+    }
+  }
+  for (int i = 0; i < h; ++i) {
+    float b = has_bias ? conv_b[i] : 0.f;
+    bn_bias[i] += ((b - bn_mean[i]) * bn_scale[i]);
+  }
+  _conv_w = get_xpu_quant_data<float, TW>("fused_conv_weight", conv_w);
+  _conv_b = get_xpu_data<float>("fused_conv_bias", bn_bias);
+}
+
+template <typename T>
+static std::tuple<std::vector<T>, std::vector<int>> read_cpu_data_from_file(
+    const std::string& data_file_prefix, int shape_ndim) {
+  std::vector<T> res_data;
+  std::string data_file = data_file_prefix + ".dat";
+  std::string shape_file = data_file_prefix + "_shape.txt";
+  std::ifstream inF(shape_file);
+  if (!inF) {
+    std::cout << "ERR: open file failed! " << shape_file << std::endl;
+    std::exit(1);
+  }
+  char useless;  // (16, 523, 80) or (160, 1)
+  std::vector<int> inshape(shape_ndim, 0);
+  if (shape_ndim == 3) {
+    inF >> useless >> inshape[0] >> useless >> inshape[1] >> useless >>
+        inshape[2] >> useless;
+  } else if (shape_ndim == 2) {
+    inF >> useless >> inshape[0] >> useless >> inshape[1] >> useless;
+  } else if (shape_ndim == 1) {
+    inF >> useless >> inshape[0] >> useless >> useless;
+  } else {
+    std::cout << "ERR: only support shape ndim == 1, 2 or 3, but got "
+              << shape_ndim << std::endl;
+    std::exit(1);
+  }
+
+  int data_len = vec_prod(inshape);
+  res_data = get_cpu_data<T>(data_file, data_len);
+  return std::make_tuple(res_data, inshape);
+}
+
+template <typename T>
+static std::tuple<T*, std::vector<int>> read_xpu_data_from_file(
+    const std::string& data_file_prefix, int shape_ndim) {
+  auto cpu_data_info = read_cpu_data_from_file<T>(data_file_prefix, shape_ndim);
+  T* xpu_data = get_xpu_data<T>(data_file_prefix, std::get<0>(cpu_data_info));
+  return std::make_tuple(xpu_data, std::get<1>(cpu_data_info));
+}
+
+template <typename T>
+static std::tuple<T*, std::vector<int>> create_mask_according_speech_length(
+    const std::vector<int>& speech_length, int max_seqlen,
+    void* xpu_stream = nullptr) {
+  int batch = speech_length.size();
+  int mask_len = batch * max_seqlen;
+  int subsample_mask_len = batch * (((max_seqlen - 1) / 2 - 1) / 2);
+  std::vector<T> mask_cpu(mask_len, 0);
+  std::vector<T> subsample_mask_cpu(subsample_mask_len, 0);
+  // create mask, equal to 'masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)'
+  for (int b = 0; b < batch; ++b) {
+    int curr_seqlen = speech_length[b];
+    for (int idx = 0; idx < curr_seqlen; ++idx) {
+      mask_cpu.at(b * max_seqlen + idx) = 1;
+    }
+  }
+  // create subsample_mask, equal to 'x_mask[:, :, :-2:2][:, :, :-2:2]'
+  int sub_seqlen = subsample_mask_len / batch;
+  for (int b = 0; b < batch; ++b) {
+    for (int idx = 0; idx < sub_seqlen; ++idx) {
+      subsample_mask_cpu.at(b * sub_seqlen + idx) =
+          mask_cpu.at(b * max_seqlen + idx * 4);
+    }
+  }
+  // copy to xpu
+  T* subsample_mask_xpu = nullptr;
+  int r = xpu_malloc(reinterpret_cast<void**>(&subsample_mask_xpu),
+                     subsample_mask_len * sizeof(T));
+  if (r != 0) {
+    std::cout << "ERR: xpu_malloc failed!" << std::endl;
+    std::exit(1);
+  }
+  r = xpu_wait(xpu_stream);
+  if (r != 0) {
+    std::cout << "ERR: xpu_wait failed!" << std::endl;
+    std::exit(1);
+  }
+  r = xpu_memcpy(subsample_mask_xpu, subsample_mask_cpu.data(),
+                 subsample_mask_len * sizeof(T),
+                 XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  if (r != 0) {
+    std::cout << "ERR: xpu_memcpy failed!" << std::endl;
+    std::exit(1);
+  }
+
+  std::vector<int> subsample_mask_shape{batch, 1, sub_seqlen};
+  return std::make_tuple(subsample_mask_xpu, subsample_mask_shape);
+}
+
+template <typename T, typename TW>
+int init_encoder_params(
+    const std::string& params_dir,
+    ConformerEncoderParam<T, TW>& encoder_param) {  // NOLINT
+  std::unordered_map<std::string, int> weights_len_info =
+      get_weights_lens(params_dir + "weights_info.txt");
+  std::unordered_map<std::string, std::vector<int>> weights_shape_info =
+      get_weights_shape(params_dir + "weights_info.txt");
+
+  // model struct param
+  auto& head_num = encoder_param.head_num;
+  auto& head_dim = encoder_param.head_dim;
+  auto& ffn_factor = encoder_param.ffn_factor;
+  auto& conv_param = encoder_param.conv_param;
+  auto& emb_param = encoder_param.emb_param;
+  auto& ctc_dim = encoder_param.ctc_dim;
+  auto& encoder_layer_num = encoder_param.layer_num;
+  auto& fc_num_per_layer = encoder_param.fc_num_per_layer;
+  auto& conv_num_per_layer = encoder_param.conv_num_per_layer;
+  auto& ln_num_per_layer = encoder_param.ln_num_per_layer;
+  encoder_layer_num = 12;
+  fc_num_per_layer = 6;
+  conv_num_per_layer = 3;
+  ln_num_per_layer = 6;
+  emb_param.conv_num = 2;
+  emb_param.fc_num = 1;
+  emb_param.embed_dim = 512;
+  ffn_factor =
+      weights_shape_info.at("encoder.encoders.0.feed_forward.w_1.weight")[0] /
+      weights_shape_info.at("encoder.encoders.0.feed_forward.w_1.weight")[1];
+  head_dim =
+      weights_shape_info.at("encoder.encoders.0.self_attn.pos_bias_u")[1];
+  head_num =
+      weights_shape_info.at("encoder.encoders.0.self_attn.pos_bias_u")[0];
+  conv_param.kernel_size = weights_shape_info.at(
+      "encoder.encoders.0.conv_module.depthwise_conv.weight")[2];
+  conv_param.lorder = conv_param.kernel_size - 1;
+  conv_param.padding = 0.0;
+  conv_param.is_casual = true;
+  ctc_dim = weights_len_info.at("ctc.ctc_lo.bias");
+  encoder_param.beam_size = 3;
+
+  // init encoder cmvn
+  auto& pe = encoder_param.pe;
+  auto& cmvn_istd = encoder_param.cmvn_istd;
+  auto& cmvn_mean = encoder_param.cmvn_mean;
+  int pe_len = weights_len_info.at("encoder.pe");
+  int mlen = weights_len_info.at("encoder.global_cmvn.mean");
+  int ilen = weights_len_info.at("encoder.global_cmvn.istd");
+  pe = get_xpu_data<float>(params_dir + "encoder.pe", pe_len);
+  cmvn_mean =
+      get_xpu_data<float>(params_dir + "encoder.global_cmvn.mean", mlen);
+  cmvn_istd =
+      get_xpu_data<float>(params_dir + "encoder.global_cmvn.istd", ilen);
+
+  // init encoder embedding param
+  std::vector<XPUQunatData<TW>> emb_conv_w_list;
+  auto& emb_conv_bias_list = encoder_param.emb_conv_bias_list;
+  std::vector<XPUQunatData<TW>> emb_fc_w_list;
+  auto& emb_fc_bias_list = encoder_param.emb_fc_bias_list;
+  emb_conv_w_list.resize(emb_param.conv_num);
+  emb_conv_bias_list.resize(emb_param.conv_num);
+  emb_fc_w_list.resize(emb_param.fc_num);
+  emb_fc_bias_list.resize(emb_param.fc_num);
+  for (int i = 0; i < emb_param.conv_num; ++i) {
+    std::string conv_name_prefix =
+        "encoder.embed.conv." + std::to_string(i * 2) + ".";
+    get_conv_param<TW>(weights_len_info, params_dir, conv_name_prefix,
+                       emb_conv_w_list[i], emb_conv_bias_list[i]);
+  }
+  get_fc_param<TW>(weights_len_info, params_dir, "encoder.embed.out.0.",
+                   emb_fc_w_list[0], emb_fc_bias_list[0]);
+
+  // encoder_param_layer
+  int enc_fc_num = encoder_layer_num * fc_num_per_layer + 1;
+  int enc_conv_num = encoder_layer_num * conv_num_per_layer;
+  int enc_ln_num = encoder_layer_num * ln_num_per_layer + 1;
+
+  std::vector<XPUQunatData<TW>> fc_w_list;
+  auto& fc_bias_list = encoder_param.fc_bias_list;
+
+  std::vector<XPUQunatData<TW>> conv_w_list;
+  auto& conv_bias_list = encoder_param.conv_bias_list;
+
+  auto& ln_scale_list = encoder_param.ln_scale_list;
+  auto& ln_bias_list = encoder_param.ln_bias_list;
+
+  std::vector<XPUQunatData<TW>> attn_pos_w_list;
+  std::vector<const float*> attn_pos_uv_bias_list;
+  // w_param need to be quanted & get maxw
+  fc_w_list.resize(enc_fc_num);
+  fc_bias_list.resize(enc_fc_num);
+  conv_w_list.resize(enc_conv_num);
+  conv_bias_list.resize(enc_conv_num);
+  ln_scale_list.resize(enc_ln_num);
+  ln_bias_list.resize(enc_ln_num);
+  attn_pos_w_list.resize(encoder_layer_num);
+  attn_pos_uv_bias_list.resize(encoder_layer_num *
+                               2);  // pos_bias_u, pos_bias_v
+  for (int i = 0; i < encoder_layer_num; ++i) {
+    std::string enc_prefix = "encoder.encoders." + std::to_string(i) + ".";
+    int fc_offset = i * fc_num_per_layer;
+    int conv_offset = i * conv_num_per_layer;
+    int ln_offset = i * ln_num_per_layer;
+    // init FeedForwardParam macaron
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     enc_prefix + "feed_forward_macaron.w_1.",
+                     fc_w_list[fc_offset], fc_bias_list[fc_offset]);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     enc_prefix + "feed_forward_macaron.w_2.",
+                     fc_w_list[fc_offset + 1], fc_bias_list[fc_offset + 1]);
+    get_fc_fused_param<TW>(
+        weights_len_info, params_dir,
+        {enc_prefix + "self_attn.linear_q.", enc_prefix + "self_attn.linear_k.",
+         enc_prefix + "self_attn.linear_v."},
+        fc_w_list[fc_offset + 2], fc_bias_list[fc_offset + 2]);
+    get_fc_param<TW>(
+        weights_len_info, params_dir, enc_prefix + "self_attn.linear_out.",
+        fc_w_list[fc_offset + 3], fc_bias_list[fc_offset + 3], true);
+    // get pos w, pos u bias, pos v bias
+    std::string pos_w_name = enc_prefix + "self_attn.linear_pos.weight";
+    std::string pos_ubias_name = enc_prefix + "self_attn.pos_bias_u";
+    std::string pos_vbias_name = enc_prefix + "self_attn.pos_bias_v";
+    int pos_wlen = weights_len_info.at(pos_w_name);
+    int pos_ublen = weights_len_info.at(pos_ubias_name);
+    int pos_vblen = weights_len_info.at(pos_vbias_name);
+    attn_pos_w_list[i] =
+        get_xpu_quant_data<float, TW>(params_dir + pos_w_name, pos_wlen);
+    attn_pos_uv_bias_list[i * 2] =
+        get_xpu_data<float>(params_dir + pos_ubias_name, pos_ublen);
+    attn_pos_uv_bias_list[i * 2 + 1] =
+        get_xpu_data<float>(params_dir + pos_vbias_name, pos_vblen);
+    // init ConvModuleParam
+    get_conv_param<TW>(weights_len_info, params_dir,
+                       enc_prefix + "conv_module.pointwise_conv1.",
+                       conv_w_list[conv_offset], conv_bias_list[conv_offset],
+                       true);
+    get_conv_param<TW>(weights_len_info, params_dir,
+                       enc_prefix + "conv_module.depthwise_conv.",
+                       conv_w_list[conv_offset + 1],
+                       conv_bias_list[conv_offset + 1], true);
+    get_conv_param<TW>(weights_len_info, params_dir,
+                       enc_prefix + "conv_module.pointwise_conv2.",
+                       conv_w_list[conv_offset + 2],
+                       conv_bias_list[conv_offset + 2], true);
+    // init FeedForwardParam
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     enc_prefix + "feed_forward.w_1.", fc_w_list[fc_offset + 4],
+                     fc_bias_list[fc_offset + 4]);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     enc_prefix + "feed_forward.w_2.", fc_w_list[fc_offset + 5],
+                     fc_bias_list[fc_offset + 5]);
+    // init LayerNormParam
+    get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_ff_macaron.",
+                 ln_scale_list[ln_offset], ln_bias_list[ln_offset]);
+    get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_mha.",
+                 ln_scale_list[ln_offset + 1], ln_bias_list[ln_offset + 1]);
+    get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_conv.",
+                 ln_scale_list[ln_offset + 2], ln_bias_list[ln_offset + 2]);
+    get_ln_param(weights_len_info, params_dir, enc_prefix + "conv_module.norm.",
+                 ln_scale_list[ln_offset + 3], ln_bias_list[ln_offset + 3]);
+    get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_ff.",
+                 ln_scale_list[ln_offset + 4], ln_bias_list[ln_offset + 4]);
+    get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_final.",
+                 ln_scale_list[ln_offset + 5], ln_bias_list[ln_offset + 5]);
+  }
+  get_ln_param(weights_len_info, params_dir, "encoder.after_norm.",
+               ln_scale_list[enc_ln_num - 1], ln_bias_list[enc_ln_num - 1]);
+  get_fc_param<TW>(weights_len_info, params_dir, "ctc.ctc_lo.",
+                   fc_w_list[enc_fc_num - 1], fc_bias_list[enc_fc_num - 1]);
+  /* get maxw && w */
+  encoder_param.emb_conv_w_list = get_w_list_from<TW>(emb_conv_w_list);
+  encoder_param.emb_conv_maxw_list =
+      get_w_maxptr_list_from<TW>(emb_conv_w_list);
+  encoder_param.emb_fc_w_list = get_w_list_from<TW>(emb_fc_w_list);
+  encoder_param.emb_fc_maxw_list = get_w_maxptr_list_from<TW>(emb_fc_w_list);
+
+  encoder_param.conv_w_list = get_w_list_from<TW>(conv_w_list);
+  encoder_param.conv_maxw_list = get_w_maxptr_list_from<TW>(conv_w_list);
+
+  encoder_param.fc_w_list = get_w_list_from<TW>(fc_w_list);
+  encoder_param.fc_maxw_list = get_w_maxptr_list_from<TW>(fc_w_list);
+
+  encoder_param.attn_pos_w_list_ = get_w_list_from<TW>(attn_pos_w_list);
+  encoder_param.attn_pos_maxw_list =
+      get_w_maxptr_list_from<TW>(attn_pos_w_list);
+  /* prepare params */
+  api::Context ctx_xpu(api::kXPU2);
+  api::ctx_guard RAII_GUARD(&ctx_xpu);
+  int ret = 0;
+  int hidden_dim = head_num * head_dim;
+  encoder_param.pos_emb.resize(encoder_layer_num);
+  for (int i = 0; i < encoder_layer_num; i++) {
+    ret = xpu_malloc((void**)&(encoder_param.pos_emb[i]),  // NOLINT
+                     5000 * hidden_dim * sizeof(T));
+    ret = api::fc_fusion<float, TW, T, int16_t>(
+        &ctx_xpu, encoder_param.pe, encoder_param.attn_pos_w_list_[i],
+        const_cast<T*>(encoder_param.pos_emb[i]), 5000, hidden_dim, hidden_dim,
+        false, true, nullptr, encoder_param.attn_pos_maxw_list[i], nullptr,
+        hidden_dim, hidden_dim, hidden_dim, 1.0f, 0.0f, nullptr,
+        api::Activation_t::LINEAR);
+  }
+  for (int i = 0; i < encoder_layer_num; i++) {
+    ret = api::scale<float>(
+        &ctx_xpu, encoder_param.fc_bias_list[i * fc_num_per_layer + 1],
+        const_cast<float*>(
+            encoder_param.fc_bias_list[i * fc_num_per_layer + 1]),
+        hidden_dim, true, 0.5f, 0.0f);
+    ret = api::scale<float>(
+        &ctx_xpu, encoder_param.fc_bias_list[i * fc_num_per_layer + 5],
+        const_cast<float*>(
+            encoder_param.fc_bias_list[i * fc_num_per_layer + 5]),
+        hidden_dim, true, 0.5f, 0.0f);
+  }
+  for (int i = 0; i < attn_pos_uv_bias_list.size(); i++) {
+    T* tmppos = nullptr;
+    ret = xpu_malloc(reinterpret_cast<void**>(&tmppos), hidden_dim * sizeof(T));
+    ret = api::cast_v2<float, T>(&ctx_xpu, attn_pos_uv_bias_list[i], tmppos,
+                                 hidden_dim);
+    encoder_param.attn_pos_uv_bias_list.push_back(tmppos);
+  }
+  return 0;
+}
+
+template <typename T, typename TW>
+int init_decoder_params(
+    const std::string& params_dir,
+    ConformerDecoderParam<T, TW>& decoder_param) {  // NOLINT
+  std::unordered_map<std::string, int> weights_len_info =
+      get_weights_lens(params_dir + "weights_info.txt");
+
+  // init DecoderLayerParam
+  auto& decoder_layer_num = decoder_param.layer_num;
+  auto& fc_num_per_layer = decoder_param.fc_num_per_layer;
+  auto& ln_num_per_layer = decoder_param.ln_num_per_layer;
+  std::vector<XPUQunatData<TW>> fc_w_list;
+  auto& fc_bias_list = decoder_param.fc_bias_list;
+  auto& ln_scale_list = decoder_param.ln_scale_list;
+  auto& ln_bias_list = decoder_param.ln_bias_list;
+  decoder_layer_num = 3;
+  fc_num_per_layer = 8;
+  ln_num_per_layer = 3;
+  int dec_fc_num = decoder_layer_num * fc_num_per_layer + 1;
+  int dec_ln_num = decoder_layer_num * ln_num_per_layer + 1;
+  fc_w_list.resize(dec_fc_num);
+  fc_bias_list.resize(dec_fc_num);
+  ln_scale_list.resize(dec_ln_num);
+  ln_bias_list.resize(dec_ln_num);
+  decoder_param.head_num = 8;
+  decoder_param.head_dim = 64;
+  decoder_param.vocab_size = 5538;
+  decoder_param.sos_id = 5537;
+  decoder_param.eos_id = 5537;
+  decoder_param.ignored_id = 2;
+  decoder_param.beam_size = 3;
+  decoder_param.max_token_num = 200;
+  decoder_param.add_sos_num = 1;
+  decoder_param.ffn_dim = 2048;
+  auto att_dim = decoder_param.head_num * decoder_param.head_dim;
+
+  // init EmbeddingParam
+  std::string embed_table_name = "decoder.left_decoder.embed.0.weight";
+  std::vector<float> embed_table_cpu = get_cpu_data<float>(
+      params_dir + embed_table_name, weights_len_info.at(embed_table_name));
+  std::vector<T> embed_table_cpu_t(embed_table_cpu.size(), 0);
+  for (int i = 0; i < static_cast<int>(embed_table_cpu.size()); ++i) {
+    embed_table_cpu_t[i] =
+        static_cast<T>(embed_table_cpu[i] * std::sqrt(att_dim));
+  }
+  decoder_param.embed_table =
+      get_xpu_data<T>(embed_table_name, embed_table_cpu_t);
+
+  // init pe
+  std::string pe_name = "encoder.pe";
+  std::vector<float> pe_cpu =
+      get_cpu_data<float>(params_dir + pe_name, weights_len_info.at(pe_name));
+  std::vector<T> pe_cpu_t(pe_cpu.size(), 0);
+  for (int i = 0; i < static_cast<int>(pe_cpu.size()); ++i) {
+    pe_cpu_t[i] = static_cast<T>(pe_cpu[i]);
+  }
+  decoder_param.pe = get_xpu_data<T>(pe_name, pe_cpu_t);
+  for (int i = 0; i < decoder_layer_num; ++i) {
+    std::string dec_prefix =
+        "decoder.left_decoder.decoders." + std::to_string(i) + ".";
+    int offset = i * fc_num_per_layer;
+    // init fc param
+    // self attention qkv fc
+    get_fc_fused_param<TW>(weights_len_info, params_dir,
+                           {
+                               dec_prefix + "self_attn.linear_q.",
+                               dec_prefix + "self_attn.linear_k.",
+                               dec_prefix + "self_attn.linear_v.",
+                           },
+                           fc_w_list[offset], fc_bias_list[offset], true);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     dec_prefix + "self_attn.linear_out.",
+                     fc_w_list[offset + 1], fc_bias_list[offset + 1], true);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     dec_prefix + "src_attn.linear_q.", fc_w_list[offset + 2],
+                     fc_bias_list[offset + 2], true);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     dec_prefix + "src_attn.linear_k.", fc_w_list[offset + 3],
+                     fc_bias_list[offset + 3], true);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     dec_prefix + "src_attn.linear_v.", fc_w_list[offset + 4],
+                     fc_bias_list[offset + 4], true);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     dec_prefix + "src_attn.linear_out.", fc_w_list[offset + 5],
+                     fc_bias_list[offset + 5], true);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     dec_prefix + "feed_forward.w_1.", fc_w_list[offset + 6],
+                     fc_bias_list[offset + 6]);
+    get_fc_param<TW>(weights_len_info, params_dir,
+                     dec_prefix + "feed_forward.w_2.", fc_w_list[offset + 7],
+                     fc_bias_list[offset + 7]);
+    // init ln param
+    offset = i * ln_num_per_layer;
+    get_ln_param(weights_len_info, params_dir, dec_prefix + "norm1.",
+                 ln_scale_list[offset], ln_bias_list[offset]);
+    get_ln_param(weights_len_info, params_dir, dec_prefix + "norm2.",
+                 ln_scale_list[offset + 1], ln_bias_list[offset + 1]);
+    get_ln_param(weights_len_info, params_dir, dec_prefix + "norm3.",
+                 ln_scale_list[offset + 2], ln_bias_list[offset + 2]);
+  }
+  // init after ln
+  get_ln_param(weights_len_info, params_dir, "decoder.left_decoder.after_norm.",
+               ln_scale_list[dec_ln_num - 1], ln_bias_list[dec_ln_num - 1]);
+  // init output layer fc
+  get_fc_param<TW>(
+      weights_len_info, params_dir, "decoder.left_decoder.output_layer.",
+      fc_w_list[dec_fc_num - 1], fc_bias_list[dec_fc_num - 1], true);
+  decoder_param.fc_w_list = get_w_list_from<TW>(fc_w_list);
+  decoder_param.fc_maxw_list = get_w_maxptr_list_from<TW>(fc_w_list);
+  return 0;
+}
+
+static int padding_target(std::vector<int>& hyps,      // NOLINT
+                          std::vector<int>& hyps_len,  // NOLINT
+                          int beam_size, int eos_id) {
+  int max_target_len = *max_element(hyps_len.begin(), hyps_len.end());
+  std::vector<int> pad(max_target_len * beam_size);
+  int offset = 0;
+  for (int i = 0; i < beam_size; i++) {
+    for (int j = 0; j < max_target_len; j++) {
+      pad[i * max_target_len + j] = j < hyps_len[i] ? hyps[j + offset] : eos_id;
+    }
+    offset += hyps_len[i];
+  }
+  hyps.swap(pad);
+  return max_target_len;
+}
+
+namespace xpu {
+namespace wenet {
+
+template <typename T, typename TW, typename TGEMM>
+int conformer_encoder_wenet(
+    api::Context* ctx, float* x, const std::vector<int>& data_shape,
+    T* encoder_out, T* ctc_probs,
+    ConformerEncoderParam<T, TW>& param,  // NOLINT
+    const std::tuple<float*, std::vector<int>>& xpu_mask_info);
+template <typename T>
+int ctc_prefix_beamsearch(api::Context* ctx, T* ctc_probs,
+                          std::vector<int>& hyps,          // NOLINT
+                          std::vector<int>& hyps_len,      // NOLINT
+                          std::vector<float>& ctc_scores,  // NOLINT
+                          int batch_size, int beam_size, int max_len,
+                          int ctc_dim);
+
+template <typename T, typename TW, typename TGEMM>
+int conformer_decoder_wenet(api::Context* ctx, const T* x,
+                            const std::vector<int32_t>& x_shape,
+                            const float* x_mask, const int* padded_target,
+                            const std::vector<int32_t>& target_shape,
+                            float* character_scores,
+                            const ConformerDecoderParam<T, TW>& param);
+}  // namespace wenet
+}  // namespace xpu
diff --git a/runtime/kunlun/xpu/xpu_util.cpp b/runtime/kunlun/xpu/xpu_util.cpp
new file mode 100644
index 000000000..b18cd12b7
--- /dev/null
+++ b/runtime/kunlun/xpu/xpu_util.cpp
@@ -0,0 +1,491 @@
+// Copyright (c) 2022 KUNLUNXIN Inc.
+//               2022 Han Qi (qihan@baidu.com)
+//                    Hehe Pan (panhehe@baidu.com)
+//                    Zikui Yan (yanzikui@baidu.com)
+//                    Chaolin Li (lichaolin@baidu.com)
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xpu_util.h"  // NOLINT
+
+template <typename T>
+static double vec_sum(const std::vector<T>& data) {
+  double res = 0;
+  for (int i = 0; i < static_cast<int>(data.size()); ++i) {
+    res += static_cast<double>(data[i]);
+  }
+  return res;
+}
+
+int vector_prod(std::vector<int> shape) {
+  int accumlate = 1;
+  for (auto a : shape) {
+    accumlate *= a;
+  }
+  return accumlate;
+}
+void add_separator_when_necessary(std::string& str) {  // NOLINT
+  int len = str.size();
+  char ch = '/';
+  if (str[len - 1] != ch) {
+    str.append(1, ch);
+  }
+}
+
+template <typename T>
+static std::string print_vec(const std::vector<T>& data) {
+  std::stringstream ss;
+  const int dump_len = data.size() > 8 ? 8 : data.size();
+  std::vector<T> dump_data(dump_len, 0);
+  int half_dump_len = dump_len / 2;
+  std::copy(data.cbegin(), data.cbegin() + half_dump_len, dump_data.begin());
+  std::copy(data.cend() - (dump_len - half_dump_len), data.cend(),
+            dump_data.begin() + half_dump_len);
+  for (int i = 0; i < dump_len - 1; ++i) {
+    ss << dump_data[i] << ", ";
+    if ((i + 1) == dump_len / 2) {
+      ss << " ... ";
+    }
+  }
+  ss << dump_data[dump_len - 1];
+  return ss.str();
+}
+
+template <typename T>
+static T parse_string(const std::string& str) {
+  return str;
+}
+
+template <>
+float parse_string(const std::string& str) {
+  return std::stof(str);
+}
+template <>
+double parse_string(const std::string& str) {
+  return std::stod(str);
+}
+template <>
+int parse_string(const std::string& str) {
+  return std::stoi(str);
+}
+template <>
+int64_t parse_string(const std::string& str) {
+  return std::stoll(str);
+}
+
+template <typename T>
+std::vector<T> Split(const std::string& str, const std::string& separator) {
+  std::vector<T> res;
+  std::string::size_type pos1, pos2;
+  pos1 = str.find_first_not_of(separator);
+  pos2 = str.find(separator, pos1);
+  while (std::string::npos != pos1 && std::string::npos != pos2) {
+    res.emplace_back(parse_string<T>(str.substr(pos1, pos2 - pos1)));
+    pos1 = str.find_first_not_of(separator, pos2);
+    pos2 = str.find(separator, pos1);
+  }
+  if (std::string::npos != pos1 && pos1 < str.length()) {
+    res.emplace_back(parse_string<T>(str.substr(pos1)));
+  }
+  return res;
+}
+
+std::unordered_map<std::string, int> get_weights_lens(
+    const std::string& file_path) {
+  std::unordered_map<std::string, int> res;
+  std::ifstream inF(file_path, std::ifstream::in);
+  if (inF) {
+    // std::cout << "read success from " << file_path << std::endl;
+    std::string buffer;
+    while (std::getline(inF, buffer)) {
+      std::vector<std::string> weight_info = Split<std::string>(buffer, ":");
+      std::string w_name = weight_info[0];
+      int w_len = std::stoi(weight_info[3]);
+      res.insert(std::make_pair(w_name, w_len));
+    }
+  } else {
+    std::cout << "ERR: read failed, " << file_path << std::endl;
+    std::exit(1);
+  }
+
+  return res;
+}
+
+std::unordered_map<std::string, std::vector<int>> get_weights_shape(
+    const std::string& file_path) {
+  std::unordered_map<std::string, std::vector<int>> res;
+  std::ifstream inF(file_path, std::ifstream::in);
+  if (inF) {
+    // std::cout << "read success from " << file_path << std::endl;
+    std::string buffer;
+    while (std::getline(inF, buffer)) {
+      std::vector<std::string> weight_info = Split<std::string>(buffer, ":");
+      std::string w_name = weight_info[0];
+      std::string w_shape_str = weight_info[2];  // example: (512, 1, 3, 3)
+      std::string w_shape_str_without_bracket(
+          w_shape_str.begin() + 1,
+          w_shape_str.end() - 1);  // example: 512, 1, 3, 3
+      std::vector<int> w_shape = Split<int>(w_shape_str_without_bracket, ",");
+      res.insert(std::make_pair(w_name, w_shape));
+    }
+  } else {
+    std::cout << "ERR: read failed, " << file_path << std::endl;
+    std::exit(1);
+  }
+
+  return res;
+}
+
+template <typename T>
+std::vector<T> get_cpu_data(const std::string& file_path, int len) {
+  std::vector<T> result(len, 0);
+  std::ifstream inF(file_path, std::ifstream::binary);
+  if (!inF) {
+    std::cout << "ERR: std::ifstream init failed! " << file_path << std::endl;
+    std::exit(1);
+  }
+  if (inF.read(reinterpret_cast<char*>(result.data()), len * sizeof(T))) {
+    // std::cout << "read success from " << file_path << std::endl;
+  } else {
+    std::cout << "ERR: something wrong: " << file_path << ", len=" << len
+              << std::endl;
+    std::exit(1);
+  }
+  return result;
+}
+
+template std::vector<float> get_cpu_data<float>(const std::string&, int len);
+template std::vector<float16> get_cpu_data<float16>(const std::string&,
+                                                    int len);
+template std::vector<int64_t> get_cpu_data<int64_t>(const std::string&,
+                                                    int len);
+template std::vector<int> get_cpu_data<int>(const std::string&, int len);
+
+template <typename T>
+T* get_xpu_data(const std::string& data_name, const std::vector<T>& cpu_data) {
+  int len = cpu_data.size();
+#ifdef TEST_DEBUG
+  std::cout << "DEBUG: file_path=" << data_name << ", len=" << len
+            << ", vec_sum=" << vec_sum(cpu_data)
+            << ", details: " << print_vec(cpu_data) << std::endl;
+#endif
+
+  T* xpu_data = nullptr;
+  int r = xpu_malloc(reinterpret_cast<void**>(&xpu_data), len * sizeof(T));
+  if (r != 0) {
+    std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl;
+    std::exit(1);
+  }
+
+  r = xpu_wait();
+  if (r != 0) {
+    std::cout << "ERR: xpu_wait failed!" << std::endl;
+    std::exit(1);
+  }
+  r = xpu_memcpy(xpu_data, cpu_data.data(), len * sizeof(T),
+                 XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  if (r != 0) {
+    std::cout << "ERR: xpu_memcpy failed! " << data_name << std::endl;
+    std::exit(1);
+  }
+
+#ifdef TEST_DEBUG
+  std::cout << "DEBUG: xpu_data=" << xpu_data << std::endl;
+#endif
+
+  return xpu_data;
+}
+
+template float* get_xpu_data(const std::string&, const std::vector<float>&);
+template float16* get_xpu_data(const std::string&, const std::vector<float16>&);
+template int64_t* get_xpu_data(const std::string&, const std::vector<int64_t>&);
+template int* get_xpu_data(const std::string&, const std::vector<int>&);
+
+template <typename T>
+T* get_xpu_data(const std::string& file_path, int len) {
+  std::vector<T> cpu_data = get_cpu_data<T>(file_path, len);
+  return get_xpu_data<T>(file_path, cpu_data);
+}
+
+template float* get_xpu_data<float>(const std::string&, int);
+template float16* get_xpu_data<float16>(const std::string&, int);
+template int64_t* get_xpu_data<int64_t>(const std::string&, int);
+template int* get_xpu_data<int>(const std::string&, int);
+
+template <typename TX, typename TY>
+std::vector<TY> quant_cpu(const std::vector<TX>& cpu_data) {
+  int len = cpu_data.size();
+  std::vector<TY> cpu_quant_data(len, 0);
+  api::Context ctx(api::kCPU);
+  int r = api::quantization<TX, TY>(&ctx, cpu_data.data(),
+                                    cpu_quant_data.data(), len, nullptr);
+  if (r != 0) {
+    std::cout << "ERR: quantization failed!" << std::endl;
+    std::exit(1);
+  }
+  return cpu_quant_data;
+}
+
+template <>
+std::vector<float> quant_cpu<float, float>(const std::vector<float>& cpu_data) {
+  return cpu_data;
+}
+
+template <typename TX, typename TY>
+XPUQunatData<TY> get_xpu_quant_data(const std::string& data_name,
+                                    const std::vector<TX>& cpu_data) {
+  XPUQunatData<TY> xpu_quant_data;
+
+  int len = cpu_data.size();
+  // quant
+  std::vector<TY> cpu_quant_data = quant_cpu<TX, TY>(cpu_data);
+  // findmax
+  float abs_max = 1e-30f;
+  if (std::is_same<TX, float>::value || std::is_same<TX, float16>::value) {
+    for (int i = 0; i < len; ++i) {
+      float abs_val = std::fabs(static_cast<float>(cpu_data[i]));
+      abs_max = std::max<float>(abs_max, abs_val);
+    }
+  }
+
+  constexpr int max_ptr_len = 6;  // for xpu2
+  std::vector<float> cpu_max(max_ptr_len, abs_max);
+  // xpu malloc
+  TY* xpu_data = nullptr;
+  float* xpu_max_ptr = nullptr;
+  int r = xpu_malloc(reinterpret_cast<void**>(&xpu_data), len * sizeof(TY));
+  if (r != 0) {
+    std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl;
+    std::exit(1);
+  }
+  r = xpu_malloc(reinterpret_cast<void**>(&xpu_max_ptr),
+                 max_ptr_len * sizeof(float));
+  if (r != 0) {
+    std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl;
+    std::exit(1);
+  }
+
+#ifdef TEST_DEBUG
+  std::cout << "DEBUG: file_path=" << data_name << ", len=" << len
+            << ", data vec_sum=" << vec_sum(cpu_data)
+            << ", quant_data vec_sum=" << vec_sum(cpu_quant_data)
+            << ", details: " << print_vec(cpu_quant_data) << std::endl;
+#endif
+  r = xpu_wait();
+  if (r != 0) {
+    std::cout << "ERR: xpu_wait failed!" << std::endl;
+    std::exit(1);
+  }
+  // xpu memcpy
+  r = xpu_memcpy(xpu_data, cpu_quant_data.data(), len * sizeof(TY),
+                 XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  if (r != 0) {
+    std::cout << "ERR: xpu_memcpy failed!" << std::endl;
+    std::exit(1);
+  }
+#ifdef TEST_DEBUG
+  std::cout << "DEBUG: max is " << print_vec(cpu_max) << std::endl;
+#endif
+  r = xpu_memcpy(xpu_max_ptr, cpu_max.data(), max_ptr_len * sizeof(float),
+                 XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  if (r != 0) {
+    std::cout << "ERR: xpu_malloc failed!" << std::endl;
+    std::exit(1);
+  }
+
+#ifdef TEST_DEBUG
+  std::cout << "DEBUG: xpu_data=" << xpu_data << ", xpu_max_ptr=" << xpu_max_ptr
+            << std::endl;
+#endif
+  xpu_quant_data.data_ = xpu_data;
+  xpu_quant_data.max_ptr_ = xpu_max_ptr;
+  return xpu_quant_data;
+}
+
+template XPUQunatData<float> get_xpu_quant_data<float, float>(
+    const std::string&, const std::vector<float>&);
+template XPUQunatData<int16_t> get_xpu_quant_data<float, int16_t>(
+    const std::string&, const std::vector<float>&);
+
+template <typename TX, typename TY>
+XPUQunatData<TY> get_xpu_quant_data(const std::string& file_path, int len) {
+  std::vector<TX> cpu_data = get_cpu_data<TX>(file_path, len);
+  return get_xpu_quant_data<TX, TY>(file_path, cpu_data);
+}
+
+template XPUQunatData<float> get_xpu_quant_data<float, float>(
+    const std::string&, int);
+template XPUQunatData<int16_t> get_xpu_quant_data<float, int16_t>(
+    const std::string&, int);
+
+std::vector<int> get_all_ids(const std::string& dir_in) {
+  std::vector<int> ids;
+  std::set<int> ids_set;
+  struct stat s;
+  stat(dir_in.c_str(), &s);
+  if (!S_ISDIR(s.st_mode)) {
+    return ids;
+  }
+  DIR* open_dir = opendir(dir_in.c_str());
+  if (nullptr == open_dir) {
+    return ids;
+  }
+  dirent* p = nullptr;
+  while ((p = readdir(open_dir)) != nullptr) {
+    if (p->d_name[0] != '.') {
+      std::string filename = std::string(p->d_name);
+      int end_pos = filename.find('_');
+
+      int qid = std::stoi(filename.substr(0, end_pos));
+      ids_set.insert(qid);
+    }
+  }
+  closedir(open_dir);
+  ids.resize(ids_set.size());
+  ids.assign(ids_set.begin(), ids_set.end());
+  return ids;
+}
+
+void get_ln_param(const std::unordered_map<std::string, int>& weights_len_info,
+                  const std::string& params_dir,
+                  const std::string& ln_name_prefix,
+                  const float*& ln_scale,   // NOLINT
+                  const float*& ln_bias) {  // NOLINT
+  std::string ln_file_prefix = params_dir + ln_name_prefix;
+  int wlen = weights_len_info.at(ln_name_prefix + "weight");
+  int blen = weights_len_info.at(ln_name_prefix + "bias");
+  ln_scale = get_xpu_data<float>(ln_file_prefix + "weight", wlen);
+  ln_bias = get_xpu_data<float>(ln_file_prefix + "bias", blen);
+}
+
+template <typename T>
+void print_xpu_data_all(api::Context* ctx, const T* data,
+                        std::vector<int> shape, std::string name) {
+  int data_len = vector_prod(shape);
+  std::vector<T> cpu_data(data_len);
+  xpu_wait(ctx->xpu_stream);
+  xpu_memcpy(reinterpret_cast<void**>(&cpu_data.front()), data,
+             data_len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  std::cout << name;
+  std::cout << " shape:";
+  for (auto i : shape) {
+    std::cout << i << " ";
+  }
+  std::cout << std::endl;
+  int row = 1;
+  int col = shape.back();
+  if (shape.size() >= 2) {
+    row = data_len / col;
+  }
+  T* cpu_data_ptr = &cpu_data.front();
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      std::cout << *(cpu_data_ptr + i * col + j) << " ";
+    }
+    std::cout << std::endl;
+  }
+}
+template <typename T>
+void print_xpu_data(api::Context* ctx, const T* data, std::vector<int> shape,
+                    std::string name) {
+  int data_len = vector_prod(shape);
+
+  std::vector<T> cpu_data(data_len);
+  xpu_memcpy(reinterpret_cast<void*>(&cpu_data.front()), data,
+             data_len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  std::cout << name;
+  std::cout << " shape:";
+  for (auto i : shape) {
+    std::cout << i << " ";
+  }
+  std::cout << std::endl;
+  if (data_len > 1000) {
+    double mean = 0;
+    for (auto val : cpu_data) {
+      mean += static_cast<double>(val);
+    }
+    mean /= data_len;
+    std::cout << "mean=" << mean << std::endl;
+    std::cout << "details: ";
+    for (int i = 0; i < 8; ++i) {
+      std::cout << cpu_data[i] << " ";
+    }
+    std::cout << "...";
+    for (int i = data_len - 8; i < data_len; ++i) {
+      std::cout << cpu_data[i] << " ";
+    }
+    std::cout << std::endl;
+    return;
+  }
+  int row = 1;
+  int col = shape.back();
+  if (shape.size() >= 2) {
+    row = data_len / col;
+  }
+  T* cpu_data_ptr = &cpu_data.front();
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      std::cout << *(cpu_data_ptr + i * col + j) << " ";
+    }
+    std::cout << std::endl;
+  }
+}
+template <typename T>
+void print_cpu_data(const T* data, std::vector<int> shape, std::string name) {
+  int data_len = vector_prod(shape);
+  std::cout << name;
+  std::cout << " shape:";
+  for (auto i : shape) {
+    std::cout << i << " ";
+  }
+  std::cout << std::endl;
+  int row = 1;
+  int col = shape.back();
+  if (shape.size() >= 2) {
+    row = data_len / col;
+  }
+  for (int i = 0; i < row; i++) {
+    for (int j = 0; j < col; j++) {
+      std::cout << *(data + i * col + j) << " ";
+    }
+    std::cout << std::endl;
+  }
+}
+
+template <typename T>
+void print_vec(const std::vector<T>& data, const std::string& data_name) {
+  int len = static_cast<int>(data.size());
+  T sum = std::accumulate(data.begin(), data.end(), 0);
+  std::cout << "DEBUG: data_name is " << data_name << ", len=" << len
+            << ", sum=" << sum << ", ";
+  for (int i = 0; i < len - 1; ++i) {
+    std::cout << data[i] << ", ";
+  }
+  std::cout << data[len - 1] << std::endl;
+}
+
+#define INSTANTIATION_PRINT(T)                                           \
+  template void print_vec<T>(const std::vector<T>&, const std::string&); \
+  template void print_cpu_data<T>(const T*, std::vector<int>,            \
+                                  std::string name);                     \
+  template void print_xpu_data<T>(api::Context * ctx, const T*,          \
+                                  std::vector<int>, std::string);        \
+  template void print_xpu_data_all<T>(api::Context * ctx, const T*,      \
+                                      std::vector<int> shape, std::string);
+
+INSTANTIATION_PRINT(int);
+INSTANTIATION_PRINT(int16_t);
+INSTANTIATION_PRINT(int8_t);
+INSTANTIATION_PRINT(float);
+INSTANTIATION_PRINT(float16);
diff --git a/runtime/kunlun/xpu/xpu_util.h b/runtime/kunlun/xpu/xpu_util.h
new file mode 100644
index 000000000..e0b02dc60
--- /dev/null
+++ b/runtime/kunlun/xpu/xpu_util.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 KUNLUNXIN Inc.
+//               2022 Han Qi (qihan@baidu.com)
+//                    Hehe Pan (panhehe@baidu.com)
+//                    Zikui Yan (yanzikui@baidu.com)
+//                    Chaolin Li (lichaolin@baidu.com)
+// All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <sys/stat.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "xpu/runtime.h"
+#include "xpu/xdnn.h"
+
+#pragma once
+namespace api = baidu::xpu::api;
+template <typename T>
+class XPUQunatData {
+ public:
+  XPUQunatData() : data_(nullptr), max_ptr_(nullptr) {}
+  XPUQunatData(T* data, float* max_ptr) : data_(data), max_ptr_(max_ptr) {}
+  T* data_{nullptr};
+  float* max_ptr_{nullptr};
+};
+
+int vector_prod(std::vector<int> shape);
+void add_separator_when_necessary(std::string& str);  // NOLINT
+
+template <typename T, typename TW>
+void conformer_test(const std::string& data_dir, const std::string& params_dir,
+                    int threads_number, int dev_id);
+
+template <typename T>
+std::vector<T> Split(const std::string& str, const std::string& separator);
+
+std::unordered_map<std::string, int> get_weights_lens(
+    const std::string& file_path);
+std::unordered_map<std::string, std::vector<int>> get_weights_shape(
+    const std::string& file_path);
+
+template <typename T>
+std::vector<T> get_cpu_data(const std::string& file_path, int len);
+
+template <typename T>
+T* get_xpu_data(const std::string& file_path, int len);
+
+template <typename T>
+T* get_xpu_data(const std::string& data_name, const std::vector<T>& cpu_data);
+
+template <typename TX, typename TY>
+XPUQunatData<TY> get_xpu_quant_data(const std::string& file_path, int len);
+
+template <typename TX, typename TY>
+XPUQunatData<TY> get_xpu_quant_data(const std::string& data_name,
+                                    const std::vector<TX>& cpu_data);
+
+std::vector<int> get_all_ids(const std::string& dir_in);
+
+void get_ln_param(const std::unordered_map<std::string, int>& weights_len_info,
+                  const std::string& params_dir,
+                  const std::string& ln_name_prefix,
+                  const float*& ln_scale,  // NOLINT
+                  const float*& ln_bias);  // NOLINT
+
+template <typename T>
+void print_vec(const std::vector<T>& data, const std::string& data_name);
+template <typename T>
+void print_cpu_data(const T* data, std::vector<int> shape, std::string name);
+template <typename T>
+void print_xpu_data(api::Context* ctx, const T* data, std::vector<int> shape,
+                    std::string name);
+template <typename T>
+void print_xpu_data_all(api::Context* ctx, const T* data,
+                        std::vector<int> shape, std::string name);
+
+#define CHECK_RET(ret)                                    \
+  if ((ret) != 0) {                                       \
+    std::cout << "ERR" << __FILE__ << ":" << __LINE__     \
+              << ", check failed, ret != 0" << std::endl; \
+    std::exit(1);                                         \
+  }
+#define WRAPPER_CHECK_CTX(ctx) \
+  if (ctx == nullptr) {        \
+    return api::INVALID_PARAM; \
+  }
+#define WRAPPER_ASSERT_GT(ctx, expra, exprb) \
+  if (!((expra) > (exprb))) {                \
+    return api::INVALID_PARAM;               \
+  }
+#define WRAPPER_ASSERT_SUCCESS(ctx, ret) \
+  if (!((ret) == api::SUCCESS)) {        \
+    return ret;                          \
+  }