diff --git a/runtime/core/cmake/xpu.cmake b/runtime/core/cmake/xpu.cmake new file mode 100644 index 0000000000..38418671b0 --- /dev/null +++ b/runtime/core/cmake/xpu.cmake @@ -0,0 +1,37 @@ +if(NOT WIN32) + string(ASCII 27 Esc) + set(ColourReset "${Esc}[m") + set(ColourBold "${Esc}[1m") + set(Red "${Esc}[31m") + set(Green "${Esc}[32m") + set(Yellow "${Esc}[33m") + set(Blue "${Esc}[34m") + set(Magenta "${Esc}[35m") + set(Cyan "${Esc}[36m") + set(White "${Esc}[37m") + set(BoldRed "${Esc}[1;31m") + set(BoldGreen "${Esc}[1;32m") + set(BoldYellow "${Esc}[1;33m") + set(BoldBlue "${Esc}[1;34m") + set(BoldMagenta "${Esc}[1;35m") + set(BoldCyan "${Esc}[1;36m") + set(BoldWhite "${Esc}[1;37m") +endif() + +if(XPU) + set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR}) + message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n") + set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu) + if(NOT DEFINED ENV{XPU_API_PATH}) + message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n") + else() + set(XPU_API_PATH $ENV{XPU_API_PATH}) + message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.") + endif() + + include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/ + ${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include) + link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/) + + add_definitions(-DUSE_XPU) +endif() diff --git a/runtime/core/decoder/CMakeLists.txt b/runtime/core/decoder/CMakeLists.txt index cfa439f421..098fdcdb5e 100644 --- a/runtime/core/decoder/CMakeLists.txt +++ b/runtime/core/decoder/CMakeLists.txt @@ -7,8 +7,8 @@ set(decoder_srcs ctc_endpoint.cc ) -if(NOT TORCH AND NOT ONNX) - message(FATAL_ERROR "Please build with TORCH or ONNX!!!") +if(NOT TORCH AND NOT ONNX AND NOT XPU) + message(FATAL_ERROR "Please build with TORCH or ONNX or XPU!!!") endif() if(TORCH) list(APPEND decoder_srcs torch_asr_model.cc) @@ -18,7 +18,8 @@ if(ONNX) endif() add_library(decoder STATIC ${decoder_srcs}) -target_link_libraries(decoder PUBLIC kaldi-decoder frontend post_processor utils) +target_link_libraries(decoder PUBLIC kaldi-decoder frontend + post_processor utils) if(ANDROID) target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY}) @@ -29,4 +30,7 @@ else() if(ONNX) target_link_libraries(decoder PUBLIC onnxruntime) endif() + if(XPU) + target_link_libraries(decoder PUBLIC xpu_conformer) + endif() endif() diff --git a/runtime/core/decoder/params.h b/runtime/core/decoder/params.h index dcabaeadc8..ede5cfbee4 100644 --- a/runtime/core/decoder/params.h +++ b/runtime/core/decoder/params.h @@ -13,7 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #ifndef DECODER_PARAMS_H_ #define DECODER_PARAMS_H_ @@ -29,17 +28,24 @@ #ifdef USE_TORCH #include "decoder/torch_asr_model.h" #endif +#ifdef USE_XPU +#include "xpu/xpu_asr_model.h" +#endif #include "frontend/feature_pipeline.h" #include "post_processor/post_processor.h" #include "utils/flags.h" #include "utils/string.h" DEFINE_int32(num_threads, 1, "num threads for ASR model"); +DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model"); // TorchAsrModel flags DEFINE_string(model_path, "", "pytorch exported model path"); // OnnxAsrModel flags DEFINE_string(onnx_dir, "", "directory where the onnx model is saved"); +// XPUAsrModel flags +DEFINE_string(xpu_model_dir, "", + "directory where the XPU model and weights is saved"); // FeaturePipelineConfig flags DEFINE_int32(num_bins, 80, "num mel bins for fbank feature"); @@ -66,7 +72,8 @@ DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search"); DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search"); DEFINE_double(blank_skip_thresh, 1.0, "blank skip thresh for ctc wfst search, 1.0 means no skip"); -DEFINE_double(length_penalty, 0.0, "length penalty ctc wfst search, will not" +DEFINE_double(length_penalty, 0.0, + "length penalty ctc wfst search, will not" "apply on self-loop arc, for balancing the del/ins ratio, " "suggest set to -3.0"); DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); @@ -130,7 +137,7 @@ std::shared_ptr InitDecodeResourceFromFlags() { #else LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'."; #endif - } else { + } else if (!FLAGS_model_path.empty()) { #ifdef USE_TORCH LOG(INFO) << "Reading torch model " << FLAGS_model_path; TorchAsrModel::InitEngineThreads(FLAGS_num_threads); @@ -140,6 +147,19 @@ std::shared_ptr InitDecodeResourceFromFlags() { #else LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'."; #endif + } else if (!FLAGS_xpu_model_dir.empty()) { +#ifdef USE_XPU + LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir; + auto model = std::make_shared(); + model->SetEngineThreads(FLAGS_num_threads); + model->SetDeviceId(FLAGS_device_id); + model->Read(FLAGS_xpu_model_dir); + resource->model = model; +#else + LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'."; +#endif + } else { + LOG(FATAL) << "Please set ONNX, TORCH or XPU model path!!!"; } LOG(INFO) << "Reading unit table " << FLAGS_unit_path; diff --git a/runtime/kunlun/.gitignore b/runtime/kunlun/.gitignore new file mode 100644 index 0000000000..c6767241c3 --- /dev/null +++ b/runtime/kunlun/.gitignore @@ -0,0 +1,2 @@ +build/ +fc_base/ diff --git a/runtime/kunlun/CMakeLists.txt b/runtime/kunlun/CMakeLists.txt new file mode 100644 index 0000000000..55266ad3ea --- /dev/null +++ b/runtime/kunlun/CMakeLists.txt @@ -0,0 +1,69 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +project(wenet VERSION 0.1) + +option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF) +option(GRAPH_TOOLS "whether to build TLG graph tools" OFF) +option(BUILD_TESTING "whether to build unit test" OFF) + +option(GRPC "whether to build with gRPC" OFF) +# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost +# which is a very big library +option(WEBSOCKET "whether to build with websocket" OFF) +option(XPU "whether to build with XPU" ON) + +set(CMAKE_VERBOSE_MAKEFILE OFF) + +include(FetchContent) +set(FETCHCONTENT_QUIET OFF) +get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") +set(FETCHCONTENT_BASE_DIR ${fc_base}) + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC") + +# Include all dependency +include(openfst) +# This CMakeLists.txt is only used for kunlun xpu, so remove the contents +# about onnx, libtorch, gpu and windows. +include(xpu) +# Compile xpu_conformer.a and conformer_test +add_subdirectory(xpu) + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/kaldi +) + +# Build all libraries +add_subdirectory(utils) +if(NOT MSVC) + add_dependencies(utils openfst) +endif() +add_subdirectory(frontend) +add_subdirectory(post_processor) +add_subdirectory(kaldi) # kaldi: wfst based decoder +add_subdirectory(decoder) +add_subdirectory(api) + +# Optionally, you can build with websocket +if(WEBSOCKET) + include(boost) + add_subdirectory(websocket) +endif() + +# Optionally, you can build with gRPC +if(GRPC) + include(grpc) + add_subdirectory(grpc) +endif() + +# Build all bins +add_subdirectory(bin) + +# Unit Test +if(BUILD_TESTING) + include(gtest) + add_subdirectory(test) +endif() diff --git a/runtime/kunlun/README.md b/runtime/kunlun/README.md new file mode 100644 index 0000000000..2e096b796a --- /dev/null +++ b/runtime/kunlun/README.md @@ -0,0 +1,83 @@ +# 在昆仑芯片上运行Wenet +## 介绍 +下面的示例展示了如何在XPU上部署WeNet离线或在线的ASR模型。XPU是一种由昆仑芯100%自主研发的通用人工智能计算核心架构。 + +## 准备XPU运行环境 + +在开始之前,请确认您获得以下必须的环境。 + + XRE(XPU Runtime Environment):昆仑芯片的基础运行环境,包括芯片驱动程序、runtime api库、固件FW工具等功能模块。 + XDNN(XPU Deep Neural Network Library):加速深度神经网络的昆仑芯片库,提供应用程序中使用的高性能DNN功能库。 + +如果您需要任何帮助,或是想要进一步了解昆仑芯片,请通过官方网址联系我们: +https://www.kunlunxin.com.cn/ + +## 操作步骤 +- 第一步:构建,需要cmake 3.14及以上版本 + +``` sh +export CXX=${your_g++_path} +export CC=${your_gcc_path} +export XPU_API_PATH=${your_api_path} + +# -r : release version; -d : debug version +bash ./compile.sh -r +``` + +- 第二步:测试,测试结果将在控制台输出 + +``` sh +## set KUNLUN XPU visible device +export XPU_VISIBLE_DEVICES=0 +export XPUSIM_DEVICE_MODEL=KUNLUN2 +## set logging level +export GLOG_logtostderr=1 +export GLOG_v=3 +## set speech wav and model/weight path +wav_path=${your_test_wav_path} +xpu_model_dir=${your_xpu_weight_dir} +units=${your_units.txt} +## executive command +./build/bin/decoder_main \ + --chunk_size -1 \ + --wav_path ${wav_path} \ + --xpu_model_dir ${xpu_model_di} \ + --unit_path ${units} \ + --device_id 0 \ + --nbest 3 2>&1 | tee log.txt +``` + +单条语音执行结果如下所示: + +``` sh +XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded +I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/ +I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/ +I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: ======= +I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4 +I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6 +I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538 +I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538 +I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1 +I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict +I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418 +I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418 +I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0! +I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418 +I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103 +I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512 +I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538 +I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms +I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 +I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 +I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3 +I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3 +I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3 +I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14 +I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms. +I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况 +I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况 +I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms. +test 甚至出现交易几乎停滞的情况 +I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms. +``` diff --git a/runtime/kunlun/README_EN.md b/runtime/kunlun/README_EN.md new file mode 100644 index 0000000000..ff78792f99 --- /dev/null +++ b/runtime/kunlun/README_EN.md @@ -0,0 +1,87 @@ +# WeNet running on KUNLUNXIN XPU device +## Introduction +The below example shows how to deploy WeNet offline and online ASR models on XPUs. +XPU is a core architecture 100% independently developed by KUNLUNXIN for general artificial intelligence computing. + +## Setup environment for XPU device + +Before the start, makesure you have these necessary environment + + XRE(XPU Runtime Environment):The basic operating environment of the XPUs + includes functional modules such as chip drivers, runtime api library, and firmware tools. + + XDNN(XPU Deep Neural Network Library): XPU library for accelerating deep neural networks, providing high-performance DNN function library used in applications. + +If you would like to know more about XPUs or need any help, please contact us through the official website: + +https://www.kunlunxin.com.cn/ + +## Instruction +- Step 1. Build, the build requires cmake 3.14 or above. + +``` sh +export CXX=${your_g++_path} +export CC=${your_gcc_path} +export XPU_API_PATH=${your_api_path} + +# -r : release version; -d : debug version +bash ./compile.sh -r +``` + +- Step 2. Testing, the result is shown in the console. + +``` sh +## set KUNLUN XPU visible device +export XPU_VISIBLE_DEVICES=0 +export XPUSIM_DEVICE_MODEL=KUNLUN2 +## set logging level +export GLOG_logtostderr=1 +export GLOG_v=3 +## set speech wav and model/weight/units path +wav_path=${your_test_wav_path} +xpu_model_dir=${your_xpu_weight_dir} +units=${your_units.txt} +## executive command +./build/bin/decoder_main \ + --chunk_size -1 \ + --wav_path $wav_path \ + --xpu_model_dir $xpu_model_dir \ + --unit_path $units \ + --device_id 0 \ + --nbest 3 2>&1 | tee log.txt +``` + +A typical output result is as following: + +``` sh +XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded +I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/ +I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/ +I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: ======= +I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4 +I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6 +I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538 +I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538 +I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1 +I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict +I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418 +I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418 +I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0! +I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418 +I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103 +I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512 +I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538 +I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms +I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 +I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况 +I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3 +I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3 +I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3 +I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14 +I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms. +I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况 +I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况 +I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms. +test 甚至出现交易几乎停滞的情况 +I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms. +``` diff --git a/runtime/kunlun/api b/runtime/kunlun/api new file mode 120000 index 0000000000..5c1acaccc3 --- /dev/null +++ b/runtime/kunlun/api @@ -0,0 +1 @@ +../core/api \ No newline at end of file diff --git a/runtime/kunlun/bin b/runtime/kunlun/bin new file mode 120000 index 0000000000..938df72152 --- /dev/null +++ b/runtime/kunlun/bin @@ -0,0 +1 @@ +../core/bin \ No newline at end of file diff --git a/runtime/kunlun/cmake b/runtime/kunlun/cmake new file mode 120000 index 0000000000..17afee87dc --- /dev/null +++ b/runtime/kunlun/cmake @@ -0,0 +1 @@ +../core/cmake \ No newline at end of file diff --git a/runtime/kunlun/compile.sh b/runtime/kunlun/compile.sh new file mode 100755 index 0000000000..d64a6f0501 --- /dev/null +++ b/runtime/kunlun/compile.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -e + +usage() { + echo "Usage:" + echo "bash compile.sh [-r] [-d] [-c]" + echo "Description:" + echo "-r, build release." + echo "-d, build debug." + echo "-c, remove cmakecache or build dir, then build." + echo "Example 1:" + echo " ./compile.sh -r " + echo " means: remove cache files in build dir, then build release." + echo "Example 2:" + echo " ./compile.sh -d -c all " + echo " means: remove all files in build dir, then build debug." + exit -1 +} + +if [ -z $CXX ]; then + echo -e "\033[31m [WARNING]: NO CXX in your env. Suggest setting CXX variable to support C++14. \033[0m" + sleep 2 +fi + +build_type='Release' +clean_type='cache' + +while getopts 'rdc:h' OPT; do + case $OPT in + r) build_type="Release";; + d) build_type="Debug";; + c) clean_type="$OPTARG";; + h) usage;; + ?) usage;; + esac +done + +if [ ! -d ./build ];then + mkdir build +fi + +if [ "$clean_type" = "all" ];then + pushd build + rm -rf ./* + popd +else + pushd build + rm -rf CMakeFiles/ cmake_install.cmake CMakeCache.txt CPackSourceConfig.cmake + popd +fi + +build_cmd="cd build && cmake -DINTTYPES_FORMAT:STRING=C99 " + +if [ "$build_type" = "Release" ];then + build_cmd="${build_cmd} -DCMAKE_BUILD_TYPE=Release .. && cmake --build ./ " +else + build_cmd="${build_cmd} -DCMAKE_BUILD_TYPE=Debug .. && cmake --build ./ " +fi + +echo "build command is ${build_cmd}" + +eval ${build_cmd} diff --git a/runtime/kunlun/decoder b/runtime/kunlun/decoder new file mode 120000 index 0000000000..3088ea48b2 --- /dev/null +++ b/runtime/kunlun/decoder @@ -0,0 +1 @@ +../core/decoder \ No newline at end of file diff --git a/runtime/kunlun/frontend b/runtime/kunlun/frontend new file mode 120000 index 0000000000..0292335d13 --- /dev/null +++ b/runtime/kunlun/frontend @@ -0,0 +1 @@ +../core/frontend \ No newline at end of file diff --git a/runtime/kunlun/grpc b/runtime/kunlun/grpc new file mode 120000 index 0000000000..57533a588c --- /dev/null +++ b/runtime/kunlun/grpc @@ -0,0 +1 @@ +../core/grpc \ No newline at end of file diff --git a/runtime/kunlun/kaldi b/runtime/kunlun/kaldi new file mode 120000 index 0000000000..764a9d445d --- /dev/null +++ b/runtime/kunlun/kaldi @@ -0,0 +1 @@ +../core/kaldi \ No newline at end of file diff --git a/runtime/kunlun/patch b/runtime/kunlun/patch new file mode 120000 index 0000000000..69789fa5e4 --- /dev/null +++ b/runtime/kunlun/patch @@ -0,0 +1 @@ +../core/patch \ No newline at end of file diff --git a/runtime/kunlun/post_processor b/runtime/kunlun/post_processor new file mode 120000 index 0000000000..4e434a5cca --- /dev/null +++ b/runtime/kunlun/post_processor @@ -0,0 +1 @@ +../core/post_processor \ No newline at end of file diff --git a/runtime/kunlun/test b/runtime/kunlun/test new file mode 120000 index 0000000000..e60cf87a7f --- /dev/null +++ b/runtime/kunlun/test @@ -0,0 +1 @@ +../core/test \ No newline at end of file diff --git a/runtime/kunlun/utils b/runtime/kunlun/utils new file mode 120000 index 0000000000..9e19e7af5a --- /dev/null +++ b/runtime/kunlun/utils @@ -0,0 +1 @@ +../core/utils \ No newline at end of file diff --git a/runtime/kunlun/websocket b/runtime/kunlun/websocket new file mode 120000 index 0000000000..18f5de12cf --- /dev/null +++ b/runtime/kunlun/websocket @@ -0,0 +1 @@ +../core/websocket \ No newline at end of file diff --git a/runtime/kunlun/xpu/CMakeLists.txt b/runtime/kunlun/xpu/CMakeLists.txt new file mode 100644 index 0000000000..380e23204b --- /dev/null +++ b/runtime/kunlun/xpu/CMakeLists.txt @@ -0,0 +1,25 @@ +message("cmake build type is ${CMAKE_BUILD_TYPE} .") + +if(XPU) + list(APPEND xpu_conformer_srcs ./xpu_asr_model.cc) + list(APPEND xpu_conformer_srcs ./xpu_conformer.cpp) + list(APPEND xpu_conformer_srcs ./xpu_util.cpp) + message(STATUS "Use src_files: [ ${xpu_conformer_srcs} ] to compile xpu_conformer.a .") + + # compile xpu_conformer.a + add_library(xpu_conformer STATIC ${xpu_conformer_srcs}) + target_link_libraries(xpu_conformer PUBLIC xpuapi xpurt) +endif() + +set(CMAKE_VERBOSE_MAKEFILE OFF) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive") +set(CMAKE_EXE_LINKER_FLAGS "-lpthread -lrt -lm -ldl") + +set(SRC_FILES ./conformer_test.cpp ./xpu_conformer.cpp ./xpu_util.cpp) +message(STATUS "Use src_files: [ ${SRC_FILES} ] to compile xpu_conformer_test.") + +add_executable(xpu_conformer_test ${SRC_FILES}) +target_link_libraries(xpu_conformer_test -lxpuapi -lxpurt) diff --git a/runtime/kunlun/xpu/conformer_test.cpp b/runtime/kunlun/xpu/conformer_test.cpp new file mode 100644 index 0000000000..1d9fd672a3 --- /dev/null +++ b/runtime/kunlun/xpu/conformer_test.cpp @@ -0,0 +1,276 @@ +// Copyright (c) 2022 KUNLUNXIN Inc. +// 2022 Han Qi (qihan@baidu.com) +// Hehe Pan (panhehe@baidu.com) +// Zikui Yan (yanzikui@baidu.com) +// Chaolin Li (lichaolin@baidu.com) +// All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "xpu_conformer.h" // NOLINT +#include "xpu_util.h" // NOLINT +namespace api = baidu::xpu::api; +namespace wenet = xpu::wenet; + +template +static void conformer_test(const std::string& data_dir, + const std::string& params_dir, int threads_number, + int dev_id) { + typedef std::vector< + std::tuple>, + std::tuple, std::vector>>> + Dtype; + ConformerEncoderParam encoder_param; + init_encoder_params(params_dir, encoder_param); + ConformerDecoderParam decoder_param; + init_decoder_params(params_dir, decoder_param); + int real_threads_number = threads_number <= 0 ? 1 : threads_number; + std::cout << "Encoder + Decoder MultiStreamTest threads:" + << real_threads_number << std::endl; + // init test data + std::vector ids = get_all_ids(data_dir); + Dtype data_list; + for (auto index_id : ids) { + std::string input_lenghts_prefix = + data_dir + std::to_string(index_id) + "_len"; + std::string input_prefix = data_dir + std::to_string(index_id); + auto input_lenghts_cpu_info = + read_cpu_data_from_file(input_lenghts_prefix, 1); + auto input_xpu_info = read_xpu_data_from_file(input_prefix, 3); + data_list.push_back( + std::make_tuple(input_xpu_info, input_lenghts_cpu_info)); + } + bool write_res = true; + // init mem + int ret = 0; + std::vector ctx_xpu_ptrs(real_threads_number); + std::vector streams(real_threads_number); + + int nsdnn = real_threads_number > 1 ? 2 : 6; + int ncluster = real_threads_number > 1 ? 2 : 8; + for (int i = 0; i < real_threads_number; i++) { + ret = xpu_stream_create(&streams[i]); + ctx_xpu_ptrs[i] = new api::Context(api::kXPU2); + ctx_xpu_ptrs[i]->xpu_stream = streams[i]; + ctx_xpu_ptrs[i]->set_nsdnn(nsdnn); + ctx_xpu_ptrs[i]->set_ncluster(ncluster); + } + // threads + std::vector thread_times(real_threads_number); + std::vector threads; + int data_counter = 0; + std::mutex data_mutex; + std::vector time_info(real_threads_number, 0.0f); + auto f = [&](int thread_id) { + xpu_set_device(dev_id); + api::Context* ctx_xpu = ctx_xpu_ptrs[thread_id]; + api::ctx_guard RAII_GUARD(ctx_xpu); + while (true) { + int data_index = -1; + data_mutex.lock(); + if (data_counter >= data_list.size()) { + data_mutex.unlock(); + break; + } + data_index = data_counter++; + data_mutex.unlock(); + if (data_index < 0) { + continue; + } + auto start_time = std::chrono::system_clock::now(); + // get input data + auto& input_xpu_info = std::get<0>(data_list[data_index]); + auto& input_lenghts_info = std::get<1>(data_list[data_index]); + auto& input_xpu_data = std::get<0>(input_xpu_info); + auto& speech_shape = std::get<1>(input_xpu_info); + int batch = speech_shape[0]; + int max_seqlen = speech_shape[1]; + auto xpu_mask_info_float = create_mask_according_speech_length( + std::get<0>(input_lenghts_info), max_seqlen, ctx_xpu->xpu_stream); + ret = xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + int q_seqlen = ((max_seqlen - 1) / 2 - 1) / 2; + // encoder run + int att_dim = encoder_param.head_num * encoder_param.head_dim; + int ctc_dim = encoder_param.ctc_dim; + T* encoder_out = RAII_GUARD.alloc(batch * q_seqlen * att_dim); + T* ctc_probs = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); + // get encoder_out & ctc_probs + ret = wenet::conformer_encoder_wenet( + ctx_xpu, input_xpu_data, speech_shape, encoder_out, ctc_probs, + encoder_param, xpu_mask_info_float); + CHECK_RET(ret); + ret = xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + // ctc_prefix_beamsearch implement in cpu + int beam_size = encoder_param.beam_size; + int new_bs = batch * beam_size; + std::vector hyps_len(new_bs); + std::vector ctc_scores(new_bs); + std::vector hyps_cpu; + int* hyps = RAII_GUARD.alloc(new_bs * q_seqlen); + ret = wenet::ctc_prefix_beamsearch(ctx_xpu, ctc_probs, hyps_cpu, + hyps_len, ctc_scores, batch, + beam_size, q_seqlen, ctc_dim); + CHECK_RET(ret); + ret = xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + int max_target_len = + padding_target(hyps_cpu, hyps_len, beam_size, decoder_param.eos_id); + ret = xpu_memcpy(hyps, reinterpret_cast(&hyps_cpu[0]), + max_target_len * new_bs * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + // decoder + int pad_target_len = decoder_param.add_sos_num + max_target_len; + float* character_scores = + RAII_GUARD.alloc(new_bs * pad_target_len * ctc_dim); + ret = wenet::conformer_decoder_wenet( + ctx_xpu, encoder_out, {batch, q_seqlen, att_dim}, + std::get<0>(xpu_mask_info_float), hyps, {new_bs, max_target_len}, + character_scores, decoder_param); + CHECK_RET(ret); + ret = xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + // Only use decoder score for rescoring + std::vector best_score(batch, -std::numeric_limits::max()); + std::vector best_index(batch, 0); + float ctc_weight = 0.5; + std::vector decoder_out(new_bs * pad_target_len * ctc_dim); + ret = xpu_memcpy(&decoder_out[0], character_scores, + new_bs * max_target_len * ctc_dim * sizeof(float), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + // cal score && output + std::string wav_prefix = + data_dir + std::to_string(data_index) + "_wav.txt"; + std::string res_prefix = "./token_id.txt"; + std::ofstream res; + std::string wav_name; + std::vector wav_info; + if (write_res) { + std::ifstream wav(wav_prefix.c_str()); + if (!wav.is_open()) { + std::cout << "wav file open fail" << std::endl; + exit(0); + } + while (getline(wav, wav_name)) { + wav_info.push_back(wav_name); + } + wav.close(); + } + for (int i = 0; i < batch; i++) { + for (int j = 0; j < beam_size; j++) { + T score = 0.0; + for (int k = 0; k < hyps_len[i * beam_size + j]; k++) { + int index = i * beam_size * max_target_len * ctc_dim + + j * max_target_len * ctc_dim + k * ctc_dim + + hyps_cpu[k]; + score += decoder_out[index]; + } + score += decoder_out[i * beam_size * max_target_len * ctc_dim + + j * max_target_len * ctc_dim + + hyps_len[i * batch + j] * ctc_dim + ctc_dim - 1]; + // add ctc score + score += ctc_weight * ctc_scores[i * beam_size + j]; + if (score > best_score[i]) { + best_score[i] = score; + best_index[i] = j; + } + } + int token_index = best_index[i] + i * beam_size; + if (write_res) { + data_mutex.lock(); + res.open(res_prefix, std::ios::app); + if (!res.is_open()) { + std::cout << "res file open fail" << std::endl; + exit(0); + } + res << wav_info[i] << ":"; + for (int k = 0; k < hyps_len[token_index]; k++) + res << hyps_cpu[k] << " "; + res << std::endl; + res.close(); + data_mutex.unlock(); + } + } + auto end_time = std::chrono::system_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time); + time_info[thread_id] += static_cast(duration.count()) / 1000; + ret = xpu_free(std::get<0>(input_xpu_info)); + CHECK_RET(ret); + ret = xpu_free(std::get<0>(xpu_mask_info_float)); + CHECK_RET(ret); + } + }; + auto all_start = std::chrono::system_clock::now(); + for (auto i = 0; i < real_threads_number; i++) { + std::thread t(f, i); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } + auto all_end = std::chrono::system_clock::now(); + auto duration = std::chrono::duration_cast( + all_end - all_start); + float total_time = static_cast(duration.count()) / 1000; + std::cout << "Total time cost:" << total_time << std::endl; + for (int i = 0; i < real_threads_number; i++) { + if (ctx_xpu_ptrs[i]) delete ctx_xpu_ptrs[i]; + } +} + +int main(int argc, char* argv[]) { + if (argc != 6) { + std::cout << "Only support the following three params:" << std::endl; + std::cout + << "\t1. " << argv[0] + << " encoder_test [params_dir] [data_dir] [dev_id] [threads_number]" + << std::endl; + std::cout + << "\t2. " << argv[0] + << " decoder_test [params_dir] [data_dir] [dev_id] [threads_number]" + << std::endl; + std::cout << "\t3. " << argv[0] + << " all [params_dir] [data_dir] [dev_id] [threads_number]" + << std::endl; + return 0; + } + std::string mode = argv[1]; + std::string params_dir = argv[2]; + std::string data_dir = argv[3]; + int dev_id = std::stoi(argv[4]); + int threads_number = std::stoi(argv[5]); + add_separator_when_necessary(params_dir); + add_separator_when_necessary(data_dir); + xpu_set_device(dev_id); + + typedef float16 T; + typedef int16_t TW; + typedef int16_t TGEMM; + + if (mode == "all") { + conformer_test(data_dir, params_dir, threads_number, dev_id); + } else { + std::cout << "Unkown test mode: " << mode << std::endl; + std::exit(1); + } +} diff --git a/runtime/kunlun/xpu/xpu_asr_model.cc b/runtime/kunlun/xpu/xpu_asr_model.cc new file mode 100644 index 0000000000..71b60bd156 --- /dev/null +++ b/runtime/kunlun/xpu/xpu_asr_model.cc @@ -0,0 +1,318 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) +// 2022 Han Qi (qihan@baidu.com, Kunlunxin Inc) +// Hehe Pan (panhehe@baidu.com, Kunlunxin Inc) +// Zikui Yan (yanzikui@baidu.com, Kunlunxin Inc) +// All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "xpu_asr_model.h" // NOLINT + +#include +#include +#include +#include + +#include "utils/string.h" + +namespace wenet { + +void XPUAsrModel::SetEngineThreads(int num_threads) { + real_threads_number = num_threads; +} + +void XPUAsrModel::SetDeviceId(int dev_id) { device_id_ = dev_id; } + +void XPUAsrModel::Read(const std::string& model_dir) { + // init xpu runtime params + ctx_xpu_ptr = std::make_shared(api::kXPU2); + RAII_GUARD.reset(new api::ctx_guard(ctx_xpu_ptr.get())); + + // For XPU, model_dir is params_dir, which is used to store weights for every + // layer. + std::string weight_dir = model_dir + "/model_weights/"; + std::string weight_info_txt_path = weight_dir + "/weights_info.txt"; + + LOG(INFO) << "\e[1;34mXPU weight_dir is: " << weight_dir << "\e[0m\n"; + if (!std::ifstream(weight_info_txt_path.c_str()).good()) { + LOG(FATAL) << "weight_info_txt: " << weight_info_txt_path + << " NOT exist !!!\n"; + } + + // 1. Load weight for every layer + init_encoder_params(weight_dir, encoder_param); + init_decoder_params(weight_dir, decoder_param); + + // 2. Read metadata + // TODO(panhehe): Load following parameters from config file or + // encoder/decoder params. + subsampling_rate_ = 4; + right_context_ = 6; + sos_ = 5538; + eos_ = 5538; + is_bidirectional_decoder_ = 1; + + LOG(INFO) << "======= XPU Kunlun Model Info: ======="; + LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; + LOG(INFO) << "\tright_context " << right_context_; + LOG(INFO) << "\tsos " << sos_; + LOG(INFO) << "\teos " << eos_; + LOG(INFO) << "\tis bidirectional decoder " << is_bidirectional_decoder_; +} + +XPUAsrModel::XPUAsrModel(const XPUAsrModel& other) { + // 1. Init the model info + right_context_ = other.right_context_; + subsampling_rate_ = other.subsampling_rate_; + sos_ = other.sos_; + eos_ = other.eos_; + is_bidirectional_decoder_ = other.is_bidirectional_decoder_; + chunk_size_ = other.chunk_size_; + num_left_chunks_ = other.num_left_chunks_; + offset_ = other.offset_; + + l3ptr = other.l3ptr; + real_threads_number = other.real_threads_number; + device_id_ = other.device_id_; + ctx_xpu_ptr = other.ctx_xpu_ptr; + RAII_GUARD = other.RAII_GUARD; + encoder_param = other.encoder_param; + decoder_param = other.decoder_param; + stream = other.stream; + // other member variables may not need to copy here +} + +std::shared_ptr XPUAsrModel::Copy() const { + auto asr_model = std::make_shared(*this); + // Reset the inner states for new decoding + asr_model->Reset(); + return asr_model; +} + +void XPUAsrModel::Reset() { + offset_ = 0; + encoder_out = nullptr; + ctc_probs = nullptr; + cached_feature_.clear(); + // Reset att_cache + att_cache_.resize(0, 0.0); + cnn_cache_.resize(0, 0.0); +} + +void XPUAsrModel::ForwardEncoderFunc( + const std::vector>& chunk_feats, + std::vector>* out_prob) { + // Set Device Id + LOG(INFO) << "Now Use XPU:" << device_id_ << "!\n"; + xpu_set_device(device_id_); + + // 1. Prepare XPU required data, splice cached_feature_ and chunk_feats + // The first dimension is for batchsize, which is 1. + // chunk + + int num_frames = cached_feature_.size() + chunk_feats.size(); + const int feature_dim = chunk_feats[0].size(); + + std::vector feats_length_shape = {1}; + std::vector feats_length_data = {num_frames}; + input_lenghts_cpu_info = + std::make_tuple(feats_length_data, feats_length_shape); + + std::vector feats_data_shape = {1, num_frames, feature_dim}; + std::vector feats_data_cpu; + feats_data_cpu.reserve(1 * num_frames * feature_dim); + // convert 2d-vector to 1d-vector + for (auto& row : chunk_feats) { + auto end_iter = feats_data_cpu.end(); + feats_data_cpu.insert(end_iter, row.cbegin(), row.cend()); + } + + float* input_xpu_data = get_xpu_data("wav_test", feats_data_cpu); + input_xpu_info = std::make_tuple(input_xpu_data, feats_data_shape); + + // init L3 Memory + int ret = 0; + real_threads_number = 1; + int nsdnn = real_threads_number > 1 ? 2 : 6; + int ncluster = real_threads_number > 1 ? 2 : 8; + for (int i = 0; i < real_threads_number; i++) { + ret = xpu_stream_create(&stream); + ctx_xpu_ptr->xpu_stream = stream; + ctx_xpu_ptr->set_nsdnn(nsdnn); + ctx_xpu_ptr->set_ncluster(ncluster); + } + + std::shared_ptr ctx_xpu = ctx_xpu_ptr; + + // get input speech info and data + batch = feats_data_shape.at(0); // batch = 1 + max_seqlen = feats_data_shape.at(1); + + xpu_mask_info_float = create_mask_according_speech_length( + feats_length_data, max_seqlen, ctx_xpu->xpu_stream); + + ret = xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + + q_seqlen = ((max_seqlen - 1) / 2 - 1) / 2; + + // Encoder run + int att_dim = encoder_param.head_num * encoder_param.head_dim; + int ctc_dim = encoder_param.ctc_dim; + + LOG(INFO) << "\t max_seqlen is " << max_seqlen << "\n"; + LOG(INFO) << "\t q_seqlen is " << q_seqlen << "\n"; + LOG(INFO) << "\t att_dim is " << att_dim << "\n"; + LOG(INFO) << "\t ctc_dim is " << ctc_dim << "\n"; + + // T is float16 + encoder_out = RAII_GUARD->alloc(batch * q_seqlen * att_dim); + ctc_probs = RAII_GUARD->alloc(batch * q_seqlen * ctc_dim); + + // 2. Encoder chunk forward, including ctc_activation + // get encoder_out & ctc_probs + ret = xpu::wenet::conformer_encoder_wenet( + ctx_xpu.get(), input_xpu_data, feats_data_shape, encoder_out, ctc_probs, + encoder_param, xpu_mask_info_float); + CHECK_RET(ret); + + // Copy to output(cpu) + int num_outputs = q_seqlen; + int output_dim = ctc_dim; + out_prob->resize(num_outputs); + + float* logp = RAII_GUARD->alloc(batch * q_seqlen * ctc_dim); + // cast T to float32 + ret = api::cast_v2(ctx_xpu.get(), ctc_probs, logp, + batch * q_seqlen * ctc_dim); + CHECK_RET(ret); + ret = xpu_wait(ctx_xpu->xpu_stream); + CHECK_RET(ret); + + // xpu_memcpy logp from device to host + for (int i = 0; i < num_outputs; i++) { + (*out_prob)[i].resize(output_dim); + ret = xpu_memcpy(reinterpret_cast((*out_prob)[i].data()), + logp + output_dim * i, output_dim * sizeof(float), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + CHECK_RET(ret); + } +} + +float XPUAsrModel::ComputeAttentionScore(const float* prob, + const std::vector& hyp, int eos, + int decode_out_len) { + float score = 0.0f; + for (size_t j = 0; j < hyp.size(); ++j) { + score += *(prob + j * decode_out_len + hyp[j]); + } + score += *(prob + hyp.size() * decode_out_len + eos); + return score; +} + +void XPUAsrModel::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { + CHECK(rescoring_score != nullptr); + int num_hyps = hyps.size(); + rescoring_score->resize(num_hyps, 0.0f); + + if (num_hyps == 0) { + return; + } + + if (encoder_out == nullptr) { + return; + } + + int beam_size = encoder_param.beam_size; + int new_bs = batch * beam_size; + + std::vector hyps_lens; + int max_hyps_len = 0; + for (size_t i = 0; i < num_hyps; ++i) { + int length = hyps[i].size() + 1; + max_hyps_len = std::max(length, max_hyps_len); + hyps_lens.emplace_back(static_cast(length)); + } + LOG(INFO) << "\t num_hyps is " << num_hyps << "\n"; + LOG(INFO) << "\t beam_size is " << beam_size << "\n"; + LOG(INFO) << "\t new_bs is " << new_bs << "\n"; + LOG(INFO) << "\t max_hyps_len is " << max_hyps_len << "\n"; + + // pad hyps + std::vector hyps_pad_cpu(max_hyps_len * beam_size); + for (size_t i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + hyps_pad_cpu.emplace_back(sos_); + size_t j = 0; + for (; j < hyp.size(); ++j) { + hyps_pad_cpu.emplace_back(hyp[j]); + } + if (j == max_hyps_len - 1) { + continue; + } + for (; j < max_hyps_len - 1; ++j) { + hyps_pad_cpu.emplace_back(0); + } + } + int* hyps_xpu = RAII_GUARD->alloc(new_bs * q_seqlen); + int max_target_len = max_hyps_len; + // xpu_memcpy hyps_pad_cup to device + int ret = xpu_memcpy(hyps_xpu, reinterpret_cast(hyps_pad_cpu.data()), + max_target_len * new_bs * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + CHECK_RET(ret); + + // Decoder + int att_dim = encoder_param.head_num * encoder_param.head_dim; + int ctc_dim = encoder_param.ctc_dim; + int pad_target_len = decoder_param.add_sos_num + max_target_len; + float* character_scores = + RAII_GUARD->alloc(new_bs * pad_target_len * ctc_dim); + ret = xpu::wenet::conformer_decoder_wenet( + ctx_xpu_ptr.get(), encoder_out, {batch, q_seqlen, att_dim}, + std::get<0>(xpu_mask_info_float), hyps_xpu, {new_bs, max_target_len}, + character_scores, decoder_param); + CHECK_RET(ret); + ret = xpu_wait(ctx_xpu_ptr->xpu_stream); + CHECK_RET(ret); + + // xpu_memcpy from xpu device to host + std::vector decoder_out(new_bs * pad_target_len * ctc_dim); + ret = xpu_memcpy(&decoder_out[0], character_scores, + new_bs * max_target_len * ctc_dim * sizeof(float), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + CHECK_RET(ret); + ret = xpu_wait(ctx_xpu_ptr->xpu_stream); + CHECK_RET(ret); + + // cal score + float* decoder_outs_data = decoder_out.data(); + for (size_t i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + float score = 0.0f; + // left to right decoder score + // ctc_dim maybe equal to decode_out_len + score = ComputeAttentionScore( + decoder_outs_data + max_target_len * ctc_dim * i, hyp, eos_, ctc_dim); + // Optional: Used for right to left score + float r_score = 0.0f; + // reverse_weight is 0 ; so the codes in if-condition is be ignored. + // combined left-to-right and right-to-left score + (*rescoring_score)[i] = + score * (1 - reverse_weight) + r_score * reverse_weight; + } +} + +} // namespace wenet diff --git a/runtime/kunlun/xpu/xpu_asr_model.h b/runtime/kunlun/xpu/xpu_asr_model.h new file mode 100644 index 0000000000..500081ad9d --- /dev/null +++ b/runtime/kunlun/xpu/xpu_asr_model.h @@ -0,0 +1,101 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) +// 2022 Han Qi (qihan@baidu.com, Kunlunxin Inc) +// Hehe Pan (panhehe@baidu.com, Kunlunxin Inc) +// Zikui Yan (yanzikui@baidu.com, Kunlunxin Inc) +// All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_ +#define RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_ + +#include +#include +#include +#include + +#include "decoder/asr_model.h" +#include "utils/log.h" +#include "utils/utils.h" + +#include "xpu_conformer.h" // NOLINT + +namespace wenet { + +class XPUAsrModel : public AsrModel { + typedef float16 T; + typedef int16_t TW; + + public: + // Note: Do not call the InitEngineThreads function more than once. + void SetEngineThreads(int num_threads = 1); + + public: + XPUAsrModel() = default; + XPUAsrModel(const XPUAsrModel& other); + void SetDeviceId(int dev_id); + void Read(const std::string& model_dir); + void Reset() override; + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override; + std::shared_ptr Copy() const override; + + protected: + void ForwardEncoderFunc(const std::vector>& chunk_feats, + std::vector>* ctc_prob) override; + + float ComputeAttentionScore(const float* prob, const std::vector& hyp, + int eos, int decode_out_len); + + private: + int encoder_output_size_ = 0; + int num_blocks_ = 0; + int cnn_module_kernel_ = 0; + int head_ = 0; + + // XPU device id + int device_id_ = 0; + int real_threads_number = 1; + + // XPU Conformer EncoderParam and DecoderParam + ConformerEncoderParam encoder_param; + ConformerDecoderParam decoder_param; + + // XPU input and weights params + using INPUT_LENGTH_CPU_TUPLE = std::tuple, std::vector>; + using INPUT_XPU_INFO_TUPLE = std::tuple>; + INPUT_LENGTH_CPU_TUPLE input_lenghts_cpu_info; + INPUT_XPU_INFO_TUPLE input_xpu_info; + INPUT_XPU_INFO_TUPLE xpu_mask_info_float; + + // XPU encoder and decoder outputs + T* encoder_out = nullptr; + T* ctc_probs = nullptr; + + // XPU runtime params + void* l3ptr = nullptr; + XPUStream stream; + std::shared_ptr ctx_xpu_ptr; + std::shared_ptr RAII_GUARD; + + int batch, max_seqlen, q_seqlen; + + // caches + std::vector att_cache_; + std::vector cnn_cache_; +}; + +} // namespace wenet + +#endif // RUNTIME_KUNLUN_XPU_XPU_ASR_MODEL_H_ diff --git a/runtime/kunlun/xpu/xpu_conformer.cpp b/runtime/kunlun/xpu/xpu_conformer.cpp new file mode 100644 index 0000000000..f5fd562a62 --- /dev/null +++ b/runtime/kunlun/xpu/xpu_conformer.cpp @@ -0,0 +1,971 @@ +// Copyright (c) 2022 KUNLUNXIN Inc. +// 2022 Han Qi (qihan@baidu.com) +// Hehe Pan (panhehe@baidu.com) +// Zikui Yan (yanzikui@baidu.com) +// Chaolin Li (lichaolin@baidu.com) +// All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "xpu_conformer.h" // NOLINT +#include +#include +#include +#include + +namespace xpu { +namespace wenet { +const int X4_BEGIN = 8; +template +static int encoder_embed(api::Context* ctx_xpu, const float* x, T* y, int batch, + int max_seqlen, int seq_dim, int att_dim, + const ConformerEncoderParam& param) { + api::ctx_guard RAII_GUARD(ctx_xpu); + int ret = 0; + int h_seqlen = (max_seqlen - 1) / 2; + int q_seqlen = (h_seqlen - 1) / 2; + int out_channels = att_dim; + int h_dim = (seq_dim - 1) / 2; + int q_dim = (h_dim - 1) / 2; + + float xscale = std::sqrt(att_dim); + std::vector sizes = {std::max(batch * max_seqlen * seq_dim, + batch * out_channels * q_seqlen * q_dim), + batch * out_channels * h_seqlen * h_dim}; + std::vector ptrs; + for (auto size_ind : sizes) { + ptrs.push_back(RAII_GUARD.alloc(size_ind)); + } + + auto& emb_conv_w_list = param.emb_conv_w_list; + auto& emb_conv_maxw_list = param.emb_conv_maxw_list; + auto& emb_conv_bias_list = param.emb_conv_bias_list; + auto& emb_fc_w = param.emb_fc_w_list; + auto& emb_fc_maxw = param.emb_fc_maxw_list; + auto& emb_fc_bias = param.emb_fc_bias_list; + + ret = + api::cast_v2(ctx_xpu, x, ptrs[0], batch * max_seqlen * seq_dim); + WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); + ret = api::conv2d_fusion( + ctx_xpu, ptrs[0], emb_conv_w_list[0], ptrs[1], batch, 1, max_seqlen, + seq_dim, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, 1, nullptr, + emb_conv_maxw_list[0], nullptr, true, emb_conv_bias_list[0], nullptr, + api::Activation_t::RELU, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); + ret = api::conv2d_fusion( + ctx_xpu, ptrs[1], emb_conv_w_list[1], ptrs[0], batch, out_channels, + h_seqlen, h_dim, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, 1, nullptr, + emb_conv_maxw_list[1], nullptr, true, emb_conv_bias_list[1], nullptr, + api::Activation_t::RELU, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); + ret = api::transpose(ctx_xpu, ptrs[0], ptrs[1], + {batch, out_channels, q_seqlen, q_dim}, {0, 2, 1, 3}); + WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); + ret = api::fc_fusion( + ctx_xpu, ptrs[1], emb_fc_w[0], ptrs[0], batch * q_seqlen, att_dim, + out_channels * q_dim, false, true, nullptr, emb_fc_maxw[0], nullptr, + out_channels * q_dim, out_channels * q_dim, att_dim, 1.0f, 0.0f, + emb_fc_bias[0], api::Activation_t::LINEAR); + WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); + ret = api::scale(ctx_xpu, ptrs[0], y, batch * q_seqlen * out_channels, + false, xscale, 0); + WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); + ret = xpu_wait(ctx_xpu->xpu_stream); + WRAPPER_ASSERT_SUCCESS(ctx_xpu, ret); + return api::SUCCESS; +} + +template +static int ffn(api::Context* ctx, int batch, int q_seqlen, int hidden_dim, + bool with_endln, const T* x, T* y, int ln_begin, int fc_begin, + std::vector ln_scale_list, + std::vector ln_bias_list, + std::vector fc_w_list, + std::vector fc_maxw_list, + std::vector fc_bias_list, + std::vector mem_single, int ffn_factor) { + api::ctx_guard RAII_GUARD(ctx); + int ret = api::SUCCESS; + std::unordered_map buf_mapping = { + {"ffn_ln", mem_single[1]}, {"ffn_fc0", mem_single[X4_BEGIN]}, + {"tmp0", mem_single[X4_BEGIN + 1]}, {"tmp1", mem_single[X4_BEGIN]}, + {"ffn_fc1", mem_single[1]}, + }; + int ffn1_out_dim = hidden_dim * ffn_factor; + int ffn2_input_dim = ffn1_out_dim; + ret = api::layer_norm(ctx, x, buf_mapping["ffn_ln"], batch * q_seqlen, + hidden_dim, 1e-5, ln_scale_list[ln_begin], + ln_bias_list[ln_begin], nullptr, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::fc_fusion( + ctx, buf_mapping["ffn_ln"], fc_w_list[fc_begin], buf_mapping["ffn_fc0"], + batch * q_seqlen, ffn1_out_dim, hidden_dim, false, true, nullptr, + fc_maxw_list[fc_begin], nullptr, hidden_dim, hidden_dim, ffn1_out_dim, + 1.0f, 0.0f, fc_bias_list[fc_begin], api::Activation_t::LINEAR); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::sigmoid(ctx, buf_mapping["ffn_fc0"], buf_mapping["tmp0"], + batch * q_seqlen * hidden_dim * ffn_factor); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::mul(ctx, buf_mapping["ffn_fc0"], buf_mapping["tmp0"], + buf_mapping["tmp1"], + batch * q_seqlen * hidden_dim * ffn_factor); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::fc_fusion( + ctx, buf_mapping["tmp1"], fc_w_list[fc_begin + 1], buf_mapping["ffn_fc1"], + batch * q_seqlen, hidden_dim, ffn2_input_dim, false, true, nullptr, + fc_maxw_list[fc_begin + 1], nullptr, ffn2_input_dim, ffn2_input_dim, + hidden_dim, 0.5f, 0.0f, fc_bias_list[fc_begin + 1], + api::Activation_t::LINEAR); + if (with_endln) { + ret = api::add_layer_norm_fusion( + ctx, x, buf_mapping["ffn_fc1"], y, batch * q_seqlen, hidden_dim, 1e-5, + ln_scale_list[ln_begin + 1], ln_bias_list[ln_begin + 1]); + } else { + ret = api::add(ctx, x, buf_mapping["ffn_fc1"], y, + batch * q_seqlen * hidden_dim); + } + WRAPPER_ASSERT_SUCCESS(ctx, ret); + return api::SUCCESS; +} + +template +int wenet_encoder_layer(api::Context* ctx, + api::ctx_guard& RAII_GUARD, // NOLINT + int batch, int q_seqlen, int hidden_dim, int ln_begin, + int fc_begin, int attn_pos_begin, int conv_begin, + const T* x, T* y, + ConformerEncoderParam& param, // NOLINT + std::vector& mem_single, // NOLINT + std::vector& mem_double, // NOLINT + float* mem_float, float* mask_score) { + WRAPPER_CHECK_CTX(ctx); + int max_size = ctx->max_ptr_size(); + int ret = api::SUCCESS; + std::unordered_map buf_mapping = { + {"ffn0_out", mem_single[1]}, + {"swp0", mem_single[2]}, + {"swp1", mem_single[3]}, + {"matrix_bd_pre", mem_double[0]}, + {"soft_scores", mem_double[0]}, + {"qkv", mem_single[2]}, + {"qkv_add", mem_single[1]}, + {"conv_p1", mem_single[X4_BEGIN + 2]}, + {"conv_glu0", mem_single[X4_BEGIN + 3]}, + {"conv_glu1", mem_single[X4_BEGIN + 4]}, + {"conv_d1", mem_single[X4_BEGIN + 3]}, + {"conv_p2", mem_single[X4_BEGIN + 2]}, + {"conv_after", mem_single[0]}, + }; + + auto ln_scale_list = param.ln_scale_list; + auto ln_bias_list = param.ln_bias_list; + + auto fc_w_list = param.fc_w_list; + auto fc_maxw_list = param.fc_maxw_list; + auto fc_bias_list = param.fc_bias_list; + + auto attn_pos_w_list = param.attn_pos_w_list; + auto attn_pos_maxw_list = param.attn_pos_maxw_list; + auto attn_pos_uv_bias_list = param.attn_pos_uv_bias_list; + + auto conv_w_list = param.conv_w_list; + auto conv_maxw_list = param.conv_maxw_list; + auto conv_bias_list = param.conv_bias_list; + + auto kernel_size = param.conv_param.kernel_size; + auto lorder = param.conv_param.lorder; + auto padding = param.conv_param.padding; + auto head_num = param.head_num; + auto head_dim = param.head_dim; + /* + ** feed forward macaron-style module + ** x = residual + 0.5*ff(x) + */ + ret = ffn(ctx, batch, q_seqlen, hidden_dim, false, x, + buf_mapping["ffn0_out"], ln_begin, fc_begin, + ln_scale_list, ln_bias_list, fc_w_list, fc_maxw_list, + fc_bias_list, mem_single, param.ffn_factor); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + /* + ** multi-headed self-attention module + ** qkv_list[0-4]: q,k,v,qu,qv mapping single[2-6] + ** attn_pos_uv_bias_list : float -> float16 + ** q_pos_attention : get pos_emb before cal + ** q_pos_attention : cal matrix_bd to qk_attention's mask ,when cal + *qk_attention, mask will be added + **/ + T* qkv_list[5] = {mem_single[6], mem_single[3], mem_single[4], mem_single[5], + mem_single[2]}; + ret = api::layer_norm(ctx, buf_mapping["ffn0_out"], buf_mapping["swp0"], + batch * q_seqlen, hidden_dim, 1e-5, + ln_scale_list[ln_begin + 1], + ln_bias_list[ln_begin + 1], nullptr, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::fc_fusion_3c( + ctx, buf_mapping["swp0"], fc_w_list[fc_begin + 2], qkv_list[0], + qkv_list[1], qkv_list[2], batch * q_seqlen, hidden_dim * 3, hidden_dim, + false, true, nullptr, fc_maxw_list[fc_begin + 2], nullptr, hidden_dim, + hidden_dim, hidden_dim * 3, 1.0f, 0.0f, fc_bias_list[fc_begin + 2], + api::Activation_t::LINEAR); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + for (int i = 0; i < 2; i++) { + ret = api::broadcast_add( + ctx, qkv_list[0], attn_pos_uv_bias_list[attn_pos_begin * 2 + i], + qkv_list[i + 3], {batch, q_seqlen, hidden_dim}, {1, 1, hidden_dim}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + } + int pos_emb_dim = 2 * q_seqlen - 1; + T* pos_emb_sliced = RAII_GUARD.alloc(pos_emb_dim * hidden_dim); + ret = api::slice(ctx, param.pos_emb[attn_pos_begin], pos_emb_sliced, + {5000, head_num, head_dim}, {0, 0, 0}, + {pos_emb_dim, head_num, head_dim}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + int tmp_sliced_len = batch * head_num * q_seqlen * q_seqlen; + float* tmp_mask = RAII_GUARD.alloc(tmp_sliced_len); + ret = api::q_pos_attention( + ctx, qkv_list[4], pos_emb_sliced, buf_mapping["matrix_bd_pre"], batch, + q_seqlen, head_num, head_dim, 1.0f / std::sqrt(head_dim), nullptr, + nullptr, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::slice(ctx, buf_mapping["matrix_bd_pre"], + reinterpret_cast(mem_float), + {batch, head_num, q_seqlen, pos_emb_dim}, {0, 0, 0, 0}, + {batch, head_num, q_seqlen, q_seqlen}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::cast_v2(ctx, reinterpret_cast(mem_float), tmp_mask, + batch * head_num * q_seqlen * q_seqlen); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::broadcast_add(ctx, tmp_mask, mask_score, mem_float, + {batch, head_num, q_seqlen, q_seqlen}, + {batch, q_seqlen}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + api::QKVAttnParam loop_p(batch, q_seqlen, head_num, head_dim, + {batch, head_num, q_seqlen, q_seqlen}, + api::Activation_t::LINEAR, -1, false, hidden_dim); + float* qk_maxptr = RAII_GUARD.alloc(max_size); + ret = api::qk_attention( + ctx, qkv_list[3], qkv_list[1], buf_mapping["soft_scores"], nullptr, + nullptr, qk_maxptr, loop_p, mem_float); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + float* qkv_maxptr = RAII_GUARD.alloc(max_size); + ret = api::qk_v_attention( + ctx, buf_mapping["soft_scores"], qkv_list[2], buf_mapping["qkv"], + qk_maxptr, nullptr, qkv_maxptr, loop_p); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::fc_fusion( + ctx, buf_mapping["qkv"], fc_w_list[fc_begin + 3], buf_mapping["swp1"], + batch * q_seqlen, hidden_dim, hidden_dim, false, true, qkv_maxptr, + fc_maxw_list[fc_begin + 3], nullptr, hidden_dim, hidden_dim, hidden_dim, + 1.0f, 0.0f, fc_bias_list[fc_begin + 3], api::Activation_t::LINEAR); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::add(ctx, buf_mapping["ffn0_out"], buf_mapping["swp1"], + buf_mapping["qkv_add"], batch * q_seqlen * hidden_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + /* + ** Conv conv_p1-conv_d1-conv_p2 + */ + ret = api::layer_norm(ctx, buf_mapping["qkv_add"], buf_mapping["swp1"], + batch * q_seqlen, hidden_dim, 1e-5, + ln_scale_list[ln_begin + 2], + ln_bias_list[ln_begin + 2], nullptr, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::transpose(ctx, buf_mapping["swp1"], buf_mapping["swp0"], + {batch, q_seqlen, hidden_dim}, {0, 2, 1}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + int pad_seqlen = q_seqlen; + if (lorder > 0) { + ret = api::pad(ctx, buf_mapping["swp0"], buf_mapping["swp1"], + {batch, hidden_dim, q_seqlen}, {0, 0, lorder}, {0, 0, 0}, + padding); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + pad_seqlen += lorder; + } + ret = api::conv2d_fusion( + ctx, buf_mapping["swp1"], conv_w_list[conv_begin], buf_mapping["swp0"], + batch, hidden_dim, 1, pad_seqlen, hidden_dim * 2, {1, 1}, {1, 1}, + {0, 0, 0, 0}, {1, 1}, 1, nullptr, conv_maxw_list[conv_begin], nullptr, + true, conv_bias_list[conv_begin], nullptr, api::Activation_t::LINEAR, + nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::split(ctx, buf_mapping["swp0"], + {buf_mapping["conv_glu0"], buf_mapping["conv_glu1"]}, + {batch, hidden_dim * 2, pad_seqlen}, + {hidden_dim, hidden_dim}, 1); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::sigmoid(ctx, buf_mapping["conv_glu1"], buf_mapping["conv_glu1"], + batch * pad_seqlen * hidden_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::mul(ctx, buf_mapping["conv_glu0"], buf_mapping["conv_glu1"], + buf_mapping["conv_p1"], batch * pad_seqlen * hidden_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::conv1d_fusion( + ctx, buf_mapping["conv_p1"], conv_w_list[conv_begin + 1], + buf_mapping["conv_d1"], batch, hidden_dim, pad_seqlen, hidden_dim, + kernel_size, 1, {0}, 1, hidden_dim, nullptr, + conv_maxw_list[conv_begin + 1], nullptr, true, + conv_bias_list[conv_begin + 1], nullptr, api::Activation_t::LINEAR, + nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + + ret = api::transpose(ctx, buf_mapping["conv_d1"], buf_mapping["swp0"], + {batch, hidden_dim, q_seqlen}, {0, 2, 1}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::layer_norm(ctx, buf_mapping["swp0"], buf_mapping["swp1"], + batch * q_seqlen, hidden_dim, 1e-5, + ln_scale_list[ln_begin + 3], + ln_bias_list[ln_begin + 3], nullptr, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::sigmoid(ctx, buf_mapping["swp1"], buf_mapping["swp0"], + batch * q_seqlen * hidden_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::mul(ctx, buf_mapping["swp0"], buf_mapping["swp1"], + buf_mapping["conv_p1"], batch * q_seqlen * hidden_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::transpose(ctx, buf_mapping["conv_p1"], buf_mapping["conv_d1"], + {batch, q_seqlen, hidden_dim}, {0, 2, 1}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::conv2d_fusion( + ctx, buf_mapping["conv_d1"], conv_w_list[conv_begin + 2], + buf_mapping["conv_p2"], batch, hidden_dim, 1, q_seqlen, hidden_dim, + {1, 1}, {1, 1}, {0, 0, 0, 0}, {1, 1}, 1, nullptr, + conv_maxw_list[conv_begin + 2], nullptr, true, + conv_bias_list[conv_begin + 2], nullptr, api::Activation_t::LINEAR, + nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::transpose(ctx, buf_mapping["conv_p2"], buf_mapping["swp0"], + {batch, hidden_dim, q_seqlen}, {0, 2, 1}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::add(ctx, buf_mapping["swp0"], buf_mapping["qkv_add"], + buf_mapping["conv_after"], batch * q_seqlen * hidden_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + /* + ** feed forward module + ** x = residual + 0.5*ff(x) + */ + ret = ffn( + ctx, batch, q_seqlen, hidden_dim, true, buf_mapping["conv_after"], y, + ln_begin + 4, fc_begin + 4, ln_scale_list, ln_bias_list, fc_w_list, + fc_maxw_list, fc_bias_list, mem_single, param.ffn_factor); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + return api::SUCCESS; +} + +template +int conformer_encoder_wenet( + api::Context* ctx, float* x, const std::vector& data_shape, + T* encoder_out, T* ctc_probs, + ConformerEncoderParam& param, // NOLINT + const std::tuple>& xpu_mask_info) { + // Embedding -> Encoder_layer * N -> Layernorm -> Ctc_loss + int ret = 0; + int fc_num_per_layer = param.fc_num_per_layer; + int conv_num_per_layer = param.conv_num_per_layer; + int ln_num_per_layer = param.ln_num_per_layer; + int ffn_factor = param.ffn_factor; + int head_num = param.head_num; + int head_dim = param.head_dim; + int att_dim = head_num * head_dim; + int ctc_dim = param.ctc_dim; + int batch = data_shape[0]; + int max_seqlen = data_shape[1]; + int seq_dim = data_shape[2]; + int h_seqlen = (max_seqlen - 1) / 2; + int q_seqlen = (h_seqlen - 1) / 2; + + WRAPPER_ASSERT_GT(ctx, param.layer_num, 0); + WRAPPER_ASSERT_GT(ctx, batch, 0); + WRAPPER_ASSERT_GT(ctx, head_num, 0); + WRAPPER_ASSERT_GT(ctx, ctc_dim, 0); + WRAPPER_ASSERT_GT(ctx, head_dim, 0); + // Inital GM + api::ctx_guard RAII_GUARD(ctx); + std::vector mem_double; + std::vector mem_single; + int base_len = batch * (q_seqlen + 14) * (att_dim + 14); + for (int i = 0; i < 8; i++) { + mem_single.push_back(RAII_GUARD.alloc(base_len)); + } + mem_single.push_back(RAII_GUARD.alloc(base_len * ffn_factor)); + mem_single.push_back(RAII_GUARD.alloc(base_len * ffn_factor)); + mem_single.push_back(RAII_GUARD.alloc(base_len * 4)); + mem_single.push_back(RAII_GUARD.alloc(base_len * 4)); + mem_single.push_back(RAII_GUARD.alloc(base_len * 2)); + mem_double.push_back( + RAII_GUARD.alloc(batch * head_num * q_seqlen * q_seqlen * 3)); + mem_double.push_back( + RAII_GUARD.alloc(batch * head_num * q_seqlen * q_seqlen)); + int ind_len = base_len * 6 + batch * param.head_num * q_seqlen * q_seqlen * 2; + int lens = + batch * param.head_num * q_seqlen * q_seqlen * sizeof(float) / sizeof(T); + float* mem_float = RAII_GUARD.alloc(lens); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + T* calx = mem_single[0]; + T* caly = mem_single[0]; + + // embedding + mask + float* emb = RAII_GUARD.alloc(batch * max_seqlen * seq_dim); + float* emb_nm = RAII_GUARD.alloc(batch * max_seqlen * seq_dim); + T* emb_fc = RAII_GUARD.alloc(batch * q_seqlen * att_dim); + ret = api::broadcast_sub(ctx, x, param.cmvn_mean, emb, data_shape, + {1, 1, 80}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::broadcast_mul(ctx, emb, param.cmvn_istd, emb_nm, data_shape, + {1, 1, 80}); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = encoder_embed(ctx, emb_nm, calx, batch, max_seqlen, seq_dim, + att_dim, param); + float* mask_scores = RAII_GUARD.alloc(batch * q_seqlen); + ret = api::scale(ctx, std::get<0>(xpu_mask_info), mask_scores, + batch * q_seqlen, false, 1e4, -1); + CHECK_RET(ret); + // encoder * N + for (int i = 0; i < param.layer_num; i++) { + int ln_begin = i * ln_num_per_layer; + int fc_begin = i * fc_num_per_layer; + int attn_pos_begin = i; + int conv_begin = i * conv_num_per_layer; + ret = wenet_encoder_layer( + ctx, RAII_GUARD, batch, q_seqlen, att_dim, ln_begin, fc_begin, + attn_pos_begin, conv_begin, calx, caly, param, mem_single, mem_double, + mem_float, mask_scores); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + calx = caly; + } + // Final Layer_Norm + int ln_begin = param.layer_num * param.ln_num_per_layer; + int fc_begin = param.layer_num * param.fc_num_per_layer; + auto final_ln_scale = param.ln_scale_list[ln_begin]; + auto final_ln_bias = param.ln_bias_list[ln_begin]; + ret = api::layer_norm(ctx, caly, encoder_out, batch * q_seqlen, att_dim, 1e-5, + final_ln_scale, final_ln_bias, nullptr, nullptr); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + // Ctc_Loss + log_sofmax + auto ctc_fc_w = param.fc_w_list[fc_begin]; + auto ctc_fc_maxw = param.fc_maxw_list[fc_begin]; + auto ctc_fc_bias = param.fc_bias_list[fc_begin]; + float* ctc_buffer = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); + ret = api::fc_fusion( + ctx, encoder_out, ctc_fc_w, ctc_buffer, batch * q_seqlen, ctc_dim, + att_dim, false, true, nullptr, ctc_fc_maxw, nullptr, att_dim, att_dim, + ctc_dim, 1.0f, 0.0f, ctc_fc_bias, api::Activation_t::LINEAR); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + float* softmax_out = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); + ret = api::softmax(ctx, ctc_buffer, softmax_out, + {batch, q_seqlen, ctc_dim}, 2); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + float* log_out = RAII_GUARD.alloc(batch * q_seqlen * ctc_dim); + ret = api::log(ctx, softmax_out, log_out, batch * q_seqlen * ctc_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::cast_v2(ctx, log_out, ctc_probs, + batch * q_seqlen * ctc_dim); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + return api::SUCCESS; +} + +#define INSTANTIATION_CONSFORMER_WENET(T, TW, TGEMM) \ + template int conformer_encoder_wenet( \ + api::Context*, float*, const std::vector&, T*, T*, \ + ConformerEncoderParam&, \ + const std::tuple>&); +INSTANTIATION_CONSFORMER_WENET(float16, int16_t, int16_t); + +const float kFloatMax = std::numeric_limits::max(); +float logadd(std::vector const& x) { + float xmax = *max_element(x.begin(), x.end()); + if (xmax <= -kFloatMax) { + return -kFloatMax; + } + float sum = 0.0; + for (auto& it : x) { + sum += std::exp(it - xmax); + } + return std::log(sum) + xmax; +} + +struct PrefixScore { + float s = -kFloatMax; + float ns = -kFloatMax; + float score() const { return logadd({s, ns}); } + void check() const { + std::cout << "score " << s << std::endl; + std::cout << "nscore " << ns << std::endl; + } +}; + +struct PrefixHash { + size_t operator()(const std::vector& prefix) const { + size_t hash_code = 0; + // here we use KB&DR hash code + for (int id : prefix) { + hash_code = id + 31 * hash_code; + } + return hash_code; + } +}; + +static bool PrefixScoreCompare( + const std::pair, PrefixScore>& a, + const std::pair, PrefixScore>& b) { + return a.second.score() > b.second.score(); +} + +template +int ctc_prefix_beamsearch(api::Context* ctx, T* ctc_probs, + std::vector& hyps, // NOLINT + std::vector& hyps_len, // NOLINT + std::vector& ctc_scores, int batch, // NOLINT + int beam_size, int max_len, int ctc_dim) { + // 0. get topk + api::ctx_guard RAII_GUARD(ctx); + int data_len = batch * max_len * beam_size; + int* topk_index_buf = RAII_GUARD.alloc(data_len); + float* topk_score_buf = RAII_GUARD.alloc(data_len); + float* logp = RAII_GUARD.alloc(batch * max_len * ctc_dim); + int ret = + api::cast_v2(ctx, ctc_probs, logp, batch * max_len * ctc_dim); + ret = api::sorted_topk(ctx, logp, topk_score_buf, topk_index_buf, + max_len, ctc_dim, beam_size, true); + xpu_wait(ctx->xpu_stream); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + std::vector topk_index(data_len); + std::vector topk_score(data_len); + ret = xpu_memcpy(reinterpret_cast(&topk_index[0]), topk_index_buf, + data_len * sizeof(int), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + CHECK_RET(ret); + ret = xpu_memcpy(reinterpret_cast(&topk_score[0]), topk_score_buf, + data_len * sizeof(float), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + CHECK_RET(ret); + std::unordered_map, PrefixScore, PrefixHash> cur_hyps; + PrefixScore prefix_score; + prefix_score.s = 0.0; + prefix_score.ns = -kFloatMax; + std::vector empty; + cur_hyps[empty] = prefix_score; + for (int t = 0; t < max_len; ++t) { + int offset = beam_size * t; + std::unordered_map, PrefixScore, PrefixHash> next_hyps; + // 1. Token passing + for (int i = 0; i < beam_size; ++i) { + int id = topk_index[i + offset]; + float prob = topk_score[i + offset]; + for (const auto& it : cur_hyps) { + const std::vector& prefix = it.first; + const PrefixScore& prefix_score = it.second; + if (id == 0) { + // Case 0: *a + ε => *a + PrefixScore& next_score = next_hyps[prefix]; + next_score.s = logadd( + {next_score.s, prefix_score.s + prob, prefix_score.ns + prob}); + // Prefix not changed, copy the context from prefix. + next_hyps[prefix] = next_score; + } else if (!prefix.empty() && id == prefix.back()) { + // Case 1: *a + a => *a + PrefixScore& next_score = next_hyps[prefix]; + next_score.ns = logadd({next_score.ns, prefix_score.ns + prob}); + next_hyps[prefix] = next_score; + // Case 2: *aε + a => *aa + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score1 = next_hyps[new_prefix]; + next_score1.ns = logadd({next_score1.ns, prefix_score.s + prob}); + next_hyps[new_prefix] = next_score1; + } else { + // Case 3: *a + b => *ab, *aε + b => *ab + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score = next_hyps[new_prefix]; + next_score.ns = logadd( + {next_score.ns, prefix_score.s + prob, prefix_score.ns + prob}); + next_hyps[new_prefix] = next_score; + } + } + } + // 2. Second beam prune, only keep top n best paths + std::vector, PrefixScore>> arr(next_hyps.begin(), + next_hyps.end()); + std::nth_element(arr.begin(), arr.begin() + beam_size, arr.end(), + PrefixScoreCompare); + arr.resize(beam_size); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + // 3. Update cur_hyps and get new result + cur_hyps.clear(); + for (int k = 0; k < beam_size; k++) { + cur_hyps[arr[k].first] = arr[k].second; + } + } + std::vector, PrefixScore>> arr(cur_hyps.begin(), + cur_hyps.end()); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + int beam = 0; + for (auto it : arr) { + auto vec = it.first; + hyps_len[beam] = vec.size(); + ctc_scores[beam] = it.second.score(); + hyps.insert(hyps.end(), vec.begin(), vec.end()); + beam++; + } + return api::SUCCESS; +} + +template int ctc_prefix_beamsearch( + api::Context* ctx, float16* logp, + std::vector& hyps, // NOLINT + std::vector& hyps_len, // NOLINT + std::vector& ctc_scores, // NOLINT + int batch, int beam_size, int max_len, int ctc_dim); + +static int clip_cpu(int x, int min, int max) { + if (x <= min) return min; + if (x >= max) return max; + return x; +} + +static int add_sos_and_pad_ignored_id( + api::Context* ctx, const int* target, + std::vector& pad_target, // NOLINT + std::vector& pad_target_lod, // NOLINT + int batch_size, int target_seq_len, int max_target_seq_len, int eos_id, + int ignored_id, int add_sos_num, int vocab_size) { + int ret = -1; + int target_data_len = batch_size * target_seq_len; + std::vector target_cpu(target_data_len); + ret = xpu_wait(ctx->xpu_stream); + ret = xpu_memcpy(reinterpret_cast(target_cpu.data()), target, + target_data_len * sizeof(int), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + for (int i = 0; i < batch_size; i++) { + int valid_target_len = add_sos_num; + for (int j = 0; j < target_seq_len; j++) { + if (target_cpu[i * target_seq_len + j] == eos_id) { + pad_target[i * max_target_seq_len + j + add_sos_num] = ignored_id; + } else { + pad_target[i * max_target_seq_len + j + add_sos_num] = + clip_cpu(target_cpu[i * target_seq_len + j], 0, vocab_size); + valid_target_len++; + } + } + pad_target_lod[i + 1] = pad_target_lod[i] + valid_target_len; + } + return api::SUCCESS; +} + +template +int conformer_decoder_wenet(api::Context* ctx, const T* x, + const std::vector& x_shape, + const float* x_mask, const int* padded_target, + const std::vector& target_shape, + float* character_scores, + const ConformerDecoderParam& param) { + int layer_num = param.layer_num; + int batch_size = x_shape[0]; + int beam_size = param.beam_size; + int head_num = param.head_num; + int head_dim = param.head_dim; + int vocab_size = param.vocab_size; + int dim = head_num * head_dim; + int add_sos_num = param.add_sos_num; + int new_bs = batch_size * beam_size; + int sos_id = param.sos_id; + int eos_id = param.eos_id; + int ignored_id = param.ignored_id; + WRAPPER_CHECK_CTX(ctx); + WRAPPER_ASSERT_GT(ctx, layer_num, 0); + WRAPPER_ASSERT_GT(ctx, batch_size, 0); + WRAPPER_ASSERT_GT(ctx, head_num, 0); + WRAPPER_ASSERT_GT(ctx, vocab_size, 0); + WRAPPER_ASSERT_GT(ctx, dim, 0); + + api::ctx_guard RAII_GUARD(ctx); + const int max_seq_len = x_shape[1]; + WRAPPER_ASSERT_GT(ctx, max_seq_len, 0); + const int ffn1_out_dim = param.ffn_dim; + // if ffn_act is glu + const int ffn2_input_dim = ffn1_out_dim; + const int d_k = dim / head_num; + WRAPPER_ASSERT_GT(ctx, d_k, 0); + int target_seq_len = target_shape[1]; + WRAPPER_ASSERT_GT(ctx, target_seq_len, 1); + int max_target_seq_len = target_seq_len + add_sos_num; // add sos + WRAPPER_ASSERT_GT(ctx, max_seq_len, max_target_seq_len); + + int seqlen_sum = new_bs * max_seq_len; + T* new_x = const_cast(x); + int ret = -1; + // get src_attn vsl input + std::vector cpu_mask_data(new_bs * max_seq_len, 0); + std::vector src_lod_vec(new_bs + 1, 0); + ret = xpu_wait(ctx->xpu_stream); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = xpu_memcpy(reinterpret_cast(&cpu_mask_data.front()), x_mask, + new_bs * max_seq_len * sizeof(float), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + for (int b = 1; b < src_lod_vec.size(); b++) { + int curr_seqlen = 0; + for (int idx = 0; idx < max_seq_len; idx++) { + if (static_cast(cpu_mask_data[idx]) == 1) { + curr_seqlen++; + } + } + src_lod_vec[b] = src_lod_vec[b - 1] + curr_seqlen; + } + api::VectorParam src_qk_lods = { + src_lod_vec.data(), static_cast(src_lod_vec.size()), nullptr}; + src_qk_lods = src_qk_lods.to_xpu(RAII_GUARD); + seqlen_sum = src_qk_lods.cpu[new_bs]; + + T* broadcast_x = RAII_GUARD.alloc(new_bs * max_seq_len * dim); + ret = api::broadcast(ctx, x, broadcast_x, {batch_size, max_seq_len, dim}, + {new_bs, max_seq_len, dim}); + + WRAPPER_ASSERT_SUCCESS(ctx, ret); + // add sos_id and pad ignored_id + std::vector real_target_cpu(max_target_seq_len * new_bs, sos_id); + std::vector real_target_lod(new_bs + 1, 0); + + ret = add_sos_and_pad_ignored_id(ctx, padded_target, real_target_cpu, + real_target_lod, batch_size * beam_size, + target_seq_len, max_target_seq_len, eos_id, + ignored_id, add_sos_num, vocab_size); + + // get self/src QKVParam + int target_seq_sum = real_target_lod[new_bs]; + api::VectorParam self_qk_lods = { + real_target_lod.data(), static_cast(real_target_lod.size()), + nullptr}; + self_qk_lods = self_qk_lods.to_xpu(RAII_GUARD); + api::QKVAttnParam self_qkv_param(self_qk_lods, head_num, d_k, + api::Activation_t::LINEAR); + api::ConformerQKVParam src_qkv_param(self_qk_lods, src_qk_lods, head_num, d_k, + false, -1); + + seqlen_sum = seqlen_sum > target_seq_sum ? seqlen_sum : target_seq_sum; + std::vector buf_sizes = { + new_bs * max_target_seq_len * + static_cast(sizeof(int) / sizeof(T)), // padded_target + new_bs * max_target_seq_len * dim, // embedding_out + new_bs * max_target_seq_len * dim, // mid_a + new_bs * max_target_seq_len * dim, // mid_b + new_bs * max_target_seq_len * + dim, // attention_out, src_attention qk_v的结果 + new_bs * max_target_seq_len * dim, // residual + // ffn buffer + new_bs * max_target_seq_len * ffn1_out_dim, // ffn1_out + new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_out + new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_a + new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_b + new_bs * max_target_seq_len * ffn2_input_dim, // ffn_glu_sigmoid + // feature buffer + new_bs * max_target_seq_len * dim * 3, // feature_in buffer + new_bs * max_target_seq_len * dim * 2, // feature_out buffer + new_bs * max_target_seq_len * 2, // final_out + seqlen_sum * dim, // q + seqlen_sum * dim, // k + seqlen_sum * dim, // v + new_bs * max_seq_len * dim, // src_x + // attention buffer + new_bs * max_seq_len * max_seq_len * dim, // src_qk + }; + std::vector buffer_ptrs(buf_sizes.size()); + for (int i = 0; i < buf_sizes.size(); i++) { + buffer_ptrs[i] = RAII_GUARD.alloc(buf_sizes[i]); + } + int b_id = 0; + std::unordered_map buffer_map = { + {"padded_target", buffer_ptrs[b_id++]}, + {"embedding_out", buffer_ptrs[b_id++]}, + {"mid_a", buffer_ptrs[b_id++]}, + {"mid_b", buffer_ptrs[b_id++]}, + {"attention_out", buffer_ptrs[b_id++]}, + {"residual", buffer_ptrs[b_id++]}, + {"ffn1_out", buffer_ptrs[b_id++]}, + {"ffn_glu_out", buffer_ptrs[b_id++]}, + {"ffn_glu_a", buffer_ptrs[b_id++]}, + {"ffn_glu_b", buffer_ptrs[b_id++]}, + {"ffn_glu_sigmoid", buffer_ptrs[b_id++]}, + {"feature_in", buffer_ptrs[b_id++]}, + {"feature_out", buffer_ptrs[b_id++]}, + {"final_out", buffer_ptrs[b_id++]}, + {"q", buffer_ptrs[b_id++]}, + {"k", buffer_ptrs[b_id++]}, + {"v", buffer_ptrs[b_id++]}, + {"src_x", buffer_ptrs[b_id++]}, + {"src_qk", buffer_ptrs[b_id++]}, + }; + // maxptr buffer + int max_size = ctx->max_ptr_size(); + float* max_buffer = RAII_GUARD.alloc(6 * max_size); + float* max_x = max_buffer; + float* max_q = max_buffer + max_size; + float* max_k = max_buffer + 2 * max_size; + float* max_v = max_buffer + 3 * max_size; + float* max_qk = max_buffer + 4 * max_size; + float* max_qkv = max_buffer + 5 * max_size; + // copy pad_sos target to xpu + int* new_paded_target = reinterpret_cast(buffer_map["padded_target"]); + ret = api::do_host2device(ctx, real_target_cpu.data(), new_paded_target, + max_target_seq_len * new_bs * sizeof(int)); + T* embedding_out = buffer_map["embedding_out"]; + T* attention_out = buffer_map["attention_out"]; + T* mid_a = buffer_map["mid_a"]; + T* mid_b = buffer_map["mid_b"]; + T* q = buffer_map["q"]; + T* k = buffer_map["k"]; + T* v = buffer_map["v"]; + T* src_qk = buffer_map["src_qk"]; + T* residual = buffer_map["residual"]; + T* ffn1_out = buffer_map["ffn1_out"]; + T* ffn_glu_a = buffer_map["ffn_glu_a"]; + T* ffn_glu_b = buffer_map["ffn_glu_b"]; + T* ffn_glu_sigmoid = buffer_map["ffn_glu_sigmoid"]; + T* ffn_glu_out = buffer_map["ffn_glu_out"]; + // 1.1 embedding input: target{3,14} out:{3,14,512} + ret = + api::embedding(ctx, param.embed_table, new_paded_target, residual, + vocab_size, dim, new_bs * max_target_seq_len, -1); + float logit_scale = 1.0f; + ret = + api::scale(ctx, residual, embedding_out, + new_bs * max_target_seq_len * dim, true, logit_scale, 0.0f); + // 1.2 pos_embed, pos=[1, 5000, dim] + ret = api::broadcast_add(ctx, embedding_out, param.pe, residual, + {new_bs, max_target_seq_len, dim}, + {1, max_target_seq_len, dim}); + // 2. decoder + auto fc_weight_itr = param.fc_w_list.begin(); + auto fc_bias_itr = param.fc_bias_list.begin(); + auto fc_w_maxptr_itr = param.fc_maxw_list.begin(); + auto ln_scale_itr = param.ln_scale_list.begin(); + auto ln_bias_itr = param.ln_bias_list.begin(); + const float eps = 1e-5f; + + std::vector mask_cpu(max_target_seq_len * max_target_seq_len, 0.0); + const float kFloatMax = std::numeric_limits::max(); + for (int j = 0; j < max_target_seq_len; j++) { + for (int k = j + 1; k < max_target_seq_len; k++) + mask_cpu[j * max_target_seq_len + k] = -kFloatMax; + } + float* mask_xpu; + mask_xpu = reinterpret_cast( + RAII_GUARD.alloc(max_target_seq_len * max_target_seq_len)); + float* tg_mask; + tg_mask = reinterpret_cast(RAII_GUARD.alloc( + new_bs * head_num * max_target_seq_len * max_target_seq_len)); + ret = xpu_memcpy(mask_xpu, reinterpret_cast(&mask_cpu[0]), + max_target_seq_len * max_target_seq_len * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + ret = api::broadcast( + ctx, mask_xpu, tg_mask, {1, 1, max_target_seq_len, max_target_seq_len}, + {new_bs, head_num, max_target_seq_len, max_target_seq_len}); + for (int j = 0; j < layer_num; j++) { + // 2.1 self attention + ret = api::layer_norm(ctx, residual, mid_b, new_bs * max_target_seq_len, + dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr, + nullptr); + ret = api::fc_fusion_3c( + ctx, mid_b, *fc_weight_itr++, q, k, v, target_seq_sum, dim * 3, dim, + false, true, nullptr, *fc_w_maxptr_itr++, max_q, dim, dim, dim * 3, + 1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); + + api::QKVAttnParam loop_p( + new_bs, max_target_seq_len, head_num, d_k, + {new_bs, head_num, max_target_seq_len, max_target_seq_len}, + api::Activation_t::LINEAR, -1, false, dim); + + ret = api::qk_attention(ctx, q, k, src_qk, nullptr, nullptr, + max_qk, loop_p, tg_mask); + ret = api::qk_v_attention(ctx, src_qk, v, mid_a, max_qk, + nullptr, max_qkv, loop_p); + // x + residual fused with fc + ret = api::fc_fusion( + ctx, mid_a, *fc_weight_itr++, residual, new_bs * max_target_seq_len, + dim, dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, + dim, 1.0f, 1.0f, *fc_bias_itr++, api::Activation_t::LINEAR); + // 2.2 src attention + ret = api::layer_norm(ctx, residual, mid_a, new_bs * max_target_seq_len, + dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr, + nullptr); + ret = api::fc_fusion( + ctx, mid_a, *fc_weight_itr++, mid_b, new_bs * max_target_seq_len, dim, + dim, false, true, nullptr, *fc_w_maxptr_itr++, max_q, dim, dim, dim, + 1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); + // get k,v use encoder_out + ret = api::fc_fusion( + ctx, broadcast_x, *fc_weight_itr++, k, new_bs * max_seq_len, dim, dim, + false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, dim, 1.0f, + 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); + ret = api::fc_fusion( + ctx, broadcast_x, *fc_weight_itr++, v, new_bs * max_seq_len, dim, dim, + false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, dim, 1.0f, + 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); + ret = api::qk_attention(ctx, mid_b, k, src_qk, nullptr, + nullptr, max_qk, src_qkv_param); + + ret = api::qk_v_attention(ctx, src_qk, v, mid_a, max_qk, + nullptr, max_qkv, src_qkv_param); + // x = x + residual fused with fc + ret = api::fc_fusion( + ctx, mid_a, *fc_weight_itr++, residual, new_bs * max_target_seq_len, + dim, dim, false, true, max_qkv, *fc_w_maxptr_itr++, nullptr, dim, dim, + dim, 1.0f, 1.0f, *fc_bias_itr++, api::Activation_t::LINEAR); + // normalize before + ret = api::layer_norm(ctx, residual, mid_a, new_bs * max_target_seq_len, + dim, eps, *ln_scale_itr++, *ln_bias_itr++, nullptr, + nullptr); + // ffn1 + ret = api::fc_fusion( + ctx, mid_a, *fc_weight_itr++, ffn1_out, new_bs * max_target_seq_len, + ffn1_out_dim, dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, + dim, dim, ffn1_out_dim, 1.0, 0.0, *fc_bias_itr++, + api::Activation_t::RELU); + // ffn2 + ret = api::fc_fusion( + ctx, ffn1_out, *fc_weight_itr++, residual, new_bs * max_target_seq_len, + dim, ffn2_input_dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, + ffn2_input_dim, ffn2_input_dim, dim, 1.0, 1.0, *fc_bias_itr++, + api::Activation_t::LINEAR); + } + + ret = + api::layer_norm(ctx, residual, mid_a, new_bs * max_target_seq_len, dim, + 1e-5, *ln_scale_itr++, *ln_bias_itr++, nullptr, nullptr); + int ctc_dim = param.vocab_size; + ret = api::fc_fusion( + ctx, mid_a, *fc_weight_itr++, mid_b, new_bs * max_target_seq_len, ctc_dim, + dim, false, true, nullptr, *fc_w_maxptr_itr++, nullptr, dim, dim, ctc_dim, + 1.0f, 0.0f, *fc_bias_itr++, api::Activation_t::LINEAR); + // log_softmax + int data_len = new_bs * max_target_seq_len * ctc_dim; + float* softmax_in = RAII_GUARD.alloc(data_len); + float* softmax_out = RAII_GUARD.alloc(data_len); + float* log_out = RAII_GUARD.alloc(data_len); + ret = api::cast_v2(ctx, mid_b, softmax_in, data_len); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::softmax(ctx, softmax_in, softmax_out, + {new_bs, max_target_seq_len, ctc_dim}, 2); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + ret = api::log(ctx, softmax_out, character_scores, data_len); + WRAPPER_ASSERT_SUCCESS(ctx, ret); + + return api::SUCCESS; +} + +template int conformer_decoder_wenet( + api::Context* ctx, const float16* x, const std::vector& x_shape, + const float* x_mask, const int* padded_target, + const std::vector& target_shape, float* character_scores, + const ConformerDecoderParam& param); + +} // namespace wenet +} // namespace xpu diff --git a/runtime/kunlun/xpu/xpu_conformer.h b/runtime/kunlun/xpu/xpu_conformer.h new file mode 100644 index 0000000000..c20af03e11 --- /dev/null +++ b/runtime/kunlun/xpu/xpu_conformer.h @@ -0,0 +1,781 @@ +// Copyright (c) 2022 KUNLUNXIN Inc. +// 2022 Han Qi (qihan@baidu.com) +// Hehe Pan (panhehe@baidu.com) +// Zikui Yan (yanzikui@baidu.com) +// Chaolin Li (lichaolin@baidu.com) +// All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "xpu/runtime.h" +#include "xpu/xdnn.h" +#include "xpu_util.h" // NOLINT +#pragma once + +namespace api = baidu::xpu::api; +template +class ConformerEncoderParam { + public: + int layer_num; + int fc_num_per_layer; + int conv_num_per_layer; + int ln_num_per_layer; + int head_num; + int head_dim; + int ctc_dim; + int ffn_factor; + int beam_size; + struct Embedding { + int conv_num; + int fc_num; + int embed_dim; + } emb_param; + struct ConvBlock { + bool is_casual; + int kernel_size; + int lorder; + T padding; + } conv_param; + + std::vector pos_emb; + std::vector emb_conv_w_list; + std::vector emb_conv_maxw_list; + std::vector emb_conv_bias_list; + std::vector emb_fc_w_list; + std::vector emb_fc_maxw_list; + std::vector emb_fc_bias_list; + + std::vector conv_w_list; + std::vector conv_maxw_list; + std::vector conv_bias_list; + + std::vector ln_scale_list; + std::vector ln_bias_list; + + std::vector fc_w_list; + std::vector fc_maxw_list; + std::vector fc_bias_list; + + std::vector attn_pos_w_list_; + std::vector attn_pos_w_list; + std::vector attn_pos_maxw_list; + std::vector attn_pos_uv_bias_list; + + const float* cmvn_istd{nullptr}; + const float* cmvn_mean{nullptr}; + const float* pe{nullptr}; + float* mask{nullptr}; +}; + +template +class ConformerDecoderParam { + public: + int layer_num; + int fc_num_per_layer; + int ln_num_per_layer; + + int head_num; + int head_dim; + int vocab_size; + int sos_id; + int eos_id; + int ignored_id; + int beam_size; + int max_token_num; + int add_sos_num; + int ffn_dim; + + const T* embed_table{nullptr}; + const T* pe{nullptr}; + std::vector fc_w_list; + std::vector fc_maxw_list; + std::vector fc_bias_list; + std::vector ln_scale_list; + std::vector ln_bias_list; +}; + +template +static int64_t vec_prod(const std::vector& data) { + int len = data.size(); + if (len < 1) { + return 0; + } + int64_t prod = data[0]; + for (int i = 1; i < len; ++i) { + prod *= data[i]; + } + return prod; +} + +template +static std::vector get_w_list_from( + const std::vector>& quant_data_list) { + int len = quant_data_list.size(); + std::vector w_list(len, nullptr); + for (int i = 0; i < len; ++i) { + w_list[i] = quant_data_list[i].data_; + } + return w_list; +} + +template +static std::vector get_w_maxptr_list_from( + const std::vector>& quant_data_list) { + int len = quant_data_list.size(); + std::vector w_maxptr_list(len, nullptr); + for (int i = 0; i < len; ++i) { + w_maxptr_list[i] = quant_data_list[i].max_ptr_; + } + return w_maxptr_list; +} + +template +void get_fc_param(const std::unordered_map& weights_len_info, + const std::string& params_dir, + const std::string& fc_name_prefix, + XPUQunatData& fc_w, // NOLINT + const float*& fc_bias, bool has_bias = true) { // NOLINT + const std::string fc_file_prefix = params_dir + fc_name_prefix; + int wlen = weights_len_info.at(fc_name_prefix + "weight"); + fc_w = get_xpu_quant_data(fc_file_prefix + "weight", wlen); + if (has_bias) { + int blen = weights_len_info.at(fc_name_prefix + "bias"); + fc_bias = get_xpu_data(fc_file_prefix + "bias", blen); + } else { + fc_bias = nullptr; + } +} + +template +void get_conv_param( + const std::unordered_map& weights_len_info, + const std::string& params_dir, const std::string& conv_name_prefix, + XPUQunatData& conv_w, const float*& conv_b, // NOLINT + bool has_bias = true) { // NOLINT + std::string conv_file_prefix = params_dir + conv_name_prefix; + int wlen = weights_len_info.at(conv_name_prefix + "weight"); + conv_w = get_xpu_quant_data(conv_file_prefix + "weight", wlen); + if (has_bias) { + int blen = weights_len_info.at(conv_name_prefix + "bias"); + conv_b = get_xpu_data(conv_file_prefix + "bias", blen); + } else { + conv_b = nullptr; + } +} + +template +void get_fc_fused_param( + const std::unordered_map& weights_len_info, + const std::string& params_dir, + const std::vector fc_name_prefixs, + XPUQunatData& _fc_w, // NOLINT + const float*& _fc_b, bool has_bias = true) { // NOLINT + // get cpu fc params + std::vector fc_ws; + std::vector fc_bs; + for (int ids = 0; ids < fc_name_prefixs.size(); ids++) { + std::string fc_file_prefix = params_dir + fc_name_prefixs[ids]; + int wlen = weights_len_info.at(fc_name_prefixs[ids] + "weight"); + std::vector fc_w = + get_cpu_data(fc_file_prefix + "weight", wlen); + std::vector fc_b; + if (has_bias) { + int blen = weights_len_info.at(fc_name_prefixs[ids] + "bias"); + fc_b = get_cpu_data(fc_file_prefix + "bias", blen); + } + fc_ws.insert(fc_ws.end(), fc_w.begin(), fc_w.end()); + fc_bs.insert(fc_bs.end(), fc_b.begin(), fc_b.end()); + } + _fc_w = get_xpu_quant_data("fused_fc_weight", fc_ws); + _fc_b = get_xpu_data("fused_fc_bias", fc_bs); +} + +template +void get_fc_ln_fused_param( + const std::unordered_map& weights_len_info, + const std::string& params_dir, + const std::vector fc_name_prefixs, + std::vector ln_name_prefixs, + XPUQunatData& _fc_w, // NOLINT + const float*& _fc_b, bool has_bias = true) { // NOLINT + // get cpu fc params + std::vector fc_ws; + std::vector fc_bs; + for (int ids = 0; ids < fc_name_prefixs.size(); ids++) { + std::string fc_file_prefix = params_dir + fc_name_prefixs[ids]; + int wlen = weights_len_info.at(fc_name_prefixs[ids] + "weight"); + std::vector fc_w = + get_cpu_data(fc_file_prefix + "weight", wlen); + std::vector fc_b; + if (has_bias) { + int blen = weights_len_info.at(fc_name_prefixs[ids] + "bias"); + fc_b = get_cpu_data(fc_file_prefix + "bias", blen); + } + // get cpu ln params + std::string ln_file_prefix = params_dir + ln_name_prefixs[ids]; + wlen = weights_len_info.at(ln_name_prefixs[ids] + "weight"); + int blen = weights_len_info.at(ln_name_prefixs[ids] + "bias"); + std::vector ln_scale = + get_cpu_data(ln_file_prefix + "weight", wlen); + std::vector ln_bias = + get_cpu_data(ln_file_prefix + "bias", blen); + int col = ln_scale.size(); + int row = static_cast(fc_w.size()) / col; + if (!has_bias) { + fc_b.resize(row); + } + // get new fc_bias + for (int i = 0; i < row; i++) { + float b = has_bias ? fc_b[i] : 0.f; + for (int j = 0; j < col; j++) { + b += fc_w[i * col + j] * ln_bias[j]; + } + fc_b[i] = b; + } + // get new fc_weight + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + fc_w[i * col + j] = fc_w[i * col + j] * ln_scale[j]; + } + } + fc_ws.insert(fc_ws.end(), fc_w.begin(), fc_w.end()); + fc_bs.insert(fc_bs.end(), fc_b.begin(), fc_b.end()); + } + _fc_w = get_xpu_quant_data("fused_fc_weight", fc_ws); + _fc_b = get_xpu_data("fused_fc_bias", fc_bs); +} + +template +void get_conv_bn_fused_param( + const std::unordered_map& weights_len_info, + const std::string& params_dir, const std::string& conv_name_prefix, + const std::string& bn_name_prefix, XPUQunatData& _conv_w, // NOLINT + const float*& _conv_b, bool has_bias = true) { // NOLINT + // get cpu conv params + std::string conv_file_prefix = params_dir + conv_name_prefix; + int wlen = weights_len_info.at(conv_name_prefix + "weight"); + std::vector conv_w = + get_cpu_data(conv_file_prefix + "weight", wlen); + std::vector conv_b; + if (has_bias) { + int blen = weights_len_info.at(conv_name_prefix + "bias"); + conv_b = get_cpu_data(conv_file_prefix + "bias", blen); + } + // get cpu bn params + std::string bn_file_prefix = params_dir + bn_name_prefix; + wlen = weights_len_info.at(bn_name_prefix + "weight"); + int blen = weights_len_info.at(bn_name_prefix + "bias"); + int mlen = weights_len_info.at(bn_name_prefix + "running_mean"); + int vlen = weights_len_info.at(bn_name_prefix + "running_var"); + std::vector bn_scale = + get_cpu_data(bn_file_prefix + "weight", wlen); + std::vector bn_bias = + get_cpu_data(bn_file_prefix + "bias", blen); + std::vector bn_mean = + get_cpu_data(bn_file_prefix + "running_mean", mlen); + std::vector bn_var = + get_cpu_data(bn_file_prefix + "running_var", vlen); + // fuse conv, bn, new weight is conv_w, new bias is bn_bias + int h = bn_scale.size(); + int w = static_cast(conv_w.size()) / h; + float eps = 1e-5f; // assume eps is 1e-5; + for (int i = 0; i < h; ++i) { + bn_scale[i] = bn_scale[i] / std::sqrt(bn_var[i] + eps); + } + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + conv_w[i * w + j] *= bn_scale[i]; + } + } + for (int i = 0; i < h; ++i) { + float b = has_bias ? conv_b[i] : 0.f; + bn_bias[i] += ((b - bn_mean[i]) * bn_scale[i]); + } + _conv_w = get_xpu_quant_data("fused_conv_weight", conv_w); + _conv_b = get_xpu_data("fused_conv_bias", bn_bias); +} + +template +static std::tuple, std::vector> read_cpu_data_from_file( + const std::string& data_file_prefix, int shape_ndim) { + std::vector res_data; + std::string data_file = data_file_prefix + ".dat"; + std::string shape_file = data_file_prefix + "_shape.txt"; + std::ifstream inF(shape_file); + if (!inF) { + std::cout << "ERR: open file failed! " << shape_file << std::endl; + std::exit(1); + } + char useless; // (16, 523, 80) or (160, 1) + std::vector inshape(shape_ndim, 0); + if (shape_ndim == 3) { + inF >> useless >> inshape[0] >> useless >> inshape[1] >> useless >> + inshape[2] >> useless; + } else if (shape_ndim == 2) { + inF >> useless >> inshape[0] >> useless >> inshape[1] >> useless; + } else if (shape_ndim == 1) { + inF >> useless >> inshape[0] >> useless >> useless; + } else { + std::cout << "ERR: only support shape ndim == 1, 2 or 3, but got " + << shape_ndim << std::endl; + std::exit(1); + } + + int data_len = vec_prod(inshape); + res_data = get_cpu_data(data_file, data_len); + return std::make_tuple(res_data, inshape); +} + +template +static std::tuple> read_xpu_data_from_file( + const std::string& data_file_prefix, int shape_ndim) { + auto cpu_data_info = read_cpu_data_from_file(data_file_prefix, shape_ndim); + T* xpu_data = get_xpu_data(data_file_prefix, std::get<0>(cpu_data_info)); + return std::make_tuple(xpu_data, std::get<1>(cpu_data_info)); +} + +template +static std::tuple> create_mask_according_speech_length( + const std::vector& speech_length, int max_seqlen, + void* xpu_stream = nullptr) { + int batch = speech_length.size(); + int mask_len = batch * max_seqlen; + int subsample_mask_len = batch * (((max_seqlen - 1) / 2 - 1) / 2); + std::vector mask_cpu(mask_len, 0); + std::vector subsample_mask_cpu(subsample_mask_len, 0); + // create mask, equal to 'masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)' + for (int b = 0; b < batch; ++b) { + int curr_seqlen = speech_length[b]; + for (int idx = 0; idx < curr_seqlen; ++idx) { + mask_cpu.at(b * max_seqlen + idx) = 1; + } + } + // create subsample_mask, equal to 'x_mask[:, :, :-2:2][:, :, :-2:2]' + int sub_seqlen = subsample_mask_len / batch; + for (int b = 0; b < batch; ++b) { + for (int idx = 0; idx < sub_seqlen; ++idx) { + subsample_mask_cpu.at(b * sub_seqlen + idx) = + mask_cpu.at(b * max_seqlen + idx * 4); + } + } + // copy to xpu + T* subsample_mask_xpu = nullptr; + int r = xpu_malloc(reinterpret_cast(&subsample_mask_xpu), + subsample_mask_len * sizeof(T)); + if (r != 0) { + std::cout << "ERR: xpu_malloc failed!" << std::endl; + std::exit(1); + } + r = xpu_wait(xpu_stream); + if (r != 0) { + std::cout << "ERR: xpu_wait failed!" << std::endl; + std::exit(1); + } + r = xpu_memcpy(subsample_mask_xpu, subsample_mask_cpu.data(), + subsample_mask_len * sizeof(T), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + if (r != 0) { + std::cout << "ERR: xpu_memcpy failed!" << std::endl; + std::exit(1); + } + + std::vector subsample_mask_shape{batch, 1, sub_seqlen}; + return std::make_tuple(subsample_mask_xpu, subsample_mask_shape); +} + +template +int init_encoder_params( + const std::string& params_dir, + ConformerEncoderParam& encoder_param) { // NOLINT + std::unordered_map weights_len_info = + get_weights_lens(params_dir + "weights_info.txt"); + std::unordered_map> weights_shape_info = + get_weights_shape(params_dir + "weights_info.txt"); + + // model struct param + auto& head_num = encoder_param.head_num; + auto& head_dim = encoder_param.head_dim; + auto& ffn_factor = encoder_param.ffn_factor; + auto& conv_param = encoder_param.conv_param; + auto& emb_param = encoder_param.emb_param; + auto& ctc_dim = encoder_param.ctc_dim; + auto& encoder_layer_num = encoder_param.layer_num; + auto& fc_num_per_layer = encoder_param.fc_num_per_layer; + auto& conv_num_per_layer = encoder_param.conv_num_per_layer; + auto& ln_num_per_layer = encoder_param.ln_num_per_layer; + encoder_layer_num = 12; + fc_num_per_layer = 6; + conv_num_per_layer = 3; + ln_num_per_layer = 6; + emb_param.conv_num = 2; + emb_param.fc_num = 1; + emb_param.embed_dim = 512; + ffn_factor = + weights_shape_info.at("encoder.encoders.0.feed_forward.w_1.weight")[0] / + weights_shape_info.at("encoder.encoders.0.feed_forward.w_1.weight")[1]; + head_dim = + weights_shape_info.at("encoder.encoders.0.self_attn.pos_bias_u")[1]; + head_num = + weights_shape_info.at("encoder.encoders.0.self_attn.pos_bias_u")[0]; + conv_param.kernel_size = weights_shape_info.at( + "encoder.encoders.0.conv_module.depthwise_conv.weight")[2]; + conv_param.lorder = conv_param.kernel_size - 1; + conv_param.padding = 0.0; + conv_param.is_casual = true; + ctc_dim = weights_len_info.at("ctc.ctc_lo.bias"); + encoder_param.beam_size = 3; + + // init encoder cmvn + auto& pe = encoder_param.pe; + auto& cmvn_istd = encoder_param.cmvn_istd; + auto& cmvn_mean = encoder_param.cmvn_mean; + int pe_len = weights_len_info.at("encoder.pe"); + int mlen = weights_len_info.at("encoder.global_cmvn.mean"); + int ilen = weights_len_info.at("encoder.global_cmvn.istd"); + pe = get_xpu_data(params_dir + "encoder.pe", pe_len); + cmvn_mean = + get_xpu_data(params_dir + "encoder.global_cmvn.mean", mlen); + cmvn_istd = + get_xpu_data(params_dir + "encoder.global_cmvn.istd", ilen); + + // init encoder embedding param + std::vector> emb_conv_w_list; + auto& emb_conv_bias_list = encoder_param.emb_conv_bias_list; + std::vector> emb_fc_w_list; + auto& emb_fc_bias_list = encoder_param.emb_fc_bias_list; + emb_conv_w_list.resize(emb_param.conv_num); + emb_conv_bias_list.resize(emb_param.conv_num); + emb_fc_w_list.resize(emb_param.fc_num); + emb_fc_bias_list.resize(emb_param.fc_num); + for (int i = 0; i < emb_param.conv_num; ++i) { + std::string conv_name_prefix = + "encoder.embed.conv." + std::to_string(i * 2) + "."; + get_conv_param(weights_len_info, params_dir, conv_name_prefix, + emb_conv_w_list[i], emb_conv_bias_list[i]); + } + get_fc_param(weights_len_info, params_dir, "encoder.embed.out.0.", + emb_fc_w_list[0], emb_fc_bias_list[0]); + + // encoder_param_layer + int enc_fc_num = encoder_layer_num * fc_num_per_layer + 1; + int enc_conv_num = encoder_layer_num * conv_num_per_layer; + int enc_ln_num = encoder_layer_num * ln_num_per_layer + 1; + + std::vector> fc_w_list; + auto& fc_bias_list = encoder_param.fc_bias_list; + + std::vector> conv_w_list; + auto& conv_bias_list = encoder_param.conv_bias_list; + + auto& ln_scale_list = encoder_param.ln_scale_list; + auto& ln_bias_list = encoder_param.ln_bias_list; + + std::vector> attn_pos_w_list; + std::vector attn_pos_uv_bias_list; + // w_param need to be quanted & get maxw + fc_w_list.resize(enc_fc_num); + fc_bias_list.resize(enc_fc_num); + conv_w_list.resize(enc_conv_num); + conv_bias_list.resize(enc_conv_num); + ln_scale_list.resize(enc_ln_num); + ln_bias_list.resize(enc_ln_num); + attn_pos_w_list.resize(encoder_layer_num); + attn_pos_uv_bias_list.resize(encoder_layer_num * + 2); // pos_bias_u, pos_bias_v + for (int i = 0; i < encoder_layer_num; ++i) { + std::string enc_prefix = "encoder.encoders." + std::to_string(i) + "."; + int fc_offset = i * fc_num_per_layer; + int conv_offset = i * conv_num_per_layer; + int ln_offset = i * ln_num_per_layer; + // init FeedForwardParam macaron + get_fc_param(weights_len_info, params_dir, + enc_prefix + "feed_forward_macaron.w_1.", + fc_w_list[fc_offset], fc_bias_list[fc_offset]); + get_fc_param(weights_len_info, params_dir, + enc_prefix + "feed_forward_macaron.w_2.", + fc_w_list[fc_offset + 1], fc_bias_list[fc_offset + 1]); + get_fc_fused_param( + weights_len_info, params_dir, + {enc_prefix + "self_attn.linear_q.", enc_prefix + "self_attn.linear_k.", + enc_prefix + "self_attn.linear_v."}, + fc_w_list[fc_offset + 2], fc_bias_list[fc_offset + 2]); + get_fc_param( + weights_len_info, params_dir, enc_prefix + "self_attn.linear_out.", + fc_w_list[fc_offset + 3], fc_bias_list[fc_offset + 3], true); + // get pos w, pos u bias, pos v bias + std::string pos_w_name = enc_prefix + "self_attn.linear_pos.weight"; + std::string pos_ubias_name = enc_prefix + "self_attn.pos_bias_u"; + std::string pos_vbias_name = enc_prefix + "self_attn.pos_bias_v"; + int pos_wlen = weights_len_info.at(pos_w_name); + int pos_ublen = weights_len_info.at(pos_ubias_name); + int pos_vblen = weights_len_info.at(pos_vbias_name); + attn_pos_w_list[i] = + get_xpu_quant_data(params_dir + pos_w_name, pos_wlen); + attn_pos_uv_bias_list[i * 2] = + get_xpu_data(params_dir + pos_ubias_name, pos_ublen); + attn_pos_uv_bias_list[i * 2 + 1] = + get_xpu_data(params_dir + pos_vbias_name, pos_vblen); + // init ConvModuleParam + get_conv_param(weights_len_info, params_dir, + enc_prefix + "conv_module.pointwise_conv1.", + conv_w_list[conv_offset], conv_bias_list[conv_offset], + true); + get_conv_param(weights_len_info, params_dir, + enc_prefix + "conv_module.depthwise_conv.", + conv_w_list[conv_offset + 1], + conv_bias_list[conv_offset + 1], true); + get_conv_param(weights_len_info, params_dir, + enc_prefix + "conv_module.pointwise_conv2.", + conv_w_list[conv_offset + 2], + conv_bias_list[conv_offset + 2], true); + // init FeedForwardParam + get_fc_param(weights_len_info, params_dir, + enc_prefix + "feed_forward.w_1.", fc_w_list[fc_offset + 4], + fc_bias_list[fc_offset + 4]); + get_fc_param(weights_len_info, params_dir, + enc_prefix + "feed_forward.w_2.", fc_w_list[fc_offset + 5], + fc_bias_list[fc_offset + 5]); + // init LayerNormParam + get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_ff_macaron.", + ln_scale_list[ln_offset], ln_bias_list[ln_offset]); + get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_mha.", + ln_scale_list[ln_offset + 1], ln_bias_list[ln_offset + 1]); + get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_conv.", + ln_scale_list[ln_offset + 2], ln_bias_list[ln_offset + 2]); + get_ln_param(weights_len_info, params_dir, enc_prefix + "conv_module.norm.", + ln_scale_list[ln_offset + 3], ln_bias_list[ln_offset + 3]); + get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_ff.", + ln_scale_list[ln_offset + 4], ln_bias_list[ln_offset + 4]); + get_ln_param(weights_len_info, params_dir, enc_prefix + "norm_final.", + ln_scale_list[ln_offset + 5], ln_bias_list[ln_offset + 5]); + } + get_ln_param(weights_len_info, params_dir, "encoder.after_norm.", + ln_scale_list[enc_ln_num - 1], ln_bias_list[enc_ln_num - 1]); + get_fc_param(weights_len_info, params_dir, "ctc.ctc_lo.", + fc_w_list[enc_fc_num - 1], fc_bias_list[enc_fc_num - 1]); + /* get maxw && w */ + encoder_param.emb_conv_w_list = get_w_list_from(emb_conv_w_list); + encoder_param.emb_conv_maxw_list = + get_w_maxptr_list_from(emb_conv_w_list); + encoder_param.emb_fc_w_list = get_w_list_from(emb_fc_w_list); + encoder_param.emb_fc_maxw_list = get_w_maxptr_list_from(emb_fc_w_list); + + encoder_param.conv_w_list = get_w_list_from(conv_w_list); + encoder_param.conv_maxw_list = get_w_maxptr_list_from(conv_w_list); + + encoder_param.fc_w_list = get_w_list_from(fc_w_list); + encoder_param.fc_maxw_list = get_w_maxptr_list_from(fc_w_list); + + encoder_param.attn_pos_w_list_ = get_w_list_from(attn_pos_w_list); + encoder_param.attn_pos_maxw_list = + get_w_maxptr_list_from(attn_pos_w_list); + /* prepare params */ + api::Context ctx_xpu(api::kXPU2); + api::ctx_guard RAII_GUARD(&ctx_xpu); + int ret = 0; + int hidden_dim = head_num * head_dim; + encoder_param.pos_emb.resize(encoder_layer_num); + for (int i = 0; i < encoder_layer_num; i++) { + ret = xpu_malloc((void**)&(encoder_param.pos_emb[i]), // NOLINT + 5000 * hidden_dim * sizeof(T)); + ret = api::fc_fusion( + &ctx_xpu, encoder_param.pe, encoder_param.attn_pos_w_list_[i], + const_cast(encoder_param.pos_emb[i]), 5000, hidden_dim, hidden_dim, + false, true, nullptr, encoder_param.attn_pos_maxw_list[i], nullptr, + hidden_dim, hidden_dim, hidden_dim, 1.0f, 0.0f, nullptr, + api::Activation_t::LINEAR); + } + for (int i = 0; i < encoder_layer_num; i++) { + ret = api::scale( + &ctx_xpu, encoder_param.fc_bias_list[i * fc_num_per_layer + 1], + const_cast( + encoder_param.fc_bias_list[i * fc_num_per_layer + 1]), + hidden_dim, true, 0.5f, 0.0f); + ret = api::scale( + &ctx_xpu, encoder_param.fc_bias_list[i * fc_num_per_layer + 5], + const_cast( + encoder_param.fc_bias_list[i * fc_num_per_layer + 5]), + hidden_dim, true, 0.5f, 0.0f); + } + for (int i = 0; i < attn_pos_uv_bias_list.size(); i++) { + T* tmppos = nullptr; + ret = xpu_malloc(reinterpret_cast(&tmppos), hidden_dim * sizeof(T)); + ret = api::cast_v2(&ctx_xpu, attn_pos_uv_bias_list[i], tmppos, + hidden_dim); + encoder_param.attn_pos_uv_bias_list.push_back(tmppos); + } + return 0; +} + +template +int init_decoder_params( + const std::string& params_dir, + ConformerDecoderParam& decoder_param) { // NOLINT + std::unordered_map weights_len_info = + get_weights_lens(params_dir + "weights_info.txt"); + + // init DecoderLayerParam + auto& decoder_layer_num = decoder_param.layer_num; + auto& fc_num_per_layer = decoder_param.fc_num_per_layer; + auto& ln_num_per_layer = decoder_param.ln_num_per_layer; + std::vector> fc_w_list; + auto& fc_bias_list = decoder_param.fc_bias_list; + auto& ln_scale_list = decoder_param.ln_scale_list; + auto& ln_bias_list = decoder_param.ln_bias_list; + decoder_layer_num = 3; + fc_num_per_layer = 8; + ln_num_per_layer = 3; + int dec_fc_num = decoder_layer_num * fc_num_per_layer + 1; + int dec_ln_num = decoder_layer_num * ln_num_per_layer + 1; + fc_w_list.resize(dec_fc_num); + fc_bias_list.resize(dec_fc_num); + ln_scale_list.resize(dec_ln_num); + ln_bias_list.resize(dec_ln_num); + decoder_param.head_num = 8; + decoder_param.head_dim = 64; + decoder_param.vocab_size = 5538; + decoder_param.sos_id = 5537; + decoder_param.eos_id = 5537; + decoder_param.ignored_id = 2; + decoder_param.beam_size = 3; + decoder_param.max_token_num = 200; + decoder_param.add_sos_num = 1; + decoder_param.ffn_dim = 2048; + auto att_dim = decoder_param.head_num * decoder_param.head_dim; + + // init EmbeddingParam + std::string embed_table_name = "decoder.left_decoder.embed.0.weight"; + std::vector embed_table_cpu = get_cpu_data( + params_dir + embed_table_name, weights_len_info.at(embed_table_name)); + std::vector embed_table_cpu_t(embed_table_cpu.size(), 0); + for (int i = 0; i < static_cast(embed_table_cpu.size()); ++i) { + embed_table_cpu_t[i] = + static_cast(embed_table_cpu[i] * std::sqrt(att_dim)); + } + decoder_param.embed_table = + get_xpu_data(embed_table_name, embed_table_cpu_t); + + // init pe + std::string pe_name = "encoder.pe"; + std::vector pe_cpu = + get_cpu_data(params_dir + pe_name, weights_len_info.at(pe_name)); + std::vector pe_cpu_t(pe_cpu.size(), 0); + for (int i = 0; i < static_cast(pe_cpu.size()); ++i) { + pe_cpu_t[i] = static_cast(pe_cpu[i]); + } + decoder_param.pe = get_xpu_data(pe_name, pe_cpu_t); + for (int i = 0; i < decoder_layer_num; ++i) { + std::string dec_prefix = + "decoder.left_decoder.decoders." + std::to_string(i) + "."; + int offset = i * fc_num_per_layer; + // init fc param + // self attention qkv fc + get_fc_fused_param(weights_len_info, params_dir, + { + dec_prefix + "self_attn.linear_q.", + dec_prefix + "self_attn.linear_k.", + dec_prefix + "self_attn.linear_v.", + }, + fc_w_list[offset], fc_bias_list[offset], true); + get_fc_param(weights_len_info, params_dir, + dec_prefix + "self_attn.linear_out.", + fc_w_list[offset + 1], fc_bias_list[offset + 1], true); + get_fc_param(weights_len_info, params_dir, + dec_prefix + "src_attn.linear_q.", fc_w_list[offset + 2], + fc_bias_list[offset + 2], true); + get_fc_param(weights_len_info, params_dir, + dec_prefix + "src_attn.linear_k.", fc_w_list[offset + 3], + fc_bias_list[offset + 3], true); + get_fc_param(weights_len_info, params_dir, + dec_prefix + "src_attn.linear_v.", fc_w_list[offset + 4], + fc_bias_list[offset + 4], true); + get_fc_param(weights_len_info, params_dir, + dec_prefix + "src_attn.linear_out.", fc_w_list[offset + 5], + fc_bias_list[offset + 5], true); + get_fc_param(weights_len_info, params_dir, + dec_prefix + "feed_forward.w_1.", fc_w_list[offset + 6], + fc_bias_list[offset + 6]); + get_fc_param(weights_len_info, params_dir, + dec_prefix + "feed_forward.w_2.", fc_w_list[offset + 7], + fc_bias_list[offset + 7]); + // init ln param + offset = i * ln_num_per_layer; + get_ln_param(weights_len_info, params_dir, dec_prefix + "norm1.", + ln_scale_list[offset], ln_bias_list[offset]); + get_ln_param(weights_len_info, params_dir, dec_prefix + "norm2.", + ln_scale_list[offset + 1], ln_bias_list[offset + 1]); + get_ln_param(weights_len_info, params_dir, dec_prefix + "norm3.", + ln_scale_list[offset + 2], ln_bias_list[offset + 2]); + } + // init after ln + get_ln_param(weights_len_info, params_dir, "decoder.left_decoder.after_norm.", + ln_scale_list[dec_ln_num - 1], ln_bias_list[dec_ln_num - 1]); + // init output layer fc + get_fc_param( + weights_len_info, params_dir, "decoder.left_decoder.output_layer.", + fc_w_list[dec_fc_num - 1], fc_bias_list[dec_fc_num - 1], true); + decoder_param.fc_w_list = get_w_list_from(fc_w_list); + decoder_param.fc_maxw_list = get_w_maxptr_list_from(fc_w_list); + return 0; +} + +static int padding_target(std::vector& hyps, // NOLINT + std::vector& hyps_len, // NOLINT + int beam_size, int eos_id) { + int max_target_len = *max_element(hyps_len.begin(), hyps_len.end()); + std::vector pad(max_target_len * beam_size); + int offset = 0; + for (int i = 0; i < beam_size; i++) { + for (int j = 0; j < max_target_len; j++) { + pad[i * max_target_len + j] = j < hyps_len[i] ? hyps[j + offset] : eos_id; + } + offset += hyps_len[i]; + } + hyps.swap(pad); + return max_target_len; +} + +namespace xpu { +namespace wenet { + +template +int conformer_encoder_wenet( + api::Context* ctx, float* x, const std::vector& data_shape, + T* encoder_out, T* ctc_probs, + ConformerEncoderParam& param, // NOLINT + const std::tuple>& xpu_mask_info); +template +int ctc_prefix_beamsearch(api::Context* ctx, T* ctc_probs, + std::vector& hyps, // NOLINT + std::vector& hyps_len, // NOLINT + std::vector& ctc_scores, // NOLINT + int batch_size, int beam_size, int max_len, + int ctc_dim); + +template +int conformer_decoder_wenet(api::Context* ctx, const T* x, + const std::vector& x_shape, + const float* x_mask, const int* padded_target, + const std::vector& target_shape, + float* character_scores, + const ConformerDecoderParam& param); +} // namespace wenet +} // namespace xpu diff --git a/runtime/kunlun/xpu/xpu_util.cpp b/runtime/kunlun/xpu/xpu_util.cpp new file mode 100644 index 0000000000..b18cd12b7e --- /dev/null +++ b/runtime/kunlun/xpu/xpu_util.cpp @@ -0,0 +1,491 @@ +// Copyright (c) 2022 KUNLUNXIN Inc. +// 2022 Han Qi (qihan@baidu.com) +// Hehe Pan (panhehe@baidu.com) +// Zikui Yan (yanzikui@baidu.com) +// Chaolin Li (lichaolin@baidu.com) +// All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "xpu_util.h" // NOLINT + +template +static double vec_sum(const std::vector& data) { + double res = 0; + for (int i = 0; i < static_cast(data.size()); ++i) { + res += static_cast(data[i]); + } + return res; +} + +int vector_prod(std::vector shape) { + int accumlate = 1; + for (auto a : shape) { + accumlate *= a; + } + return accumlate; +} +void add_separator_when_necessary(std::string& str) { // NOLINT + int len = str.size(); + char ch = '/'; + if (str[len - 1] != ch) { + str.append(1, ch); + } +} + +template +static std::string print_vec(const std::vector& data) { + std::stringstream ss; + const int dump_len = data.size() > 8 ? 8 : data.size(); + std::vector dump_data(dump_len, 0); + int half_dump_len = dump_len / 2; + std::copy(data.cbegin(), data.cbegin() + half_dump_len, dump_data.begin()); + std::copy(data.cend() - (dump_len - half_dump_len), data.cend(), + dump_data.begin() + half_dump_len); + for (int i = 0; i < dump_len - 1; ++i) { + ss << dump_data[i] << ", "; + if ((i + 1) == dump_len / 2) { + ss << " ... "; + } + } + ss << dump_data[dump_len - 1]; + return ss.str(); +} + +template +static T parse_string(const std::string& str) { + return str; +} + +template <> +float parse_string(const std::string& str) { + return std::stof(str); +} +template <> +double parse_string(const std::string& str) { + return std::stod(str); +} +template <> +int parse_string(const std::string& str) { + return std::stoi(str); +} +template <> +int64_t parse_string(const std::string& str) { + return std::stoll(str); +} + +template +std::vector Split(const std::string& str, const std::string& separator) { + std::vector res; + std::string::size_type pos1, pos2; + pos1 = str.find_first_not_of(separator); + pos2 = str.find(separator, pos1); + while (std::string::npos != pos1 && std::string::npos != pos2) { + res.emplace_back(parse_string(str.substr(pos1, pos2 - pos1))); + pos1 = str.find_first_not_of(separator, pos2); + pos2 = str.find(separator, pos1); + } + if (std::string::npos != pos1 && pos1 < str.length()) { + res.emplace_back(parse_string(str.substr(pos1))); + } + return res; +} + +std::unordered_map get_weights_lens( + const std::string& file_path) { + std::unordered_map res; + std::ifstream inF(file_path, std::ifstream::in); + if (inF) { + // std::cout << "read success from " << file_path << std::endl; + std::string buffer; + while (std::getline(inF, buffer)) { + std::vector weight_info = Split(buffer, ":"); + std::string w_name = weight_info[0]; + int w_len = std::stoi(weight_info[3]); + res.insert(std::make_pair(w_name, w_len)); + } + } else { + std::cout << "ERR: read failed, " << file_path << std::endl; + std::exit(1); + } + + return res; +} + +std::unordered_map> get_weights_shape( + const std::string& file_path) { + std::unordered_map> res; + std::ifstream inF(file_path, std::ifstream::in); + if (inF) { + // std::cout << "read success from " << file_path << std::endl; + std::string buffer; + while (std::getline(inF, buffer)) { + std::vector weight_info = Split(buffer, ":"); + std::string w_name = weight_info[0]; + std::string w_shape_str = weight_info[2]; // example: (512, 1, 3, 3) + std::string w_shape_str_without_bracket( + w_shape_str.begin() + 1, + w_shape_str.end() - 1); // example: 512, 1, 3, 3 + std::vector w_shape = Split(w_shape_str_without_bracket, ","); + res.insert(std::make_pair(w_name, w_shape)); + } + } else { + std::cout << "ERR: read failed, " << file_path << std::endl; + std::exit(1); + } + + return res; +} + +template +std::vector get_cpu_data(const std::string& file_path, int len) { + std::vector result(len, 0); + std::ifstream inF(file_path, std::ifstream::binary); + if (!inF) { + std::cout << "ERR: std::ifstream init failed! " << file_path << std::endl; + std::exit(1); + } + if (inF.read(reinterpret_cast(result.data()), len * sizeof(T))) { + // std::cout << "read success from " << file_path << std::endl; + } else { + std::cout << "ERR: something wrong: " << file_path << ", len=" << len + << std::endl; + std::exit(1); + } + return result; +} + +template std::vector get_cpu_data(const std::string&, int len); +template std::vector get_cpu_data(const std::string&, + int len); +template std::vector get_cpu_data(const std::string&, + int len); +template std::vector get_cpu_data(const std::string&, int len); + +template +T* get_xpu_data(const std::string& data_name, const std::vector& cpu_data) { + int len = cpu_data.size(); +#ifdef TEST_DEBUG + std::cout << "DEBUG: file_path=" << data_name << ", len=" << len + << ", vec_sum=" << vec_sum(cpu_data) + << ", details: " << print_vec(cpu_data) << std::endl; +#endif + + T* xpu_data = nullptr; + int r = xpu_malloc(reinterpret_cast(&xpu_data), len * sizeof(T)); + if (r != 0) { + std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl; + std::exit(1); + } + + r = xpu_wait(); + if (r != 0) { + std::cout << "ERR: xpu_wait failed!" << std::endl; + std::exit(1); + } + r = xpu_memcpy(xpu_data, cpu_data.data(), len * sizeof(T), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + if (r != 0) { + std::cout << "ERR: xpu_memcpy failed! " << data_name << std::endl; + std::exit(1); + } + +#ifdef TEST_DEBUG + std::cout << "DEBUG: xpu_data=" << xpu_data << std::endl; +#endif + + return xpu_data; +} + +template float* get_xpu_data(const std::string&, const std::vector&); +template float16* get_xpu_data(const std::string&, const std::vector&); +template int64_t* get_xpu_data(const std::string&, const std::vector&); +template int* get_xpu_data(const std::string&, const std::vector&); + +template +T* get_xpu_data(const std::string& file_path, int len) { + std::vector cpu_data = get_cpu_data(file_path, len); + return get_xpu_data(file_path, cpu_data); +} + +template float* get_xpu_data(const std::string&, int); +template float16* get_xpu_data(const std::string&, int); +template int64_t* get_xpu_data(const std::string&, int); +template int* get_xpu_data(const std::string&, int); + +template +std::vector quant_cpu(const std::vector& cpu_data) { + int len = cpu_data.size(); + std::vector cpu_quant_data(len, 0); + api::Context ctx(api::kCPU); + int r = api::quantization(&ctx, cpu_data.data(), + cpu_quant_data.data(), len, nullptr); + if (r != 0) { + std::cout << "ERR: quantization failed!" << std::endl; + std::exit(1); + } + return cpu_quant_data; +} + +template <> +std::vector quant_cpu(const std::vector& cpu_data) { + return cpu_data; +} + +template +XPUQunatData get_xpu_quant_data(const std::string& data_name, + const std::vector& cpu_data) { + XPUQunatData xpu_quant_data; + + int len = cpu_data.size(); + // quant + std::vector cpu_quant_data = quant_cpu(cpu_data); + // findmax + float abs_max = 1e-30f; + if (std::is_same::value || std::is_same::value) { + for (int i = 0; i < len; ++i) { + float abs_val = std::fabs(static_cast(cpu_data[i])); + abs_max = std::max(abs_max, abs_val); + } + } + + constexpr int max_ptr_len = 6; // for xpu2 + std::vector cpu_max(max_ptr_len, abs_max); + // xpu malloc + TY* xpu_data = nullptr; + float* xpu_max_ptr = nullptr; + int r = xpu_malloc(reinterpret_cast(&xpu_data), len * sizeof(TY)); + if (r != 0) { + std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl; + std::exit(1); + } + r = xpu_malloc(reinterpret_cast(&xpu_max_ptr), + max_ptr_len * sizeof(float)); + if (r != 0) { + std::cout << "ERR: xpu_malloc failed! " << data_name << std::endl; + std::exit(1); + } + +#ifdef TEST_DEBUG + std::cout << "DEBUG: file_path=" << data_name << ", len=" << len + << ", data vec_sum=" << vec_sum(cpu_data) + << ", quant_data vec_sum=" << vec_sum(cpu_quant_data) + << ", details: " << print_vec(cpu_quant_data) << std::endl; +#endif + r = xpu_wait(); + if (r != 0) { + std::cout << "ERR: xpu_wait failed!" << std::endl; + std::exit(1); + } + // xpu memcpy + r = xpu_memcpy(xpu_data, cpu_quant_data.data(), len * sizeof(TY), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + if (r != 0) { + std::cout << "ERR: xpu_memcpy failed!" << std::endl; + std::exit(1); + } +#ifdef TEST_DEBUG + std::cout << "DEBUG: max is " << print_vec(cpu_max) << std::endl; +#endif + r = xpu_memcpy(xpu_max_ptr, cpu_max.data(), max_ptr_len * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + if (r != 0) { + std::cout << "ERR: xpu_malloc failed!" << std::endl; + std::exit(1); + } + +#ifdef TEST_DEBUG + std::cout << "DEBUG: xpu_data=" << xpu_data << ", xpu_max_ptr=" << xpu_max_ptr + << std::endl; +#endif + xpu_quant_data.data_ = xpu_data; + xpu_quant_data.max_ptr_ = xpu_max_ptr; + return xpu_quant_data; +} + +template XPUQunatData get_xpu_quant_data( + const std::string&, const std::vector&); +template XPUQunatData get_xpu_quant_data( + const std::string&, const std::vector&); + +template +XPUQunatData get_xpu_quant_data(const std::string& file_path, int len) { + std::vector cpu_data = get_cpu_data(file_path, len); + return get_xpu_quant_data(file_path, cpu_data); +} + +template XPUQunatData get_xpu_quant_data( + const std::string&, int); +template XPUQunatData get_xpu_quant_data( + const std::string&, int); + +std::vector get_all_ids(const std::string& dir_in) { + std::vector ids; + std::set ids_set; + struct stat s; + stat(dir_in.c_str(), &s); + if (!S_ISDIR(s.st_mode)) { + return ids; + } + DIR* open_dir = opendir(dir_in.c_str()); + if (nullptr == open_dir) { + return ids; + } + dirent* p = nullptr; + while ((p = readdir(open_dir)) != nullptr) { + if (p->d_name[0] != '.') { + std::string filename = std::string(p->d_name); + int end_pos = filename.find('_'); + + int qid = std::stoi(filename.substr(0, end_pos)); + ids_set.insert(qid); + } + } + closedir(open_dir); + ids.resize(ids_set.size()); + ids.assign(ids_set.begin(), ids_set.end()); + return ids; +} + +void get_ln_param(const std::unordered_map& weights_len_info, + const std::string& params_dir, + const std::string& ln_name_prefix, + const float*& ln_scale, // NOLINT + const float*& ln_bias) { // NOLINT + std::string ln_file_prefix = params_dir + ln_name_prefix; + int wlen = weights_len_info.at(ln_name_prefix + "weight"); + int blen = weights_len_info.at(ln_name_prefix + "bias"); + ln_scale = get_xpu_data(ln_file_prefix + "weight", wlen); + ln_bias = get_xpu_data(ln_file_prefix + "bias", blen); +} + +template +void print_xpu_data_all(api::Context* ctx, const T* data, + std::vector shape, std::string name) { + int data_len = vector_prod(shape); + std::vector cpu_data(data_len); + xpu_wait(ctx->xpu_stream); + xpu_memcpy(reinterpret_cast(&cpu_data.front()), data, + data_len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + std::cout << name; + std::cout << " shape:"; + for (auto i : shape) { + std::cout << i << " "; + } + std::cout << std::endl; + int row = 1; + int col = shape.back(); + if (shape.size() >= 2) { + row = data_len / col; + } + T* cpu_data_ptr = &cpu_data.front(); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + std::cout << *(cpu_data_ptr + i * col + j) << " "; + } + std::cout << std::endl; + } +} +template +void print_xpu_data(api::Context* ctx, const T* data, std::vector shape, + std::string name) { + int data_len = vector_prod(shape); + + std::vector cpu_data(data_len); + xpu_memcpy(reinterpret_cast(&cpu_data.front()), data, + data_len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + std::cout << name; + std::cout << " shape:"; + for (auto i : shape) { + std::cout << i << " "; + } + std::cout << std::endl; + if (data_len > 1000) { + double mean = 0; + for (auto val : cpu_data) { + mean += static_cast(val); + } + mean /= data_len; + std::cout << "mean=" << mean << std::endl; + std::cout << "details: "; + for (int i = 0; i < 8; ++i) { + std::cout << cpu_data[i] << " "; + } + std::cout << "..."; + for (int i = data_len - 8; i < data_len; ++i) { + std::cout << cpu_data[i] << " "; + } + std::cout << std::endl; + return; + } + int row = 1; + int col = shape.back(); + if (shape.size() >= 2) { + row = data_len / col; + } + T* cpu_data_ptr = &cpu_data.front(); + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + std::cout << *(cpu_data_ptr + i * col + j) << " "; + } + std::cout << std::endl; + } +} +template +void print_cpu_data(const T* data, std::vector shape, std::string name) { + int data_len = vector_prod(shape); + std::cout << name; + std::cout << " shape:"; + for (auto i : shape) { + std::cout << i << " "; + } + std::cout << std::endl; + int row = 1; + int col = shape.back(); + if (shape.size() >= 2) { + row = data_len / col; + } + for (int i = 0; i < row; i++) { + for (int j = 0; j < col; j++) { + std::cout << *(data + i * col + j) << " "; + } + std::cout << std::endl; + } +} + +template +void print_vec(const std::vector& data, const std::string& data_name) { + int len = static_cast(data.size()); + T sum = std::accumulate(data.begin(), data.end(), 0); + std::cout << "DEBUG: data_name is " << data_name << ", len=" << len + << ", sum=" << sum << ", "; + for (int i = 0; i < len - 1; ++i) { + std::cout << data[i] << ", "; + } + std::cout << data[len - 1] << std::endl; +} + +#define INSTANTIATION_PRINT(T) \ + template void print_vec(const std::vector&, const std::string&); \ + template void print_cpu_data(const T*, std::vector, \ + std::string name); \ + template void print_xpu_data(api::Context * ctx, const T*, \ + std::vector, std::string); \ + template void print_xpu_data_all(api::Context * ctx, const T*, \ + std::vector shape, std::string); + +INSTANTIATION_PRINT(int); +INSTANTIATION_PRINT(int16_t); +INSTANTIATION_PRINT(int8_t); +INSTANTIATION_PRINT(float); +INSTANTIATION_PRINT(float16); diff --git a/runtime/kunlun/xpu/xpu_util.h b/runtime/kunlun/xpu/xpu_util.h new file mode 100644 index 0000000000..e0b02dc600 --- /dev/null +++ b/runtime/kunlun/xpu/xpu_util.h @@ -0,0 +1,118 @@ +// Copyright (c) 2022 KUNLUNXIN Inc. +// 2022 Han Qi (qihan@baidu.com) +// Hehe Pan (panhehe@baidu.com) +// Zikui Yan (yanzikui@baidu.com) +// Chaolin Li (lichaolin@baidu.com) +// All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xpu/runtime.h" +#include "xpu/xdnn.h" + +#pragma once +namespace api = baidu::xpu::api; +template +class XPUQunatData { + public: + XPUQunatData() : data_(nullptr), max_ptr_(nullptr) {} + XPUQunatData(T* data, float* max_ptr) : data_(data), max_ptr_(max_ptr) {} + T* data_{nullptr}; + float* max_ptr_{nullptr}; +}; + +int vector_prod(std::vector shape); +void add_separator_when_necessary(std::string& str); // NOLINT + +template +void conformer_test(const std::string& data_dir, const std::string& params_dir, + int threads_number, int dev_id); + +template +std::vector Split(const std::string& str, const std::string& separator); + +std::unordered_map get_weights_lens( + const std::string& file_path); +std::unordered_map> get_weights_shape( + const std::string& file_path); + +template +std::vector get_cpu_data(const std::string& file_path, int len); + +template +T* get_xpu_data(const std::string& file_path, int len); + +template +T* get_xpu_data(const std::string& data_name, const std::vector& cpu_data); + +template +XPUQunatData get_xpu_quant_data(const std::string& file_path, int len); + +template +XPUQunatData get_xpu_quant_data(const std::string& data_name, + const std::vector& cpu_data); + +std::vector get_all_ids(const std::string& dir_in); + +void get_ln_param(const std::unordered_map& weights_len_info, + const std::string& params_dir, + const std::string& ln_name_prefix, + const float*& ln_scale, // NOLINT + const float*& ln_bias); // NOLINT + +template +void print_vec(const std::vector& data, const std::string& data_name); +template +void print_cpu_data(const T* data, std::vector shape, std::string name); +template +void print_xpu_data(api::Context* ctx, const T* data, std::vector shape, + std::string name); +template +void print_xpu_data_all(api::Context* ctx, const T* data, + std::vector shape, std::string name); + +#define CHECK_RET(ret) \ + if ((ret) != 0) { \ + std::cout << "ERR" << __FILE__ << ":" << __LINE__ \ + << ", check failed, ret != 0" << std::endl; \ + std::exit(1); \ + } +#define WRAPPER_CHECK_CTX(ctx) \ + if (ctx == nullptr) { \ + return api::INVALID_PARAM; \ + } +#define WRAPPER_ASSERT_GT(ctx, expra, exprb) \ + if (!((expra) > (exprb))) { \ + return api::INVALID_PARAM; \ + } +#define WRAPPER_ASSERT_SUCCESS(ctx, ret) \ + if (!((ret) == api::SUCCESS)) { \ + return ret; \ + }