From a9d3cef942ebfb09e722199cbc882d5abeb54154 Mon Sep 17 00:00:00 2001 From: Zhenghai Zhang <65210872+ccsuzzh@users.noreply.github.com> Date: Thu, 10 Aug 2023 20:31:07 +0800 Subject: [PATCH 001/246] [cmake] add isl third_party cache (#55631) * add isl third_party cache * fix bug * Update cinn.cmake --- cmake/cinn.cmake | 2 +- cmake/cinn/external/isl.cmake | 55 +++++++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake index e2ab8c6c5c9f3..ff3731f380540 100644 --- a/cmake/cinn.cmake +++ b/cmake/cinn.cmake @@ -331,7 +331,7 @@ set(ABSL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/absl/include") include_directories(${ABSL_INCLUDE_DIR}) # Add isl -set(ISL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/isl/include") +set(ISL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/third_party/install/isl/include") include_directories(${ISL_INCLUDE_DIR}) # Add LLVM diff --git a/cmake/cinn/external/isl.cmake b/cmake/cinn/external/isl.cmake index da957c6f632b8..6c05e425b4831 100644 --- a/cmake/cinn/external/isl.cmake +++ b/cmake/cinn/external/isl.cmake @@ -7,25 +7,62 @@ include(ExternalProject) # static build # CPPFLAGS="-fPIC -DPIC" ./configure --with-gmp-prefix= --with-clang-prefix= --enable-shared=no --enable-static=yes +set(ISL_FILE + "isl-6a1760fe.tar.gz" + CACHE STRING "" FORCE) set(ISL_DOWNLOAD_URL - https://paddle-inference-dist.bj.bcebos.com/CINN/isl-6a1760fe.tar.gz) -set(ISL_MD5 fff10083fb79d394b8a7b7b2089f6183) + "https://paddle-inference-dist.bj.bcebos.com/CINN/${ISL_FILE}") +set(ISL_URL_MD5 fff10083fb79d394b8a7b7b2089f6183) +set(ISL_DOWNLOAD_DIR ${PADDLE_SOURCE_DIR}/third_party/isl) +set(ISL_PREFIX_DIR ${THIRD_PARTY_PATH}/isl) +set(ISL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/isl) + +function(download_isl) + message( + STATUS "Downloading ${ISL_DOWNLOAD_URL} to ${ISL_DOWNLOAD_DIR}/${ISL_FILE}") + file( + DOWNLOAD ${ISL_DOWNLOAD_URL} ${ISL_DOWNLOAD_DIR}/${ISL_FILE} + EXPECTED_MD5 ${ISL_URL_MD5} + STATUS ERR) + if(ERR EQUAL 0) + message(STATUS "Download ${ISL_FILE} success") + else() + message( + FATAL_ERROR + "Download failed, error: ${ERR}\n You can try downloading ${ISL_FILE} again" + ) + endif() +endfunction() + +# Download and check isl. +if(EXISTS ${ISL_DOWNLOAD_DIR}/${ISL_FILE}) + file(MD5 ${ISL_DOWNLOAD_DIR}/${ISL_FILE} ISL_MD5) + if(NOT ISL_MD5 STREQUAL ISL_URL_MD5) + # clean build file + file(REMOVE_RECURSE ${ISL_PREFIX_DIR}) + file(REMOVE_RECURSE ${ISL_INSTALL_DIR}) + download_isl() + endif() +else() + download_isl() +endif() ExternalProject_Add( external_isl ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${ISL_DOWNLOAD_URL} - URL_MD5 ${ISL_MD5} - PREFIX ${THIRD_PARTY_PATH}/isl - SOURCE_DIR ${THIRD_PARTY_PATH}/install/isl + URL ${ISL_DOWNLOAD_DIR}/${ISL_FILE} + URL_MD5 ${ISL_URL_MD5} + DOWNLOAD_DIR ${ISL_DOWNLOAD_DIR} + PREFIX ${ISL_PREFIX_DIR} + SOURCE_DIR ${ISL_INSTALL_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" UPDATE_COMMAND "" INSTALL_COMMAND "" - BUILD_BYPRODUCTS ${THIRD_PARTY_PATH}/install/isl/lib/libisl.a) + BUILD_BYPRODUCTS ${ISL_INSTALL_DIR}/lib/libisl.a) add_library(isl STATIC IMPORTED GLOBAL) set_property(TARGET isl PROPERTY IMPORTED_LOCATION - ${THIRD_PARTY_PATH}/install/isl/lib/libisl.a) + ${ISL_INSTALL_DIR}/lib/libisl.a) add_dependencies(isl external_isl) -include_directories(${THIRD_PARTY_PATH}/install/isl/include) +include_directories(${ISL_INSTALL_DIR}/include) From 988b252ab8bfdd85b9fd822863e52c8ddc88b051 Mon Sep 17 00:00:00 2001 From: gouzil <66515297+gouzil@users.noreply.github.com> Date: Thu, 10 Aug 2023 20:32:37 +0800 Subject: [PATCH 002/246] [cmake] add absl third_party cache (#55549) * add absl to third_party * fix * ci test * ci test * ci test * rollback --- cmake/cinn.cmake | 4 ---- cmake/cinn/external/absl.cmake | 27 ++++++++++++++++++++++----- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake index ff3731f380540..de13f71526c8a 100644 --- a/cmake/cinn.cmake +++ b/cmake/cinn.cmake @@ -326,10 +326,6 @@ set(CINN_LIB "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}") # Add CINN's dependencies header files ###################################### -# Add absl -set(ABSL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/absl/include") -include_directories(${ABSL_INCLUDE_DIR}) - # Add isl set(ISL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/third_party/install/isl/include") include_directories(${ISL_INCLUDE_DIR}) diff --git a/cmake/cinn/external/absl.cmake b/cmake/cinn/external/absl.cmake index 466d531780736..051a83c956882 100644 --- a/cmake/cinn/external/absl.cmake +++ b/cmake/cinn/external/absl.cmake @@ -1,13 +1,31 @@ include(ExternalProject) -set(ABSL_SOURCES_DIR ${THIRD_PARTY_PATH}/absl) +set(ABSL_SOURCES_DIR ${PADDLE_SOURCE_DIR}/third_party/absl) set(ABSL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/absl) - +set(ABSL_PREFIX_DIR ${THIRD_PARTY_PATH}/absl) set(ABSL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) set(ABSL_REPOSITORY "https://github.com/abseil/abseil-cpp.git") set(ABSL_TAG "20210324.2") +if(NOT EXISTS ${ABSL_SOURCES_DIR}) + message( + STATUS "Download absl source from ${ABSL_REPOSITORY} to ABSL_SOURCES_DIR") + execute_process(COMMAND ${GIT_EXECUTABLE} clone -b ${ABSL_TAG} + ${ABSL_REPOSITORY} ${ABSL_SOURCES_DIR}) +else() + # check git tag + execute_process( + COMMAND ${GIT_EXECUTABLE} -C ${ABSL_SOURCES_DIR} describe --tags + OUTPUT_VARIABLE CURRENT_TAG + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT ${CURRENT_TAG} STREQUAL ${ABSL_TAG}) + message(STATUS "Checkout absl to ${ABSL_TAG}") + execute_process(COMMAND ${GIT_EXECUTABLE} -C ${ABSL_SOURCES_DIR} checkout + -q ${ABSL_TAG}) + endif() +endif() + set(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" @@ -22,9 +40,8 @@ ExternalProject_Add( external_absl ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags - GIT_REPOSITORY ${ABSL_REPOSITORY} - GIT_TAG ${ABSL_TAG} - PREFIX ${ABSL_SOURCES_DIR} + PREFIX ${ABSL_PREFIX_DIR} + SOURCE_DIR ${ABSL_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS ${OPTIONAL_ARGS} -DCMAKE_INSTALL_PREFIX=${ABSL_INSTALL_DIR} From dfe97dc82cba5463e552fed5550391dd8292b630 Mon Sep 17 00:00:00 2001 From: umiswing Date: Fri, 11 Aug 2023 09:09:51 +0800 Subject: [PATCH 003/246] Add flash attention to ci gpups. (#56117) --- tools/gpups_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh index d0f9dd19341bd..fe2f93a34c91e 100644 --- a/tools/gpups_test.sh +++ b/tools/gpups_test.sh @@ -46,6 +46,7 @@ parallel_list="^init_phi_test$|\ ^test_dist_fleet_ps11$|\ ^test_dist_fleet_ps12$|\ ^test_executor_feed_non_tensor$|\ +^test_flash_attention$|\ ^test_fused_adam_op$|\ ^test_fused_attention_no_dropout$|\ ^test_fused_attention_op$|\ From 460e4fc6e82221be4639e061d4e11bd9332a5f21 Mon Sep 17 00:00:00 2001 From: hong19860320 <9973393+hong19860320@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:13:25 +0800 Subject: [PATCH 004/246] [XPU] Add fast_gather_nd plugin (#56103) --- paddle/phi/kernels/xpu/gather_nd_kernel.cc | 21 ++ .../kernels/xpu/plugin/include/xpu/plugin.h | 24 ++ .../src/kernel/kunlun2cpp/fast_gather_nd.xpu | 259 ++++++++++++++++ .../xpu/plugin/src/wrapper/fast_gather_nd.cpp | 281 ++++++++++++++++++ 4 files changed, 585 insertions(+) create mode 100644 paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/fast_gather_nd.xpu create mode 100644 paddle/phi/kernels/xpu/plugin/src/wrapper/fast_gather_nd.cpp diff --git a/paddle/phi/kernels/xpu/gather_nd_kernel.cc b/paddle/phi/kernels/xpu/gather_nd_kernel.cc index 9966d3795d504..43581963987c9 100644 --- a/paddle/phi/kernels/xpu/gather_nd_kernel.cc +++ b/paddle/phi/kernels/xpu/gather_nd_kernel.cc @@ -87,6 +87,7 @@ void GatherNdKernel(const Context &ctx, x_shape.data(), static_cast(x_shape.size()), nullptr}; int ret = XPU_SUCCESS; +#ifndef PADDLE_WITH_XPU_PLUGIN if (index_type == DataType::INT32) { ret = xpu::gather_nd( ctx.x_context(), @@ -105,6 +106,26 @@ void GatherNdKernel(const Context &ctx, index_shape); } PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather_nd"); +#else + if (index_type == DataType::INT32) { + ret = xpu::plugin::fast_gather_nd( + ctx.x_context(), + reinterpret_cast(x.data()), + index.data(), + reinterpret_cast(out->data()), + x_vec, + index_shape); + } else { + ret = xpu::plugin::fast_gather_nd( + ctx.x_context(), + reinterpret_cast(x.data()), + index.data(), + reinterpret_cast(out->data()), + x_vec, + index_shape); + } + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "fast_gather_nd"); +#endif } } // namespace phi diff --git a/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h b/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h index 712c02977de67..eb7588252a640 100644 --- a/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h +++ b/paddle/phi/kernels/xpu/plugin/include/xpu/plugin.h @@ -31,6 +31,30 @@ DLL_EXPORT int fast_where(Context* ctx, const T* y, T* out, int64_t len); +template +DLL_EXPORT int fast_gather_nd(Context* ctx, + const T* x, + const TID* index, + T* y, + const VectorParam& xshape, + const std::vector& index_shape); +template +static inline int fast_gather_nd(Context* ctx, + const T* x, + const TID* index, + T* y, + const VectorParam& xshape, + const std::vector& index_shape) { + auto deleter = [](int64_t* ptr) { delete[] ptr; }; + std::shared_ptr xshape_i64(new int64_t[xshape.len], deleter); + return fast_gather_nd( + ctx, + x, + index, + y, + vpi32_to_vpi64(xshape, xshape_i64.get()), + std::vector(index_shape.begin(), index_shape.end())); +} } // namespace plugin } // namespace api diff --git a/paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/fast_gather_nd.xpu b/paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/fast_gather_nd.xpu new file mode 100644 index 0000000000000..09c69561d732f --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/src/kernel/kunlun2cpp/fast_gather_nd.xpu @@ -0,0 +1,259 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + * copyright (C) 2022 KUNLUNXIN, Inc + */ + +#include "xpu/kernel/cluster.h" +#include "xpu/kernel/cluster_partition.h" +#include "xpu/kernel/cluster_primitive.h" + +namespace xpu2 { +namespace plugin { + +template +__global__ void fast_gather1d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_stride0, + int8_t* y) { + int cid = core_id(); + int tid = core_id() * cluster_num() + cluster_id(); + int nthreads = core_num() * cluster_num(); + const int index_len = 320 / sizeof(TID); + __simd__ TID local_index[index_len]; + const int buf_len = 5824 / sizeof(int8_t); + __simd__ int8_t local_x[buf_len]; + if (x_stride0 > buf_len) { + for (int64_t i = tid; i < count; i += nthreads) { + GM2LM(index + i, local_index, sizeof(TID)); + int64_t offset = ((local_index[0] + x_dim0) % x_dim0) * x_stride0; + for (int64_t j = 0; j < x_stride0; j += buf_len) { + int read_len = min(static_cast(x_stride0), x_stride0 - j); + GM2LM(x + offset + j, local_x, read_len); + LM2GM(local_x, y + i * x_stride0 + j, read_len); + } + } + } else { + int64_t count_per_thread = min(index_len, buf_len / x_stride0); + for (int64_t i = tid * count_per_thread; i < count; + i += nthreads * count_per_thread) { + int count_in_thread = + min(static_cast(count_per_thread), count - i); + GM2LM(index + i, local_index, count_in_thread * sizeof(TID)); + for (int64_t j = 0; j < count_in_thread; j++) { + int64_t offset = ((local_index[j] + x_dim0) % x_dim0) * x_stride0; + GM2LM_ASYNC(x + offset, local_x + j * x_stride0, x_stride0); + } + mfence_lm(); + LM2GM(local_x, y + i * x_stride0, x_stride0 * count_in_thread); + } + } +} + +template +__global__ void fast_gather2d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_dim1, + int64_t x_stride0, + int64_t x_stride1, + int8_t* y) { + int cid = core_id(); + int tid = core_id() * cluster_num() + cluster_id(); + int nthreads = core_num() * cluster_num(); + const int index_len = 640 / sizeof(TID); + __simd__ TID local_index[index_len]; + const int buf_len = 5504 / sizeof(int8_t); + __simd__ int8_t local_x[buf_len]; + if (x_stride1 > buf_len) { + for (int64_t i = tid; i < count; i += nthreads) { + GM2LM(index + i * 2, local_index, 2 * sizeof(TID)); + int64_t offset = ((local_index[0] + x_dim0) % x_dim0) * x_stride0 + + ((local_index[1] + x_dim1) % x_dim1) * x_stride1; + for (int64_t j = 0; j < x_stride1; j += buf_len) { + int read_len = min(static_cast(x_stride1), x_stride1 - j); + GM2LM(x + offset + j, local_x, read_len); + LM2GM(local_x, y + i * x_stride1 + j, read_len); + } + } + } else { + int64_t count_per_thread = min(index_len / 2, buf_len / x_stride1); + for (int64_t i = tid * count_per_thread; i < count; + i += nthreads * count_per_thread) { + int count_in_thread = + min(static_cast(count_per_thread), count - i); + GM2LM(index + i * 2, local_index, 2 * count_in_thread * sizeof(TID)); + for (int64_t j = 0; j < count_in_thread; j++) { + int64_t offset = + ((local_index[j * 2] + x_dim0) % x_dim0) * x_stride0 + + ((local_index[j * 2 + 1] + x_dim1) % x_dim1) * x_stride1; + GM2LM_ASYNC(x + offset, local_x + j * x_stride1, x_stride1); + } + mfence_lm(); + LM2GM(local_x, y + i * x_stride1, x_stride1 * count_in_thread); + } + } +} + +template +__global__ void fast_gather3d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_dim1, + int64_t x_dim2, + int64_t x_stride0, + int64_t x_stride1, + int64_t x_stride2, + int8_t* y) { + int cid = core_id(); + int tid = core_id() * cluster_num() + cluster_id(); + int nthreads = core_num() * cluster_num(); + const int index_len = 960 / sizeof(TID); + __simd__ TID local_index[index_len]; + const int buf_len = 5184 / sizeof(int8_t); + __simd__ int8_t local_x[buf_len]; + if (x_stride2 > buf_len) { + for (int64_t i = tid; i < count; i += nthreads) { + GM2LM(index + i * 3, local_index, 3 * sizeof(TID)); + int64_t offset = ((local_index[0] + x_dim0) % x_dim0) * x_stride0 + + ((local_index[1] + x_dim1) % x_dim1) * x_stride1 + + ((local_index[2] + x_dim2) % x_dim2) * x_stride2; + for (int64_t j = 0; j < x_stride2; j += buf_len) { + int read_len = min(static_cast(x_stride2), x_stride2 - j); + GM2LM(x + offset + j, local_x, read_len); + LM2GM(local_x, y + i * x_stride2 + j, read_len); + } + } + } else { + int64_t count_per_thread = min(index_len / 3, buf_len / x_stride2); + for (int64_t i = tid * count_per_thread; i < count; + i += nthreads * count_per_thread) { + int count_in_thread = + min(static_cast(count_per_thread), count - i); + GM2LM(index + i * 3, local_index, 3 * count_in_thread * sizeof(TID)); + for (int64_t j = 0; j < count_in_thread; j++) { + int64_t offset = + ((local_index[j * 3] + x_dim0) % x_dim0) * x_stride0 + + ((local_index[j * 3 + 1] + x_dim1) % x_dim1) * x_stride1 + + ((local_index[j * 3 + 2] + x_dim2) % x_dim2) * x_stride2; + GM2LM_ASYNC(x + offset, local_x + j * x_stride2, x_stride2); + } + mfence_lm(); + LM2GM(local_x, y + i * x_stride2, x_stride2 * count_in_thread); + } + } +} + +template +__global__ void fast_gather4d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_dim1, + int64_t x_dim2, + int64_t x_dim3, + int64_t x_stride0, + int64_t x_stride1, + int64_t x_stride2, + int64_t x_stride3, + int8_t* y) { + int cid = core_id(); + int tid = core_id() * cluster_num() + cluster_id(); + int nthreads = core_num() * cluster_num(); + const int index_len = 1280 / sizeof(TID); + __simd__ TID local_index[index_len]; + const int buf_len = 4864 / sizeof(int8_t); + __simd__ int8_t local_x[buf_len]; + if (x_stride3 > buf_len) { + for (int64_t i = tid; i < count; i += nthreads) { + GM2LM(index + i * 4, local_index, 4 * sizeof(TID)); + int64_t offset = ((local_index[0] + x_dim0) % x_dim0) * x_stride0 + + ((local_index[1] + x_dim1) % x_dim1) * x_stride1 + + ((local_index[2] + x_dim2) % x_dim2) * x_stride2 + + ((local_index[3] + x_dim3) % x_dim3) * x_stride3; + for (int64_t j = 0; j < x_stride3; j += buf_len) { + int read_len = min(static_cast(x_stride3), x_stride3 - j); + GM2LM(x + offset + j, local_x, read_len); + LM2GM(local_x, y + i * x_stride3 + j, read_len); + } + } + } else { + int64_t count_per_thread = min(index_len / 4, buf_len / x_stride3); + for (int64_t i = tid * count_per_thread; i < count; + i += nthreads * count_per_thread) { + int count_in_thread = + min(static_cast(count_per_thread), count - i); + GM2LM(index + i * 4, local_index, 4 * count_in_thread * sizeof(TID)); + for (int64_t j = 0; j < count_in_thread; j++) { + int64_t offset = + ((local_index[j * 4] + x_dim0) % x_dim0) * x_stride0 + + ((local_index[j * 4 + 1] + x_dim1) % x_dim1) * x_stride1 + + ((local_index[j * 4 + 2] + x_dim2) % x_dim2) * x_stride2 + + ((local_index[j * 4 + 3] + x_dim3) % x_dim3) * x_stride3; + GM2LM_ASYNC(x + offset, local_x + j * x_stride3, x_stride3); + } + mfence_lm(); + LM2GM(local_x, y + i * x_stride3, x_stride3 * count_in_thread); + } + } +} + +#define _XPU_DEF__FAST_GATHERND_(IDTYPE) \ + template __global__ void fast_gather1d(const int8_t* x, \ + const IDTYPE* index, \ + int64_t count, \ + int64_t x_dim0, \ + int64_t x_stride0, \ + int8_t* y); \ + template __global__ void fast_gather2d(const int8_t* x, \ + const IDTYPE* index, \ + int64_t count, \ + int64_t x_dim0, \ + int64_t x_dim1, \ + int64_t x_stride0, \ + int64_t x_stride1, \ + int8_t* y); \ + template __global__ void fast_gather3d(const int8_t* x, \ + const IDTYPE* index, \ + int64_t count, \ + int64_t x_dim0, \ + int64_t x_dim1, \ + int64_t x_dim2, \ + int64_t x_stride0, \ + int64_t x_stride1, \ + int64_t x_stride2, \ + int8_t* y); \ + template __global__ void fast_gather4d(const int8_t* x, \ + const IDTYPE* index, \ + int64_t count, \ + int64_t x_dim0, \ + int64_t x_dim1, \ + int64_t x_dim2, \ + int64_t x_dim3, \ + int64_t x_stride0, \ + int64_t x_stride1, \ + int64_t x_stride2, \ + int64_t x_stride3, \ + int8_t* y); +_XPU_DEF__FAST_GATHERND_(int); +_XPU_DEF__FAST_GATHERND_(int8_t); +_XPU_DEF__FAST_GATHERND_(int64_t); +_XPU_DEF__FAST_GATHERND_(bool); + +} // namespace plugin +} // namespace xpu2 diff --git a/paddle/phi/kernels/xpu/plugin/src/wrapper/fast_gather_nd.cpp b/paddle/phi/kernels/xpu/plugin/src/wrapper/fast_gather_nd.cpp new file mode 100644 index 0000000000000..24215092768db --- /dev/null +++ b/paddle/phi/kernels/xpu/plugin/src/wrapper/fast_gather_nd.cpp @@ -0,0 +1,281 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* + * copyright (C) 2022 KUNLUNXIN, Inc + */ + +#include "xpu/plugin.h" +#include "xpu/refactor/impl_public/wrapper_check.h" + +namespace xpu2 { +namespace plugin { +template +__attribute__((global)) void fast_gather1d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_stride0, + int8_t* y); +template +__attribute__((global)) void fast_gather2d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_dim1, + int64_t x_stride0, + int64_t x_stride1, + int8_t* y); +template +__attribute__((global)) void fast_gather3d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_dim1, + int64_t x_dim2, + int64_t x_stride0, + int64_t x_stride1, + int64_t x_stride2, + int8_t* y); +template +__attribute__((global)) void fast_gather4d(const int8_t* x, + const TID* index, + int64_t count, + int64_t x_dim0, + int64_t x_dim1, + int64_t x_dim2, + int64_t x_dim3, + int64_t x_stride0, + int64_t x_stride1, + int64_t x_stride2, + int64_t x_stride3, + int8_t* y); +} // namespace plugin +} // namespace xpu2 + +namespace baidu { +namespace xpu { +namespace api { +namespace plugin { + +template +static int cpu_wrapper(Context* ctx, + const T* x, + const TID* index, + T* y, + const VectorParam& x_shape, + const std::vector& index_shape) { + int64_t x_shape_size = x_shape.len; + int64_t index_shape_size = index_shape.size(); + int64_t gather_time = 1; + for (int64_t i = 0; i < index_shape_size - 1; i++) { + gather_time *= index_shape[i]; + } + int64_t end_size = index_shape.back(); + int64_t gather_size = 1; + for (int64_t i = end_size; i < x_shape_size; i++) { + gather_size *= x_shape.cpu[i]; + } + const int64_t gather_bytes = gather_size * sizeof(T); + for (int64_t i = 0; i < gather_time; i++) { + int64_t x_index = 0; + int64_t step = 1; + for (int64_t j = end_size - 1; j >= 0; j--) { + x_index += (index[i * end_size + j] * step); + step *= x_shape.cpu[j]; + } + memcpy(y, x + x_index * gather_size, gather_bytes); + y += gather_size; + } + return api::SUCCESS; +} + +template +static int xpu2_wrapper(Context* ctx, + const T* x, + const TID* index, + T* y, + const VectorParam& x_shape, + const std::vector& index_shape) { + using XPU_TID = typename XPUIndexType::type; + int64_t x_shape_size = x_shape.len; + int64_t index_shape_size = index_shape.size(); + int64_t end_size = index_shape.back(); + int64_t gather_time = 1; + for (int64_t i = 0; i < index_shape_size - 1; i++) { + gather_time *= index_shape[i]; + } + std::vector gather_strides(end_size); + gather_strides[end_size - 1] = sizeof(T); + for (int64_t i = end_size; i < x_shape_size; i++) { + gather_strides[end_size - 1] *= x_shape.cpu[i]; + } + for (int64_t i = end_size - 2; i >= 0; i--) { + gather_strides[i] = gather_strides[i + 1] * x_shape.cpu[i + 1]; + } + auto casted_x = static_cast(static_cast(x)); + auto casted_index = + static_cast(static_cast(index)); + auto casted_y = static_cast(static_cast(y)); + switch (end_size) { + case 1: + xpu2::plugin::fast_gather1d + <<ncluster(), 64, ctx->xpu_stream>>>(casted_x, + casted_index, + gather_time, + x_shape.cpu[0], + gather_strides[0], + casted_y); + return api::SUCCESS; + case 2: + xpu2::plugin::fast_gather2d + <<ncluster(), 64, ctx->xpu_stream>>>(casted_x, + casted_index, + gather_time, + x_shape.cpu[0], + x_shape.cpu[1], + gather_strides[0], + gather_strides[1], + casted_y); + return api::SUCCESS; + case 3: + xpu2::plugin::fast_gather3d + <<ncluster(), 64, ctx->xpu_stream>>>(casted_x, + casted_index, + gather_time, + x_shape.cpu[0], + x_shape.cpu[1], + x_shape.cpu[2], + gather_strides[0], + gather_strides[1], + gather_strides[2], + casted_y); + return api::SUCCESS; + case 4: + xpu2::plugin::fast_gather4d + <<ncluster(), 64, ctx->xpu_stream>>>(casted_x, + casted_index, + gather_time, + x_shape.cpu[0], + x_shape.cpu[1], + x_shape.cpu[2], + x_shape.cpu[3], + gather_strides[0], + gather_strides[1], + gather_strides[2], + gather_strides[3], + casted_y); + return api::SUCCESS; + defaut: + break; + } + return gather_nd(ctx, x, index, y, x_shape, index_shape); +} + +template +int fast_gather_nd(Context* ctx, + const T* x, + const TID* index, + T* y, + const VectorParam& x_shape, + const std::vector& index_shape) { + WRAPPER_CHECK_CTX(ctx); + WRAPPER_DUMP_FUNCTION_T2(ctx, "fast_gather_nd", T, TID); + WRAPPER_DUMP_PARAM6( + ctx, x, index, y, x_shape, index_shape, ctx->_l3_mgr.get_size()); + WRAPPER_DUMP(ctx); + WRAPPER_ASSERT_GT(ctx, x_shape.len, 0); + WRAPPER_ASSERT_LE(ctx, x_shape.len, 32); + WRAPPER_ASSERT_GT(ctx, index_shape.size(), 0); + int64_t x_len = 1; + for (int64_t i = 0; i < x_shape.len; i++) { + x_len *= x_shape.cpu[i]; + } + WRAPPER_CHECK_PTR(ctx, T, x_len, x); + int64_t index_len = -1; + WRAPPER_CHECK_SHAPE(ctx, &index_len, index_shape); + WRAPPER_CHECK_PTR(ctx, TID, index_len, index); + // index.shape[-1] <= x.rank + WRAPPER_ASSERT_LE(ctx, index_shape.back(), x_shape.len); + std::vector y_shape; + for (int64_t i = 0; i < index_shape.size() - 1; i++) { + y_shape.push_back(index_shape[i]); + } + for (int64_t i = index_shape.back(); i < x_shape.len; i++) { + y_shape.push_back(x_shape.cpu[i]); + } + int64_t y_len = -1; + WRAPPER_CHECK_SHAPE(ctx, &y_len, y_shape); + WRAPPER_CHECK_PTR(ctx, T, y_len, y); + if (ctx->dev().type() == api::kCPU) { + return cpu_wrapper(ctx, x, index, y, x_shape, index_shape); + } + if (ctx->dev().type() == api::kXPU2) { + return xpu2_wrapper(ctx, x, index, y, x_shape, index_shape); + } + WRAPPER_UNIMPLEMENTED(ctx); +} + +template int fast_gather_nd(Context*, + const float*, + const int*, + float*, + const VectorParam&, + const std::vector&); +template int fast_gather_nd(Context*, + const int*, + const int*, + int*, + const VectorParam&, + const std::vector&); +template int fast_gather_nd(Context*, + const int64_t*, + const int*, + int64_t*, + const VectorParam&, + const std::vector&); +template int fast_gather_nd(Context*, + const float16*, + const int*, + float16*, + const VectorParam&, + const std::vector&); +template int fast_gather_nd(Context*, + const float*, + const int64_t*, + float*, + const VectorParam&, + const std::vector&); +template int fast_gather_nd(Context*, + const int*, + const int64_t*, + int*, + const VectorParam&, + const std::vector&); +template int fast_gather_nd(Context*, + const int64_t*, + const int64_t*, + int64_t*, + const VectorParam&, + const std::vector&); +template int fast_gather_nd(Context*, + const float16*, + const int64_t*, + float16*, + const VectorParam&, + const std::vector&); + +} // namespace plugin +} // namespace api +} // namespace xpu +} // namespace baidu From c00320c573d2b5c433322bfd457ed477cd7100df Mon Sep 17 00:00:00 2001 From: Candy2Tang <141831089+Candy2Tang@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:28:03 +0800 Subject: [PATCH 005/246] [NewIR]Remove framework_proto DEPS in pd_dialect (#56152) --- paddle/fluid/ir/dialect/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/ir/dialect/CMakeLists.txt b/paddle/fluid/ir/dialect/CMakeLists.txt index df0061b0111d0..ee08169892995 100644 --- a/paddle/fluid/ir/dialect/CMakeLists.txt +++ b/paddle/fluid/ir/dialect/CMakeLists.txt @@ -52,8 +52,7 @@ file(GLOB PD_DIALECT_SRCS "*.cc") cc_library( pd_dialect SRCS ${PD_DIALECT_SRCS} ${op_source_file} - DEPS framework_proto - phi + DEPS phi phi_utils pd_interface pd_trait From 0c7fdda9bdf07dea46b432ddcbc6053883fa4d06 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:29:10 +0800 Subject: [PATCH 006/246] [IR] Fix dialect lower bug (#56130) * fix bug * fix bug * fix bug * fix bug * fix bug * fix bug * fix bug * fix bug * refine code * fix bug --- .../new_executor/new_ir_interpreter.cc | 10 +- .../ir/transforms/pd_op_to_kernel_pass.cc | 180 +++++++++++++++++- 2 files changed, 180 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 1b1f45128e5cd..836cc490221e2 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1537,7 +1537,12 @@ void NewIRInterpreter::BuildInstruction() { size_t op_idx = 0; for (auto& op : *ir_program_->block()) { VLOG(6) << "Build Instruction for op: " << op_idx; - if (op->dialect()->name() == "pd_kernel") { + if (op->dialect()->name() == "builtin") { + if (interpreter::GetSpecialOpNames().count(op->name())) { + VLOG(6) << "skip process " << op->name(); + continue; + } + } else if (op->dialect()->name() == "pd_kernel") { auto op_name = op->attributes() .at("op_name") .dyn_cast<::ir::StrAttribute>() @@ -1546,6 +1551,7 @@ void NewIRInterpreter::BuildInstruction() { VLOG(6) << "skip process " << op_name; continue; } + VLOG(6) << "process " << op_name; if (op_name == "pd.fused_softmax_mask_upper_triangle" || op_name == "pd.fused_softmax_mask_upper_triangle_grad") { @@ -1571,7 +1577,7 @@ void NewIRInterpreter::BuildInstruction() { } } else { PADDLE_THROW(platform::errors::Unimplemented( - "Now only support pd_kernel dialect.")); + "Now only support pd or pd_kernel dialect.")); } } } diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc index c211812f569bd..3346d7525470d 100644 --- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc @@ -364,14 +364,180 @@ std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, std::unordered_map map_op_pair; std::unordered_map map_value_pair; - std::string op_name = paddle::dialect::PhiKernelOp::name(); - - ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name); + std::string phi_kernel_op_name = paddle::dialect::PhiKernelOp::name(); + ir::OpInfo phi_kernel_op_info = ctx->GetRegisteredOpInfo(phi_kernel_op_name); for (auto op_item : *block) { VLOG(6) << "op name " << op_item->name(); + + if (op_item->name() == "builtin.combine") { + std::vector out_places; + // Copy op inputs + std::vector vec_inputs; + if (op_item->num_operands() > 0) { + for (size_t i = 0; i < op_item->num_operands(); ++i) { + auto cur_in = op_item->operand_source(i); + if (!cur_in) { + vec_inputs.emplace_back(); + continue; + } + PADDLE_ENFORCE_EQ(map_value_pair.count(cur_in), + true, + phi::errors::PreconditionNotMet( + "[%d]'s input of [%s] op MUST in map pair", + i, + op_item->name())); + auto new_in = map_value_pair.at(cur_in); + vec_inputs.push_back(new_in); + if (new_in.type().isa()) { + out_places.push_back( + new_in.type() + .dyn_cast() + .place()); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "only support dense tensor type for now")); + } + } + } + // Copy op output type + std::vector op_output_types; + if (op_item->num_results() > 0) { + for (size_t i = 0; i < op_item->num_results(); ++i) { + auto result_type = op_item->result(i).type(); + if (!result_type) { + op_output_types.push_back(result_type); + } else if (result_type.isa()) { + std::vector vec_inner_types; + auto base_types = result_type.dyn_cast().data(); + for (size_t idx = 0; idx < base_types.size(); idx++) { + auto& base_type = base_types[idx]; + if (base_type) { + if (base_type.isa()) { + auto allocated_dense_tensor_dtype = + paddle::dialect::AllocatedDenseTensorType::get( + ctx, + out_places[idx], + base_type.dyn_cast()); + vec_inner_types.push_back(allocated_dense_tensor_dtype); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "only support dense tensor in vector type for now")); + } + } else { + // NOTE(phlrain), kernel not support a nullptr in output + ir::Type fp32_dtype = ir::Float32Type::get(ctx); + phi::DDim dims = {}; + phi::DataLayout data_layout = phi::DataLayout::NCHW; + phi::LoD lod = {{}}; + size_t offset = 0; + auto dense_tensor_dtype = paddle::dialect::DenseTensorType::get( + ctx, fp32_dtype, dims, data_layout, lod, offset); + vec_inner_types.push_back(dense_tensor_dtype); + } + } + ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types); + op_output_types.push_back(t1); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "builtin.combine Result type only support " + "VectorType")); + } + } + } + // Get op info + ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name()); + // Generate new op + ir::Operation* op = ir::Operation::Create( + vec_inputs, op_item->attributes(), op_output_types, op_info); + program->block()->push_back(op); + map_op_pair[op_item] = op; + // only deal with single output + if (op_item->num_results() > 0) { + for (size_t i = 0; i < op_item->num_results(); ++i) { + map_value_pair[op_item->result(i)] = op->result(i); + } + } + VLOG(6) << "Deep copy a new builtin op: " << op_item->name(); + continue; + } + + if (op_item->name() == "builtin.slice") { + phi::Place out_place = place; + // Copy op inputs + std::vector vec_inputs; + if (op_item->num_operands() > 0) { + for (size_t i = 0; i < op_item->num_operands(); ++i) { + auto cur_in = op_item->operand_source(i); + if (!cur_in) { + vec_inputs.emplace_back(); + continue; + } + PADDLE_ENFORCE_EQ(map_value_pair.count(cur_in), + true, + phi::errors::PreconditionNotMet( + "[%d]'s input of [%s] op MUST in map pair", + i, + op_item->name())); + auto new_in = map_value_pair.at(cur_in); + vec_inputs.push_back(new_in); + + if (new_in.type().isa()) { + auto vec_types = new_in.type().dyn_cast().data(); + out_place = + vec_types[op_item->attributes() + .at("index") + .dyn_cast() + .data()] + .dyn_cast() + .place(); + } else { + PADDLE_THROW( + phi::errors::Unimplemented("only support vector type for now")); + } + } + } + // Copy op output type + std::vector op_output_types; + if (op_item->num_results() > 0) { + for (size_t i = 0; i < op_item->num_results(); ++i) { + auto result_type = op_item->result(i).type(); + if (!result_type) { + op_output_types.push_back(result_type); + } else if (result_type.isa()) { + auto allocated_dense_tensor_dtype = + paddle::dialect::AllocatedDenseTensorType::get( + ctx, + out_place, + result_type.dyn_cast()); + op_output_types.push_back(allocated_dense_tensor_dtype); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "builtin.combine Result type only support DenseTensorType")); + } + } + } + // Get op info + ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name()); + // Generate new op + ir::Operation* op = ir::Operation::Create( + vec_inputs, op_item->attributes(), op_output_types, op_info); + program->block()->push_back(op); + map_op_pair[op_item] = op; + // only deal with single output + if (op_item->num_results() > 0) { + for (size_t i = 0; i < op_item->num_results(); ++i) { + map_value_pair[op_item->result(i)] = op->result(i); + } + } + VLOG(6) << "Deep copy a new builtin op: " << op_item->name(); + continue; + } + + // Lower from PaddleDialect to KernelDialect paddle::dialect::OpYamlInfoInterface op_info_interface = op_item->dyn_cast(); + std::unique_ptr op_info_parser(nullptr); if (op_info_interface) { op_info_parser = @@ -399,7 +565,6 @@ std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, // need update new kernel key layout and data tyep std::vector op_output_types; - if (op_item->num_results() > 0) { auto phi_kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN( kernel_fn_str, kernel_key); @@ -484,7 +649,6 @@ std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, // constuct input std::vector vec_inputs; - if (op_item->num_operands() > 0) { for (size_t i = 0; i < op_item->num_operands(); ++i) { auto cur_in = op_item->operand_source(i); @@ -563,7 +727,7 @@ std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, } ir::Operation* op = ir::Operation::Create( - vec_inputs, op_attribute, op_output_types, op_info); + vec_inputs, op_attribute, op_output_types, phi_kernel_op_info); map_op_pair[op_item] = op; @@ -593,8 +757,8 @@ std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, phi::TransToPhiPlace(shadow_key.backend()), op_item->result(0).type().dyn_cast()); - ir::Operation* shadow_op = - ir::Operation::Create({op->result(0)}, attr_map, {out_type}, op_info); + ir::Operation* shadow_op = ir::Operation::Create( + {op->result(0)}, attr_map, {out_type}, phi_kernel_op_info); map_op_pair[op_item] = shadow_op; program->block()->push_back(shadow_op); From 1e5fec393856e9348393d0b2da39bdbd90234165 Mon Sep 17 00:00:00 2001 From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:52:59 +0800 Subject: [PATCH 007/246] [Prim] Fix get var in prim when list of single tensor (#56114) * fix get var in prim * fix stack test case --- python/paddle/incubate/autograd/utils.py | 22 ++++++++++++++++------ test/legacy_test/test_stack_op.py | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py index e79c27f30e1d0..0de52c68bb61b 100644 --- a/python/paddle/incubate/autograd/utils.py +++ b/python/paddle/incubate/autograd/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import collections import typing import paddle @@ -132,11 +133,13 @@ def disable_prim(): } -def get_var_block(block, names): +def get_var_block(block, names, is_tensor_list=None): assert isinstance(names, list) if len(names) == 0: return None elif len(names) == 1: + if is_tensor_list: + return [block.var(names[0])] return block.var(names[0]) else: return [block.var(name) for name in names] @@ -179,7 +182,7 @@ def _get_args_values(op, phi_name): "get attrs' values for api args' values" args = op_info[phi_name] args_list = args["args"].split(",") - inputs = [] + inputs = collections.OrderedDict() attrs = [] for item in args_list: @@ -212,9 +215,9 @@ def _get_args_values(op, phi_name): "inputs" in op_content.keys() and arg_name in op_content["inputs"].keys() ): - inputs.append(op_content["inputs"][arg_name]) + inputs[op_content["inputs"][arg_name]] = arg_type else: - inputs.append(arg_name) + inputs[arg_name] = arg_type else: attr_value = _get_attr_value(op, arg_type, arg_name) attrs.append(attr_value) @@ -237,9 +240,16 @@ def prepare_python_api_arguments(op): phi_name = op.type inputs, attrs = _get_args_values(op, phi_name) res = [] - for item in inputs: + for item, tensor_type in inputs.items(): if item in op.input_names: - res.append(get_var_block(op.block, op.input(item))) + if tensor_type == "Tensor[]": + res.append( + get_var_block( + op.block, op.input(item), is_tensor_list=True + ) + ) + else: + res.append(get_var_block(op.block, op.input(item))) else: # Note: in some cases, inputs may be optional, thus assign None. Such case must be recorded. res.append(None) diff --git a/test/legacy_test/test_stack_op.py b/test/legacy_test/test_stack_op.py index fea3183512083..5c5e653dbaeb6 100644 --- a/test/legacy_test/test_stack_op.py +++ b/test/legacy_test/test_stack_op.py @@ -375,5 +375,20 @@ def test_dygraph(self): paddle.enable_static() +class TestStackListOfSingleTensor(unittest.TestCase): + def setUp(self): + paddle.disable_static() + paddle.seed(2022) + self.x = [paddle.randn((4, 2, 6), dtype="float32")] + + def test_list_single_tensor(self): + expect = paddle.stack(self.x) + paddle.fluid.core._set_prim_all_enabled(True) + st_model = paddle.jit.to_static(paddle.stack) + actual = st_model(self.x) + np.testing.assert_allclose(expect, actual) + paddle.enable_static() + + if __name__ == '__main__': unittest.main() From eafc9889d2b84bde1baefffc638d60e84fd8d4a2 Mon Sep 17 00:00:00 2001 From: Difer <707065510@qq.com> Date: Fri, 11 Aug 2023 11:07:56 +0800 Subject: [PATCH 008/246] move some fluid apis (#55986) * move fluid apis * fix type error * remove static exponential_decay * fix some import error * remove nn.py * fix some error * fix type error --- python/paddle/__init__.py | 2 +- .../meta_optimizers/localsgd_optimizer.py | 4 +- .../distributed/passes/ps_server_pass.py | 8 +- python/paddle/fluid/initializer.py | 1 - python/paddle/fluid/layers/__init__.py | 5 - .../fluid/layers/learning_rate_scheduler.py | 604 ------------------ python/paddle/fluid/layers/nn.py | 114 ---- .../fleet/parameter_server/ir/public.py | 2 - python/paddle/nn/initializer/initializer.py | 4 +- .../{fluid => nn/initializer}/lazy_init.py | 2 +- python/paddle/optimizer/lr.py | 604 ++++++++++++++++++ python/paddle/static/__init__.py | 3 - python/paddle/static/nn/common.py | 4 +- .../fleet/parallel_dygraph_se_resnext.py | 2 +- test/legacy_test/dist_se_resnext.py | 2 +- test/legacy_test/test_dist_transpiler.py | 4 +- .../test_imperative_ocr_attention_model.py | 4 +- test/legacy_test/test_imperative_resnet.py | 2 +- .../test_imperative_resnet_sorted_gradient.py | 2 +- ..._imperative_transformer_sorted_gradient.py | 2 +- .../test_learning_rate_scheduler.py | 54 +- 21 files changed, 662 insertions(+), 767 deletions(-) delete mode 100644 python/paddle/fluid/layers/learning_rate_scheduler.py delete mode 100644 python/paddle/fluid/layers/nn.py rename python/paddle/{fluid => nn/initializer}/lazy_init.py (99%) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 103a996443e30..06549901e17b1 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -42,7 +42,6 @@ from .framework import enable_static # noqa: F401 from .framework import in_dynamic_mode # noqa: F401 from .fluid.dataset import * # noqa: F401, F403 -from .fluid.lazy_init import LazyGuard # noqa: F401 from .framework.dtype import iinfo # noqa: F401 from .framework.dtype import finfo # noqa: F401 @@ -437,6 +436,7 @@ import paddle.vision # noqa: F401 from .tensor.random import check_shape # noqa: F401 +from .nn.initializer.lazy_init import LazyGuard # noqa: F401 # CINN has to set a flag to include a lib if is_compiled_with_cinn(): diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index f3be337fedb77..4889bf5f701f6 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -113,7 +113,7 @@ def minimize_impl( p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): - step = paddle.fluid.layers.autoincreased_step_counter(begin=1) + step = paddle.optimizer.lr.autoincreased_step_counter(begin=1) k_steps = paddle.static.create_global_var( name="k_steps", shape=[1], @@ -330,7 +330,7 @@ def minimize_impl( p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): - step = paddle.fluid.layers.autoincreased_step_counter(begin=1) + step = paddle.optimizer.lr.autoincreased_step_counter(begin=1) k_steps = paddle.static.create_global_var( name="k_steps", diff --git a/python/paddle/distributed/passes/ps_server_pass.py b/python/paddle/distributed/passes/ps_server_pass.py index 4e4377f328f3d..c68746366f48d 100755 --- a/python/paddle/distributed/passes/ps_server_pass.py +++ b/python/paddle/distributed/passes/ps_server_pass.py @@ -15,17 +15,15 @@ import logging import paddle -from paddle.fluid.layers.learning_rate_scheduler import ( - exponential_decay, - inverse_time_decay, - noam_decay, -) from paddle.optimizer.lr import ( ExponentialDecay, InverseTimeDecay, LRScheduler, NaturalExpDecay, NoamDecay, + exponential_decay, + inverse_time_decay, + noam_decay, ) from ..ps.utils.public import ( diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 6eb88d8f8ef3d..5eead87a995c9 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -21,7 +21,6 @@ default_main_program, _current_expected_place, ) -from .lazy_init import lazy_init_helper from .framework import program_guard import numpy as np from .core import VarDesc diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py index c5eb01ff76383..9c6ce9aed0892 100644 --- a/python/paddle/fluid/layers/__init__.py +++ b/python/paddle/fluid/layers/__init__.py @@ -12,17 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import nn -from .nn import * from . import io from .io import * from . import math_op_patch from .math_op_patch import * -from .learning_rate_scheduler import * from ..layer_helper import LayerHelper __all__ = [] -__all__ += nn.__all__ __all__ += io.__all__ -__all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py deleted file mode 100644 index 59f25c63b744a..0000000000000 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ /dev/null @@ -1,604 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -When training a model, it's often useful to decay the -learning rate during training process, this is called -learning_rate_decay. There are many strategies to do -this, this module will provide some classical method. -User can also implement their own learning_rate_decay -strategy according to this module. -""" - -import math -import numbers - -import paddle -from . import nn -from ..framework import ( - default_main_program, - Parameter, - unique_name, - name_scope, - in_dygraph_mode, -) -from ..framework import Variable -from ..dygraph import learning_rate_scheduler as imperate_lr -from ..data_feeder import check_variable_and_dtype, check_type - -__all__ = [ - 'exponential_decay', - 'natural_exp_decay', - 'inverse_time_decay', - 'polynomial_decay', - 'piecewise_decay', - 'noam_decay', - 'cosine_decay', - 'linear_lr_warmup', -] - - -def _decay_step_counter(begin=0): - # the first global step is zero in learning rate decay - global_step = nn.autoincreased_step_counter( - counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1 - ) - global_step = paddle.cast(global_step, 'float32') - return global_step - - -def noam_decay(d_model, warmup_steps, learning_rate=1.0): - """ - - Noam decay method. The numpy implementation of noam decay as follows. - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - # set hyper parameters - base_lr = 0.01 - d_model = 2 - current_steps = 20 - warmup_steps = 200 - # compute - lr_value = base_lr * np.power(d_model, -0.5) * np.min([ - np.power(current_steps, -0.5), - np.power(warmup_steps, -1.5) * current_steps]) - - Please reference `attention is all you need - `_. - - Args: - d_model(Variable): The dimensionality of input and output of model. - - warmup_steps(Variable): A super parameter. - - learning_rate(Variable|float|int): The initial learning rate. If the type - is Variable, it's a tensor with shape [1], the data type can be - float32 or float64. It also can be set to python int number. Default 1.0 - - Returns: - The decayed learning rate. - Examples: - .. code-block:: python - - import paddle.fluid as fluid - warmup_steps = 100 - learning_rate = 0.01 - lr = fluid.layers.learning_rate_scheduler.noam_decay( - 1/(warmup_steps *(learning_rate ** 2)), - warmup_steps, - learning_rate) - """ - with default_main_program()._lr_schedule_guard(): - if in_dygraph_mode(): - decay = paddle.optimizer.lr.NoamDecay( - d_model, warmup_steps, learning_rate=learning_rate - ) - return decay - else: - global_step = _decay_step_counter(1) - - a = global_step**-0.5 - b = (warmup_steps**-1.5) * global_step - lr_value = learning_rate * (d_model**-0.5) * paddle.minimum(a, b) - - return lr_value - - -def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): - """ - - Applies exponential decay to the learning rate. - - When training a model, it is often recommended to lower the learning rate as the - training progresses. By using this function, the learning rate will be decayed by - 'decay_rate' every 'decay_steps' steps. - - Decayed learning rate calculates as follows: - - >>> if staircase == True: - >>> decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps) - >>> else: - >>> decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) - - Args: - learning_rate(Variable|float): The initial learning rate. It should be a Variable - or a float - decay_steps(int): The learning rate decay steps. See the decay computation above. - decay_rate(float): The learning rate decay rate. See the decay computation above. - staircase(bool): If True, decay the learning rate at discrete intervals, which - means the learning rate will be decayed by `decay_rate` every - `decay_steps`. If False, learning rate will be decayed continuously - and following the formula above. Default: False - - Returns: - Variable: The decayed learning rate. The data type is float32. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - paddle.enable_static() - base_lr = 0.1 - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=base_lr, - decay_steps=10000, - decay_rate=0.5, - staircase=True)) - - """ - with default_main_program()._lr_schedule_guard(): - if in_dygraph_mode(): - decay = paddle.optimizer.lr.ExponentialDecay( - learning_rate, decay_rate - ) - return decay - else: - global_step = _decay_step_counter() - - div_res = global_step / decay_steps - if staircase: - div_res = paddle.floor(div_res) - decayed_lr = learning_rate * (decay_rate**div_res) - - return decayed_lr - - -def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): - """ - - Applies natural exponential decay to the initial learning rate. - - When training a model, it is often recommended to lower the learning rate as the - training progresses. By using this function, the learning rate will be decayed by - natural exponential power 'decay_rate' every 'decay_steps' steps. - - Decayed learning rate calculates as follows: - - >>> if not staircase: - >>> decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) - >>> else: - >>> decayed_learning_rate = learning_rate * exp(- decay_rate * floor(global_step / decay_steps)) - - Args: - learning_rate(Variable|float): The initial learning rate. It should be a Variable - or a float - decay_steps(int): The learning rate decay steps. See the decay computation above. - decay_rate(float): The learning rate decay rate. See the decay computation above. - staircase(bool): If True, decay the learning rate at discrete intervals, which - means the learning rate will be decayed by natural exponential power - `decay_rate` every `decay_steps`. If False, learning rate will be - decayed continuously and following the formula above. Default: False - - Returns: - The decayed learning rate. The data type is float32. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - paddle.enable_static() - base_lr = 0.1 - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.natural_exp_decay( - learning_rate=base_lr, - decay_steps=10000, - decay_rate=0.5, - staircase=True)) - - """ - with default_main_program()._lr_schedule_guard(): - if in_dygraph_mode(): - decay = paddle.optimizer.lr.NaturalExpDecay( - learning_rate, decay_rate - ) - return decay - else: - global_step = _decay_step_counter() - - div_res = global_step / decay_steps - if staircase: - div_res = paddle.floor(div_res) - decayed_lr = learning_rate * paddle.exp(-1 * decay_rate * div_res) - - return decayed_lr - - -def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): - """ - - Applies inverse time decay to the initial learning rate. - - When training a model, it is often recommended to lower the learning rate as the - training progresses. By using this function, an inverse decay function will be - applied to the initial learning rate. - - Decayed learning rate calculates as follows: - - >>> if staircase == True: - >>> decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) - >>> else: - >>> decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) - - Args: - learning_rate(Variable|float): The initial learning rate. It should be a Variable - or a float - decay_steps(int): The learning rate decay steps. See the decay computation above. - decay_rate(float): The learning rate decay rate. See the decay computation above. - staircase(bool): If True, decay the learning rate at discrete intervals, which - means the learning rate will be decayed by `decay_rate` times - every `decay_steps`. If False, learning rate will be decayed - continuously and following the formula above. Default: False - - Returns: - Variable: The decayed learning rate. The data type is float32. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - base_lr = 0.1 - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.inverse_time_decay( - learning_rate=base_lr, - decay_steps=10000, - decay_rate=0.5, - staircase=True)) - """ - with default_main_program()._lr_schedule_guard(): - if in_dygraph_mode(): - decay = paddle.optimizer.lr.InverseTimeDecay( - learning_rate, decay_rate - ) - return decay - else: - global_step = _decay_step_counter() - - div_res = global_step / decay_steps - if staircase: - div_res = paddle.floor(div_res) - - decayed_lr = learning_rate / (1 + decay_rate * div_res) - - return decayed_lr - - -def polynomial_decay( - learning_rate, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False -): - """ - Applies polynomial decay to the initial learning rate. - - .. code-block:: text - - if cycle: - decay_steps = decay_steps * ceil(global_step / decay_steps) - else: - global_step = min(global_step, decay_steps) - decayed_learning_rate = (learning_rate - end_learning_rate) * - (1 - global_step / decay_steps) ^ power + end_learning_rate - - Args: - learning_rate(Variable|float32): A scalar float32 value or a Variable. This - will be the initial learning rate during training. - decay_steps(int32): A Python `int32` number. - end_learning_rate(float): A Python `float` number. - power(float): A Python `float` number. - cycle(bool): If set true, decay the learning rate every decay_steps. - - Returns: - Variable: The decayed learning rate - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - start_lr = 0.01 - total_step = 5000 - end_lr = 0 - lr = fluid.layers.polynomial_decay( - start_lr, total_step, end_lr, power=1) - - """ - with default_main_program()._lr_schedule_guard(): - if in_dygraph_mode(): - decay = paddle.optimizer.lr.PolynomialDecay( - learning_rate, decay_steps, end_learning_rate, power, cycle - ) - return decay - else: - global_step = _decay_step_counter() - - if cycle: - div_res = paddle.ceil(global_step / decay_steps) - zero_var = paddle.tensor.fill_constant( - shape=[1], dtype='float32', value=0.0 - ) - one_var = paddle.tensor.fill_constant( - shape=[1], dtype='float32', value=1.0 - ) - - div_val = paddle.static.nn.cond( - global_step == zero_var, lambda: one_var, lambda: div_res - ) - paddle.assign(div_val, output=div_res) - - decay_steps = decay_steps * div_res - else: - decay_steps_var = paddle.tensor.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps) - ) - global_step = paddle.minimum(x=global_step, y=decay_steps_var) - - decayed_lr = (learning_rate - end_learning_rate) * ( - (1 - global_step / decay_steps) ** power - ) + end_learning_rate - return decayed_lr - - -def piecewise_decay(boundaries, values): - """ - - Applies piecewise decay to the initial learning rate. - - The algorithm can be described as the code below. - - .. code-block:: text - - boundaries = [10000, 20000] - values = [1.0, 0.5, 0.1] - if step < 10000: - learning_rate = 1.0 - elif 10000 <= step < 20000: - learning_rate = 0.5 - else: - learning_rate = 0.1 - Args: - boundaries: A list of steps numbers. - values: A list of learning rate values that will be picked during - different step boundaries. - - Returns: - The decayed learning rate. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - boundaries = [10000, 20000] - values = [1.0, 0.5, 0.1] - optimizer = paddle.optimizer.Momentum( - momentum=0.9, - learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries, values), - weight_decay=paddle.regularizer.L2Decay(1e-4)) - - - """ - with default_main_program()._lr_schedule_guard(): - if len(values) - len(boundaries) != 1: - raise ValueError("len(values) - len(boundaries) should be 1") - - if in_dygraph_mode(): - decay = paddle.optimizer.lr.PiecewiseDecay(boundaries, values) - return decay - else: - global_step = _decay_step_counter() - - lr = paddle.static.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate", - ) - with paddle.static.nn.control_flow.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = paddle.tensor.fill_constant( - shape=[1], - dtype='float32', - value=float(boundaries[i]), - force_cpu=True, - ) - with switch.case(global_step < boundary_val): - paddle.tensor.fill_constant( - shape=[1], - dtype="float32", - value=float(values[i]), - out=lr, - ) - with switch.default(): - paddle.tensor.fill_constant( - shape=[1], - dtype="float32", - value=float(values[len(values) - 1]), - out=lr, - ) - return lr - - -def cosine_decay(learning_rate, step_each_epoch, epochs): - r""" - - Applies cosine decay to the learning rate. - - when training a model, it is often recommended to lower the learning rate as the - training progresses. By using this function, the learning rate will be decayed by - following cosine decay strategy. - - .. math:: - - decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1) - - Args: - learning_rate(Variable|float): The initial learning rate. - step_each_epoch(int): the number of steps in an epoch. - epochs(int): the number of epochs. - - Returns: - Variable: The decayed learning rate. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - base_lr = 0.1 - lr = fluid.layers.cosine_decay( - learning_rate = base_lr, step_each_epoch=10000, epochs=120) - """ - check_type( - learning_rate, 'learning_rate', (float, Variable), 'cosine_decay' - ) - - with default_main_program()._lr_schedule_guard(): - if in_dygraph_mode(): - decay = paddle.optimizer.lr.CosineAnnealingDecay( - learning_rate, epochs - ) - return decay - else: - global_step = _decay_step_counter() - - cur_epoch = paddle.floor(global_step / step_each_epoch) - decayed_lr = ( - learning_rate - * 0.5 - * (paddle.cos(cur_epoch * math.pi / epochs) + 1) - ) - return decayed_lr - - -def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): - """ - - This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling. - For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks `_ - - When global_step < warmup_steps, learning rate is updated as: - - .. code-block:: text - - linear_step = end_lr - start_lr - lr = start_lr + linear_step * (global_step / warmup_steps) - - where start_lr is the initial learning rate, and end_lr is the final learning rate; - - When global_step >= warmup_steps, learning rate is updated as: - - .. code-block:: text - - lr = learning_rate - - where lr is the learning_rate after warm-up. - - Args: - learning_rate (Variable|float): Learning_rate after warm-up, it could be 1D-Tensor or single value with the data type of float32. - warmup_steps (int): Steps for warm up. - start_lr (float): Initial learning rate of warm up. - end_lr (float): Final learning rate of warm up. - - Returns: - Variable: Warm-up learning rate with the same data type as learning_rate. - - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - - boundaries = [100, 200] - lr_steps = [0.1, 0.01, 0.001] - learning_rate = fluid.layers.piecewise_decay(boundaries, lr_steps) #case1, 1D-Tensor - #learning_rate = 0.1 #case2, single-value - warmup_steps = 50 - start_lr = 1. / 3. - end_lr = 0.1 - decayed_lr = fluid.layers.linear_lr_warmup(learning_rate, - warmup_steps, start_lr, end_lr) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - out, = exe.run(fetch_list=[decayed_lr.name]) - print(out) - # case1: [0.33333334] - # case2: [0.33333334] - """ - dtype = 'float32' - if isinstance(learning_rate, Variable): - dtype = learning_rate.dtype - - linear_step = float(end_lr) - float(start_lr) - with default_main_program()._lr_schedule_guard(): - if in_dygraph_mode(): - lr = paddle.optimizer.lr.LinearWarmup( - learning_rate, warmup_steps, start_lr, end_lr - ) - return lr - else: - lr = paddle.static.create_global_var( - shape=[1], - value=0.0, - dtype=dtype, - persistable=True, - name="learning_rate_warmup", - ) - - global_step = _decay_step_counter() - if not isinstance(learning_rate, Variable): - learning_rate = paddle.tensor.fill_constant( - shape=[1], dtype=dtype, value=float(learning_rate) - ) - lr_val = paddle.static.nn.case( - pred_fn_pairs=[ - ( - global_step < warmup_steps, - lambda: start_lr - + linear_step * (global_step / float(warmup_steps)), - ) - ], - default=lambda: learning_rate, - ) - paddle.assign(lr_val, lr) - return lr diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py deleted file mode 100644 index a4a770a97829a..0000000000000 --- a/python/paddle/fluid/layers/nn.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -All layers just related to the neural network. -""" -import os -import inspect -import warnings - -import numpy as np - -import paddle -from ..layer_helper import LayerHelper -from ..framework import ( - Variable, - OpProtoHolder, - dygraph_only, - _dygraph_tracer, - default_main_program, - _create_tensor, - static_only, - _global_flags, - in_dygraph_mode, -) -from ..framework import _current_expected_place -from .. import dygraph_utils -from ..param_attr import ParamAttr -from .layer_function_generator import ( - autodoc, - templatedoc, - _generate_doc_string_, -) - -from .. import unique_name -from .. import core -from ...utils import deprecated -from ..data_feeder import ( - convert_dtype, - check_variable_and_dtype, - check_type, - check_dtype, -) -from paddle.utils import deprecated -from paddle import _C_ops, _legacy_C_ops -from collections.abc import Iterable - - -__all__ = [ - 'autoincreased_step_counter', -] - - -def autoincreased_step_counter(counter_name=None, begin=1, step=1): - """ - :api_attr: Static Graph - - Create an auto-increase variable. which will be automatically increased - by 1 in every iteration. By default, the first return of this counter is 1, - and the step size is 1. - - Args: - counter_name(str, optional): The counter name. Default '@STEP_COUNTER@'. - begin(int, optional): The first return value of this counter. Default 1. - step(int, optional): The step size. Default 1. - - Returns: - Variable: The auto-increased Variable with data type int64. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - global_step = fluid.layers.autoincreased_step_counter( - counter_name='@LR_DECAY_COUNTER@', begin=0, step=1) - """ - helper = LayerHelper('global_step_counter') - if counter_name is None: - counter_name = '@STEP_COUNTER@' - counter, is_new_var = helper.create_or_get_global_variable( - name=counter_name, - dtype='int64', - shape=[1], - persistable=True, - belong_to_optimizer=True, - ) - if is_new_var: - helper.set_variable_initializer( - counter, - initializer=paddle.nn.initializer.ConstantInitializer( - value=begin - 1, force_cpu=True - ), - ) - helper.main_program.global_block()._prepend_op( - type='increment', - inputs={'X': [counter]}, - outputs={'Out': [counter]}, - attrs={'step': float(step)}, - ) - counter.stop_gradient = True - - return counter diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py b/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py index 8fc55869f54f3..75d65dc079e09 100755 --- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py +++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py @@ -1409,8 +1409,6 @@ def _get_lr_scheduler_program(lr_scheduler, lr_param_dict, lr_decay_steps): InverseTimeDecay, NaturalExpDecay, NoamDecay, - ) - from paddle.static.learning_rate_scheduler import ( exponential_decay, inverse_time_decay, natural_exp_decay, diff --git a/python/paddle/nn/initializer/initializer.py b/python/paddle/nn/initializer/initializer.py index 7d04e8d7cbc71..9d5880aa09561 100644 --- a/python/paddle/nn/initializer/initializer.py +++ b/python/paddle/nn/initializer/initializer.py @@ -18,7 +18,7 @@ import numpy as np from ...fluid.framework import default_main_program, in_dygraph_mode -from ...fluid.lazy_init import lazy_init_helper +from .lazy_init import lazy_init_helper __all__ = [] @@ -42,7 +42,7 @@ def __call__(self, param, block=None): return self._lazy_init(param, block) def forward(self, param, block=None): - """Add corresponding initialization operations to the network""" + """Add corresponding initialization operations to the network.""" raise NotImplementedError() def _lazy_init(self, param, block=None): diff --git a/python/paddle/fluid/lazy_init.py b/python/paddle/nn/initializer/lazy_init.py similarity index 99% rename from python/paddle/fluid/lazy_init.py rename to python/paddle/nn/initializer/lazy_init.py index 36f36161e6f27..e2321f682f77e 100644 --- a/python/paddle/fluid/lazy_init.py +++ b/python/paddle/nn/initializer/lazy_init.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import framework +from ...fluid import framework __all__ = ["LazyGuard"] diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index e628509e52afc..0e7b8fe735339 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -17,8 +17,16 @@ import numpy +import paddle from paddle import Tensor from paddle.fluid import core +from paddle.fluid.data_feeder import check_type +from paddle.fluid.framework import ( + Variable, + default_main_program, + in_dygraph_mode, +) +from paddle.fluid.layer_helper import LayerHelper __all__ = [ # noqa 'LRScheduler', @@ -2227,3 +2235,599 @@ def get_lr(self): lr = self.base_lr + base_height * self.scale_fn(eval(self.scale_mode)) return lr + + +def autoincreased_step_counter(counter_name=None, begin=1, step=1): + """ + :api_attr: Static Graph + + Create an auto-increase variable. which will be automatically increased + by 1 in every iteration. By default, the first return of this counter is 1, + and the step size is 1. + + Args: + counter_name(str, optional): The counter name. Default '@STEP_COUNTER@'. + begin(int, optional): The first return value of this counter. Default 1. + step(int, optional): The step size. Default 1. + + Returns: + Variable: The auto-increased Variable with data type int64. + + Examples: + .. code-block:: python + + import paddle + paddle.enable_static() + global_step = paddle.optimizer.lr.autoincreased_step_counter( + counter_name='@LR_DECAY_COUNTER@', begin=0, step=1) + """ + helper = LayerHelper('global_step_counter') + if counter_name is None: + counter_name = '@STEP_COUNTER@' + counter, is_new_var = helper.create_or_get_global_variable( + name=counter_name, + dtype='int64', + shape=[1], + persistable=True, + belong_to_optimizer=True, + ) + if is_new_var: + helper.set_variable_initializer( + counter, + initializer=paddle.nn.initializer.ConstantInitializer( + value=begin - 1, force_cpu=True + ), + ) + helper.main_program.global_block()._prepend_op( + type='increment', + inputs={'X': [counter]}, + outputs={'Out': [counter]}, + attrs={'step': float(step)}, + ) + counter.stop_gradient = True + + return counter + + +def _decay_step_counter(begin=0): + # the first global step is zero in learning rate decay + global_step = autoincreased_step_counter( + counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1 + ) + global_step = paddle.cast(global_step, 'float32') + return global_step + + +def noam_decay(d_model, warmup_steps, learning_rate=1.0): + """ + + Noam decay method. The numpy implementation of noam decay as follows. + + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + # set hyper parameters + base_lr = 0.01 + d_model = 2 + current_steps = 20 + warmup_steps = 200 + # compute + lr_value = base_lr * np.power(d_model, -0.5) * np.min([ + np.power(current_steps, -0.5), + np.power(warmup_steps, -1.5) * current_steps]) + + Please reference `attention is all you need + `_. + + Args: + d_model(Variable): The dimensionality of input and output of model. + + warmup_steps(Variable): A super parameter. + + learning_rate(Variable|float|int): The initial learning rate. If the type + is Variable, it's a tensor with shape [1], the data type can be + float32 or float64. It also can be set to python int number. Default 1.0 + + Returns: + The decayed learning rate. + Examples: + .. code-block:: python + + import paddle + warmup_steps = 100 + learning_rate = 0.01 + lr = paddle.optimizer.lr.noam_decay( + 1/(warmup_steps *(learning_rate ** 2)), + warmup_steps, + learning_rate) + """ + with default_main_program()._lr_schedule_guard(): + if in_dygraph_mode(): + decay = paddle.optimizer.lr.NoamDecay( + d_model, warmup_steps, learning_rate=learning_rate + ) + return decay + else: + global_step = _decay_step_counter(1) + + a = global_step**-0.5 + b = (warmup_steps**-1.5) * global_step + lr_value = learning_rate * (d_model**-0.5) * paddle.minimum(a, b) + + return lr_value + + +def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): + """ + + Applies exponential decay to the learning rate. + + When training a model, it is often recommended to lower the learning rate as the + training progresses. By using this function, the learning rate will be decayed by + 'decay_rate' every 'decay_steps' steps. + + Decayed learning rate calculates as follows: + + >>> if staircase == True: + >>> decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps) + >>> else: + >>> decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) + + Args: + learning_rate(Variable|float): The initial learning rate. It should be a Variable + or a float + decay_steps(int): The learning rate decay steps. See the decay computation above. + decay_rate(float): The learning rate decay rate. See the decay computation above. + staircase(bool): If True, decay the learning rate at discrete intervals, which + means the learning rate will be decayed by `decay_rate` every + `decay_steps`. If False, learning rate will be decayed continuously + and following the formula above. Default: False + + Returns: + Variable: The decayed learning rate. The data type is float32. + + Examples: + .. code-block:: python + + import paddle + + paddle.enable_static() + base_lr = 0.1 + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=paddle.optimizer.lr.exponential_decay( + learning_rate=base_lr, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + + """ + with default_main_program()._lr_schedule_guard(): + if in_dygraph_mode(): + decay = ExponentialDecay(learning_rate, decay_rate) + return decay + else: + global_step = _decay_step_counter() + + div_res = global_step / decay_steps + if staircase: + div_res = paddle.floor(div_res) + decayed_lr = learning_rate * (decay_rate**div_res) + + return decayed_lr + + +def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): + """ + + Applies natural exponential decay to the initial learning rate. + + When training a model, it is often recommended to lower the learning rate as the + training progresses. By using this function, the learning rate will be decayed by + natural exponential power 'decay_rate' every 'decay_steps' steps. + + Decayed learning rate calculates as follows: + + >>> if not staircase: + >>> decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) + >>> else: + >>> decayed_learning_rate = learning_rate * exp(- decay_rate * floor(global_step / decay_steps)) + + Args: + learning_rate(Variable|float): The initial learning rate. It should be a Variable + or a float + decay_steps(int): The learning rate decay steps. See the decay computation above. + decay_rate(float): The learning rate decay rate. See the decay computation above. + staircase(bool): If True, decay the learning rate at discrete intervals, which + means the learning rate will be decayed by natural exponential power + `decay_rate` every `decay_steps`. If False, learning rate will be + decayed continuously and following the formula above. Default: False + + Returns: + The decayed learning rate. The data type is float32. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + + paddle.enable_static() + base_lr = 0.1 + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=paddle.optimizer.lr.natural_exp_decay( + learning_rate=base_lr, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + + """ + with default_main_program()._lr_schedule_guard(): + if in_dygraph_mode(): + decay = NaturalExpDecay(learning_rate, decay_rate) + return decay + else: + global_step = _decay_step_counter() + + div_res = global_step / decay_steps + if staircase: + div_res = paddle.floor(div_res) + decayed_lr = learning_rate * paddle.exp(-1 * decay_rate * div_res) + + return decayed_lr + + +def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): + """ + + Applies inverse time decay to the initial learning rate. + + When training a model, it is often recommended to lower the learning rate as the + training progresses. By using this function, an inverse decay function will be + applied to the initial learning rate. + + Decayed learning rate calculates as follows: + + >>> if staircase == True: + >>> decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) + >>> else: + >>> decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) + + Args: + learning_rate(Variable|float): The initial learning rate. It should be a Variable + or a float + decay_steps(int): The learning rate decay steps. See the decay computation above. + decay_rate(float): The learning rate decay rate. See the decay computation above. + staircase(bool): If True, decay the learning rate at discrete intervals, which + means the learning rate will be decayed by `decay_rate` times + every `decay_steps`. If False, learning rate will be decayed + continuously and following the formula above. Default: False + + Returns: + Variable: The decayed learning rate. The data type is float32. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import paddle + paddle.enable_static() + base_lr = 0.1 + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=paddle.optimizer.lr.inverse_time_decay( + learning_rate=base_lr, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + """ + with default_main_program()._lr_schedule_guard(): + if in_dygraph_mode(): + decay = InverseTimeDecay(learning_rate, decay_rate) + return decay + else: + global_step = _decay_step_counter() + + div_res = global_step / decay_steps + if staircase: + div_res = paddle.floor(div_res) + + decayed_lr = learning_rate / (1 + decay_rate * div_res) + + return decayed_lr + + +def polynomial_decay( + learning_rate, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False +): + """ + Applies polynomial decay to the initial learning rate. + + .. code-block:: text + + if cycle: + decay_steps = decay_steps * ceil(global_step / decay_steps) + else: + global_step = min(global_step, decay_steps) + decayed_learning_rate = (learning_rate - end_learning_rate) * + (1 - global_step / decay_steps) ^ power + end_learning_rate + + Args: + learning_rate(Variable|float32): A scalar float32 value or a Variable. This + will be the initial learning rate during training. + decay_steps(int32): A Python `int32` number. + end_learning_rate(float): A Python `float` number. + power(float): A Python `float` number. + cycle(bool): If set true, decay the learning rate every decay_steps. + + Returns: + Variable: The decayed learning rate + + Examples: + .. code-block:: python + + import paddle + start_lr = 0.01 + total_step = 5000 + end_lr = 0 + lr = paddle.optimizer.lr.polynomial_decay( + start_lr, total_step, end_lr, power=1) + + """ + with default_main_program()._lr_schedule_guard(): + if in_dygraph_mode(): + decay = PolynomialDecay( + learning_rate, decay_steps, end_learning_rate, power, cycle + ) + return decay + else: + global_step = _decay_step_counter() + + if cycle: + div_res = paddle.ceil(global_step / decay_steps) + zero_var = paddle.tensor.fill_constant( + shape=[1], dtype='float32', value=0.0 + ) + one_var = paddle.tensor.fill_constant( + shape=[1], dtype='float32', value=1.0 + ) + + div_val = paddle.static.nn.cond( + global_step == zero_var, lambda: one_var, lambda: div_res + ) + paddle.assign(div_val, output=div_res) + + decay_steps = decay_steps * div_res + else: + decay_steps_var = paddle.tensor.fill_constant( + shape=[1], dtype='float32', value=float(decay_steps) + ) + global_step = paddle.minimum(x=global_step, y=decay_steps_var) + + decayed_lr = (learning_rate - end_learning_rate) * ( + (1 - global_step / decay_steps) ** power + ) + end_learning_rate + return decayed_lr + + +def piecewise_decay(boundaries, values): + """ + + Applies piecewise decay to the initial learning rate. + + The algorithm can be described as the code below. + + .. code-block:: text + + boundaries = [10000, 20000] + values = [1.0, 0.5, 0.1] + if step < 10000: + learning_rate = 1.0 + elif 10000 <= step < 20000: + learning_rate = 0.5 + else: + learning_rate = 0.1 + Args: + boundaries: A list of steps numbers. + values: A list of learning rate values that will be picked during + different step boundaries. + + Returns: + The decayed learning rate. + + Examples: + .. code-block:: python + + import paddle + paddle.enable_static() + boundaries = [10000, 20000] + values = [1.0, 0.5, 0.1] + optimizer = paddle.optimizer.Momentum( + momentum=0.9, + learning_rate=paddle.optimizer.lr.PiecewiseDecay(boundaries, values), + weight_decay=paddle.regularizer.L2Decay(1e-4)) + + + """ + with default_main_program()._lr_schedule_guard(): + if len(values) - len(boundaries) != 1: + raise ValueError("len(values) - len(boundaries) should be 1") + + if in_dygraph_mode(): + decay = PiecewiseDecay(boundaries, values) + return decay + else: + global_step = _decay_step_counter() + + lr = paddle.static.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate", + ) + with paddle.static.nn.control_flow.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = paddle.tensor.fill_constant( + shape=[1], + dtype='float32', + value=float(boundaries[i]), + force_cpu=True, + ) + with switch.case(global_step < boundary_val): + paddle.tensor.fill_constant( + shape=[1], + dtype="float32", + value=float(values[i]), + out=lr, + ) + with switch.default(): + paddle.tensor.fill_constant( + shape=[1], + dtype="float32", + value=float(values[len(values) - 1]), + out=lr, + ) + return lr + + +def cosine_decay(learning_rate, step_each_epoch, epochs): + r""" + + Applies cosine decay to the learning rate. + + when training a model, it is often recommended to lower the learning rate as the + training progresses. By using this function, the learning rate will be decayed by + following cosine decay strategy. + + .. math:: + + decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1) + + Args: + learning_rate(Variable|float): The initial learning rate. + step_each_epoch(int): the number of steps in an epoch. + epochs(int): the number of epochs. + + Returns: + Variable: The decayed learning rate. + + Examples: + .. code-block:: python + + import paddle + base_lr = 0.1 + lr = paddle.optimizer.lr.cosine_decay( + learning_rate = base_lr, step_each_epoch=10000, epochs=120) + """ + check_type( + learning_rate, 'learning_rate', (float, Variable), 'cosine_decay' + ) + + with default_main_program()._lr_schedule_guard(): + if in_dygraph_mode(): + decay = CosineAnnealingDecay(learning_rate, epochs) + return decay + else: + global_step = _decay_step_counter() + + cur_epoch = paddle.floor(global_step / step_each_epoch) + decayed_lr = ( + learning_rate + * 0.5 + * (paddle.cos(cur_epoch * math.pi / epochs) + 1) + ) + return decayed_lr + + +def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): + """ + + This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling. + For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks `_ + + When global_step < warmup_steps, learning rate is updated as: + + .. code-block:: text + + linear_step = end_lr - start_lr + lr = start_lr + linear_step * (global_step / warmup_steps) + + where start_lr is the initial learning rate, and end_lr is the final learning rate; + + When global_step >= warmup_steps, learning rate is updated as: + + .. code-block:: text + + lr = learning_rate + + where lr is the learning_rate after warm-up. + + Args: + learning_rate (Variable|float): Learning_rate after warm-up, it could be 1D-Tensor or single value with the data type of float32. + warmup_steps (int): Steps for warm up. + start_lr (float): Initial learning rate of warm up. + end_lr (float): Final learning rate of warm up. + + Returns: + Variable: Warm-up learning rate with the same data type as learning_rate. + + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + + boundaries = [100, 200] + lr_steps = [0.1, 0.01, 0.001] + learning_rate = fluid.layers.piecewise_decay(boundaries, lr_steps) #case1, 1D-Tensor + #learning_rate = 0.1 #case2, single-value + warmup_steps = 50 + start_lr = 1. / 3. + end_lr = 0.1 + decayed_lr = fluid.layers.linear_lr_warmup(learning_rate, + warmup_steps, start_lr, end_lr) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + out, = exe.run(fetch_list=[decayed_lr.name]) + print(out) + # case1: [0.33333334] + # case2: [0.33333334] + """ + dtype = 'float32' + if isinstance(learning_rate, Variable): + dtype = learning_rate.dtype + + linear_step = float(end_lr) - float(start_lr) + with default_main_program()._lr_schedule_guard(): + if in_dygraph_mode(): + lr = LinearWarmup(learning_rate, warmup_steps, start_lr, end_lr) + return lr + else: + lr = paddle.static.create_global_var( + shape=[1], + value=0.0, + dtype=dtype, + persistable=True, + name="learning_rate_warmup", + ) + + global_step = _decay_step_counter() + if not isinstance(learning_rate, Variable): + learning_rate = paddle.tensor.fill_constant( + shape=[1], dtype=dtype, value=float(learning_rate) + ) + lr_val = paddle.static.nn.case( + pred_fn_pairs=[ + ( + global_step < warmup_steps, + lambda: start_lr + + linear_step * (global_step / float(warmup_steps)), + ) + ], + default=lambda: learning_rate, + ) + paddle.assign(lr_val, lr) + return lr diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 1a55745a81b52..d8247cf6561bb 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -72,8 +72,6 @@ from ..fluid.param_attr import WeightNormParamAttr # noqa: F401 from ..fluid.optimizer import Optimizer # noqa: F401 -from ..fluid.layers import exponential_decay # noqa: F401 -from ..fluid.layers import learning_rate_scheduler # noqa: F401 from .nn.metric import auc # noqa: F401 from .nn.metric import accuracy # noqa: F401 @@ -135,5 +133,4 @@ 'create_parameter', 'set_ipu_shard', 'ctr_metric_bundle', - 'exponential_decay', ] diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py index f04dc277e4e0d..41ec17f0567a6 100644 --- a/python/paddle/static/nn/common.py +++ b/python/paddle/static/nn/common.py @@ -24,7 +24,7 @@ check_type, check_variable_and_dtype, ) -from paddle.fluid import core, layers, unique_name +from paddle.fluid import core, unique_name from paddle.fluid.data_feeder import check_dtype from paddle.fluid.framework import ( Program, @@ -4210,7 +4210,7 @@ def update(self): Update Exponential Moving Average. Should only call this method in train program. """ - global_step = layers.autoincreased_step_counter( + global_step = paddle.optimizer.lr.autoincreased_step_counter( counter_name=self._step_counter_name ) param_master_emas = [] diff --git a/test/collective/fleet/parallel_dygraph_se_resnext.py b/test/collective/fleet/parallel_dygraph_se_resnext.py index 05e9088c9c980..c24e4e7ebef3d 100644 --- a/test/collective/fleet/parallel_dygraph_se_resnext.py +++ b/test/collective/fleet/parallel_dygraph_se_resnext.py @@ -67,7 +67,7 @@ def optimizer_setting(params, parameter_list=None): ) else: optimizer = paddle.optimizer.Momentum( - learning_rate=fluid.layers.cosine_decay( + learning_rate=paddle.optimizer.lr.cosine_decay( learning_rate=lr, step_each_epoch=step, epochs=num_epochs ), momentum=momentum_rate, diff --git a/test/legacy_test/dist_se_resnext.py b/test/legacy_test/dist_se_resnext.py index ddc79809e80a0..f7b31d315722f 100644 --- a/test/legacy_test/dist_se_resnext.py +++ b/test/legacy_test/dist_se_resnext.py @@ -248,7 +248,7 @@ def get_model(self, batch_size=2, use_dgc=False): else: optimizer = ( paddle.distributed.fleet.meta_optimizers.DGCMomentumOptimizer( - learning_rate=fluid.layers.piecewise_decay( + learning_rate=paddle.optimizer.lr.piecewise_decay( boundaries=bd, values=lr ), momentum=0.9, diff --git a/test/legacy_test/test_dist_transpiler.py b/test/legacy_test/test_dist_transpiler.py index 8f5565ee7b73d..b3a2f95aef78c 100644 --- a/test/legacy_test/test_dist_transpiler.py +++ b/test/legacy_test/test_dist_transpiler.py @@ -477,7 +477,7 @@ def net_conf(self): cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) sgd_optimizer = paddle.optimizer.SGD( - learning_rate=fluid.layers.piecewise_decay( + learning_rate=paddle.optimizer.lr.piecewise_decay( [10000, 20000], [1.0, 0.5, 1.0] ) ) @@ -581,7 +581,7 @@ def net_conf(self): bd = [1, 10, 20, 30] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] sgd_optimizer = paddle.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( + learning_rate=paddle.optimizer.lr.piecewise_decay( boundaries=bd, values=lr ), momentum=0.9, diff --git a/test/legacy_test/test_imperative_ocr_attention_model.py b/test/legacy_test/test_imperative_ocr_attention_model.py index 30c00600ae772..8b07c7652fad9 100644 --- a/test/legacy_test/test_imperative_ocr_attention_model.py +++ b/test/legacy_test/test_imperative_ocr_attention_model.py @@ -451,7 +451,7 @@ def run_dygraph(): ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": - learning_rate = fluid.layers.piecewise_decay( + learning_rate = paddle.optimizer.lr.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01] ) else: @@ -527,7 +527,7 @@ def run_dygraph(): ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": - learning_rate = fluid.layers.piecewise_decay( + learning_rate = paddle.optimizer.lr.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01] ) else: diff --git a/test/legacy_test/test_imperative_resnet.py b/test/legacy_test/test_imperative_resnet.py index 4bb78f64d3125..41e270c67958a 100644 --- a/test/legacy_test/test_imperative_resnet.py +++ b/test/legacy_test/test_imperative_resnet.py @@ -67,7 +67,7 @@ def optimizer_setting(params, parameter_list=None): # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( # learning_rate=params["lr"], - # learning_rate=fluid.layers.piecewise_decay( + # learning_rate=paddle.optimizer.lr.piecewise_decay( # boundaries=bd, values=lr), # momentum=0.9, # regularization=paddle.regularizer.L2Decay(1e-4)) diff --git a/test/legacy_test/test_imperative_resnet_sorted_gradient.py b/test/legacy_test/test_imperative_resnet_sorted_gradient.py index 98bdd0c8ccb07..ba71a803fb650 100644 --- a/test/legacy_test/test_imperative_resnet_sorted_gradient.py +++ b/test/legacy_test/test_imperative_resnet_sorted_gradient.py @@ -63,7 +63,7 @@ def optimizer_setting(params, parameter_list=None): # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( # learning_rate=params["lr"], - # learning_rate=fluid.layers.piecewise_decay( + # learning_rate=paddle.optimizer.lr.piecewise_decay( # boundaries=bd, values=lr), # momentum=0.9, # regularization=paddle.regularizer.L2Decay(1e-4)) diff --git a/test/legacy_test/test_imperative_transformer_sorted_gradient.py b/test/legacy_test/test_imperative_transformer_sorted_gradient.py index 2d724c080cb7e..f85986fab3584 100644 --- a/test/legacy_test/test_imperative_transformer_sorted_gradient.py +++ b/test/legacy_test/test_imperative_transformer_sorted_gradient.py @@ -1137,7 +1137,7 @@ def run_dygraph(): is_sparse=is_sparse, ) if sync: - lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( + lr_decay = paddle.optimizer.lr.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps ) with fluid.default_main_program()._lr_schedule_guard(): diff --git a/test/legacy_test/test_learning_rate_scheduler.py b/test/legacy_test/test_learning_rate_scheduler.py index 0ff27eec5ad20..8898fb59b87b1 100644 --- a/test/legacy_test/test_learning_rate_scheduler.py +++ b/test/legacy_test/test_learning_rate_scheduler.py @@ -20,7 +20,7 @@ import paddle from paddle import fluid -from paddle.fluid import core, framework, layers +from paddle.fluid import core, framework def exponential_decay( @@ -239,7 +239,9 @@ def test_NoamDecay(self): d_model = 0.01 warmup_steps = 200 learning_rate = 2.0 - lr = fluid.layers.noam_decay(d_model, warmup_steps, learning_rate) + lr = paddle.optimizer.lr.noam_decay( + d_model, warmup_steps, learning_rate + ) for step in range(5): step += 1 right_result = noam_decay( @@ -278,7 +280,7 @@ def test_LinearLrWarmup(self): np.testing.assert_allclose(t, right_result[i], rtol=1e-05) with self.assertRaises(TypeError): - lr = fluid.layers.linear_lr_warmup( + lr = paddle.optimizer.lr.linear_lr_warmup( learning_rate="fake_lr", warmup_steps=2, start_lr=0.0, @@ -443,39 +445,59 @@ def test_decay(self): common_kwargs_false["staircase"] = False decay_fns = [ - (exponential_decay, layers.exponential_decay, common_kwargs_true), - (exponential_decay, layers.exponential_decay, common_kwargs_false), - (natural_exp_decay, layers.natural_exp_decay, common_kwargs_true), - (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false), - (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true), + ( + exponential_decay, + paddle.optimizer.lr.exponential_decay, + common_kwargs_true, + ), + ( + exponential_decay, + paddle.optimizer.lr.exponential_decay, + common_kwargs_false, + ), + ( + natural_exp_decay, + paddle.optimizer.lr.natural_exp_decay, + common_kwargs_true, + ), + ( + natural_exp_decay, + paddle.optimizer.lr.natural_exp_decay, + common_kwargs_false, + ), + ( + inverse_time_decay, + paddle.optimizer.lr.inverse_time_decay, + common_kwargs_true, + ), ( inverse_time_decay, - layers.inverse_time_decay, + paddle.optimizer.lr.inverse_time_decay, common_kwargs_false, ), ( polynomial_decay, - layers.polynomial_decay, + paddle.optimizer.lr.polynomial_decay, {"learning_rate": 1.0, "decay_steps": 5, "cycle": True}, ), ( polynomial_decay, - layers.polynomial_decay, + paddle.optimizer.lr.polynomial_decay, {"learning_rate": 1.0, "decay_steps": 5, "cycle": False}, ), ( piecewise_decay, - layers.piecewise_decay, + paddle.optimizer.lr.piecewise_decay, {"boundaries": [3, 6, 9], "values": [0.1, 0.2, 0.3, 0.4]}, ), ( cosine_decay, - layers.cosine_decay, + paddle.optimizer.lr.cosine_decay, {"learning_rate": 0.1, "step_each_epoch": 100, "epochs": 120}, ), ( noam_decay, - layers.noam_decay, + paddle.optimizer.lr.noam_decay, {"d_model": 0.01, "warmup_steps": 200, "learning_rate": 2.0}, ), ] @@ -507,7 +529,7 @@ def check_decay_with_place( end_lr = 0.1 with fluid.program_guard(main_prog, startup_prog): - decayed_lr = layers.linear_lr_warmup( + decayed_lr = paddle.optimizer.lr.linear_lr_warmup( fluid_decay_fn(**kwargs), warmup_steps, start_lr, end_lr ) @@ -548,7 +570,7 @@ def run_scalar_lr(self, place, lr, start_lr, end_lr): warmup_steps = 10 with fluid.program_guard(main_prog, startup_prog): - decayed_lr = layers.linear_lr_warmup( + decayed_lr = paddle.optimizer.lr.linear_lr_warmup( lr, warmup_steps, start_lr, end_lr ) From bfc6480152139098fb3c73ab62fd20f6b1e02e26 Mon Sep 17 00:00:00 2001 From: Difer <707065510@qq.com> Date: Fri, 11 Aug 2023 11:09:21 +0800 Subject: [PATCH 009/246] repacle fluid.io.load_inference_model, fluid.io.save_inference_model in fluid with 2.0 version (#55345) * repacle fluid.io.load_inference_model * replace fluid.io.save_inference_model * fix some bug * fix some bugs of load & save model * fix some bug * fix test_inference_model_io bug * fix word2vec_inference_model bug * fix some bug * fix valueError bug * fix some bug * fix a warning error * for debug * for debug * fix io error * fix test_wordvec_book error * remove debug print * fix load_var bug * for debug cinn test * revert cinn & fix inference_pass_test in windows * fix some bugs * revert cinn & fix inference_pass_test in windows * for debug vars * for debug * fix quant_dequant_test * fix some path errors * remove fluid save/load * fix incubate-fleet save * move some from fluid.io to static.io --- .../fleet/runtime/parameter_server_runtime.py | 21 +- python/paddle/fluid/io.py | 441 +----------------- python/paddle/hapi/model.py | 10 +- .../incubate/distributed/fleet/collective.py | 28 +- .../incubate/distributed/fleet/fleet_util.py | 26 +- .../incubate/distributed/fleet/utils.py | 5 +- python/paddle/jit/api.py | 42 +- python/paddle/static/io.py | 71 ++- test/book/notest_understand_sentiment.py | 8 +- test/book/test_image_classification.py | 21 +- test/book/test_recognize_digits.py | 24 +- test/book/test_recommender_system.py | 29 +- test/book/test_word2vec_book.py | 14 +- test/cinn/fake_model/resnet_model.py | 6 +- test/cinn/test_paddle_model_convertor.py | 8 +- .../contrib/test_image_classification_fp16.py | 14 +- test/dygraph_to_static/test_bert.py | 2 +- test/dygraph_to_static/test_bmn.py | 2 +- test/dygraph_to_static/test_lac.py | 2 +- test/dygraph_to_static/test_mnist.py | 4 +- test/dygraph_to_static/test_mobile_net.py | 2 +- test/dygraph_to_static/test_resnet.py | 2 +- .../test_save_inference_model.py | 4 +- test/dygraph_to_static/test_se_resnet.py | 2 +- test/ir/inference/inference_pass_test.py | 43 +- test/ir/inference/program_config.py | 15 +- test/ir/inference/quant_dequant_test.py | 42 +- test/legacy_test/test_dist_base.py | 23 +- .../legacy_test/test_dist_mnist_fleet_save.py | 1 - .../test_eager_deletion_padding_rnn.py | 4 +- .../test_executor_and_use_program_cache.py | 15 +- test/legacy_test/test_inference_model_io.py | 52 ++- test/legacy_test/test_io_save_load.py | 10 +- .../test_load_state_dict_from_old_format.py | 6 +- .../test_save_model_without_var.py | 12 +- test/quantization/convert_model2dot.py | 4 +- ...t2_int8_image_classification_comparison.py | 4 +- ...nt_int8_image_classification_comparison.py | 4 +- .../paddle_benchmark/paddle_save_model.py | 6 +- .../cinn/paddle_benchmark/test_paddle_ops.py | 4 +- 40 files changed, 362 insertions(+), 671 deletions(-) diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py index 946045f26582e..ee3bd60b46b9e 100644 --- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py +++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py @@ -714,7 +714,7 @@ def _ps_inference_save_inference_model( self, executor, dirname, - feeded_var_names, + feeded_vars, target_vars, main_program=None, export_for_deployment=True, @@ -735,28 +735,21 @@ def _ps_inference_save_inference_model( raise TypeError( "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed" ) - paddle.fluid.io.save_inference_model( + paddle.static.io.save_inference_model( dirname, - feeded_var_names, + feeded_vars, target_vars, executor, - main_program, - None, - None, - export_for_deployment, + program=main_program, legacy_format=legacy_format, ) else: - paddle.fluid.io.save_inference_model( + paddle.static.save_inference_model( dirname, - feeded_var_names, + feeded_vars, target_vars, executor, - self.origin_main_program, - None, - None, - export_for_deployment, - True, + program=self.origin_main_program, legacy_format=legacy_format, ) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index e0e102e2393f9..027e1ad7c438e 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -59,448 +59,9 @@ from paddle.utils import deprecated from paddle.fluid.framework import static_only -__all__ = [ - 'save_inference_model', - 'load_inference_model', -] + reader.__all__ +__all__ = reader.__all__ _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' ) - - -def prepend_feed_ops( - inference_program, feed_target_names, feed_holder_name='feed' -): - if len(feed_target_names) == 0: - return - - global_block = inference_program.global_block() - feed_var = global_block.create_var( - name=feed_holder_name, - type=core.VarDesc.VarType.FEED_MINIBATCH, - persistable=True, - ) - - for i, name in enumerate(feed_target_names): - if not global_block.has_var(name): - raise ValueError( - "The feeded_var_names[{i}]: '{name}' doesn't exist in pruned inference program. " - "Please check whether '{name}' is a valid feed_var name, or remove it from feeded_var_names " - "if '{name}' is not involved in the target_vars calculation.".format( - i=i, name=name - ) - ) - out = global_block.var(name) - global_block._prepend_op( - type='feed', - inputs={'X': [feed_var]}, - outputs={'Out': [out]}, - attrs={'col': i}, - ) - - -def append_fetch_ops( - inference_program, fetch_target_names, fetch_holder_name='fetch' -): - global_block = inference_program.global_block() - fetch_var = global_block.create_var( - name=fetch_holder_name, - type=core.VarDesc.VarType.FETCH_LIST, - persistable=True, - ) - - for i, name in enumerate(fetch_target_names): - global_block.append_op( - type='fetch', - inputs={'X': [name]}, - outputs={'Out': [fetch_var]}, - attrs={'col': i}, - ) - - -@static_only -@deprecated(since="2.0.0", update_to="paddle.static.save_inference_model") -def save_inference_model( - dirname, - feeded_var_names, - target_vars, - executor, - main_program=None, - model_filename=None, - params_filename=None, - export_for_deployment=True, - program_only=False, - clip_extra=True, - legacy_format=False, -): - """ - Prune the given `main_program` to build a new program especially for inference, - and then save it and all related parameters to given `dirname` . - If you just want to save parameters of your trained model, please use the - :ref:`api_fluid_io_save_params` . You can refer to :ref:`api_guide_model_save_reader_en` - for more details. - - Note: - The :code:`dirname` is used to specify the folder where inference model - structure and parameters are going to be saved. If you would like to save params of - Program in separate files, set `params_filename` None; if you would like to save all - params of Program in a single file, use `params_filename` to specify the file name. - - Args: - dirname(str): The directory path to save the inference model. - feeded_var_names(list[str]): list of string. Names of variables that need to be fed - data during inference. - target_vars(list[Variable]): list of Variable. Variables from which we can get - inference results. - executor(Executor): The executor that saves the inference model. You can refer - to :ref:`api_guide_executor_en` for more details. - main_program(Program, optional): The original program, which will be pruned to - build the inference model. If is set None, - the global default :code:`_main_program_` will be used. - Default: None. - model_filename(str, optional): The name of file to save the inference program - itself. If is set None, a default filename - :code:`__model__` will be used. - params_filename(str, optional): The name of file to save all related parameters. - If it is set None, parameters will be saved - in separate files . - export_for_deployment(bool, optional): If True, programs are modified to only support - direct inference deployment. Otherwise, - more information will be stored for flexible - optimization and re-training. Currently, only - True is supported. - Default: True. - program_only(bool, optional): If True, It will save inference program only, and do not - save params of Program. - Default: False. - legacy_format(bool, optional): Whether to save program in legacy format. - Default: False. - - Returns: - list, The fetch variables' name list. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - path = "./infer_model" - - # User defined network, here a softmax regession example - image = paddle.static.data(name='img', shape=[None, 28, 28], dtype='float32') - label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') - feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace()) - predict = paddle.static.nn.fc(x=image, size=10, activation='softmax') - - loss = paddle.nn.functional.cross_entropy( - input=predict, label=label, - reduction='none', use_softmax=False - ) - avg_loss = paddle.mean(loss) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - - # Feed data and train process - - # Save inference model. Note we don't save label and loss in this example - fluid.io.save_inference_model(dirname=path, - feeded_var_names=['img'], - target_vars=[predict], - executor=exe) - - # In this example, the save_inference_mode inference will prune the default - # main program according to the network's input node (img) and output node(predict). - # The pruned inference program is going to be saved in the "./infer_model/__model__" - # and parameters are going to be saved in separate files under folder - # "./infer_model". - - """ - if isinstance(feeded_var_names, str): - feeded_var_names = [feeded_var_names] - elif export_for_deployment: - if len(feeded_var_names) > 0: - # TODO(paddle-dev): polish these code blocks - if not ( - bool(feeded_var_names) - and all(isinstance(name, str) for name in feeded_var_names) - ): - raise ValueError("'feed_var_names' should be a list of str.") - - if isinstance(target_vars, Variable): - target_vars = [target_vars] - elif export_for_deployment: - if not ( - bool(target_vars) - and all(isinstance(var, Variable) for var in target_vars) - ): - raise ValueError("'target_vars' should be a list of Variable.") - - main_program = paddle.static.io._get_valid_program(main_program) - - # remind user to set auc_states to zeros if the program contains auc op - all_ops = main_program.global_block().ops - for op in all_ops: - # clear device of Op - device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() - op._set_attr(device_attr_name, "") - if op.type == 'auc': - warnings.warn( - "please ensure that you have set the auc states to zeros before saving inference model" - ) - break - - with program_guard(main_program): - uniq_target_vars = [] - for i, var in enumerate(target_vars): - uniq_target_vars.append(var) - target_vars = uniq_target_vars - target_var_name_list = [var.name for var in target_vars] - - # when a pserver and a trainer running on the same machine, mkdir may conflict - save_dirname = dirname - try: - save_dirname = os.path.normpath(dirname) - os.makedirs(save_dirname) - except OSError as e: - if e.errno != errno.EEXIST: - raise - - if model_filename is not None: - model_basename = os.path.basename(model_filename) - else: - model_basename = "__model__" - model_basename = os.path.join(save_dirname, model_basename) - - # When export_for_deployment is true, we modify the program online so that - # it can only be loaded for inference directly. If it's false, the whole - # original program and related meta are saved so that future usage can be - # more flexible. - - origin_program = main_program.clone() - - if export_for_deployment: - main_program = main_program.clone() - global_block = main_program.global_block() - need_to_remove_op_index = [] - for i, op in enumerate(global_block.ops): - op.desc.set_is_target(False) - if op.type == "feed" or op.type == "fetch": - need_to_remove_op_index.append(i) - - for index in need_to_remove_op_index[::-1]: - global_block._remove_op(index) - - main_program.desc.flush() - - main_program = main_program._prune_with_input( - feeded_var_names=feeded_var_names, targets=target_vars - ) - main_program = main_program._inference_optimize(prune_read_op=True) - fetch_var_names = [v.name for v in target_vars] - - for target_v in target_vars: - if not main_program.global_block().has_var(target_v.name): - main_program.global_block().create_var( - name=target_v.name, - shape=target_v.shape, - dtype=target_v.dtype, - persistable=target_v.persistable, - ) - - prepend_feed_ops(main_program, feeded_var_names) - append_fetch_ops(main_program, fetch_var_names) - - with open(model_basename, "wb") as f: - f.write( - main_program._remove_training_info( - clip_extra=clip_extra - ).desc.serialize_to_string() - ) - else: - # TODO(panyx0718): Save more information so that it can also be used - # for training and more flexible post-processing. - with open(model_basename + ".main_program", "wb") as f: - f.write( - main_program._remove_training_info( - clip_extra=clip_extra - ).desc.serialize_to_string() - ) - - if program_only: - warnings.warn( - "save_inference_model specified the param `program_only` to True, It will not save params of Program." - ) - return target_var_name_list - - main_program._copy_dist_param_info_from(origin_program) - - if params_filename is not None: - params_filename = os.path.basename(params_filename) - - paddle.distributed.io.save_persistables( - executor, save_dirname, main_program, params_filename - ) - return target_var_name_list - - -@static_only -@deprecated(since="2.0.0", update_to="paddle.static.load_inference_model") -def load_inference_model( - dirname, - executor, - model_filename=None, - params_filename=None, - pserver_endpoints=None, -): - """ - Load the inference model from a given directory. By this API, you can get the model - structure(Inference Program) and model parameters. If you just want to load - parameters of the pre-trained model, please use the :ref:`api_fluid_io_load_params` API. - You can refer to :ref:`api_guide_model_save_reader_en` for more details. - - Args: - dirname(str): One of the following: - - The given directory path. - - Set to None when reading the model from memory. - executor(Executor): The executor to run for loading inference model. - See :ref:`api_guide_executor_en` for more details about it. - model_filename(str, optional): One of the following: - - The name of file to load the inference program. - - If it is None, the default filename ``__model__`` will be used. - - When ``dirname`` is ``None``, it must be set to a string containing model. - Default: ``None``. - params_filename(str, optional): It is only used for the case that all - parameters were saved in a single binary file. One of the following: - - The name of file to load all parameters. - - When ``dirname`` is ``None``, it must be set to a string containing all the parameters. - - If parameters were saved in separate files, set it as ``None``. - Default: ``None``. - - pserver_endpoints(list, optional): It is only needed by the distributed inference. - If using a distributed look up table during the training, - this table is also needed by the inference process. Its value is - a list of pserver endpoints. - - Returns: - list: The return of this API is a list with three elements: - (program, feed_target_names, fetch_targets). The `program` is a - ``Program`` (refer to :ref:`api_guide_Program_en`), which is used for inference. - The `feed_target_names` is a list of ``str``, which contains names of variables - that need to feed data in the inference program. The `fetch_targets` is a list of - ``Variable`` (refer to :ref:`api_guide_Program_en`). It contains variables from which - we can get inference results. - - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - paddle.enable_static() - # Build the model - main_prog = fluid.Program() - startup_prog = fluid.Program() - with fluid.program_guard(main_prog, startup_prog): - data = paddle.static.data(name="img", shape=[-1, 64, 784]) - w = paddle.create_parameter(shape=[784, 200], dtype='float32') - b = paddle.create_parameter(shape=[200], dtype='float32') - hidden_w = paddle.matmul(x=data, y=w) - hidden_b = paddle.add(hidden_w, b) - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - - # Save the inference model - path = "./infer_model" - fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'], - target_vars=[hidden_b], executor=exe, main_program=main_prog) - - # Demo one. Not need to set the distributed look up table, because the - # training doesn't use a distributed look up table. - [inference_program, feed_target_names, fetch_targets] = ( - fluid.io.load_inference_model(dirname=path, executor=exe)) - tensor_img = np.array(np.random.random((1, 64, 784)), dtype=np.float32) - results = exe.run(inference_program, - feed={feed_target_names[0]: tensor_img}, - fetch_list=fetch_targets) - - # Demo two. If the training uses a distributed look up table, the pserver - # endpoints list should be supported when loading the inference model. - # The below is just an example. - endpoints = ["127.0.0.1:2023","127.0.0.1:2024"] - [dist_inference_program, dist_feed_target_names, dist_fetch_targets] = ( - fluid.io.load_inference_model(dirname=path, - executor=exe, - pserver_endpoints=endpoints)) - - # In this example, the inference program was saved in the file - # "./infer_model/__model__" and parameters were saved in - # separate files under the directory "./infer_model". - # By the inference program, feed_target_names and - # fetch_targets, we can use an executor to run the inference - # program for getting the inference result. - """ - load_from_memory = False - if dirname is not None: - load_dirname = os.path.normpath(dirname) - if not os.path.isdir(load_dirname): - raise ValueError("There is no directory named '%s'" % dirname) - - if model_filename is None: - model_filename = '__model__' - - model_filename = os.path.join( - load_dirname, os.path.basename(model_filename) - ) - - if params_filename is not None: - params_filename = os.path.basename(params_filename) - - with open(model_filename, "rb") as f: - program_desc_str = f.read() - else: - load_from_memory = True - if params_filename is None: - raise ValueError( - "The path of params cannot be None when the directory path is None." - ) - load_dirname = dirname - program_desc_str = model_filename - params_filename = params_filename - - program = Program.parse_from_string(program_desc_str) - if not core._is_program_version_supported(program._version()): - raise ValueError( - "Unsupported program version: %d\n" % program._version() - ) - # Binary data also need versioning. - paddle.distributed.io.load_persistables( - executor, load_dirname, program, params_filename - ) - - if pserver_endpoints: - program = _endpoints_replacement(program, pserver_endpoints) - - feed_target_names = program.desc.get_feed_target_names() - fetch_target_names = program.desc.get_fetch_target_names() - fetch_targets = [ - program.global_block().var(name) for name in fetch_target_names - ] - - return [program, feed_target_names, fetch_targets] - - -def _endpoints_replacement(program, endpoints): - ENDPOINT_MAP = "epmap" - for op in program.global_block().ops: - if op.has_attr(ENDPOINT_MAP): - op.set_attr(ENDPOINT_MAP, endpoints) - program._sync_with_cpp() - return program diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index b029c4d3a1c3a..d89ae807c07b6 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -2268,17 +2268,15 @@ def _save_inference_model(self, path): infer_prog = prog.clone(for_test=True) - input_names = [v.name for v in self._adapter._input_vars['test']] + inputs = list(self._adapter._input_vars['test']) endpoints = self._adapter._endpoints['test']['output'] - fluid.io.save_inference_model( + paddle.static.save_inference_model( model_path, - input_names, + inputs, endpoints, self._adapter._executor, - main_program=infer_prog, - model_filename=model_filename, - params_filename=params_filename, + program=infer_prog, ) def _run_one_epoch( diff --git a/python/paddle/incubate/distributed/fleet/collective.py b/python/paddle/incubate/distributed/fleet/collective.py index 3c0d783189884..6b32ef85de3e5 100644 --- a/python/paddle/incubate/distributed/fleet/collective.py +++ b/python/paddle/incubate/distributed/fleet/collective.py @@ -18,7 +18,6 @@ import paddle.distributed.transpiler.distribute_transpiler as dist_transpiler from paddle import fluid from paddle.distributed.fleet.meta_optimizers import RawProgramOptimizer -from paddle.fluid import io from paddle.fluid.compiler import CompiledProgram from paddle.fluid.executor import Executor from paddle.fluid.framework import Program @@ -31,6 +30,7 @@ Fleet, Mode, ) +from paddle.static import io class Collective(Fleet): @@ -77,11 +77,10 @@ def distributed_optimizer(self, optimizer, strategy=None): def save_inference_model( self, executor, - dirname, - feeded_var_names=None, - target_vars=None, - main_program=None, - export_for_deployment=True, + path_prefix, + feeded_vas=None, + fetch_vars=None, + program=None, legacy_format=False, ): """ @@ -94,22 +93,19 @@ def save_inference_model( " Executor type." ) - if main_program is None: - main_program = self._origin_program - assert isinstance(main_program, Program), ( + if program is None: + program = self._origin_program + assert isinstance(program, Program), ( "In fleet.save_inference_model() function, main_program " "must be as Program type." ) io.save_inference_model( - dirname, - feeded_var_names, - target_vars, + path_prefix, + feeded_vas, + fetch_vars, executor, - main_program, - None, - None, - export_for_deployment, + program=program, legacy_format=legacy_format, ) diff --git a/python/paddle/incubate/distributed/fleet/fleet_util.py b/python/paddle/incubate/distributed/fleet/fleet_util.py index 23777390cc891..f809a536ca241 100644 --- a/python/paddle/incubate/distributed/fleet/fleet_util.py +++ b/python/paddle/incubate/distributed/fleet/fleet_util.py @@ -965,28 +965,26 @@ def save_paddle_inference_model( """ day = str(day) pass_id = str(pass_id) - feeded_var_names = [i.name for i in feeded_vars] model_name = "inference_model" # pull dense before save self.pull_all_dense_params(scope, program) if fleet.worker_index() == 0: with fluid.scope_guard(scope): if save_combine: - fluid.io.save_inference_model( - dirname=model_name, - feeded_var_names=feeded_var_names, - target_vars=target_vars, - executor=executor, - main_program=program.clone(), - params_filename="params", + paddle.static.io.save_inference_model( + model_name, + feeded_vars, + target_vars, + executor, + program=program.clone(), ) else: - fluid.io.save_inference_model( - dirname=model_name, - feeded_var_names=feeded_var_names, - target_vars=target_vars, - executor=executor, - main_program=program.clone(), + paddle.static.io.save_inference_model( + model_name, + feeded_vars, + target_vars, + executor, + program=program.clone(), ) configs = { diff --git a/python/paddle/incubate/distributed/fleet/utils.py b/python/paddle/incubate/distributed/fleet/utils.py index 45cd520d67e7f..ca30f1caff128 100644 --- a/python/paddle/incubate/distributed/fleet/utils.py +++ b/python/paddle/incubate/distributed/fleet/utils.py @@ -238,11 +238,12 @@ def try_load_model_vars( dump_prog_fn = program_type_trans( dump_dir, dump_prog_fn, is_text_dump_program ) - ( + + [ inference_program, feed_target_names, fetch_targets, - ) = fluid.io.load_inference_model( + ] = paddle.static.io.load_inference_model( dump_dir, exe, model_filename=dump_prog_fn, diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py index 372a61ddc3fdd..6769b770db04e 100644 --- a/python/paddle/jit/api.py +++ b/python/paddle/jit/api.py @@ -74,7 +74,7 @@ ) from paddle.fluid.framework import dygraph_only from paddle.fluid.wrapped_decorator import wrap_decorator -from paddle.fluid.io import save_inference_model +from paddle.static.io import save_inference_model from paddle.framework import in_dynamic_mode @@ -1222,23 +1222,25 @@ def save(layer, path, input_spec=None, **configs): if 'forward' == attr_func or not isinstance(layer, Layer): model_filename = file_prefix + INFER_MODEL_SUFFIX params_filename = file_prefix + INFER_PARAMS_SUFFIX + path_prefix = file_prefix else: model_filename = file_prefix + '.' + attr_func + INFER_MODEL_SUFFIX params_filename = ( file_prefix + '.' + attr_func + INFER_PARAMS_SUFFIX ) - + file_prefix = file_prefix + '.' + attr_func + file_prefix = os.path.join(model_path, file_prefix) with scope_guard(scope): + input_vars = [] + for var in concrete_program.main_program.clone().list_vars(): + if var.name in input_var_names: + input_vars.append(var) save_inference_model( - dirname=model_path, - feeded_var_names=input_var_names, - target_vars=output_vars, + path_prefix=file_prefix, + feed_vars=input_vars, + fetch_vars=output_vars, executor=Executor(_current_expected_place()), - main_program=concrete_program.main_program.clone(), - model_filename=model_filename, - params_filename=params_filename, - export_for_deployment=configs._export_for_deployment, - program_only=configs._program_only, + program=concrete_program.main_program.clone(), clip_extra=configs.clip_extra, ) @@ -1893,24 +1895,24 @@ def get_feed_fetch(all_vars, partial_vars): with scope_guard(self._scope): feeded_var_names = get_feed_fetch(self._feed_names, feed) target_var_names = get_feed_fetch(self._fetch_names, fetch) + feed_vars = [] + for name in feeded_var_names: + feed_var = self._program.global_block().vars.get(name, None) + assert feed_var is not None, f"{name} cannot be found" + feed_vars.append(feed_var) target_vars = [] for name in target_var_names: target_var = self._program.global_block().vars.get(name, None) assert target_var is not None, f"{name} cannot be found" target_vars.append(target_var) - - model_filename = file_prefix + INFER_MODEL_SUFFIX - params_filename = file_prefix + INFER_PARAMS_SUFFIX - legacy_format = kwargs.get('legacy_format', False) + file_prefix = os.path.join(dirname, file_prefix) save_inference_model( - dirname=dirname, - feeded_var_names=feeded_var_names, - target_vars=target_vars, + path_prefix=file_prefix, + feed_vars=feed_vars, + fetch_vars=target_vars, executor=self._exe, - main_program=self._program.clone(), - model_filename=model_filename, - params_filename=params_filename, + program=self._program.clone(), clip_extra=clip_extra, legacy_format=legacy_format, ) diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py index 5a58fe5440b73..88be89e3d4403 100644 --- a/python/paddle/static/io.py +++ b/python/paddle/static/io.py @@ -34,7 +34,6 @@ ) from paddle.fluid.executor import Executor, global_scope from paddle.fluid.framework import Parameter, dygraph_not_support, static_only -from paddle.fluid.io import append_fetch_ops, prepend_feed_ops from paddle.fluid.log_helper import get_logger from paddle.framework.io_utils import ( _clone_var_in_block_, @@ -138,6 +137,56 @@ def _clone_var_in_block(block, var): ) +def prepend_feed_ops( + inference_program, feed_target_names, feed_holder_name='feed' +): + if len(feed_target_names) == 0: + return + + global_block = inference_program.global_block() + feed_var = global_block.create_var( + name=feed_holder_name, + type=core.VarDesc.VarType.FEED_MINIBATCH, + persistable=True, + ) + + for i, name in enumerate(feed_target_names): + if not global_block.has_var(name): + raise ValueError( + "The feeded_var_names[{i}]: '{name}' doesn't exist in pruned inference program. " + "Please check whether '{name}' is a valid feed_var name, or remove it from feeded_var_names " + "if '{name}' is not involved in the target_vars calculation.".format( + i=i, name=name + ) + ) + out = global_block.var(name) + global_block._prepend_op( + type='feed', + inputs={'X': [feed_var]}, + outputs={'Out': [out]}, + attrs={'col': i}, + ) + + +def append_fetch_ops( + inference_program, fetch_target_names, fetch_holder_name='fetch' +): + global_block = inference_program.global_block() + fetch_var = global_block.create_var( + name=fetch_holder_name, + type=core.VarDesc.VarType.FETCH_LIST, + persistable=True, + ) + + for i, name in enumerate(fetch_target_names): + global_block.append_op( + type='fetch', + inputs={'X': [name]}, + outputs={'Out': [fetch_var]}, + attrs={'col': i}, + ) + + def normalize_program(program, feed_vars, fetch_vars): """ @@ -200,8 +249,7 @@ def normalize_program(program, feed_vars, fetch_vars): op._set_attr(device_attr_name, "") if op.type == 'auc': warnings.warn( - "Be sure that you have set auc states to 0 " - "before saving inference model." + "Be sure that you have set auc states to 0 before saving inference model." ) break @@ -521,15 +569,23 @@ def save_inference_model( program = _get_valid_program(kwargs.get('program', None)) clip_extra = kwargs.get('clip_extra', True) program = normalize_program(program, feed_vars, fetch_vars) + # serialize and save program legacy_format = kwargs.get('legacy_format', False) program_bytes = _serialize_program( program._remove_training_info(clip_extra=clip_extra), legacy_format=legacy_format, ) + save_to_file(model_path, program_bytes) vars = list(filter(is_persistable, program.list_vars())) + + if len(list(vars)) == 0: + warnings.warn( + "no variable in your model, please ensure there are any variables in your model to save" + ) + if len(vars) > 0: save_dirname = os.path.dirname(params_path) params_filename = os.path.basename(params_path) @@ -832,7 +888,9 @@ def load_inference_model(path_prefix, executor, **kwargs): else: # check and norm path_prefix path_prefix = _normalize_path_prefix(path_prefix) - + dir_path = os.path.dirname(path_prefix) + if not os.path.isdir(dir_path): + raise ValueError(f"There is no directory named {dir_path}") # set model_path and params_path in new way, # path_prefix represents a file path without suffix in this case. if not kwargs: @@ -867,6 +925,7 @@ def load_inference_model(path_prefix, executor, **kwargs): model_path, params_path ) ) + program_bytes = load_from_file(model_path) # deserialize bytes to program @@ -876,6 +935,7 @@ def load_inference_model(path_prefix, executor, **kwargs): if len(vars) > 0: load_dirname = os.path.dirname(params_path) params_filename = os.path.basename(params_path) + load_vars( executor, dirname=load_dirname, @@ -1139,6 +1199,9 @@ def name_has_fc(var): else: vars_from_memory = True + if filename == '': + filename = None + if vars is None: if main_program is None: main_program = default_main_program() diff --git a/test/book/notest_understand_sentiment.py b/test/book/notest_understand_sentiment.py index 8d7cde66bce56..606c0f2542d85 100644 --- a/test/book/notest_understand_sentiment.py +++ b/test/book/notest_understand_sentiment.py @@ -110,8 +110,8 @@ def train_loop(main_program): print("cost=" + str(cost_val) + " acc=" + str(acc_val)) if cost_val < 0.4 and acc_val > 0.8: if save_dirname is not None: - fluid.io.save_inference_model( - save_dirname, ["words"], prediction, exe + paddle.static.io.save_inference_model( + save_dirname, data, prediction, exe ) return if math.isnan(float(cost_val)): @@ -153,7 +153,7 @@ def infer(word_dict, use_cuda, save_dirname=None): inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, + # Use paddle.static.io.load_inference_model to obtain the inference program desc, # the feed_target_names (the names of variables that will be fed # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). @@ -161,7 +161,7 @@ def infer(word_dict, use_cuda, save_dirname=None): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model(save_dirname, exe) + ] = paddle.static.io.load_inference_model(save_dirname, exe) word_dict_len = len(word_dict) diff --git a/test/book/test_image_classification.py b/test/book/test_image_classification.py index 9396e15fa03a4..bb8c45ea9011c 100644 --- a/test/book/test_image_classification.py +++ b/test/book/test_image_classification.py @@ -188,8 +188,8 @@ def train_loop(main_program): ) if acc_value > 0.01: # Low threshold for speeding up CI - fluid.io.save_inference_model( - save_dirname, ["pixel"], [predict], exe + paddle.static.io.save_inference_model( + save_dirname, images, [predict], exe ) return @@ -228,7 +228,7 @@ def infer(use_cuda, save_dirname=None): inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, + # Use paddle.static.io.load_inference_model to obtain the inference program desc, # the feed_target_names (the names of variables that will be fed # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). @@ -236,7 +236,7 @@ def infer(use_cuda, save_dirname=None): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model(save_dirname, exe) + ] = paddle.static.io.load_inference_model(save_dirname, exe) # The input's dimension of conv should be 4-D or 5-D. # Use normilized image pixels as input data, which should be in the range [0, 1.0]. @@ -252,13 +252,16 @@ def infer(use_cuda, save_dirname=None): ) print("infer results: ", results[0]) - - fluid.io.save_inference_model( + feeded_vars = [ + inference_program.global_block().var(name) + for name in feed_target_names + ] + paddle.static.io.save_inference_model( save_dirname, - feed_target_names, + feeded_vars, fetch_targets, exe, - inference_program, + program=inference_program, ) @@ -269,7 +272,7 @@ def main(net_type, use_cuda, is_local=True): # Directory for saving the trained model temp_dir = tempfile.TemporaryDirectory() save_dirname = os.path.join( - temp_dir.name, "image_classification_" + net_type + ".inference.model" + temp_dir.name, "image_classification_" + net_type + "_inference_model" ) train(net_type, use_cuda, save_dirname, is_local) diff --git a/test/book/test_recognize_digits.py b/test/book/test_recognize_digits.py index 5a64bcd3ed572..aff66af640031 100644 --- a/test/book/test_recognize_digits.py +++ b/test/book/test_recognize_digits.py @@ -137,23 +137,18 @@ def train_loop(main_program): if float(acc_val) > 0.2 or pass_id == (PASS_NUM - 1): # Smaller value to increase CI speed if save_dirname is not None: - fluid.io.save_inference_model( + paddle.static.io.save_inference_model( save_dirname, - ["img"], + img, [prediction], exe, - model_filename=model_filename, - params_filename=params_filename, ) if save_full_dirname is not None: - fluid.io.save_inference_model( + paddle.static.save_inference_model( save_full_dirname, [], [], exe, - model_filename=model_filename, - params_filename=params_filename, - export_for_deployment=False, ) return else: @@ -206,7 +201,7 @@ def infer( inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, + # Use paddle.static.io.load_inference_model to obtain the inference program desc, # the feed_target_names (the names of variables that will be feeded # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). @@ -214,8 +209,9 @@ def infer( inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( - save_dirname, exe, model_filename, params_filename + ] = paddle.static.io.load_inference_model( + save_dirname, + exe, ) # The input's dimension of conv should be 4-D or 5-D. @@ -241,11 +237,13 @@ def main(use_cuda, parallel, nn_type, combine): model_filename = None params_filename = None if not use_cuda and not parallel: - save_dirname = "recognize_digits_" + nn_type + ".inference.model" - save_full_dirname = "recognize_digits_" + nn_type + ".train.model" + save_dirname = "recognize_digits_" + nn_type + "_inference_model" + save_full_dirname = "recognize_digits_" + nn_type + "_train_model" if combine: model_filename = "__model_combined__" params_filename = "__params_combined__" + save_dirname = save_dirname + model_filename + save_full_dirname = params_filename + params_filename # call train() with is_local argument to run distributed train train( diff --git a/test/book/test_recommender_system.py b/test/book/test_recommender_system.py index 5087e8a0b370f..8912467413be1 100644 --- a/test/book/test_recommender_system.py +++ b/test/book/test_recommender_system.py @@ -213,6 +213,15 @@ def train(use_cuda, save_dirname, is_local=True): 'movie_title', 'score', ] + feed_infer_order = [ + 'user_id', + 'gender_id', + 'age_id', + 'job_id', + 'movie_id', + 'category_id', + 'movie_title', + ] def train_loop(main_program): exe.run(framework.default_startup_program()) @@ -220,6 +229,10 @@ def train_loop(main_program): feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] + feed_infer_list = [ + main_program.global_block().var(var_name) + for var_name in feed_infer_order + ] feeder = fluid.DataFeeder(feed_list, place) PASS_NUM = 100 @@ -248,17 +261,9 @@ def train_loop(main_program): if test_avg_cost < 6.0: # if avg_cost less than 6.0, we think our code is good. if save_dirname is not None: - fluid.io.save_inference_model( + paddle.static.io.save_inference_model( save_dirname, - [ - "user_id", - "gender_id", - "age_id", - "job_id", - "movie_id", - "category_id", - "movie_title", - ], + feed_infer_list, [scale_infer], exe, ) @@ -302,7 +307,7 @@ def infer(use_cuda, save_dirname=None): inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, + # Use paddle.static.io.load_inference_model to obtain the inference program desc, # the feed_target_names (the names of variables that will be fed # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). @@ -310,7 +315,7 @@ def infer(use_cuda, save_dirname=None): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model(save_dirname, exe) + ] = paddle.static.io.load_inference_model(save_dirname, exe) # Use the first data from paddle.dataset.movielens.test() as input assert feed_target_names[0] == "user_id" diff --git a/test/book/test_word2vec_book.py b/test/book/test_word2vec_book.py index 2a511191743d5..f971db41dbbc3 100644 --- a/test/book/test_word2vec_book.py +++ b/test/book/test_word2vec_book.py @@ -159,9 +159,9 @@ def train_loop(main_program): ) if avg_cost_np[0] < 5.0: if save_dirname is not None and not pure_bf16: - fluid.io.save_inference_model( + paddle.static.io.save_inference_model( save_dirname, - ['firstw', 'secondw', 'thirdw', 'forthw'], + [first_word, second_word, third_word, forth_word], [predict_word], exe, ) @@ -205,15 +205,16 @@ def infer(target, save_dirname=None): exe = fluid.Executor(place) inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, + # Use paddle.static.io.load_inference_model to obtain the inference program desc, # the feed_target_names (the names of variables that will be fed # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). + [ inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model(save_dirname, exe) + ] = paddle.static.io.load_inference_model(save_dirname, exe) word_dict = paddle.dataset.imikolov.build_dict() dict_size = len(word_dict) @@ -272,7 +273,8 @@ def to_infer_tensor(lod_tensor): infer_inputs = [to_infer_tensor(t) for t in infer_inputs] infer_config = fluid.core.NativeConfig() - infer_config.model_dir = save_dirname + infer_config.prog_file = save_dirname + ".pdmodel" + infer_config.param_file = save_dirname + ".pdiparams" if target == "cuda": infer_config.use_gpu = True infer_config.device = 0 @@ -300,7 +302,7 @@ def main(target, is_sparse, is_parallel, use_bf16, pure_bf16): temp_dir = tempfile.TemporaryDirectory() if not is_parallel: - save_dirname = os.path.join(temp_dir.name, "word2vec.inference.model") + save_dirname = os.path.join(temp_dir.name, "word2vec_inference_model") else: save_dirname = None diff --git a/test/cinn/fake_model/resnet_model.py b/test/cinn/fake_model/resnet_model.py index 1a9bf25ece435..4e61e46db462d 100644 --- a/test/cinn/fake_model/resnet_model.py +++ b/test/cinn/fake_model/resnet_model.py @@ -14,7 +14,7 @@ import paddle -from paddle import fluid, static +from paddle import static paddle.enable_static() @@ -48,7 +48,5 @@ exe.run(static.default_startup_program()) static.io.save_inference_model("./resnet_model", [resnet_input], [temp7], exe) -fluid.io.save_inference_model( - "./resnet_model_1", [resnet_input.name], [temp7], exe -) +static.io.save_inference_model("./resnet_model_1", [resnet_input], [temp7], exe) print('res', temp7.name) diff --git a/test/cinn/test_paddle_model_convertor.py b/test/cinn/test_paddle_model_convertor.py index af80736f4662c..5e696785fb50f 100644 --- a/test/cinn/test_paddle_model_convertor.py +++ b/test/cinn/test_paddle_model_convertor.py @@ -81,7 +81,7 @@ # exe.run(paddle.static.default_startup_program()) # prog = paddle.static.default_main_program() -# paddle.fluid.io.save_inference_model("./stack", [x.name, y.name], [prediction], exe, prog) +# paddle.static.io.save_inference_model("./stack", [x.name, y.name], [prediction], exe, prog) # ``` # Second load and run model like: # ``` @@ -150,11 +150,9 @@ def load_paddle_program(self): self.inference_program, self.feed_names, self.fetch_targets, - ] = paddle.fluid.io.load_inference_model( - dirname=self.model_dir, + ] = paddle.static.io.load_inference_model( + path_prefix=self.model_dir, executor=self.exe, - model_filename=self.model_filename, - params_filename=self.params_filename, ) self.param_vars = paddle.load( diff --git a/test/contrib/test_image_classification_fp16.py b/test/contrib/test_image_classification_fp16.py index 67e1189c0e5d6..9aac48b5fe2dd 100644 --- a/test/contrib/test_image_classification_fp16.py +++ b/test/contrib/test_image_classification_fp16.py @@ -217,12 +217,12 @@ def train_loop(main_program): ) if acc_value > 0.08: # Low threshold for speeding up CI - fluid.io.save_inference_model( + paddle.static.io.save_inference_model( save_dirname, - ["pixel"], + images, [predict], exe, - main_program=train_program, + program=train_program, clip_extra=True, ) return @@ -262,7 +262,7 @@ def infer(use_cuda, save_dirname=None): inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, + # Use paddle.static.io.load_inference_model to obtain the inference program desc, # the feed_target_names (the names of variables that will be fed # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). @@ -270,7 +270,7 @@ def infer(use_cuda, save_dirname=None): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model(save_dirname, exe) + ] = paddle.static.io.load_inference_model(save_dirname, exe) # The input's dimension of conv should be 4-D or 5-D. # Use normilized image pixels as input data, which should be in the range [0, 1.0]. @@ -287,12 +287,12 @@ def infer(use_cuda, save_dirname=None): print("infer results: ", results[0]) - fluid.io.save_inference_model( + paddle.static.save_inference_model( save_dirname, feed_target_names, fetch_targets, exe, - inference_program, + parogram=inference_program, clip_extra=True, ) diff --git a/test/dygraph_to_static/test_bert.py b/test/dygraph_to_static/test_bert.py index 8d26ee57a441d..431c9425c8f60 100644 --- a/test/dygraph_to_static/test_bert.py +++ b/test/dygraph_to_static/test_bert.py @@ -181,7 +181,7 @@ def predict_static(self, data): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( + ] = paddle.static.io.load_inference_model( self.model_save_dir, executor=exe, model_filename=self.model_filename, diff --git a/test/dygraph_to_static/test_bmn.py b/test/dygraph_to_static/test_bmn.py index 8203bcec450a8..4c0987941f9cc 100644 --- a/test/dygraph_to_static/test_bmn.py +++ b/test/dygraph_to_static/test_bmn.py @@ -855,7 +855,7 @@ def predict_static(self, data): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( + ] = paddle.static.io.load_inference_model( self.model_save_dir, executor=exe, model_filename=self.model_filename, diff --git a/test/dygraph_to_static/test_lac.py b/test/dygraph_to_static/test_lac.py index 720727e4a4fe4..edcb49452277b 100644 --- a/test/dygraph_to_static/test_lac.py +++ b/test/dygraph_to_static/test_lac.py @@ -667,7 +667,7 @@ def predict_static(self, batch): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( + ] = paddle.static.io.load_inference_model( self.model_save_dir, executor=exe, model_filename=self.model_filename, diff --git a/test/dygraph_to_static/test_mnist.py b/test/dygraph_to_static/test_mnist.py index 2cb556680427b..0f276f52d2485 100644 --- a/test/dygraph_to_static/test_mnist.py +++ b/test/dygraph_to_static/test_mnist.py @@ -292,8 +292,8 @@ def jit_load_and_run_inference_static( inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( - dirname=model_path, + ] = paddle.static.io.load_inference_model( + path_prefix=model_path, executor=exe, model_filename=model_filename, params_filename=params_filename, diff --git a/test/dygraph_to_static/test_mobile_net.py b/test/dygraph_to_static/test_mobile_net.py index db8d420094e15..649f00c3eec0a 100644 --- a/test/dygraph_to_static/test_mobile_net.py +++ b/test/dygraph_to_static/test_mobile_net.py @@ -608,7 +608,7 @@ def predict_static(args, data): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( + ] = paddle.static.io.load_inference_model( args.model_save_dir, executor=exe, model_filename=args.model_filename, diff --git a/test/dygraph_to_static/test_resnet.py b/test/dygraph_to_static/test_resnet.py index 9299922f91dc0..72d0e19518f10 100644 --- a/test/dygraph_to_static/test_resnet.py +++ b/test/dygraph_to_static/test_resnet.py @@ -355,7 +355,7 @@ def predict_static(self, data): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( + ] = paddle.static.io.load_inference_model( self.model_save_dir, executor=exe, model_filename=self.model_filename, diff --git a/test/dygraph_to_static/test_save_inference_model.py b/test/dygraph_to_static/test_save_inference_model.py index 231dd38bccdfb..b050c201c13ca 100644 --- a/test/dygraph_to_static/test_save_inference_model.py +++ b/test/dygraph_to_static/test_save_inference_model.py @@ -130,8 +130,8 @@ def load_and_run_inference( inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( - dirname=model_path, + ] = paddle.static.io.load_inference_model( + path_prefix=model_path, executor=exe, model_filename=model_filename, params_filename=params_filename, diff --git a/test/dygraph_to_static/test_se_resnet.py b/test/dygraph_to_static/test_se_resnet.py index 5e0eb039188d9..0aeac4ac80b80 100644 --- a/test/dygraph_to_static/test_se_resnet.py +++ b/test/dygraph_to_static/test_se_resnet.py @@ -493,7 +493,7 @@ def predict_static(self, data): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( + ] = paddle.static.io.load_inference_model( self.model_save_dir, executor=exe, model_filename=self.model_filename, diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py index cee27eca7d28a..5cbf69b3527be 100644 --- a/test/ir/inference/inference_pass_test.py +++ b/test/ir/inference/inference_pass_test.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import errno import os import random import tempfile @@ -57,13 +58,36 @@ def _save_models( self, dirname, feeded_var_names, target_vars, executor, program, scope ): with fluid.scope_guard(scope): - # save models as combined to ensure that - # there won't be too many useless files - # after finishing a couple of tests. - fluid.io.save_inference_model( - dirname, feeded_var_names, target_vars, executor, program + # save models as combined but sometimes params is null + # To adapt to this situation, the path needs to be adjusted to the old version format. + feeded_vars = [] + for var in program.list_vars(): + if var.name in feeded_var_names: + feeded_vars.append(var) + + paddle.static.io.save_inference_model( + dirname, + feeded_vars, + target_vars, + executor, + program=program, ) + # if the param save is null + # replace model_path to old version + param_file = dirname + ".pdiparams" + if not os.path.exists(param_file): + model_path = dirname + ".pdmodel" + try: + save_dirname = os.path.normpath(dirname) + os.makedirs(save_dirname) + except OSError as e: + if e.errno != errno.EEXIST: + raise + model_path_old = os.path.join(save_dirname, "__model__") + if not os.path.exists(model_path_old): + os.rename(model_path, model_path_old) + def _get_paddle_outs(self, executor, program, scope): ''' Return PaddlePaddle outputs. @@ -109,7 +133,14 @@ def _get_analysis_config( ''' Return a new object of AnalysisConfig. ''' - config = AnalysisConfig(self.path) + # To adapt to save_inference_model + param_file = self.path + ".pdiparams" + if not os.path.exists(param_file): + config = AnalysisConfig(self.path) + else: + config = AnalysisConfig( + self.path + ".pdmodel", self.path + ".pdiparams" + ) config.disable_gpu() config.switch_specify_input_names(True) config.switch_ir_optim(True) diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py index 601ddfdc23833..9df3359c3cc30 100644 --- a/test/ir/inference/program_config.py +++ b/test/ir/inference/program_config.py @@ -432,7 +432,7 @@ def create_quant_model( inference_program, feed_target_names, fetch_targets, - ] = paddle.static.load_inference_model( + ] = paddle.static.io.load_inference_model( path_prefix=None, executor=exe, model_filename=model, @@ -596,18 +596,19 @@ def _get_op_output_var_names(op): tensor = scope.var(var_name).get_tensor() tensor.set(np.ones(tensor.shape(), dtype=np.float32), place) + feed_vars = [ + main_program.global_block().var(name) for name in feed_target_names + ] + if save: - fluid.io.save_inference_model( + paddle.static.io.save_inference_model( 'test_inference_model', - feed_target_names, + feed_vars, fetch_targets, exe, - main_program=main_program, + program=main_program, ) - feed_vars = [ - main_program.global_block().var(name) for name in feed_target_names - ] serialized_program = paddle.static.serialize_program( feed_vars, fetch_targets, program=main_program ) diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py index d2b8667ee67b8..32155b8d22d38 100644 --- a/test/ir/inference/quant_dequant_test.py +++ b/test/ir/inference/quant_dequant_test.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import errno +import os import random import unittest import warnings @@ -23,7 +25,7 @@ from paddle.fluid import Program, Variable, core from paddle.fluid.core import AnalysisConfig, create_paddle_predictor from paddle.fluid.framework import IrGraph -from paddle.fluid.io import append_fetch_ops, prepend_feed_ops +from paddle.static.io import append_fetch_ops, prepend_feed_ops from paddle.static.quantization import ( AddQuantDequantPass, OutScaleForInferencePass, @@ -51,7 +53,7 @@ def __init__(self, methodName='runTest'): self.dynamic_shape_params = None self.enable_lite = False self.lite_parameters = None - self.path = "./inference_pass/" + self.__class__.__name__ + "/" + self.path = "./inference_pass/" + self.__class__.__name__ self.data = None self.label = None self.result = None @@ -118,15 +120,36 @@ def _normalize_program(self, program, feed_vars, fetch_vars): def _save_models( self, dirname, feeded_var_names, target_vars, executor, program, scope ): + # save models as combined but sometimes params is null + # To adapt to this situation, the path needs to be adjusted to the old version format. + feeded_vars = [] + for var in program.list_vars(): + if var.name in feeded_var_names: + feeded_vars.append(var) + with fluid.scope_guard(scope): - fluid.io.save_inference_model( + paddle.static.io.save_inference_model( dirname, - feeded_var_names, + feeded_vars, target_vars, executor, - program, + program=program, clip_extra=True, ) + # if the param save is null + # replace model_path to old version + param_file = dirname + ".pdiparams" + if not os.path.exists(param_file): + model_path = dirname + ".pdmodel" + try: + save_dirname = os.path.normpath(dirname) + os.makedirs(save_dirname) + except OSError as e: + if e.errno != errno.EEXIST: + raise + model_path_old = os.path.join(save_dirname, "__model__") + if not os.path.exists(model_path_old): + os.rename(model_path, model_path_old) def _get_paddle_outs(self, feed, fetch_list, executor, program, scope): ''' @@ -172,7 +195,14 @@ def _get_analysis_config( ''' Return a new object of AnalysisConfig. ''' - config = AnalysisConfig(self.path) + # To adapt to save_inference_model + param_file = self.path + ".pdiparams" + if not os.path.exists(param_file): + config = AnalysisConfig(self.path) + else: + config = AnalysisConfig( + self.path + ".pdmodel", self.path + ".pdiparams" + ) config.disable_gpu() config.switch_specify_input_names(True) config.switch_ir_optim(True) diff --git a/test/legacy_test/test_dist_base.py b/test/legacy_test/test_dist_base.py index 5f3b335e03e32..a1ea2d40a4b0f 100755 --- a/test/legacy_test/test_dist_base.py +++ b/test/legacy_test/test_dist_base.py @@ -435,10 +435,10 @@ def get_data(): model_save_dir, "fleet_persistables" ) infer_save_dir_fluid = os.path.join( - model_save_dir, "fluid_infer" + model_save_dir, "fluid_infer/infer" ) infer_save_dir_fleet = os.path.join( - model_save_dir, "fleet_infer" + model_save_dir, "fleet_infer/infer" ) else: model_save_dir_fluid = os.path.join( @@ -448,25 +448,24 @@ def get_data(): model_save_dir, "fleet_persistables_2" ) infer_save_dir_fluid = os.path.join( - model_save_dir, "fluid_infer_2" + model_save_dir, "fluid_infer_2/infer_2" ) infer_save_dir_fleet = os.path.join( - model_save_dir, "fleet_infer_2" + model_save_dir, "fleet_infer_2/infer_2" ) paddle.distributed.io.save_persistables( exe, model_save_dir_fluid, fleet._origin_program ) fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet) - feeded_var_names = [var.name for var in feed_var_list] - fluid.io.save_inference_model( - infer_save_dir_fluid, - feeded_var_names, - [avg_cost], - exe, - fleet._origin_program, + paddle.static.io.save_inference_model( + path_prefix=infer_save_dir_fluid, + feed_vars=feed_var_list, + fetch_vars=[avg_cost], + executor=exe, + program=fleet._origin_program, ) fleet.save_inference_model( - exe, infer_save_dir_fleet, feeded_var_names, [avg_cost] + exe, infer_save_dir_fleet, feed_var_list, [avg_cost] ) def run_trainer(self, args): diff --git a/test/legacy_test/test_dist_mnist_fleet_save.py b/test/legacy_test/test_dist_mnist_fleet_save.py index c00fffd9d36f2..0cee8d58cc677 100644 --- a/test/legacy_test/test_dist_mnist_fleet_save.py +++ b/test/legacy_test/test_dist_mnist_fleet_save.py @@ -68,7 +68,6 @@ def _test_saved_files(self, dirname): if fluid_persistables[i] != fleet_persistables[i]: self._rm_temp_files(dirname) raise ValueError("Test Failed.") - if len(fluid_infer_files) != len(fleet_infer_files): self._rm_temp_files(dirname) raise ValueError("Test Failed.") diff --git a/test/legacy_test/test_eager_deletion_padding_rnn.py b/test/legacy_test/test_eager_deletion_padding_rnn.py index 3faa050fcea97..874731e35d27a 100644 --- a/test/legacy_test/test_eager_deletion_padding_rnn.py +++ b/test/legacy_test/test_eager_deletion_padding_rnn.py @@ -314,7 +314,7 @@ def encoder_static( paddle.assign(last_cell, output=init_cell) paddle.assign(last_hidden, output=init_hidden) - feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] + feeding_list = [x, y, init_hidden, init_cell] return loss, last_hidden, last_cell, feeding_list @@ -365,7 +365,7 @@ def _prepare_program(self, config): self.loss, self.last_hidden, self.last_cell, - self.feed_order, + self.feed_list, ) = res_vars paddle.nn.clip.set_gradient_clip( diff --git a/test/legacy_test/test_executor_and_use_program_cache.py b/test/legacy_test/test_executor_and_use_program_cache.py index f4cc24e0604a5..f209407fcf2f2 100644 --- a/test/legacy_test/test_executor_and_use_program_cache.py +++ b/test/legacy_test/test_executor_and_use_program_cache.py @@ -88,13 +88,12 @@ def train_and_save_inference_program( config = RNNConfig("test", rnn_model) with fluid.scope_guard(fluid.Scope()): self.train(config, use_program_cache) - fluid.io.save_inference_model( - main_program=self.main_program, - feeded_var_names=self.feed_order, - target_vars=[self.loss, self.last_hidden, self.last_cell], + paddle.static.io.save_inference_model( + path_prefix="padding_rnn." + rnn_model + ".inference_model", + feed_vars=self.feed_list, + fetch_vars=[self.loss, self.last_hidden, self.last_cell], executor=self.exe, - dirname="padding_rnn." + rnn_model + ".inference_model", - params_filename="__params__", + program=self.main_program, ) def test_inference_output(self): @@ -134,8 +133,8 @@ def test_inference_output(self): inference_program, feed_target_names, fetch_targets, - ] = fluid.io.load_inference_model( - save_dirname, self.exe, params_filename="__params__" + ] = paddle.static.io.load_inference_model( + save_dirname, self.exe ) results = self.exe.run( diff --git a/test/legacy_test/test_inference_model_io.py b/test/legacy_test/test_inference_model_io.py index f2e86a351d6fc..7db7a113b5284 100644 --- a/test/legacy_test/test_inference_model_io.py +++ b/test/legacy_test/test_inference_model_io.py @@ -29,7 +29,7 @@ from paddle.fluid import core, executor from paddle.fluid.compiler import CompiledProgram from paddle.fluid.framework import Program, program_guard -from paddle.fluid.io import load_inference_model, save_inference_model +from paddle.static.io import load_inference_model, save_inference_model paddle.enable_static() @@ -82,15 +82,15 @@ def test_fit_line_inference_model(self): ) # Separated model and unified model - save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program) + save_inference_model( + MODEL_DIR, [x, y], [avg_cost], exe, program=program + ) save_inference_model( UNI_MODEL_DIR, - ["x", "y"], + [x, y], [avg_cost], exe, - program, - 'model', - 'params', + program=program, ) main_program = program.clone()._prune_with_input( feeded_var_names=["x", "y"], targets=[avg_cost] @@ -104,12 +104,22 @@ def test_fit_line_inference_model(self): importlib.reload(executor) # reload to build a new scope model_0 = InferModel(load_inference_model(MODEL_DIR, exe)) - with open(os.path.join(UNI_MODEL_DIR, 'model'), "rb") as f: + with open((UNI_MODEL_DIR + '.pdmodel'), "rb") as f: model_str = f.read() - model_1 = InferModel( - load_inference_model(None, exe, model_str, params_str) + model_1 = InferModel(load_inference_model(UNI_MODEL_DIR, exe)) + + # To be compatible with load_inference_model_distributed function + tmp_model_filename = MODEL_DIR + '.pdmodel' + tmp_params_filename = MODEL_DIR + '.pdiparams' + model_2 = InferModel( + load_inference_model_distributed( + root_path.name, + exe, + model_filename=tmp_model_filename, + params_filename=tmp_params_filename, + ) ) - model_2 = InferModel(load_inference_model_distributed(MODEL_DIR, exe)) + model_3 = InferModel( load_inference_model_distributed(None, exe, model_str, params_str) ) @@ -134,11 +144,11 @@ def test_fit_line_inference_model(self): self.assertRaises( ValueError, - fluid.io.load_inference_model, + paddle.static.io.load_inference_model, None, exe, - model_str, - None, + model_filename=model_str, + params_filename=None, ) self.assertRaises( ValueError, @@ -173,7 +183,9 @@ def test_save_inference_model(self): exe = executor.Executor(place) exe.run(init_program, feed={}, fetch_list=[]) - save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program) + save_inference_model( + MODEL_DIR, [x, y], [avg_cost], exe, program=program + ) root_path.cleanup() def test_save_inference_model_with_auc(self): @@ -202,10 +214,10 @@ def test_save_inference_model_with_auc(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") save_inference_model( - MODEL_DIR, ["x", "y"], [avg_cost], exe, program + MODEL_DIR, [x, y], [avg_cost], exe, program=program ) root_path.cleanup() - expected_warn = "please ensure that you have set the auc states to zeros before saving inference model" + expected_warn = "Be sure that you have set auc states to 0 before saving inference model." self.assertTrue(len(w) > 0) self.assertTrue(expected_warn == str(w[0].message)) @@ -237,11 +249,13 @@ def test_save_inference_model(self): cp_prog = CompiledProgram(program) - save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, cp_prog) + save_inference_model( + MODEL_DIR, [x, y], [avg_cost], exe, program=cp_prog + ) self.assertRaises( TypeError, save_inference_model, - [MODEL_DIR, ["x", "y"], [avg_cost], [], cp_prog], + [MODEL_DIR, [x, y], [avg_cost], [], cp_prog], ) root_path.cleanup() @@ -535,7 +549,7 @@ def test_load_model_not_exist(self): place = core.CPUPlace() exe = executor.Executor(place) self.assertRaises( - ValueError, load_inference_model, './test_not_exist_dir', exe + ValueError, load_inference_model, './test_not_exist_dir/model', exe ) self.assertRaises( ValueError, diff --git a/test/legacy_test/test_io_save_load.py b/test/legacy_test/test_io_save_load.py index e09573b52c3f6..a6e125d3d3298 100644 --- a/test/legacy_test/test_io_save_load.py +++ b/test/legacy_test/test_io_save_load.py @@ -79,12 +79,12 @@ def test_useless_feeded_var_names(self): with self.assertRaisesRegex( ValueError, "not involved in the target_vars calculation" ): - fluid.io.save_inference_model( - dirname=os.path.join(self.temp_dir.name, 'model'), - feeded_var_names=['x', 'y'], - target_vars=[z], + paddle.static.io.save_inference_model( + path_prefix=os.path.join(self.temp_dir.name, 'model'), + feed_vars=[x, y], + fetch_vars=[z], executor=exe, - main_program=main_prog, + program=main_prog, ) diff --git a/test/legacy_test/test_load_state_dict_from_old_format.py b/test/legacy_test/test_load_state_dict_from_old_format.py index 4cb19ae90e01b..e107a03baff6c 100644 --- a/test/legacy_test/test_load_state_dict_from_old_format.py +++ b/test/legacy_test/test_load_state_dict_from_old_format.py @@ -125,13 +125,11 @@ def train_and_save_model(self): param.name ) - fluid.io.save_inference_model( + paddle.static.io.save_inference_model( self.save_dirname, - ["img"], + [img], [prediction], exe, - model_filename=self.model_filename, - params_filename=self.params_filename, ) return static_param_dict diff --git a/test/legacy_test/test_save_model_without_var.py b/test/legacy_test/test_save_model_without_var.py index eaeaefe0d56f2..04437ab783499 100644 --- a/test/legacy_test/test_save_model_without_var.py +++ b/test/legacy_test/test_save_model_without_var.py @@ -35,13 +35,11 @@ def test_no_var_save(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - fluid.io.save_inference_model( - dirname='test', - feeded_var_names=['data'], - target_vars=[data_plus], - executor=exe, - model_filename='model', - params_filename='params', + paddle.static.io.save_inference_model( + 'test', + data, + [data_plus], + exe, ) expected_warn = "no variable in your model, please ensure there are any variables in your model to save" self.assertTrue(len(w) > 0) diff --git a/test/quantization/convert_model2dot.py b/test/quantization/convert_model2dot.py index 4ae3e838d3141..d4f298ddcd5b9 100644 --- a/test/quantization/convert_model2dot.py +++ b/test/quantization/convert_model2dot.py @@ -56,7 +56,9 @@ def generate_dot_for_model(model_path, save_graph_dir, save_graph_name): inference_program, feed_target_names, fetch_targets, - ] = paddle.fluid.io.load_inference_model(model_path, exe) + ] = paddle.static.io.load_inference_model( + model_path, exe, model_filename='__model__' + ) else: [ inference_program, diff --git a/test/quantization/quant2_int8_image_classification_comparison.py b/test/quantization/quant2_int8_image_classification_comparison.py index 34d91851eafd8..c50a926562d27 100644 --- a/test/quantization/quant2_int8_image_classification_comparison.py +++ b/test/quantization/quant2_int8_image_classification_comparison.py @@ -195,7 +195,9 @@ def _predict( inference_program, feed_target_names, fetch_targets, - ] = paddle.fluid.io.load_inference_model(model_path, exe) + ] = paddle.static.io.load_inference_model( + model_path, exe, model_filename=None, params_filename=None + ) else: [ inference_program, diff --git a/test/quantization/quant_int8_image_classification_comparison.py b/test/quantization/quant_int8_image_classification_comparison.py index a1ca602406ac0..74b2b0827d870 100644 --- a/test/quantization/quant_int8_image_classification_comparison.py +++ b/test/quantization/quant_int8_image_classification_comparison.py @@ -173,7 +173,9 @@ def _predict( inference_program, feed_target_names, fetch_targets, - ] = paddle.fluid.io.load_inference_model(model_path, exe) + ] = paddle.static.io.load_inference_model( + model_path, exe, model_filename=None, params_filename=None + ) else: [ inference_program, diff --git a/tools/cinn/paddle_benchmark/paddle_save_model.py b/tools/cinn/paddle_benchmark/paddle_save_model.py index f1ec02b8cf5df..b40c5ff49a724 100755 --- a/tools/cinn/paddle_benchmark/paddle_save_model.py +++ b/tools/cinn/paddle_benchmark/paddle_save_model.py @@ -13,7 +13,7 @@ # limitations under the License. import paddle -from paddle import fluid, static +from paddle import static # For paddlepaddle version >=2.0rc, we need to set paddle.enable_static() paddle.enable_static() @@ -30,7 +30,7 @@ exe.run(static.default_startup_program()) -fluid.io.save_inference_model( - "./elementwise_add_model", [a.name, b.name], [a1], exe +paddle.static.io.save_inference_model( + "./elementwise_add_model", [a, b], [a1], exe ) print('input and output names are: ', a.name, b.name, a1.name) diff --git a/tools/cinn/paddle_benchmark/test_paddle_ops.py b/tools/cinn/paddle_benchmark/test_paddle_ops.py index 3c1465faa3a76..6cb9e806d3096 100755 --- a/tools/cinn/paddle_benchmark/test_paddle_ops.py +++ b/tools/cinn/paddle_benchmark/test_paddle_ops.py @@ -17,7 +17,7 @@ import numpy as np import paddle -from paddle import fluid, static +from paddle import static from paddle.fluid.core import AnalysisConfig, create_paddle_predictor @@ -84,7 +84,7 @@ def create_model(input_names, input_shapes, input_dtypes, fn, attrs=None): model_name += "_" + str(input_shapes[0][i]) print("save model:", model_name) - fluid.io.save_inference_model(model_name, input_args_names, [res], exe) + paddle.static.io.save_inference_model(model_name, input_args, [res], exe) print('output name is: ', res.name) From eadc5d07a37c426efcdb0614cb6eb7c91977e0bd Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Fri, 11 Aug 2023 12:56:12 +0800 Subject: [PATCH 010/246] =?UTF-8?q?=E3=80=90New=20IR]=20delete=20print=20p?= =?UTF-8?q?rogram=20in=20test=20and=20delete=20add=5Fn=20attribute=20c++?= =?UTF-8?q?=20interface=20to=20reply=20#56080=20(#56120)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refine program translator * fix warning: not override * fix bug * merge new modifications * modify by reviews * resolve conflicts * resolve conflicts * fix * fix * fix conflicts * pseudocode of backward * modify test * modify register op * clear other code * modify ci build bug * reply review comments * reply review comments * delete print and add_n c++ interface --------- Co-authored-by: kangguangli --- paddle/fluid/ir/dialect/pd_manual_op.h | 18 ------------------ test/ir/new_ir/test_build_op.py | 2 -- test/ir/new_ir/test_ir_backward.py | 10 ++++++++++ 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/ir/dialect/pd_manual_op.h b/paddle/fluid/ir/dialect/pd_manual_op.h index 46e34b74fdb06..ff055ea6edf8a 100644 --- a/paddle/fluid/ir/dialect/pd_manual_op.h +++ b/paddle/fluid/ir/dialect/pd_manual_op.h @@ -48,24 +48,6 @@ class AddNOp : public ir::Op { void Verify(); ir::Value inputs() { return operand_source(0); } ir::OpResult out() { return result(0); } - ir::Attribute attribute(const std::string &name) { - { - PADDLE_ENFORCE( - attributes().count(name) > 0, - phi::errors::PreconditionNotMet("Attribute is not exist.")); - return attributes().at(name); - } - } - template - T attribute(const std::string &name) { - { - PADDLE_ENFORCE( - attributes().count(name) > 0 && attributes().at(name).isa(), - phi::errors::PreconditionNotMet("Attribute is not right.")); - return attributes().at(name).dyn_cast(); - } - } - static void InferMeta(phi::InferMetaContext *infer_meta); }; diff --git a/test/ir/new_ir/test_build_op.py b/test/ir/new_ir/test_build_op.py index 5de3b8229a050..c49b0ae14939c 100644 --- a/test/ir/new_ir/test_build_op.py +++ b/test/ir/new_ir/test_build_op.py @@ -43,7 +43,6 @@ def test_build_mean_op(self): paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True}) with paddle.ir.core.program_guard(newir_program): out = paddle.mean(tanh_out) - print(newir_program) self.assertEqual(out.get_defining_op().name(), "pd.mean") self.assertEqual( out.get_defining_op() @@ -65,7 +64,6 @@ def test_build_add_n_op(self): out1 = paddle.mean(tanh_out) out2 = paddle.mean(tanh_out) out = paddle.add_n([out1, out2]) - print(newir_program) self.assertEqual(out.get_defining_op().name(), "pd.add_n") self.assertEqual( out.get_defining_op() diff --git a/test/ir/new_ir/test_ir_backward.py b/test/ir/new_ir/test_ir_backward.py index 8c93162f105e7..2cd375468f263 100644 --- a/test/ir/new_ir/test_ir_backward.py +++ b/test/ir/new_ir/test_ir_backward.py @@ -88,6 +88,16 @@ def test_2(self): print(newir_program) self.assertEqual(newir_program.block().ops[-3].name(), "pd.full") + self.assertEqual(input_grad[0].get_defining_op().name(), "pd.tanh_grad") + self.assertEqual( + input_grad[0] + .get_defining_op() + .operands()[1] + .source() + .get_defining_op() + .name(), + "pd.mean_grad", + ) paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False}) # TODO(Ruting) test add_n op when add_n api and add_grad finished From ee0034575ca022d212acc9aa1d2b6f64153e8d39 Mon Sep 17 00:00:00 2001 From: wz1qqx <55830058+wz1qqx@users.noreply.github.com> Date: Thu, 10 Aug 2023 22:57:25 -0700 Subject: [PATCH 011/246] [XPU]Add flip kernel (#55932) --- .../ir/xpu/add_layernorm_xpu_fuse_pass.cc | 3 +- paddle/phi/backends/xpu/xpu2_op_list.cc | 4 +- .../fusion/xpu/add_layernorm_xpu_kernel.cc | 3 +- paddle/phi/kernels/xpu/flip_kernel.cc | 61 +++++++++++++++++++ 4 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 paddle/phi/kernels/xpu/flip_kernel.cc diff --git a/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc index 5e50b762e8cd2..7a3a826fc7133 100644 --- a/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc @@ -91,7 +91,8 @@ AddLayernormXPUPattern::AddLayernormXPUPattern(PDPattern* pattern, ->AsInput(); auto ele_out = pattern->NewNode(ele_out_repr()) ->assert_is_op_output("elementwise_add", "Out") - ->assert_is_op_input("layer_norm", "X"); + ->assert_is_op_input("layer_norm", "X") + ->assert_has_n_outputs(1); ele_add->LinksFrom({add_x, add_y}).LinksTo({ele_out}); auto l_norm = pattern->NewNode(l_norm_repr())->assert_is_op("layer_norm"); auto norm_bias = pattern->NewNode(norm_bias_repr()) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index bb22e15d43c6a..f74b5d1edc9c3 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -24,7 +24,8 @@ XPUOpMap& get_kl2_ops() { static XPUOpMap s_xpu2_kernels{ {"add_act_xpu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, - {"add_layernorm_xpu", XPUKernelSet({phi::DataType::FLOAT32})}, + {"add_layernorm_xpu", + XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"abs", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"abs_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, @@ -371,6 +372,7 @@ XPUOpMap& get_kl2_ops() { phi::DataType::INT32, phi::DataType::INT8, phi::DataType::FLOAT32})}, + {"flip", XPUKernelSet({phi::DataType::FLOAT32})}, {"full_batch_size_like", XPUKernelSet({phi::DataType::INT64, phi::DataType::INT32, diff --git a/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc index 616e81c138c94..a3a524d3e8802 100644 --- a/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc @@ -119,4 +119,5 @@ PD_REGISTER_KERNEL(add_layernorm_xpu, XPU, ALL_LAYOUT, phi::fusion::AddLayernormXPUKernel, - float) {} + float, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/flip_kernel.cc b/paddle/phi/kernels/xpu/flip_kernel.cc new file mode 100644 index 0000000000000..3311fce88bc1d --- /dev/null +++ b/paddle/phi/kernels/xpu/flip_kernel.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/flip_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void FlipKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + using XPUInTDType = typename XPUTypeTrait::Type; + int x_rank = x.dims().size(); + std::vector formated_axis(std::begin(axis), std::end(axis)); + for (size_t i = 0; i < axis.size(); i++) { + if (axis[i] < 0) { + formated_axis[i] = static_cast(axis[i] + x_rank); + } + } + dev_ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + if (formated_axis.size() == 0) { + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + return; + } + std::vector x_shape = phi::vectorize(x.dims()); + auto x_data = reinterpret_cast(x.data()); + auto out_data = reinterpret_cast(out->data()); + auto numel = x.numel(); + if (numel <= 0) { + return; + } + int r = xpu::flip( + /* Context* ctx */ dev_ctx.x_context(), + /* const T* x */ x_data, + /* T* y */ out_data, + /* const std::vector& xshape */ x_shape, + /* const std::vector& axis */ formated_axis); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "flip"); +} + +} // namespace phi + +PD_REGISTER_KERNEL(flip, XPU, ALL_LAYOUT, phi::FlipKernel, float) {} From 24771dd630708a198ac3ae13e0a88bb4b8690c0e Mon Sep 17 00:00:00 2001 From: 6clc Date: Fri, 11 Aug 2023 14:01:26 +0800 Subject: [PATCH 012/246] ci(CINN): remove proxy off in scripts,test=document_fix (#56161) --- tools/cinn/build.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tools/cinn/build.sh b/tools/cinn/build.sh index 6f55f185efe92..e2c1d0d0025d2 100755 --- a/tools/cinn/build.sh +++ b/tools/cinn/build.sh @@ -54,12 +54,6 @@ OLD_HTTP_PROXY=$http_proxy &> /dev/null OLD_HTTPS_PROXY=$https_proxy &> /dev/null set -x -function proxy_off { - set +x - unset http_proxy &> /dev/null - unset https_proxy &> /dev/null - set -x -} function proxy_on { set +x export http_proxy=$OLD_HTTP_PROXY &> /dev/null @@ -70,10 +64,6 @@ function proxy_on { function prepare_ci { cd $workspace proxy_on - if [[ ! -z ${PULL_ID} ]]; then - # in ci environment, we use aliyun ubuntu mirror, thus turn off proxy - proxy_off - fi if [[ $(command -v python) == $build_dir/ci-env/bin/python ]]; then return @@ -97,7 +87,6 @@ function prepare_ci { function cmake_ { - proxy_off mkdir -p $build_dir cd $build_dir set -x @@ -109,7 +98,6 @@ function cmake_ { } function _download_and_untar { - proxy_off local tar_file=$1 if [[ ! -f $tar_file ]]; then wget https://paddle-inference-dist.bj.bcebos.com/CINN/$tar_file @@ -118,7 +106,6 @@ function _download_and_untar { } function prepare_model { - proxy_off cd $build_dir/third_party _download_and_untar ResNet18.tar.gz @@ -203,7 +190,6 @@ function CINNRT { prepare_ci - proxy_off mkdir -p $build_dir cd $build_dir set -x From 533b62ff563310fe4765c0decd3278f9107bac56 Mon Sep 17 00:00:00 2001 From: Charles-hit <56987902+Charles-hit@users.noreply.github.com> Date: Fri, 11 Aug 2023 14:12:01 +0800 Subject: [PATCH 013/246] [PRIM][IR]fix comment for vjp (#56137) * [prim][newir] add basic framework for primitive * support desctensor in new ir * add vjp interface * support vjp in new ir * support vjp in new ir * polish vjp interface * fix stop_gradients set * fix vjp dispatch * add comment * add vjp test for new ir * add test for tanh vjp * [prim][newir] add basic framework for primitive * support desctensor in new ir * support vjp in new ir * support vjp in new ir * polish vjp interface * fix stop_gradients set * fix vjp dispatch * add comment * add vjp test for new ir * add test for tanh vjp * add eager and static backend for warp lower level api * support call_vjp pybind * polish code and add test for vjp * remove useless code * polish code * remove useless code * support mean vjp * add test for mean vjp and support has_vjp function * fix call_vjp * polish code * add primitive ops set for backend * add vjp test for tanh_ * fix inference CI * fix inference ci * modify fluid cmake * remove useless deps * add cmake * fix comment * fix test * polish code * modify backward stop_gradients * modify static_backend.cc * remove useless code --------- Co-authored-by: cxxly Co-authored-by: zhangbo9674 --- .../dialect/op_generator/op_interface_gen.py | 2 +- paddle/fluid/ir/dialect/pd_api.cc | 2 +- paddle/fluid/ir/dialect/pd_api.h | 2 +- paddle/fluid/ir/dialect/pd_dialect.h | 3 ++- paddle/fluid/ir/dialect/pd_op_vjp_manual.cc | 23 ++++++++++--------- paddle/fluid/ir/interface/vjp.h | 8 +++---- paddle/fluid/primitive/backend/CMakeLists.txt | 2 +- .../fluid/primitive/backend/static_backend.cc | 4 ++-- .../fluid/primitive/backend/static_backend.h | 4 +++- paddle/fluid/primitive/rule/vjp/vjp.cc | 12 ++++------ paddle/fluid/primitive/rule/vjp/vjp.h | 9 +++++--- paddle/fluid/primitive/type/desc_tensor.h | 3 --- paddle/fluid/pybind/pybind.cc | 4 ++-- python/paddle/autograd/backward.py | 4 ++-- test/cpp/prim/test_vjp.cc | 6 ++--- test/ir/new_ir/test_ir_vjp.py | 8 +++---- 16 files changed, 49 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/ir/dialect/op_generator/op_interface_gen.py b/paddle/fluid/ir/dialect/op_generator/op_interface_gen.py index fb22aa2e9b25b..4833111c9d2ab 100644 --- a/paddle/fluid/ir/dialect/op_generator/op_interface_gen.py +++ b/paddle/fluid/ir/dialect/op_generator/op_interface_gen.py @@ -40,5 +40,5 @@ def gen_exclusive_interface_str(op_info): " static void InferMeta( phi::InferMetaContext *infer_meta );" ) if op_info.op_phi_name[0] in vjp_interface_gen_op_list: - exclusive_interface_str += "\n static std::vector> Vjp(ir::Operation* op, const std::vector>& out_grads, const std::vector>& stop_gradients);" + exclusive_interface_str += "\n static std::vector> Vjp(ir::Operation* op, const std::vector>& out_grads, const std::vector>& stop_gradients);" return exclusive_interface_str diff --git a/paddle/fluid/ir/dialect/pd_api.cc b/paddle/fluid/ir/dialect/pd_api.cc index df88dd9cc7348..f65b1e25f9c46 100644 --- a/paddle/fluid/ir/dialect/pd_api.cc +++ b/paddle/fluid/ir/dialect/pd_api.cc @@ -72,7 +72,7 @@ ir::OpResult tanh_grad(ir::OpResult out, ir::OpResult grad_out) { ir::OpResult mean_grad(ir::OpResult x, ir::OpResult out_grad, - std::vector axis, + const std::vector& axis, bool keepdim, bool reduce_all) { paddle::dialect::MeanGradOp mean_grad_op = diff --git a/paddle/fluid/ir/dialect/pd_api.h b/paddle/fluid/ir/dialect/pd_api.h index a44c8bb83a76a..5d3b2376314e1 100644 --- a/paddle/fluid/ir/dialect/pd_api.h +++ b/paddle/fluid/ir/dialect/pd_api.h @@ -44,7 +44,7 @@ ir::OpResult tanh_grad(ir::OpResult out, ir::OpResult grad_out); ir::OpResult mean_grad(ir::OpResult x, ir::OpResult out_grad, - std::vector axis = {}, + const std::vector& axis = {}, bool keepdim = false, bool reduce_all = false); } // namespace dialect diff --git a/paddle/fluid/ir/dialect/pd_dialect.h b/paddle/fluid/ir/dialect/pd_dialect.h index 1e43a40c55f6b..4fa14d394248a 100644 --- a/paddle/fluid/ir/dialect/pd_dialect.h +++ b/paddle/fluid/ir/dialect/pd_dialect.h @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/ir/core/dialect.h" #include "paddle/ir/core/enforce.h" +#include "paddle/ir/core/macros.h" #include "paddle/ir/core/parameter.h" #include "paddle/ir/core/program.h" @@ -92,7 +93,7 @@ class APIBuilder { ctx_->GetOrRegisterDialect(); } - APIBuilder(const APIBuilder&) = delete; + DISABLE_COPY_AND_ASSIGN(APIBuilder); ir::IrContext* ctx_; std::shared_ptr builder_; diff --git a/paddle/fluid/ir/dialect/pd_op_vjp_manual.cc b/paddle/fluid/ir/dialect/pd_op_vjp_manual.cc index 42bb1556aa211..be43ddd60491c 100644 --- a/paddle/fluid/ir/dialect/pd_op_vjp_manual.cc +++ b/paddle/fluid/ir/dialect/pd_op_vjp_manual.cc @@ -17,16 +17,19 @@ #include "paddle/fluid/primitive/rule/vjp/vjp.h" #include "paddle/fluid/primitive/type/desc_tensor.h" #include "paddle/ir/core/op_base.h" +#include "paddle/phi/common/int_array.h" // TODO(wanghao107) // this file will be generated in pd_op.cc namespace paddle { namespace dialect { +using IntArray = paddle::experimental::IntArray; + std::vector> TanhOp::Vjp( ir::Operation* op, const std::vector>& out_grads, - const std::vector>& stop_gradients) { + const std::vector>& stop_gradients) { TanhOp op_obj = op->dyn_cast(); Tensor out( std::make_shared(op_obj.out())); @@ -35,7 +38,7 @@ std::vector> TanhOp::Vjp( std::vector> tensor_res = primitive::experimental::tanh_vjp(out, grad_out, stop_gradients); std::vector> res(1, std::vector(1)); - if (!stop_gradients[0][0]) { + if (tensor_res[0][0].defined()) { res[0][0] = std::static_pointer_cast( tensor_res[0][0].impl()) ->getValue() @@ -47,7 +50,7 @@ std::vector> TanhOp::Vjp( std::vector> Tanh_Op::Vjp( ir::Operation* op, const std::vector>& out_grads, - const std::vector>& stop_gradients) { + const std::vector>& stop_gradients) { // TODO(wanghao107) // we don't support inplace now, // so use the non-inplace version instead currently. @@ -60,7 +63,7 @@ std::vector> Tanh_Op::Vjp( std::vector> tensor_res = primitive::experimental::tanh_vjp(out, grad_out, stop_gradients); std::vector> res(1, std::vector(1)); - if (!stop_gradients[0][0]) { + if (tensor_res[0][0].defined()) { res[0][0] = std::static_pointer_cast( tensor_res[0][0].impl()) ->getValue() @@ -72,24 +75,22 @@ std::vector> Tanh_Op::Vjp( std::vector> MeanOp::Vjp( ir::Operation* op, const std::vector>& out_grads, - const std::vector>& stop_gradients) { + const std::vector>& stop_gradients) { MeanOp op_obj = op->dyn_cast(); Tensor x(std::make_shared(op_obj.x())); Tensor out_grad( std::make_shared(out_grads[0][0])); - std::vector axis = - op->attribute("axis") - .dyn_cast() - .data() - .GetData(); + IntArray axis = op->attribute("axis") + .dyn_cast() + .data(); bool keepdim = op->attribute("keepdim").dyn_cast().data(); bool reduce_all = false; std::vector> tensor_res = primitive::experimental::mean_vjp( x, out_grad, axis, keepdim, reduce_all, stop_gradients); std::vector> res(1, std::vector(1)); - if (!stop_gradients[0][0]) { + if (tensor_res[0][0].defined()) { res[0][0] = std::static_pointer_cast( tensor_res[0][0].impl()) ->getValue() diff --git a/paddle/fluid/ir/interface/vjp.h b/paddle/fluid/ir/interface/vjp.h index 07e64da142f73..a373cd0bacca4 100644 --- a/paddle/fluid/ir/interface/vjp.h +++ b/paddle/fluid/ir/interface/vjp.h @@ -23,12 +23,12 @@ class VjpInterface : public ir::OpInterfaceBase { explicit Concept(std::vector> (*vjp)( ir::Operation* op, const std::vector>& out_grads, - const std::vector>& stop_gradients)) + const std::vector>& stop_gradients)) : vjp_(vjp) {} std::vector> (*vjp_)( ir::Operation* op, const std::vector>& out_grads, - const std::vector>& stop_gradients); + const std::vector>& stop_gradients); }; template @@ -36,7 +36,7 @@ class VjpInterface : public ir::OpInterfaceBase { static std::vector> Vjp( ir::Operation* op, const std::vector>& out_grads, - const std::vector>& stop_gradients) { + const std::vector>& stop_gradients) { return ConcreteOp::Vjp(op, out_grads, stop_gradients); } @@ -49,7 +49,7 @@ class VjpInterface : public ir::OpInterfaceBase { std::vector> Vjp( ir::Operation* op, const std::vector>& out_grads, - const std::vector>& stop_gradients) { + const std::vector>& stop_gradients) { return impl_->vjp_(op, out_grads, stop_gradients); } diff --git a/paddle/fluid/primitive/backend/CMakeLists.txt b/paddle/fluid/primitive/backend/CMakeLists.txt index 75e59d0b88163..26855583b46f9 100644 --- a/paddle/fluid/primitive/backend/CMakeLists.txt +++ b/paddle/fluid/primitive/backend/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT (NOT WITH_PYTHON AND ON_INFER)) +if(WITH_PYTHON OR NOT ON_INFER) cc_library( primitive_backend_eager_experimental SRCS eager_backend.cc diff --git a/paddle/fluid/primitive/backend/static_backend.cc b/paddle/fluid/primitive/backend/static_backend.cc index b0a515c0d75af..b041d3710c25d 100644 --- a/paddle/fluid/primitive/backend/static_backend.cc +++ b/paddle/fluid/primitive/backend/static_backend.cc @@ -42,7 +42,7 @@ Tensor tanh_grad(const Tensor& out, const Tensor& grad_out) { template <> Tensor mean_grad(const Tensor& x, const Tensor& out_grad, - std::vector axis, + const IntArray& axis, bool keepdim, bool reduce_all) { ir::OpResult x_res = std::static_pointer_cast(x.impl()) @@ -54,7 +54,7 @@ Tensor mean_grad(const Tensor& x, .dyn_cast(); ir::OpResult op_res = paddle::dialect::mean_grad( - x_res, out_grad_res, axis, keepdim, reduce_all); + x_res, out_grad_res, axis.GetData(), keepdim, reduce_all); return Tensor(std::make_shared(op_res)); } diff --git a/paddle/fluid/primitive/backend/static_backend.h b/paddle/fluid/primitive/backend/static_backend.h index bd1fb737b8658..09835bb759674 100644 --- a/paddle/fluid/primitive/backend/static_backend.h +++ b/paddle/fluid/primitive/backend/static_backend.h @@ -18,6 +18,7 @@ #include #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/common/int_array.h" namespace paddle { namespace primitive { @@ -25,6 +26,7 @@ namespace backend { namespace experimental { using Tensor = paddle::Tensor; +using IntArray = paddle::experimental::IntArray; template Tensor tanh_grad(const Tensor& out, const Tensor& grad_out); @@ -32,7 +34,7 @@ Tensor tanh_grad(const Tensor& out, const Tensor& grad_out); template Tensor mean_grad(const Tensor& x, const Tensor& out_grad, - std::vector axis = {}, + const IntArray& axis = {}, bool keepdim = false, bool reduce_all = false); } // namespace experimental diff --git a/paddle/fluid/primitive/rule/vjp/vjp.cc b/paddle/fluid/primitive/rule/vjp/vjp.cc index 28ffff5d9c701..b5f0acf98c1d8 100644 --- a/paddle/fluid/primitive/rule/vjp/vjp.cc +++ b/paddle/fluid/primitive/rule/vjp/vjp.cc @@ -12,12 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include - +#include "paddle/fluid/primitive/rule/vjp/vjp.h" #include "paddle/fluid/ir/dialect/pd_api.h" #include "paddle/fluid/primitive/backend/static_backend.h" -#include "paddle/fluid/primitive/rule/vjp/vjp.h" #include "paddle/fluid/primitive/type/desc_tensor.h" #include "paddle/ir/core/operation.h" // TODO(wanghao107): @@ -26,10 +23,11 @@ namespace paddle { namespace primitive { namespace experimental { + std::vector> tanh_vjp( const Tensor& out, const Tensor& grad_out, - const std::vector>& stop_gradients) { + const std::vector>& stop_gradients) { std::vector> vjp_res( 1, std::vector(1)); // get tanh_grad res. @@ -71,10 +69,10 @@ std::vector> tanh_vjp( std::vector> mean_vjp( const Tensor& x, const Tensor& out_grad, - std::vector axis, + const IntArray& axis, bool keepdim, bool reduce_all, - const std::vector>& stop_gradients) { + const std::vector>& stop_gradients) { std::vector> vjp_res( 1, std::vector(1)); // get mean_grad res. diff --git a/paddle/fluid/primitive/rule/vjp/vjp.h b/paddle/fluid/primitive/rule/vjp/vjp.h index 9da7d57429bc3..48bc2affa9db4 100644 --- a/paddle/fluid/primitive/rule/vjp/vjp.h +++ b/paddle/fluid/primitive/rule/vjp/vjp.h @@ -24,24 +24,27 @@ #include "paddle/fluid/primitive/primitive/primitive.h" #include "paddle/ir/core/value.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/common/int_array.h" namespace paddle { namespace primitive { namespace experimental { + +using IntArray = paddle::experimental::IntArray; // TODO(wanghao107): // op's vjp will be auto generated. std::vector> tanh_vjp( const Tensor& out, const Tensor& grad_out, - const std::vector>& stop_gradients); + const std::vector>& stop_gradients); std::vector> mean_vjp( const Tensor& x, const Tensor& out_grad, - std::vector axis, + const IntArray& axis, bool keepdim, bool reduce_all, - const std::vector>& stop_gradients); + const std::vector>& stop_gradients); namespace details { // NOTE: this namespace will store diff --git a/paddle/fluid/primitive/type/desc_tensor.h b/paddle/fluid/primitive/type/desc_tensor.h index 60dc4e01377eb..650b00e58ba7d 100644 --- a/paddle/fluid/primitive/type/desc_tensor.h +++ b/paddle/fluid/primitive/type/desc_tensor.h @@ -43,14 +43,11 @@ class DescTensor : public phi::ExtendedTensor, ir::Value getValue() const { return value_; } - const phi::Place& place() const override { return place_; } - bool initialized() const override { return value_.impl() != nullptr; } private: ir::Value value_; mutable phi::DDim dims_; - phi::Place place_; }; } // namespace experimental diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index bc0df124de960..4a005bc6dd372 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -693,7 +693,7 @@ void BindVjp(pybind11::module *m) { "call_vjp", [](ir::Operation &fwd_op, const std::vector> &out_grads, - const std::vector> &stop_gradients) { + const std::vector> &stop_gradients) { py::list res; ir::IrContext *ctx = ir::IrContext::Instance(); ir::OpInfo fwd_op_info = ctx->GetRegisteredOpInfo(fwd_op.name()); @@ -731,7 +731,7 @@ void BindVjp(pybind11::module *m) { vjp_res[i].size())); py::list sub_res; for (size_t j = 0; j < vjp_res[i].size(); ++j) { - if (stop_gradients[i][j]) { + if (!vjp_res[i][j]) { sub_res.append(nullptr); } else { sub_res.append(vjp_res[i][j]); diff --git a/python/paddle/autograd/backward.py b/python/paddle/autograd/backward.py index ba9d8a7a3f2e0..6ea1c491d4ad5 100644 --- a/python/paddle/autograd/backward.py +++ b/python/paddle/autograd/backward.py @@ -377,9 +377,9 @@ def append_backward_ops( input_grad_stopgradient_list = [] for input in op.operands_source(): if input in no_grad_set: - input_grad_stopgradient_list.append([1]) + input_grad_stopgradient_list.append([True]) else: - input_grad_stopgradient_list.append([0]) + input_grad_stopgradient_list.append([False]) before_ops_num = len(block.ops) # prim should be a globel flag, it will make create_grad_op choose diffrient func diff --git a/test/cpp/prim/test_vjp.cc b/test/cpp/prim/test_vjp.cc index 49cb6e29ab12c..9f7633c008176 100644 --- a/test/cpp/prim/test_vjp.cc +++ b/test/cpp/prim/test_vjp.cc @@ -55,7 +55,7 @@ TEST(VJP, TanhBackwardTest) { paddle::dialect::FullOp op3 = builder->Build( std::vector{1}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace()); - std::vector> stop_gradients{{0}}; + std::vector> stop_gradients{{false}}; std::vector> out_grads{{op3.out()}}; ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.tanh"); @@ -109,7 +109,7 @@ TEST(VJP, Tanh_BackwardTest) { paddle::dialect::FullOp op3 = builder->Build( std::vector{1}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace()); - std::vector> stop_gradients{{0}}; + std::vector> stop_gradients{{false}}; std::vector> out_grads{{op3.out()}}; ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.tanh_"); @@ -163,7 +163,7 @@ TEST(VJP, MeanBackwardTest) { paddle::dialect::FullOp op3 = builder->Build( std::vector{}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace()); - std::vector> stop_gradients{{0}}; + std::vector> stop_gradients{{false}}; std::vector> out_grads{{op3.out()}}; ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.mean"); diff --git a/test/ir/new_ir/test_ir_vjp.py b/test/ir/new_ir/test_ir_vjp.py index 12931b89cca2a..45da7162664e4 100644 --- a/test/ir/new_ir/test_ir_vjp.py +++ b/test/ir/new_ir/test_ir_vjp.py @@ -41,7 +41,7 @@ def test_tanh_vjp1(self): tanh_op = newir_program.block().ops[-2] fill_constant_op = newir_program.block().ops[-1] out_grads = [[fill_constant_op.result(0)]] - stop_gradients = [[0]] + stop_gradients = [[False]] with paddle.ir.core.program_guard(newir_program): grad_outs = call_vjp(tanh_op, out_grads, stop_gradients) self.assertEqual( @@ -72,7 +72,7 @@ def test_tanh_vjp2(self): tanh_op = newir_program.block().ops[-2] fill_constant_op = newir_program.block().ops[-1] out_grads = [[fill_constant_op.result(0)]] - stop_gradients = [[1]] + stop_gradients = [[True]] with paddle.ir.core.program_guard(newir_program): grad_outs = call_vjp(tanh_op, out_grads, stop_gradients) self.assertEqual(grad_outs[0][0], None) @@ -93,7 +93,7 @@ def test_mean_vjp1(self): fill_constant_op = newir_program.block().ops[-1] mean_op = newir_program.block().ops[-2] out_grads = [[fill_constant_op.result(0)]] - stop_gradients = [[0]] + stop_gradients = [[False]] with paddle.ir.core.program_guard(newir_program): grad_outs = call_vjp(mean_op, out_grads, stop_gradients) self.assertEqual( @@ -133,7 +133,7 @@ def test_mean_vjp2(self): fill_constant_op = newir_program.block().ops[-1] mean_op = newir_program.block().ops[-2] out_grads = [[fill_constant_op.result(0)]] - stop_gradients = [[1]] + stop_gradients = [[True]] with paddle.ir.core.program_guard(newir_program): grad_outs = call_vjp(mean_op, out_grads, stop_gradients) self.assertEqual(grad_outs[0][0], None) From 28f74a0e8fb38b9eb140eb09193032e8afd34ff0 Mon Sep 17 00:00:00 2001 From: kangguangli Date: Fri, 11 Aug 2023 14:30:05 +0800 Subject: [PATCH 014/246] enable FLAGS_apply_pass_to_program_in_default (#55911) --- .../meta_optimizers/raw_program_optimizer.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py index 8919ded2e245c..a1ab474723527 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py @@ -15,7 +15,6 @@ from paddle import static from paddle.fluid import core -from paddle.framework import _global_flags from paddle.framework.ir import apply_build_strategy from paddle.utils import unique_name @@ -31,6 +30,14 @@ from .meta_optimizer_base import MetaOptimizerBase +def evaluate_flag_apply_pass_to_program(val: str) -> bool: + val = val.lower() + if val in ('false', 'off', '0'): + return False + else: + return True + + class RawProgramOptimizer(MetaOptimizerBase): def __init__(self, optimizer): super().__init__(optimizer) @@ -153,7 +160,11 @@ def minimize_impl( optimize_ops, params_grads = self.inner_opt.minimize( loss, startup_program, parameter_list, no_grad_set ) - if _global_flags()['FLAGS_apply_pass_to_program']: + # Not apply pass only when FLAGS_apply_pass_to_program explicitly set to False + is_apply_pass_to_program = os.environ.get( + 'FLAGS_apply_pass_to_program', '1' + ) + if evaluate_flag_apply_pass_to_program(is_apply_pass_to_program): pass_attrs = {"use_cuda": True} build_strategy = self.user_defined_strategy.build_strategy._copy() build_strategy.fuse_all_optimizer_ops = False From 81874b929abf31b0cd83bb990865ccc032d8bb5f Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 11 Aug 2023 14:34:32 +0800 Subject: [PATCH 015/246] Disable CINN Test (#56086) * Disable CINN Test * Fix test_group_norm_op;test=document_fix --- test/prim/model/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/prim/model/CMakeLists.txt b/test/prim/model/CMakeLists.txt index c37a25924aa97..52ca91e86a3bc 100644 --- a/test/prim/model/CMakeLists.txt +++ b/test/prim/model/CMakeLists.txt @@ -13,14 +13,14 @@ set_tests_properties(test_bert_prim PROPERTIES TIMEOUT 500) set_tests_properties(test_prim_simplenet_cinn PROPERTIES TIMEOUT 120) if(WITH_CINN) - set_tests_properties(test_resnet_cinn PROPERTIES TIMEOUT 850) - set_tests_properties(test_resnet_prim_cinn PROPERTIES TIMEOUT 850) + #set_tests_properties(test_resnet_cinn PROPERTIES TIMEOUT 850) + #set_tests_properties(test_resnet_prim_cinn PROPERTIES TIMEOUT 850) set_tests_properties(test_bert_cinn PROPERTIES TIMEOUT 500) set_tests_properties(test_bert_prim_cinn PROPERTIES TIMEOUT 500) set_tests_properties(test_resnet_prim PROPERTIES LABELS "RUN_TYPE=CINN") - set_tests_properties(test_resnet_cinn PROPERTIES LABELS "RUN_TYPE=CINN") - set_tests_properties(test_resnet_prim_cinn PROPERTIES LABELS "RUN_TYPE=CINN") + #set_tests_properties(test_resnet_cinn PROPERTIES LABELS "RUN_TYPE=CINN") + #set_tests_properties(test_resnet_prim_cinn PROPERTIES LABELS "RUN_TYPE=CINN") set_tests_properties(test_bert_prim PROPERTIES LABELS "RUN_TYPE=CINN") set_tests_properties(test_bert_cinn PROPERTIES LABELS "RUN_TYPE=CINN") set_tests_properties(test_bert_prim_cinn PROPERTIES LABELS "RUN_TYPE=CINN") From 9e6f4433858a99a687e4cd90d050eb04af625768 Mon Sep 17 00:00:00 2001 From: Candy2Tang <141831089+Candy2Tang@users.noreply.github.com> Date: Fri, 11 Aug 2023 14:40:26 +0800 Subject: [PATCH 016/246] [xdoctest][task 105]reformat example code with google style in python/paddle/optimizer/adamax.py (#56167) * [xdoctest][task 105]reformat example code with google style in python/paddle/optimizer/adamax.py * fix word typo test=docs_preview * fix comment test=docs_preview --- python/paddle/optimizer/adamax.py | 96 ++++++++++++++++--------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index bc33c392a4cba..26988e9d3c96a 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -62,18 +62,18 @@ class Adamax(Optimizer): parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. This parameter is required in dygraph mode. And you can specify different options for different parameter groups such as the learning rate, weight decay, etc, - then the parameters are list of dict. Note that the learning_rate in paramter groups + then the parameters are list of dict. Note that the learning_rate in parameter groups represents the scale of base learning_rate. The default value is None in static graph mode, at this time all parameters will be updated. weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. - It canbe a float value as coeff of L2 regularization or + It can be a float value as coeff of L2 regularization or :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies + grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three clipping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. name (str, optional): Normally there is no need for user to set this property. @@ -86,49 +86,51 @@ class Adamax(Optimizer): Examples: .. code-block:: python - import paddle - - inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1) - linear = paddle.nn.Linear(10, 10) - inp = paddle.to_tensor(inp) - out = linear(inp) - loss = paddle.mean(out) - - beta1 = paddle.to_tensor([0.9], dtype="float32") - beta2 = paddle.to_tensor([0.99], dtype="float32") - - adam = paddle.optimizer.Adamax(learning_rate=0.1, - parameters=linear.parameters(), - beta1=beta1, - beta2=beta2, - weight_decay=0.01) - out.backward() - adam.step() - adam.clear_grad() - - - #Note that the learning_rate of linear_2 is 0.01. - linear_1 = paddle.nn.Linear(10, 10) - linear_2 = paddle.nn.Linear(10, 10) - inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) - out = linear_1(inp) - out = linear_2(out) - loss = paddle.mean(out) - adam = paddle.optimizer.Adamax( - learning_rate=0.1, - parameters=[{ - 'params': linear_1.parameters() - }, { - 'params': linear_2.parameters(), - 'weight_decay': 0.001, - 'learning_rate': 0.1, - 'beta1': 0.8 - }], - weight_decay=0.01, - beta1=0.9) - out.backward() - adam.step() - adam.clear_grad() + >>> import paddle + + >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1) + >>> linear = paddle.nn.Linear(10, 10) + >>> inp = paddle.to_tensor(inp) + >>> out = linear(inp) + >>> loss = paddle.mean(out) + + >>> beta1 = paddle.to_tensor([0.9], dtype="float32") + >>> beta2 = paddle.to_tensor([0.99], dtype="float32") + + >>> adam = paddle.optimizer.Adamax(learning_rate=0.1, + ... parameters=linear.parameters(), + ... beta1=beta1, + ... beta2=beta2, + ... weight_decay=0.01 + ... ) + >>> out.backward() + >>> adam.step() + >>> adam.clear_grad() + + + >>> # Note that the learning_rate of linear_2 is 0.01. + >>> linear_1 = paddle.nn.Linear(10, 10) + >>> linear_2 = paddle.nn.Linear(10, 10) + >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) + >>> out = linear_1(inp) + >>> out = linear_2(out) + >>> loss = paddle.mean(out) + >>> adam = paddle.optimizer.Adamax( + ... learning_rate=0.1, + ... parameters=[{ + ... 'params': linear_1.parameters() + ... }, { + ... 'params': linear_2.parameters(), + ... 'weight_decay': 0.001, + ... 'learning_rate': 0.1, + ... 'beta1': 0.8 + ... }], + ... weight_decay=0.01, + ... beta1=0.9 + ... ) + >>> out.backward() + >>> adam.step() + >>> adam.clear_grad() """ _moment_acc_str = "moment" _inf_norm_acc_str = "inf_norm" From 8cf4b1c23476bb843c6f2e8e425b559b28f5ed7a Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 11 Aug 2023 15:00:48 +0800 Subject: [PATCH 017/246] [NewIR]Polish IR code (#56087) * perfect code * delete __all__ --- .../ir/dialect/op_generator/op_build_gen.py | 4 +- .../fluid/ir/dialect/op_generator/op_gen.py | 6 +- paddle/fluid/ir/dialect/pd_api.cc | 16 +- paddle/fluid/ir/dialect/pd_api.h | 8 +- paddle/fluid/ir/dialect/utils.cc | 171 ++++++++++++++++++ paddle/fluid/ir/dialect/utils.h | 150 +-------------- paddle/fluid/pybind/ir.cc | 146 +++++++++++++-- python/paddle/fluid/framework.py | 28 +-- python/paddle/ir/__init__.py | 11 +- python/paddle/ir/core.py | 18 ++ 10 files changed, 346 insertions(+), 212 deletions(-) create mode 100644 paddle/fluid/ir/dialect/utils.cc diff --git a/paddle/fluid/ir/dialect/op_generator/op_build_gen.py b/paddle/fluid/ir/dialect/op_generator/op_build_gen.py index 5c3696d02c88c..f4d91d5c06821 100644 --- a/paddle/fluid/ir/dialect/op_generator/op_build_gen.py +++ b/paddle/fluid/ir/dialect/op_generator/op_build_gen.py @@ -65,7 +65,7 @@ def GenBuildInputArgsStr( ] if ( op_attribute_build_arg_type_list[attr_idx] - != "std::string" + != "const std::string&" ): if ( default_value[0] == "'" @@ -106,7 +106,7 @@ def GenBuildInputArgsStr( op_non_mutable_attribute_build_arg_type_list[ attr_idx ] - != "std::string" + != "const std::string&" ): if ( default_value[0] == "'" diff --git a/paddle/fluid/ir/dialect/op_generator/op_gen.py b/paddle/fluid/ir/dialect/op_generator/op_gen.py index ba4424aa7bdd8..d990141add5a0 100644 --- a/paddle/fluid/ir/dialect/op_generator/op_gen.py +++ b/paddle/fluid/ir/dialect/op_generator/op_gen.py @@ -258,12 +258,12 @@ def __init__(self, op_yaml_item, op_compat_item): 'ir::ArrayAttribute', 'const std::vecot&', ], - 'str': ['ir::StrAttribute', 'std::string'], + 'str': ['ir::StrAttribute', 'const std::string&'], 'str[]': [ 'ir::ArrayAttribute', 'const std::vector&', ], - 'Place': ['paddle::dialect::PlaceAttribute', 'Place'], + 'Place': ['paddle::dialect::PlaceAttribute', 'const Place&'], 'DataLayout': [ 'paddle::dialect::DataLayoutAttribute', 'DataLayout', @@ -577,7 +577,7 @@ def parse_attribute_build_arg_type_list(self): temp_type = attribute_info['data_type'] if 'IntArray' in temp_type: if 'data_type' in attribute_info: - temp_type = attribute_info['data_type'] + temp_type = "const " + attribute_info['data_type'] + "&" type_list.append(self.get_phi_dtype_name(temp_type)) return type_list diff --git a/paddle/fluid/ir/dialect/pd_api.cc b/paddle/fluid/ir/dialect/pd_api.cc index f65b1e25f9c46..6405f7dce7e80 100644 --- a/paddle/fluid/ir/dialect/pd_api.cc +++ b/paddle/fluid/ir/dialect/pd_api.cc @@ -29,7 +29,9 @@ ir::OpResult add_n(std::vector x) { return add_n_op.out(); } -ir::OpResult mean(ir::OpResult x, std::vector axis, bool keepdim) { +ir::OpResult mean(ir::OpResult x, + const std::vector& axis, + bool keepdim) { paddle::dialect::MeanOp mean_op = APIBuilder::Instance().GetBuilder()->Build( x, axis, keepdim); @@ -37,27 +39,27 @@ ir::OpResult mean(ir::OpResult x, std::vector axis, bool keepdim) { } ir::OpResult sum(ir::OpResult x, - std::vector axis, + const std::vector& axis, phi::DataType dtype, bool keepdim) { - paddle::dialect::SumOp sum_op = + auto sum_op = APIBuilder::Instance().GetBuilder()->Build( x, axis, dtype, keepdim); return sum_op.out(); } ir::OpResult divide(ir::OpResult x, ir::OpResult y) { - paddle::dialect::DivideOp divide_op = + auto divide_op = APIBuilder::Instance().GetBuilder()->Build(x, y); return divide_op.out(); } -ir::OpResult full(std::vector shape, +ir::OpResult full(const std::vector& shape, float value, phi::DataType dtype, - phi::Place place) { - paddle::dialect::FullOp full_op = + const phi::Place& place) { + auto full_op = APIBuilder::Instance().GetBuilder()->Build( shape, value, dtype, place); return full_op.out(); diff --git a/paddle/fluid/ir/dialect/pd_api.h b/paddle/fluid/ir/dialect/pd_api.h index 5d3b2376314e1..9581e0a4e7ee1 100644 --- a/paddle/fluid/ir/dialect/pd_api.h +++ b/paddle/fluid/ir/dialect/pd_api.h @@ -25,20 +25,20 @@ namespace dialect { ir::OpResult add_n(std::vector x); ir::OpResult mean(ir::OpResult x, - std::vector axis = {}, + const std::vector& axis = {}, bool keepdim = false); ir::OpResult sum(ir::OpResult x, - std::vector axis = {}, + const std::vector& axis = {}, phi::DataType dtype = phi::DataType::UNDEFINED, bool keepdim = false); ir::OpResult divide(ir::OpResult x, ir::OpResult y); -ir::OpResult full(std::vector shape, +ir::OpResult full(const std::vector& shape, float value, phi::DataType dtype = phi::DataType::FLOAT32, - phi::Place place = phi::CPUPlace()); + const phi::Place& place = phi::CPUPlace()); ir::OpResult tanh_grad(ir::OpResult out, ir::OpResult grad_out); diff --git a/paddle/fluid/ir/dialect/utils.cc b/paddle/fluid/ir/dialect/utils.cc new file mode 100644 index 0000000000000..cd6ff35ef7f4e --- /dev/null +++ b/paddle/fluid/ir/dialect/utils.cc @@ -0,0 +1,171 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/ir/dialect/utils.h" + +namespace paddle { +namespace dialect { + +enum class AttrType { + UNDEFINED = 0, + BOOL, + INT32, + INT64, + + FLOAT, + DOUBLE, + + ARRAY, + INT_ARRAY, + + SCALAR, + DATA_TYPE, + DATA_LAYOUT, + PLACE, + + STRING, + + NUM_ATTR_TYPES, +}; + +static inline AttrType GetAttributeType(const ir::Attribute& attr) { + if (attr.isa()) { + return AttrType::BOOL; + } else if (attr.isa()) { + return AttrType::FLOAT; + } else if (attr.isa()) { + return AttrType::DOUBLE; + } else if (attr.isa()) { + return AttrType::INT32; + } else if (attr.isa()) { + return AttrType::INT64; + } else if (attr.isa()) { + return AttrType::ARRAY; + } else if (attr.isa()) { + return AttrType::STRING; + } else if (attr.isa()) { + return AttrType::INT_ARRAY; + } else if (attr.isa()) { + return AttrType::DATA_TYPE; + } else if (attr.isa()) { + return AttrType::PLACE; + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported ir Attribute type when casting it into " + "AttrType.")); + } +} + +static std::unordered_map> + kAttrCastMap = { + {AttrType::BOOL, + [](const ir::Attribute& attr) { + return VariantType{attr.dyn_cast().data()}; + }}, + {AttrType::FLOAT, + [](const ir::Attribute& attr) { + return VariantType{attr.dyn_cast().data()}; + }}, + {AttrType::DOUBLE, + [](const ir::Attribute& attr) { + return VariantType{attr.dyn_cast().data()}; + }}, + {AttrType::INT32, + [](const ir::Attribute& attr) { + return VariantType{attr.dyn_cast().data()}; + }}, + {AttrType::INT64, + [](const ir::Attribute& attr) { + return VariantType{attr.dyn_cast().data()}; + }}, + {AttrType::INT_ARRAY, + [](const ir::Attribute& attr) { + return VariantType{ + attr.dyn_cast() + .data() + .GetData()}; + }}, + {AttrType::STRING, + [](const ir::Attribute& attr) { + return VariantType{attr.dyn_cast().AsString()}; + }}, + {AttrType::DATA_TYPE, + [](const ir::Attribute& attr) { + return VariantType{ + attr.dyn_cast().data()}; + }}, + {AttrType::PLACE, + [](const ir::Attribute& attr) { + return VariantType{ + attr.dyn_cast().data()}; + }}, + {AttrType::ARRAY, + [](const ir::Attribute& attr) { + auto attr_vec = attr.dyn_cast().AsVector(); + if (attr_vec.size() == 0) { + return VariantType{std::vector()}; + } + AttrType element_type = GetAttributeType(attr_vec[0]); + + if (element_type == AttrType::BOOL) { + std::vector vec_bools; + for (auto vec_element : attr_vec) { + vec_bools.push_back( + vec_element.dyn_cast().data()); + } + return VariantType{vec_bools}; + } else if (element_type == AttrType::INT32) { + std::vector vec_int32; + for (auto vec_element : attr_vec) { + vec_int32.push_back( + vec_element.dyn_cast().data()); + } + return VariantType{vec_int32}; + } else if (element_type == AttrType::INT64) { + std::vector vec_int64; + for (auto vec_element : attr_vec) { + vec_int64.push_back( + vec_element.dyn_cast().data()); + } + return VariantType{vec_int64}; + } else if (element_type == AttrType::FLOAT) { + std::vector vec_float; + for (auto vec_element : attr_vec) { + vec_float.push_back( + vec_element.dyn_cast().data()); + } + return VariantType{vec_float}; + } else if (element_type == AttrType::DOUBLE) { + std::vector vec_double; + for (auto vec_element : attr_vec) { + vec_double.push_back( + vec_element.dyn_cast().data()); + } + return VariantType{vec_double}; + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported ir Attribute type when casting it into " + "vector.")); + } + }}, +}; + +VariantType GetAttributeData(const ir::Attribute& attr) { + AttrType attr_type = GetAttributeType(attr); + return kAttrCastMap[attr_type](attr); +} + +} // namespace dialect +} // namespace paddle diff --git a/paddle/fluid/ir/dialect/utils.h b/paddle/fluid/ir/dialect/utils.h index 3dbeaa9cc5aa0..a81febc0cbab9 100644 --- a/paddle/fluid/ir/dialect/utils.h +++ b/paddle/fluid/ir/dialect/utils.h @@ -141,155 +141,7 @@ static inline ir::Attribute TransToIrAttribute(phi::Scalar scalar, } } -enum class AttrType { - UNDEFINED = 0, - BOOL, - INT32, - INT64, - - FLOAT, - DOUBLE, - - ARRAY, - INT_ARRAY, - - SCALAR, - DATA_TYPE, - DATA_LAYOUT, - PLACE, - - STRING, - - NUM_ATTR_TYPES, -}; - -static inline AttrType GetAttributeType(const ir::Attribute& attr) { - if (attr.isa()) { - return AttrType::BOOL; - } else if (attr.isa()) { - return AttrType::FLOAT; - } else if (attr.isa()) { - return AttrType::DOUBLE; - } else if (attr.isa()) { - return AttrType::INT32; - } else if (attr.isa()) { - return AttrType::INT64; - } else if (attr.isa()) { - return AttrType::ARRAY; - } else if (attr.isa()) { - return AttrType::STRING; - } else if (attr.isa()) { - return AttrType::INT_ARRAY; - } else if (attr.isa()) { - return AttrType::DATA_TYPE; - } else if (attr.isa()) { - return AttrType::PLACE; - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Unsupported ir Attribute type when casting it into " - "AttrType.")); - } -} - -static std::unordered_map> - attr_cast_map = { - {AttrType::BOOL, - [](const ir::Attribute& attr) { - return VariantType{attr.dyn_cast().data()}; - }}, - {AttrType::FLOAT, - [](const ir::Attribute& attr) { - return VariantType{attr.dyn_cast().data()}; - }}, - {AttrType::DOUBLE, - [](const ir::Attribute& attr) { - return VariantType{attr.dyn_cast().data()}; - }}, - {AttrType::INT32, - [](const ir::Attribute& attr) { - return VariantType{attr.dyn_cast().data()}; - }}, - {AttrType::INT64, - [](const ir::Attribute& attr) { - return VariantType{attr.dyn_cast().data()}; - }}, - {AttrType::INT_ARRAY, - [](const ir::Attribute& attr) { - return VariantType{ - attr.dyn_cast() - .data() - .GetData()}; - }}, - {AttrType::STRING, - [](const ir::Attribute& attr) { - return VariantType{attr.dyn_cast().AsString()}; - }}, - {AttrType::DATA_TYPE, - [](const ir::Attribute& attr) { - return VariantType{ - attr.dyn_cast().data()}; - }}, - {AttrType::PLACE, - [](const ir::Attribute& attr) { - return VariantType{ - attr.dyn_cast().data()}; - }}, - {AttrType::ARRAY, - [](const ir::Attribute& attr) { - auto attr_vec = attr.dyn_cast().AsVector(); - if (attr_vec.size() == 0) { - return VariantType{std::vector()}; - } - AttrType element_type = GetAttributeType(attr_vec[0]); - - if (element_type == AttrType::BOOL) { - std::vector vec_bools; - for (auto vec_element : attr_vec) { - vec_bools.push_back( - vec_element.dyn_cast().data()); - } - return VariantType{vec_bools}; - } else if (element_type == AttrType::INT32) { - std::vector vec_int32; - for (auto vec_element : attr_vec) { - vec_int32.push_back( - vec_element.dyn_cast().data()); - } - return VariantType{vec_int32}; - } else if (element_type == AttrType::INT64) { - std::vector vec_int64; - for (auto vec_element : attr_vec) { - vec_int64.push_back( - vec_element.dyn_cast().data()); - } - return VariantType{vec_int64}; - } else if (element_type == AttrType::FLOAT) { - std::vector vec_float; - for (auto vec_element : attr_vec) { - vec_float.push_back( - vec_element.dyn_cast().data()); - } - return VariantType{vec_float}; - } else if (element_type == AttrType::DOUBLE) { - std::vector vec_double; - for (auto vec_element : attr_vec) { - vec_double.push_back( - vec_element.dyn_cast().data()); - } - return VariantType{vec_double}; - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Unsupported ir Attribute type when casting it into " - "vector.")); - } - }}, -}; - -static inline VariantType GetAttributeData(const ir::Attribute& attr) { - AttrType attr_type = GetAttributeType(attr); - return attr_cast_map[attr_type](attr); -} +VariantType GetAttributeData(const ir::Attribute& attr); } // namespace dialect } // namespace paddle diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 2081d327a2395..a6da23bc78e0f 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -57,7 +57,46 @@ PyTypeObject *g_ir_opresult_pytype = nullptr; void BindOpsAPI(pybind11::module *module); void BindProgram(py::module *m) { - py::class_ program(*m, "Program"); + py::class_ program(*m, "Program", R"DOC( + Create Python Program. Program is an abstraction of model structure, divided into + computational graphs and weights. The Program has a main block that stores the computational + graphs. + + A set of Program usually contains startup program and main program. + A startup program is set to contain some initial work, eg. initialize the ``Parameter``, and the main + program will contain the network structure and vars for train. + + A set of Program can be used for test or train, in train program , + Paddle will contain all content to build a train network, in test + program Paddle will prune some content which is irrelevant to test, eg. + backward ops and vars. + + **Notes**: + **we have** :ref:`api_paddle_static_default_startup_program` **and** :ref:`api_paddle_static_default_main_program` + **by default, a pair of them will shared the parameters. The** :ref:`api_paddle_static_default_startup_program` **only run once to initialize parameters,** + :ref:`api_paddle_static_default_main_program` **run in every mini batch and adjust the weights.** + + Returns: + Program: An empty Program. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + main_program = static.Program() + startup_program = static.Program() + with static.program_guard(main_program=main_program, startup_program=startup_program): + x = static.data(name="x", shape=[-1, 784], dtype='float32') + y = static.data(name="y", shape=[-1, 1], dtype='int32') + z = static.nn.fc(name="fc", x=x, size=10, activation="relu") + + print("main program is: {}".format(main_program)) + print("start up program is: {}".format(startup_program)) + )DOC"); program .def( "__init__", @@ -78,7 +117,13 @@ void BindProgram(py::module *m) { } void BindBlock(py::module *m) { - py::class_ block(*m, "Block"); + py::class_ block(*m, "Block", R"DOC( + In IR, a Block has a list of Operation and can represent a sub computational graph. + + Notes: + The constructor of Block should not be invoked directly. You can + use `Program.block()` to get a block. + )DOC"); block.def("front", &Block::front, return_value_policy::reference) .def("get_parent_program", [](Block &self) { return self.GetParentOp()->GetParentProgram(); }) @@ -91,14 +136,35 @@ void BindBlock(py::module *m) { } return op_list; }) - .def("remove_op", [](Block &self, Operation *op) { - auto op_iter = std::find(self.begin(), self.end(), op); - self.erase(op_iter); - }); + .def( + "remove_op", + [](Block &self, Operation *op) { + auto op_iter = std::find(self.begin(), self.end(), op); + self.erase(op_iter); + }, + R"DOC( + Remove the specific position operator. + + Args: + index(int): the position that the operator to insert. + + Returns: + None + + )DOC"); } void BindOperation(py::module *m) { - py::class_ op(*m, "Operation"); + py::class_ op(*m, "Operation", R"DOC( + In IR, all the operation are represented by Operation, and Operation + is regarded as a build in an instruction of a Block. Users can call + python api to describe their neural network. + + Notes: + The constructor of operator should not be invoked directly. Use + python api, for example: paddle.mean for building mean operation. + + )DOC"); op.def("name", &Operation::name) .def("get_parent_block", py::overload_cast<>(&Operation::GetParent), @@ -170,7 +236,15 @@ void BindOperation(py::module *m) { } void BindValue(py::module *m) { - py::class_ value(*m, "Value"); + py::class_ value(*m, "Value", R"DOC( + Value class represents the SSA value in the IR system. It is a directed edge + and a base class. + + Notes: + The constructor of Value should not be invoked directly. Value can be automatically constructed + when build network. + + )DOC"); value .def("get_defining_op", &Value::GetDefiningOp, @@ -185,7 +259,16 @@ void BindValue(py::module *m) { } void BindOpOperand(py::module *m) { - py::class_ op_operand(*m, "OpOperand"); + py::class_ op_operand(*m, + "OpOperand", + R"DOC( + OpOperand class represents the op_operand (input) of operation. + + Notes: + The constructor of OpOperand should not be invoked directly. OpOperand can be automatically constructed + when build network. + + )DOC"); op_operand .def("source", [](OpOperand &self) { return self.source().dyn_cast(); }) @@ -228,7 +311,13 @@ void SetStopGradient(const OpResult &self, bool stop_gradient) { } void BindOpResult(py::module *m) { - py::class_ op_result(*m, "OpResult"); + py::class_ op_result(*m, "OpResult", R"DOC( + OpResult class represents the value(output) defined by a result of operation. + + Notes: + The constructor of OpResult should not be invoked directly. OpResult can be automatically constructed + when build network. + )DOC"); g_ir_opresult_pytype = reinterpret_cast(op_result.ptr()); op_result.def("__eq__", &OpResult::operator==) .def("__eq__", @@ -301,7 +390,42 @@ void BindUtils(pybind11::module *m) { []() { APIBuilder::Instance().ResetInsertionPointToStart(); }); m->def("reset_insertion_point_to_end", []() { APIBuilder::Instance().ResetInsertionPointToEnd(); }); - m->def("translate_to_new_ir", &paddle::TranslateLegacyProgramToProgram); + m->def("translate_to_new_ir", &paddle::TranslateLegacyProgramToProgram, R"DOC( + Convert Fluid Program to New IR Program. + + Args: + + legacy_program (ProgramDesc): The Fluid Program that will be converted. + + Returns: + Program: The New IR Program + + Raises: + PreconditionNotMet: If legacy_program has multi block will raise error. + + Examples: + .. code-block:: python + + import paddle + from paddle import ir + paddle.enable_static() + + x = paddle.randn([4, 4]) + main_program, start_program = ( + paddle.static.Program(), + paddle.static.Program(), + ) + with paddle.static.program_guard(main_program, start_program): + x_s = paddle.static.data('x', [4, 4], x.dtype) + x_s.stop_gradient = False + y_s = paddle.matmul(x_s, x_s) + z_s = paddle.add(y_s, y_s) + k_s = paddle.tanh(z_s) + newir_program = ir.translate_to_new_ir(main_program.desc) + + print(newir_program) + + )DOC"); } void BindNewIR(pybind11::module *module) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 05731baad2471..b375cca76c127 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1016,32 +1016,8 @@ def convert_np_dtype_to_dtype_(np_dtype): dtype = np.dtype(np_dtype) if ir.core._use_new_ir_api(): - if dtype == np.float32: - return core.DataType.FLOAT32 - elif dtype == np.float64: - return core.DataType.FLOAT64 - elif dtype == np.float16: - return core.DataType.FLOAT16 - elif dtype == np.int32: - return core.DataType.INT32 - elif dtype == np.int16: - return core.DataType.INT16 - elif dtype == np.int64: - return core.DataType.INT64 - elif dtype == np.bool_: - return core.DataType.BOOL - elif dtype == np.uint16: - # since there is still no support for bfloat16 in NumPy, - # uint16 is used for casting bfloat16 - return core.DataType.UINT16 - elif dtype == np.uint8: - return core.DataType.UINT8 - elif dtype == np.int8: - return core.DataType.INT8 - elif dtype == np.complex64: - return core.DataType.COMPLEX64 - elif dtype == np.complex128: - return core.DataType.COMPLEX128 + if dtype in ir.core.np_type_to_paddle_type.keys(): + return ir.core.np_type_to_paddle_type[dtype] else: raise ValueError("Not supported numpy dtype %s" % dtype) else: diff --git a/python/paddle/ir/__init__.py b/python/paddle/ir/__init__.py index 0d272cf88ae1c..f023cfc0539fc 100644 --- a/python/paddle/ir/__init__.py +++ b/python/paddle/ir/__init__.py @@ -31,13 +31,4 @@ from . import core -__all__ = [ # noqa - 'Program', - 'Block', - 'Operation', - 'Value', - 'OpOperand', - 'OpResult', - 'Type', - 'translate_to_new_ir', -] +__all__ = [] diff --git a/python/paddle/ir/core.py b/python/paddle/ir/core.py index 9310c9b75bf65..ea73d266cca0d 100644 --- a/python/paddle/ir/core.py +++ b/python/paddle/ir/core.py @@ -13,11 +13,29 @@ # limitations under the License. +import numpy as np + import paddle +from paddle.fluid.libpaddle import DataType from paddle.fluid.libpaddle.ir import Program, set_global_program from ..fluid.wrapped_decorator import signature_safe_contextmanager +np_type_to_paddle_type = { + np.dtype("float32"): DataType.FLOAT32, + np.dtype("float64"): DataType.FLOAT64, + np.dtype("float16"): DataType.FLOAT16, + np.dtype("int32"): DataType.INT32, + np.dtype("int16"): DataType.INT16, + np.dtype("int64"): DataType.INT64, + np.dtype("bool_"): DataType.BOOL, + np.dtype("uint16"): DataType.UINT16, + np.dtype("uint8"): DataType.UINT8, + np.dtype("int8"): DataType.INT8, + np.dtype("complex64"): DataType.COMPLEX64, + np.dtype("complex128"): DataType.COMPLEX128, +} + def _use_new_ir_api(): """ From 74eb309396a1c06b655f67e26b93dc8fb0f8cfcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Fri, 11 Aug 2023 15:09:29 +0800 Subject: [PATCH 018/246] [cmake] add ginac third_party cache (#56165) * Update ginac.cmake * Update ginac.cmake * Update ginac.cmake --- cmake/cinn/external/ginac.cmake | 70 +++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/cmake/cinn/external/ginac.cmake b/cmake/cinn/external/ginac.cmake index 39e404c0aa1bd..8e17d39efdaec 100644 --- a/cmake/cinn/external/ginac.cmake +++ b/cmake/cinn/external/ginac.cmake @@ -5,32 +5,68 @@ include(ExternalProject) # ginac-1.8.1 https://www.ginac.de/ginac-1.8.1.tar.bz2 # all build with CFLAGS="-fPIC -DPIC" CXXFLAGS="-fPIC -DPIC" --enable-static=yes +set(GINAC_FILE + "ginac-1.8.1_cln-1.3.6_gmp-6.2.1.tar.gz" + CACHE STRING "" FORCE) set(GINAC_DOWNLOAD_URL - https://paddle-inference-dist.bj.bcebos.com/CINN/ginac-1.8.1_cln-1.3.6_gmp-6.2.1.tar.gz -) -set(GINAC_MD5 ebc3e4b7770dd604777ac3f01bfc8b06) + "https://paddle-inference-dist.bj.bcebos.com/CINN/${GINAC_FILE}") +set(GINAC_URL_MD5 ebc3e4b7770dd604777ac3f01bfc8b06) +set(GINAC_DOWNLOAD_DIR ${PADDLE_SOURCE_DIR}/third_party/ginac) +set(GINAC_PREFIX_DIR ${THIRD_PARTY_PATH}/ginac) +set(GINAC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ginac) + +function(download_ginac) + message( + STATUS + "Downloading ${GINAC_DOWNLOAD_URL} to ${GINAC_DOWNLOAD_DIR}/${GINAC_FILE}" + ) + file( + DOWNLOAD ${GINAC_DOWNLOAD_URL} ${GINAC_DOWNLOAD_DIR}/${GINAC_FILE} + EXPECTED_MD5 ${GINAC_URL_MD5} + STATUS ERR) + if(ERR EQUAL 0) + message(STATUS "Download ${GINAC_FILE} success") + else() + message( + FATAL_ERROR + "Download failed, error: ${ERR}\n You can try downloading ${GINAC_FILE} again" + ) + endif() +endfunction() + +# Download and check ginac. +if(EXISTS ${GINAC_DOWNLOAD_DIR}/${GINAC_FILE}) + file(MD5 ${GINAC_DOWNLOAD_DIR}/${GINAC_FILE} GINAC_MD5) + if(NOT GINAC_MD5 STREQUAL GINAC_URL_MD5) + # clean build file + file(REMOVE_RECURSE ${GINAC_PREFIX_DIR}) + file(REMOVE_RECURSE ${GINAC_INSTALL_DIR}) + download_ginac() + endif() +else() + download_ginac() +endif() ExternalProject_Add( external_ginac ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${GINAC_DOWNLOAD_URL} - URL_MD5 ${GINAC_MD5} - PREFIX ${THIRD_PARTY_PATH}/ginac - SOURCE_DIR ${THIRD_PARTY_PATH}/install/ginac + URL ${GINAC_DOWNLOAD_DIR}/${GINAC_FILE} + URL_MD5 ${GINAC_URL_MD5} + DOWNLOAD_DIR ${GINAC_DOWNLOAD_DIR} + PREFIX ${GINAC_PREFIX_DIR} + SOURCE_DIR ${GINAC_INSTALL_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" UPDATE_COMMAND "" INSTALL_COMMAND "" - BUILD_BYPRODUCTS ${THIRD_PARTY_PATH}/install/ginac/lib/libginac.a - BUILD_BYPRODUCTS ${THIRD_PARTY_PATH}/install/ginac/lib/libcln.a - BUILD_BYPRODUCTS ${THIRD_PARTY_PATH}/install/ginac/lib/libgmp.a) + BUILD_BYPRODUCTS ${GINAC_INSTALL_DIR}/lib/libginac.a + BUILD_BYPRODUCTS ${GINAC_INSTALL_DIR}/lib/libcln.a + BUILD_BYPRODUCTS ${GINAC_INSTALL_DIR}/lib/libgmp.a) add_library(ginac STATIC IMPORTED GLOBAL) add_dependencies(ginac external_ginac) -set_property( - TARGET ginac PROPERTY IMPORTED_LOCATION - ${THIRD_PARTY_PATH}/install/ginac/lib/libginac.a) -target_link_libraries( - ginac INTERFACE ${THIRD_PARTY_PATH}/install/ginac/lib/libcln.a - ${THIRD_PARTY_PATH}/install/ginac/lib/libgmp.a) -include_directories(${THIRD_PARTY_PATH}/install/ginac/include) +set_property(TARGET ginac PROPERTY IMPORTED_LOCATION + ${GINAC_INSTALL_DIR}/lib/libginac.a) +target_link_libraries(ginac INTERFACE ${GINAC_INSTALL_DIR}/lib/libcln.a + ${GINAC_INSTALL_DIR}/lib/libgmp.a) +include_directories(${GINAC_INSTALL_DIR}/include) From 34e905744eaf4d0fa20592066ae66030bbb61a75 Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Fri, 11 Aug 2023 16:40:46 +0800 Subject: [PATCH 019/246] add indextype (#56112) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IR 的 builtin dialect 中加入 IndexType --- paddle/cinn/utils/attribute_util.h | 1 + paddle/fluid/ir/dialect/utils.h | 4 ++++ paddle/ir/core/builder.cc | 1 + paddle/ir/core/builder.h | 2 ++ paddle/ir/core/builtin_dialect.cc | 1 + paddle/ir/core/builtin_type.cc | 1 + paddle/ir/core/builtin_type.h | 2 ++ paddle/ir/core/ir_context.cc | 4 ++++ paddle/ir/core/ir_printer.cc | 2 ++ paddle/ir/core/type.h | 21 +++++++++++++++++++++ test/cpp/ir/core/ir_builder_test.cc | 1 + test/cpp/ir/core/ir_type_converter_test.cc | 20 ++++++++++++++++++++ test/cpp/ir/core/type_test.cc | 8 ++++++++ 13 files changed, 68 insertions(+) diff --git a/paddle/cinn/utils/attribute_util.h b/paddle/cinn/utils/attribute_util.h index b5caaf667b958..02e7fc1bf2823 100644 --- a/paddle/cinn/utils/attribute_util.h +++ b/paddle/cinn/utils/attribute_util.h @@ -87,6 +87,7 @@ common::Type ConvertIRType(::ir::Type type) { CASE_TYPE(Int16Type, I16) CASE_TYPE(Int32Type, I32) CASE_TYPE(Int64Type, I64) + CASE_TYPE(IndexType, I32) CASE_TYPE(BoolType, UI1) LOG(FATAL) << "unknown ir::Type " << type; diff --git a/paddle/fluid/ir/dialect/utils.h b/paddle/fluid/ir/dialect/utils.h index a81febc0cbab9..13a9f3d7ac8b8 100644 --- a/paddle/fluid/ir/dialect/utils.h +++ b/paddle/fluid/ir/dialect/utils.h @@ -66,6 +66,8 @@ static inline phi::DataType TransToPhiDataType(ir::Type dtype) { return phi::DataType::INT32; } else if (dtype.isa()) { return phi::DataType::INT64; + } else if (dtype.isa()) { + return phi::DataType::INT32; } else if (dtype.isa()) { return phi::DataType::BOOL; } else if (dtype.isa()) { @@ -79,6 +81,8 @@ static inline phi::DataType TransToPhiDataType(ir::Type dtype) { } } +// use phi::DataType::INT32 for IndexType from builtin type to phi::DataType, +// but only use INT32 not IndexType from phi::DataType type to builtin type. static inline ir::Type TransToIrDataType(phi::DataType dtype, ir::IrContext* ctx = nullptr) { if (ctx == nullptr) { diff --git a/paddle/ir/core/builder.cc b/paddle/ir/core/builder.cc index 954b46b08f897..1bfbd2e2a8ca8 100644 --- a/paddle/ir/core/builder.cc +++ b/paddle/ir/core/builder.cc @@ -49,6 +49,7 @@ BFloat16Type Builder::bfloat16_type() { return BFloat16Type::get(context_); } Float32Type Builder::float32_type() { return Float32Type::get(context_); } Float64Type Builder::float64_type() { return Float64Type::get(context_); } +IndexType Builder::index_type() { return IndexType::get(context_); } Int16Type Builder::int16_type() { return Int16Type::get(context_); } BoolType Builder::bool_type() { return BoolType::get(context_); } Complex64Type Builder::complex64_type() { return Complex64Type::get(context_); } diff --git a/paddle/ir/core/builder.h b/paddle/ir/core/builder.h index 74856cdaf7c0c..f3ae837ea9723 100644 --- a/paddle/ir/core/builder.h +++ b/paddle/ir/core/builder.h @@ -29,6 +29,7 @@ class BFloat16Type; class Float32Type; class Float64Type; class Int16Type; +class IndexType; class BoolType; class Complex64Type; class Complex128Type; @@ -114,6 +115,7 @@ class Builder { IR_API Int8Type int8_type(); IR_API VectorType vec_type(const std::vector &); IR_API BFloat16Type bfloat16_type(); + IR_API IndexType index_type(); IR_API Float32Type float32_type(); IR_API Float64Type float64_type(); IR_API Int16Type int16_type(); diff --git a/paddle/ir/core/builtin_dialect.cc b/paddle/ir/core/builtin_dialect.cc index a5e9605c2835e..3284a96c8b519 100644 --- a/paddle/ir/core/builtin_dialect.cc +++ b/paddle/ir/core/builtin_dialect.cc @@ -34,6 +34,7 @@ void BuiltinDialect::initialize() { Int16Type, Int32Type, Int64Type, + IndexType, BoolType, Complex64Type, Complex128Type, diff --git a/paddle/ir/core/builtin_type.cc b/paddle/ir/core/builtin_type.cc index 3a8e1030fb07f..8a0aea5745a5b 100644 --- a/paddle/ir/core/builtin_type.cc +++ b/paddle/ir/core/builtin_type.cc @@ -29,6 +29,7 @@ IR_DEFINE_EXPLICIT_TYPE_ID(ir::Float64Type) IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int16Type) IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int32Type) IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int64Type) +IR_DEFINE_EXPLICIT_TYPE_ID(ir::IndexType) IR_DEFINE_EXPLICIT_TYPE_ID(ir::BoolType) IR_DEFINE_EXPLICIT_TYPE_ID(ir::Complex64Type) IR_DEFINE_EXPLICIT_TYPE_ID(ir::Complex128Type) diff --git a/paddle/ir/core/builtin_type.h b/paddle/ir/core/builtin_type.h index aa043f206d22e..9a2939110deac 100644 --- a/paddle/ir/core/builtin_type.h +++ b/paddle/ir/core/builtin_type.h @@ -73,6 +73,7 @@ class IR_API VectorType : public Type { __macro(Int16Type); \ __macro(Int32Type); \ __macro(Int64Type); \ + __macro(IndexType); \ __macro(BoolType); \ __macro(Complex64Type); \ __macro(Complex128Type); @@ -95,5 +96,6 @@ IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int16Type) IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int32Type) IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int64Type) IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::BoolType) +IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::IndexType) IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Complex64Type) IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Complex128Type) diff --git a/paddle/ir/core/ir_context.cc b/paddle/ir/core/ir_context.cc index 5c609f183c40d..54865e1ec38bd 100644 --- a/paddle/ir/core/ir_context.cc +++ b/paddle/ir/core/ir_context.cc @@ -156,6 +156,7 @@ class IrContextImpl { Float16Type fp16_type; Float32Type fp32_type; Float64Type fp64_type; + IndexType index_type; UInt8Type uint8_type; Int8Type int8_type; Int16Type int16_type; @@ -203,6 +204,7 @@ IrContext::IrContext() : impl_(new IrContextImpl()) { impl_->int16_type = TypeManager::get(this); impl_->int32_type = TypeManager::get(this); impl_->int64_type = TypeManager::get(this); + impl_->index_type = TypeManager::get(this); impl_->bool_type = TypeManager::get(this); impl_->complex64_type = TypeManager::get(this); impl_->complex128_type = TypeManager::get(this); @@ -343,6 +345,8 @@ Int32Type Int32Type::get(IrContext *ctx) { return ctx->impl().int32_type; } Int64Type Int64Type::get(IrContext *ctx) { return ctx->impl().int64_type; } +IndexType IndexType::get(IrContext *ctx) { return ctx->impl().index_type; } + Int8Type Int8Type::get(IrContext *ctx) { return ctx->impl().int8_type; } UInt8Type UInt8Type::get(IrContext *ctx) { return ctx->impl().uint8_type; } diff --git a/paddle/ir/core/ir_printer.cc b/paddle/ir/core/ir_printer.cc index f8549433f75c7..080e0bafc966a 100644 --- a/paddle/ir/core/ir_printer.cc +++ b/paddle/ir/core/ir_printer.cc @@ -59,6 +59,8 @@ void BasicIrPrinter::PrintType(Type type) { os << "i32"; } else if (type.isa()) { os << "i64"; + } else if (type.isa()) { + os << "index"; } else if (type.isa()) { os << "c64"; } else if (type.isa()) { diff --git a/paddle/ir/core/type.h b/paddle/ir/core/type.h index 62dcefdf3ba65..df148f17a2350 100644 --- a/paddle/ir/core/type.h +++ b/paddle/ir/core/type.h @@ -97,6 +97,27 @@ IR_API std::ostream &operator<<(std::ostream &os, Type type); } // namespace ir +/// +/// \brief This class represents the base of a type interface. +/// + +// template +// class TypeInterface : public ir::DialectInterface { +// public: +// using Base = TypeInterface; +// using DialectInterfaceBase = ir::DialectInterface; +// using DialectInterfaceBase::Base; + +// private: +// /// Returns the impl interface instance for the given type. +// static typename InterfaceBase::Concept *getInterfaceFor(Type type) { +// return type.getAbstractType().getInterface(); +// } + +// /// Allow access to 'getInterfaceFor'. +// friend InterfaceBase; +// }; + namespace std { /// /// \brief Enable hashing Type. diff --git a/test/cpp/ir/core/ir_builder_test.cc b/test/cpp/ir/core/ir_builder_test.cc index 3b70220a8d309..863bac72da9c2 100644 --- a/test/cpp/ir/core/ir_builder_test.cc +++ b/test/cpp/ir/core/ir_builder_test.cc @@ -31,6 +31,7 @@ TEST(builder_test, type_api) { EXPECT_EQ(ir::BFloat16Type::get(&ctx), builder.bfloat16_type()); EXPECT_EQ(ir::Float32Type::get(&ctx), builder.float32_type()); EXPECT_EQ(ir::Float64Type::get(&ctx), builder.float64_type()); + EXPECT_EQ(ir::IndexType::get(&ctx), builder.index_type()); EXPECT_EQ(ir::Int16Type::get(&ctx), builder.int16_type()); EXPECT_EQ(ir::BoolType::get(&ctx), builder.bool_type()); EXPECT_EQ(ir::Complex64Type::get(&ctx), builder.complex64_type()); diff --git a/test/cpp/ir/core/ir_type_converter_test.cc b/test/cpp/ir/core/ir_type_converter_test.cc index 896c1059dc664..26f4cde589171 100644 --- a/test/cpp/ir/core/ir_type_converter_test.cc +++ b/test/cpp/ir/core/ir_type_converter_test.cc @@ -65,3 +65,23 @@ TEST(TypeConverterTest, paramterless_type) { ir::Complex64Type, ir::Complex128Type>(); } + +void test_index_type() { + ir::IrContext* ctx = ir::IrContext::Instance(); + ctx->GetOrRegisterDialect(); + + ir::Type type = ir::IndexType::get(ctx); + std::stringstream ss; + ss << type; + EXPECT_GT(ss.str().size(), 0u); + EXPECT_EQ(ss.str(), "index"); + EXPECT_NE(ss.str(), "<>"); + phi::DataType phi_type = paddle::dialect::TransToPhiDataType(type); + auto& type_translator = paddle::translator::TypeTranslator::instance(); + paddle::framework::VarDesc empty_var_desc("empty"); + auto proto_type = paddle::framework::TransToProtoVarType(phi_type); + ir::Type final_type = type_translator[proto_type](ctx, empty_var_desc); + EXPECT_EQ(paddle::dialect::TransToIrDataType(phi_type), final_type); +} + +TEST(IndexTypeConverterTest, index_type) { test_index_type(); } diff --git a/test/cpp/ir/core/type_test.cc b/test/cpp/ir/core/type_test.cc index a748e1d5db88b..24bf92446c2a0 100644 --- a/test/cpp/ir/core/type_test.cc +++ b/test/cpp/ir/core/type_test.cc @@ -89,6 +89,14 @@ TEST(type_test, built_in_type) { &ir::AbstractType::lookup(bfp16_1.type_id(), ctx)); EXPECT_EQ(ir::BFloat16Type::classof(bfp16_1), 1); + ir::Type index_1 = ir::IndexType::get(ctx); + ir::Type index_2 = ir::IndexType::get(ctx); + EXPECT_EQ(index_1, index_2); + EXPECT_EQ(index_1.type_id(), index_2.type_id()); + EXPECT_EQ(&index_1.abstract_type(), + &ir::AbstractType::lookup(index_1.type_id(), ctx)); + EXPECT_EQ(ir::IndexType::classof(index_1), 1); + ir::Type fp16_1 = ir::Float16Type::get(ctx); ir::Type fp16_2 = ir::Float16Type::get(ctx); EXPECT_EQ(fp16_1, fp16_2); From 22dbcecaa1d8627a59401b515a67ed082d541b46 Mon Sep 17 00:00:00 2001 From: JYChen Date: Fri, 11 Aug 2023 17:04:26 +0800 Subject: [PATCH 020/246] remove fluid/tests (#56182) --- python/CMakeLists.txt | 4 ---- python/paddle/fluid/tests/.gitignore | 4 ---- python/paddle/fluid/tests/CMakeLists.txt | 9 --------- python/paddle/fluid/tests/__init__.py | 13 ------------- 4 files changed, 30 deletions(-) delete mode 100644 python/paddle/fluid/tests/.gitignore delete mode 100644 python/paddle/fluid/tests/CMakeLists.txt delete mode 100644 python/paddle/fluid/tests/__init__.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index a96f541580eb3..a915790c57cb2 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -186,10 +186,6 @@ add_custom_target(paddle_python ALL set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) -if(WITH_TESTING) - add_subdirectory(paddle/fluid/tests) -endif() - if(NOT WITH_SETUP_INSTALL) install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} DESTINATION opt/paddle/share/wheels) diff --git a/python/paddle/fluid/tests/.gitignore b/python/paddle/fluid/tests/.gitignore deleted file mode 100644 index 62f82151eb423..0000000000000 --- a/python/paddle/fluid/tests/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -image/ -fit_a_line.model/ -tmp -cuda_profiler.txt diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt deleted file mode 100644 index ec4cf73570456..0000000000000 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -file( - GLOB TEST_OPS - RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" - "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() diff --git a/python/paddle/fluid/tests/__init__.py b/python/paddle/fluid/tests/__init__.py deleted file mode 100644 index eca2dce114b06..0000000000000 --- a/python/paddle/fluid/tests/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. From f60c698f06128cd716e860dac16f2e556ced3370 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 11 Aug 2023 17:20:57 +0800 Subject: [PATCH 021/246] Fix the shape of input sin and cos for fused_rope. (#56132) * Fix the shape of input sin and cos for fused_rope. * Update shape in unittest. --- .../fusion/gpu/fused_rope_grad_kernel.cu | 1 + .../kernels/fusion/gpu/fused_rope_kernel.cu | 49 ++++++++++++------- .../test_fused_rotary_position_embedding.py | 20 +++++--- 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu index 4784acf646743..442317eb53d98 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu @@ -19,6 +19,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h" + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu index f837793860a70..f6dcbc2a9038f 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu @@ -19,6 +19,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h" + namespace phi { namespace fusion { @@ -35,13 +36,14 @@ void FusedRopeKernel(const Context& dev_ctx, int64_t numel = q.numel(); if (numel <= 0) return; dev_ctx.template Alloc(out_q); - // small size for broadcast + + // q.shape: [batch_size, seq_len, num_heads, head_dim] auto batch_size = q.dims()[0]; + auto seq_len = q.dims()[1]; auto num_heads = q.dims()[2]; auto head_dim = q.dims()[3]; - auto seq_len = q.dims()[1]; - PADDLE_ENFORCE_NE(head_dim % 2, - 1, + PADDLE_ENFORCE_EQ(head_dim % 2, + 0, phi::errors::InvalidArgument( "The head_dim of input must be a multiple of 2.")); @@ -85,26 +87,37 @@ void FusedRopeKernel(const Context& dev_ctx, PADDLE_ENFORCE_EQ(sin.get_ptr()->dims(), cos.get_ptr()->dims(), phi::errors::InvalidArgument( - "The dims of sin and cos must be the same.")); + "The dims of sin and cos must be the same. But " + "recieved sin's dims is {%s}, cos's dims is {%s}.", + sin.get_ptr()->dims(), + cos.get_ptr()->dims())); + auto sin_dims = sin.get_ptr()->dims(); int dims_size = sin_dims.size(); - PADDLE_ENFORCE_NE((dims_size == 2 || dims_size == 4), - false, - phi::errors::InvalidArgument( - "The dims of sin and cos must be 2 or 4.")); + PADDLE_ENFORCE_EQ( + (dims_size == 2 || dims_size == 4), + true, + phi::errors::InvalidArgument("The dims of sin and cos is expected to " + "be 2 or 4, but recieved %d.", + dims_size)); if (dims_size == 4) { - PADDLE_ENFORCE_NE( - (sin_dims[0] == 1 && sin_dims[1] == 1), - false, + // sin.shape: [1, seq_len, 1, head_dim] + PADDLE_ENFORCE_EQ( + (sin_dims[0] == 1 && sin_dims[2] == 1), + true, phi::errors::InvalidArgument( "The batch_size and num_heads of sin and cos must be 1.")); } - PADDLE_ENFORCE_NE( - (sin_dims[dims_size - 1] == head_dim && - sin_dims[dims_size - 2] == seq_len), - false, - phi::errors::InvalidArgument("The seq_len and head_dim of sin and cos " - "must be the same as those of q.")); + int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0; + PADDLE_ENFORCE_EQ((sin_dims[dims_size - 1] == head_dim && + sin_dims[sin_seq_len_dim] == seq_len), + true, + phi::errors::InvalidArgument( + "The seq_len and head_dim of sin and cos " + "must be the same as those of q. But recieved sin's " + "shape is {%s}, q's shape is {%s}.", + sin_dims, + q.dims())); sin_cos_data[0] = sin->data(); sin_cos_data[1] = cos->data(); diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py index 737f2850d96cd..9842fbf1f4ee8 100644 --- a/test/legacy_test/test_fused_rotary_position_embedding.py +++ b/test/legacy_test/test_fused_rotary_position_embedding.py @@ -64,27 +64,35 @@ def get_sin_cos_tensor(seq_len, head_dim, sign): tensor_sin = paddle.reshape( paddle.to_tensor(sin_sin), - [1, 1, seq_len, head_dim], + [1, seq_len, 1, head_dim], ) tensor_cos = paddle.reshape( paddle.to_tensor(cos_cos), - [1, 1, seq_len, head_dim], + [1, seq_len, 1, head_dim], ) return tensor_sin, tensor_cos def paddle_fused_rotary_position_embedding(init_q, init_k, init_v): + # permute q, k, v from [batch_size, seq_len, num_heads, head_dim] + # to [batch_size, num_heads, seq_len, head_dim] q, k, v = deal_qkv(init_q, init_k, init_v) sin_tensor, cos_tensor = get_sin_cos_tensor(q.shape[2], q.shape[3], -1) + # permute sin, cos from [1, seq_len, 1, head_dim] + # to [1, 1, seq_len, head_dim] + perm = [0, 2, 1, 3] + sin_tensor = paddle.transpose(x=sin_tensor, perm=perm) + cos_tensor = paddle.transpose(x=cos_tensor, perm=perm) + query = mult_qkv(q, cos_tensor, sin_tensor) value = mult_qkv(v, cos_tensor, sin_tensor) key = mult_qkv(k, cos_tensor, sin_tensor) + # permute the result back to [batch_size, seq_len, num_heads, head_dim] r_query, r_key, r_value = deal_qkv(query, key, value) - return r_query, r_key, r_value @@ -94,7 +102,7 @@ def paddle_fused_rotary_position_embedding(init_q, init_k, init_v): ) class TestFusedRotaryPositionEmbedding(unittest.TestCase): def setUp(self): - self.shape = [1, 16, 1, 16] + self.shape = [1, 8, 2, 16] self.dtype = 'float32' self.training = True self.seed = 1203 @@ -138,7 +146,7 @@ def get_forward_backward(self, rope_function, seed, flag=0): return fw, bw - def test_fused_dropout_add(self): + def test_fused_rope(self): p_fw, p_bw = self.get_forward_backward( paddle_fused_rotary_position_embedding, seed=self.seed ) @@ -153,7 +161,7 @@ def test_fused_dropout_add(self): p_bw[i].numpy(), f_bw[i].numpy(), rtol=1e-05 ) - def test_fused_dropout_add_sin_cos(self): + def test_fused_rope_with_sin_cos(self): p_fw, p_bw = self.get_forward_backward( paddle_fused_rotary_position_embedding, seed=self.seed ) From 6eaed2da37dd7de57f132365768ca56db25163c6 Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Fri, 11 Aug 2023 17:29:29 +0800 Subject: [PATCH 022/246] remove the optimizer base and learning rate base (#56099) * remove the optimizer base and learning rate base * fix bug * fix bug --- python/paddle/amp/auto_cast.py | 3 +- .../auto_parallel/static/engine.py | 3 +- .../fleet/meta_optimizers/dgc_optimizer.py | 59 +- python/paddle/fluid/__init__.py | 2 - python/paddle/fluid/dygraph/__init__.py | 3 - .../fluid/dygraph/learning_rate_scheduler.py | 180 -- python/paddle/fluid/optimizer.py | 1448 ----------------- .../paddle/incubate/distributed/fleet/base.py | 6 +- .../distribute_transpiler/__init__.py | 1 - .../optimizer/distributed_fused_lamb.py | 2 +- python/paddle/incubate/optimizer/pipeline.py | 2 - python/paddle/optimizer/optimizer.py | 6 - python/paddle/static/__init__.py | 1 - python/paddle/static/amp/decorator.py | 4 +- test/legacy_test/test_dist_transpiler.py | 2 +- .../test_imperative_save_load_v2.py | 6 +- 16 files changed, 65 insertions(+), 1663 deletions(-) delete mode 100644 python/paddle/fluid/dygraph/learning_rate_scheduler.py delete mode 100755 python/paddle/fluid/optimizer.py diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 19007bccc48ca..b1e9b4c00b24f 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -250,7 +250,6 @@ def _is_valid_optimizer(optimizer): optimizer, ( paddle.optimizer.Optimizer, - paddle.fluid.optimizer.Optimizer, DygraphShardingOptimizer, ), ) @@ -260,7 +259,7 @@ def check_optimizers(optimizers): for optimizer in optimizers: if not _is_valid_optimizer(optimizer): raise RuntimeError( - "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format( + "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format( type(optimizer) ) ) diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 853ff7d0aedbf..6bfb094f28346 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -146,11 +146,10 @@ def __init__( if optimizer and not isinstance( optimizer, - (paddle.optimizer.Optimizer, paddle.static.Optimizer), + (paddle.optimizer.Optimizer), ): raise TypeError( "'optimizer' must be object of class `paddle.optimizer.Optimizer`" - " or `paddle.static.Optimizer`." ) self._optimizer = auto_utils.validate_opt(optimizer) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py index f3f3bf950d507..4fa85994269a7 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py @@ -20,11 +20,11 @@ import paddle from paddle.common_ops_import import LayerHelper +from paddle.fluid import framework from paddle.fluid.dygraph import base as imperative_base -from paddle.fluid.optimizer import Optimizer from paddle.framework import core, in_dynamic_mode from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops -from paddle.optimizer import Momentum +from paddle.optimizer import Momentum, Optimizer from paddle.regularizer import L1Decay, L2Decay from paddle.static import create_global_var @@ -58,8 +58,8 @@ def __init__( assert momentum is not None super().__init__( learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, + parameters=parameter_list, + weight_decay=regularization, grad_clip=grad_clip, name=name, ) @@ -396,6 +396,55 @@ def _dgc_op( op_maker.kOpRoleVarAttrName(), [param_var.name, grad_var.name] ) + def _process_distribute_lookuptable(self, param_grads): + """ + Because distribute lookup table only support SGD optimizer for now, not support + other optimizer and regularization, so we should find the table parameter out, + and avoid to add regularization and other op for it, and add sgd optimize op + for it independently. + :param param_grads(list((Var, Var))): list of (param, grad) pair. + :param loss: the loss variable. + :param startup_program: the startup program + """ + from paddle.distributed.distribute_lookup_table import ( + find_distributed_lookup_table, + ) + + program = framework.default_main_program() + global_block = framework.default_main_program().global_block() + table_name = find_distributed_lookup_table(program) + table_param = None + table_grad = None + new_param_grads = [] + for p, g in param_grads: + if p.name == table_name: + if table_param is not None: + raise RuntimeError( + "multi dist table var found, only support one now!" + ) + table_param = p + table_grad = g + else: + new_param_grads.append((p, g)) + sgd_op = None + if table_param is not None: + param_and_grad = [table_param, table_grad] + with table_param.block.program._optimized_guard( + param_and_grad + ), framework.name_scope("optimizer"): + self._create_global_learning_rate() + # create the optimize op + sgd_op = global_block.append_op( + type='sgd', + inputs={ + "Param": table_param, + "Grad": table_grad, + "LearningRate": self._create_param_lr(param_and_grad), + }, + outputs={"ParamOut": param_and_grad[0]}, + ) + return new_param_grads, (table_param, table_grad), sgd_op + @imperative_base.no_grad() def apply_gradients(self, params_grads): # Note: since we can't use all_reduce_op now, @@ -532,7 +581,7 @@ def apply_gradients(self, params_grads): def apply_optimize(self, loss, startup_program, params_grads): self._init_dgc_opt() - return self.dgc_opt.apply_optimize( + return self.dgc_opt._apply_optimize( loss, startup_program=startup_program, params_grads=params_grads ) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index d677d4e8d70d1..833576a9c7f8a 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -53,7 +53,6 @@ from .initializer import set_global_initializer from . import layers from . import dygraph -from . import optimizer from . import backward from .backward import gradients from . import incubate @@ -109,7 +108,6 @@ 'disable_dygraph', 'enable_imperative', 'disable_imperative', - 'optimizer', 'backward', 'LoDTensor', 'LoDTensorArray', diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py index c40262a45d7c3..09cc385b948ed 100644 --- a/python/paddle/fluid/dygraph/__init__.py +++ b/python/paddle/fluid/dygraph/__init__.py @@ -18,9 +18,6 @@ from . import tracer from .tracer import * -from . import learning_rate_scheduler -from .learning_rate_scheduler import * __all__ = [] __all__ += base.__all__ -__all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py deleted file mode 100644 index 90746fa5c3422..0000000000000 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import warnings -import numpy as np - -import paddle -from .. import unique_name -from ..framework import Variable -from ..data_feeder import check_type - -__all__ = [] - - -class LearningRateDecay: - """ - Base class of learning rate decay - - Define the common interface of an LearningRateDecay. - User should not use this class directly, - but need to use one of it's implementation. - """ - - def __init__(self, begin=0, step=1, dtype='float32'): - self.step_num = begin - self.step_size = step - self.dtype = dtype - - def __call__(self): - lr = self.step() - if isinstance(lr, float): - lr = self.create_lr_var(lr) - self.step_num += self.step_size - return lr - - def create_lr_var(self, lr): - """ - convert lr from float to variable - - Args: - lr: learning rate - Returns: - learning rate variable - """ - from .. import layers - - lr = paddle.static.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(lr), - dtype=self.dtype, - persistable=False, - ) - return lr - - # Note: If you want to change what optimizer.state_dict stores, just overwrite this functions, - # "self.step_num" will be stored by default. - def state_dict(self): - """ - Returns the state of the scheduler as a :class:`dict`. - - It is a subset of self.__dict__ . - """ - self._state_keys() - state_dict = {} - for key in self.keys: - if key not in self.__dict__: - continue - value = self.__dict__[key] - if isinstance(value, Variable): - assert ( - value.size == 1 - ), "the size of Variable in state_dict must be 1, but its size is {} with shape {}".format( - value.size, value.shape - ) - value = value.item() - state_dict[key] = value - - return state_dict - - def _state_keys(self): - """ - set the keys in self.__dict__ that are needed to be saved. - """ - self.keys = ['step_num'] - - def set_state_dict(self, state_dict): - """ - Loads the schedulers state. - """ - self._state_keys() - for key in self.keys: - if key in state_dict: - self.__dict__[key] = state_dict[key] - else: - raise RuntimeError( - "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".format( - key - ) - ) - if len(state_dict) > len(self.keys): - warnings.warn( - "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict" - ) - - # [aliases] Compatible with old method names - set_dict = set_state_dict - - def step(self): - raise NotImplementedError() - - -class _LearningRateEpochDecay(LearningRateDecay): - """ - :api_attr: imperative - - Base class of learning rate decay, which is updated each epoch. - - Define the common interface of an _LearningRateEpochDecay. - User should not use this class directly, - but need to use one of it's implementation. And invoke method: `epoch()` each epoch. - """ - - def __init__(self, learning_rate, dtype=None): - if not isinstance(learning_rate, (float, int)): - raise TypeError( - "The type of 'learning_rate' must be 'float, int', but received %s." - % type(learning_rate) - ) - if learning_rate < 0: - raise ValueError("Invalid learning rate: {}".format(learning_rate)) - - self.base_lr = float(learning_rate) - - self.epoch_num = -1 - self.dtype = dtype - if dtype is None: - self.dtype = "float32" - self.learning_rate = self.create_lr_var(self.base_lr) - - self.epoch() - - # For those subclass who overload _LearningRateEpochDecay, "self.epoch_num/learning_rate" will be stored by default. - # you can change it for your subclass. - def _state_keys(self): - self.keys = ['epoch_num', 'learning_rate'] - - def __call__(self): - """ - Return last computed learning rate on current epoch. - """ - if not isinstance(self.learning_rate, Variable): - self.learning_rate = self.create_lr_var(self.learning_rate) - return self.learning_rate - - def epoch(self, epoch=None): - """ - compueted learning_rate and update it when invoked. - """ - if epoch is None: - self.epoch_num += 1 - else: - self.epoch_num = epoch - - self.learning_rate = self.get_lr() - - def get_lr(self): - raise NotImplementedError diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py deleted file mode 100755 index c986d39aafe46..0000000000000 --- a/python/paddle/fluid/optimizer.py +++ /dev/null @@ -1,1448 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import os -import logging -from collections import defaultdict - -import paddle - - -from paddle.fluid.framework import ( - Program, - Variable, - Parameter, - name_scope, - default_main_program, - default_startup_program, - device_guard, -) - -from . import framework -from . import layers -from . import unique_name -from .backward import ( - append_backward, - _some_in_set_, - _append_grad_suffix_, - _get_no_grad_set_name, -) -from .framework import program_guard -from .layer_helper import LayerHelper -from .dygraph import base as imperative_base -from .dygraph import no_grad -from .dygraph.learning_rate_scheduler import ( - LearningRateDecay, - _LearningRateEpochDecay, -) -from paddle.fluid import core -from functools import reduce -from functools import cmp_to_key -from .wrapped_decorator import signature_safe_contextmanager -import warnings -from paddle import _C_ops, _legacy_C_ops -from ..fluid.framework import ( - in_dygraph_mode, - _current_expected_place, -) - -__all__ = [] - - -class Optimizer: - """Optimizer Base class. - - Define the common interface of an optimizer. - User should not use this class directly, - but need to use one of it's implementation. - """ - - @imperative_base.no_grad - def __init__( - self, - learning_rate, - parameter_list=None, - regularization=None, - grad_clip=None, - flatten_param_grads=False, - align_size=-1, - name=None, - ): - """ - Args: - flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads. - If true, the parameters and gradients will be coalesce to contiguous mempry, - and the grad_clip ops / optimizer ops will be fuse to one operator. - """ - # Because of the loop import, so place it in the function body - from paddle.optimizer.lr import LRScheduler - - self._parameter_list = ( - list(parameter_list) if parameter_list is not None else None - ) - self._name = name - if in_dygraph_mode(): - if not isinstance( - learning_rate, (float, LearningRateDecay, LRScheduler) - ): - raise TypeError( - "learning rate should be float or LRScheduler, got %s here" - % type(learning_rate) - ) - if self._parameter_list is None: - raise AttributeError( - "parameter_list argument given to the Optimizer should not be None in dygraph mode." - ) - if regularization is not None: - for param in self._parameter_list: - if param.regularizer is not None: - logging.info( - "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " - "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" - % regularization.__str__() - ) - break - else: - if not isinstance( - learning_rate, (float, framework.Variable, LRScheduler) - ): - raise TypeError( - "learning rate should be float or LRScheduler, got %s here" - % type(learning_rate) - ) - - if grad_clip is not None: - if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase): - raise TypeError( - "'grad_clip' should be an instance of GradientClipBase's derived class" - ) - self.regularization = regularization - self._grad_clip = grad_clip - self._learning_rate = learning_rate - self._flatten_param_grads = flatten_param_grads - self._align_size = align_size - - self._dtype = None - # Infer the dtype form parameter - if self._parameter_list: - self._dtype = self._parameter_list[0].dtype - - # each program should have a independent learning rate - # program -> Variable(learning_rate) - self._learning_rate_map = dict() - if isinstance(self._learning_rate, framework.Variable): - self._learning_rate_map[ - framework.default_main_program() - ] = self._learning_rate - # Dictionary of accumulators. Some optimizer subclasses need to - # allocate and manage extra variables associated with the parameters - # to train. These variables are called accumulators. - # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} - self._accumulators = defaultdict(lambda: dict()) - # global_accumulator dict, {accum_name : acc_variable, ...} - self._global_accumulators = {} - self.helper = LayerHelper(self.__class__.__name__) - self._opti_name_list = [] - self._accumulators_holder = {} - self._param_device_map = dict() - # NOTE(zhiqiu): sometimes we want to add some variables(Tenosr) to the optimizer for a specific optimization, - # for example, we want to pass 'found_inf' to adam optimizer so it can skip update when found_inf is True. - # And these variables should not be the parameters of Optimizer's construnctor (because not commonly used). - # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that. - self._auxiliary_vars = dict() - - @framework.dygraph_only - def state_dict(self): - ''' - Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict. - If the optimizer never be called(minimize function), the state_dict is empty. - - Args: None - Return: - state_dict(dict) : dict contains all the variable used by optimizer - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - with fluid.dygraph.guard(): - emb = paddle.nn.Embedding(10, 10) - - adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) - state_dict = adam.state_dict() - - ''' - from paddle.optimizer.lr import LRScheduler - - state_dict = {} - for k, v in self._accumulators.items(): - for para_name, var_tmp in v.items(): - state_dict[var_tmp.name] = var_tmp - for k, v in self._global_accumulators.items(): - state_dict[v.name] = v - # global step if use lr decay - if isinstance(self._learning_rate, LRScheduler): - state_dict["LR_Scheduler"] = self._learning_rate.state_dict() - return state_dict - if isinstance(self._learning_rate, LearningRateDecay): - state_dict["LR_Scheduler"] = self._learning_rate.state_dict() - - if not isinstance(self._learning_rate, _LearningRateEpochDecay): - var_tmp = None - var_temp = framework._create_tensor( - None, name='global_step', dtype='int32' - ) - - paddle.tensor.fill_constant( - [1], "int32", self._learning_rate.step_num, out=var_temp - ) - - state_dict['global_step'] = var_temp - return state_dict - - @framework.dygraph_only - def set_state_dict(self, state_dict): - ''' - Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed. - - Args: - state_dict(dict) : Dict contains all the Variable needed by optimizer - Return: - None - - Examples: - .. code-block:: python - - import paddle - - paddle.disable_static() - - emb = paddle.nn.Embedding(10, 10) - - state_dict = emb.state_dict() - paddle.save(state_dict, "paddle_dy.pdparams") - - scheduler = paddle.optimizer.lr.NoamDecay( - d_model=0.01, warmup_steps=100, verbose=True) - adam = paddle.optimizer.Adam( - learning_rate=scheduler, - parameters=emb.parameters()) - state_dict = adam.state_dict() - paddle.save(state_dict, "paddle_dy.pdopt") - - para_state_dict = paddle.load("paddle_dy.pdparams") - opti_state_dict = paddle.load("paddle_dy.pdopt") - ''' - from paddle.optimizer.lr import LRScheduler - - if isinstance(self._learning_rate, LRScheduler): - self._learning_rate.set_dict(state_dict["LR_Scheduler"]) - - if isinstance(self._learning_rate, LearningRateDecay): - self._learning_rate.set_dict(state_dict["LR_Scheduler"]) - - if not isinstance(self._learning_rate, _LearningRateEpochDecay): - assert ( - 'global_step' in state_dict - ), 'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict' - global_step = state_dict['global_step'] - - if isinstance(global_step, Variable): - step_np = global_step - step_np = np.array(step_np.value().get_tensor()) - assert step_np.shape == ( - 1, - ), "global step shape is (1,), the shape is {}".format( - step_np.shape - ) - - self._learning_rate.step_num = int(step_np[0]) - elif isinstance(global_step, np.ndarray): - assert global_step.shape == ( - 1, - ), "global step shape is (1,), the shape is {}".format( - global_step.shape - ) - self._learning_rate.step_num = global_step[0] - else: - raise RuntimeError( - "Type not supprt, value in state dict must be [Tensor, Variable, numpy], the type is ", - type(global_step), - ) - - def _load_state_para(state_dict, param): - var = param.value() - tensor = var.get_tensor() - model_np = np.array(tensor) - load_para = state_dict[param.name] - if isinstance(load_para, Variable): - load_para_np = load_para.numpy() - elif isinstance(load_para, core.eager.Tensor): - load_para_np = load_para.numpy() - elif isinstance(load_para, np.ndarray): - load_para_np = load_para - else: - raise RuntimeError( - "State dict type {} not supprt".format(str(type(load_para))) - ) - - assert ( - model_np.shape == load_para_np.shape - ), "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format( - param.name, model_np.shape, load_para_np.shape - ) - - assert ( - model_np.dtype == load_para_np.dtype - ), "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( - param.name, model_np.dtype, load_para_np.dtype - ) - - tensor.set(load_para_np, framework._current_expected_place()) - - self._accumulators_holder = state_dict - for k, v in self._accumulators.items(): - for para_name, var_tmp in v.items(): - assert ( - var_tmp.name in state_dict - ), "optimizer variable {} not found".format(var_tmp.name) - _load_state_para(state_dict, var_tmp) - - for k, v in self._global_accumulators.items(): - assert ( - v.name in state_dict - ), "optimizer variable {} not found".format(v.name) - _load_state_para(state_dict, v) - - # [aliases] Compatible with old method names - set_dict = set_state_dict - - def get_opti_var_name_list(self): - return self._opti_name_list - - def _set_auxiliary_var(self, key, val): - self._auxiliary_vars[key] = val - - def _get_auxiliary_var(self, key): - if key in self._auxiliary_vars: - return self._auxiliary_vars[key] - else: - return None - - def _create_global_learning_rate(self): - from paddle.optimizer.lr import LRScheduler - - if isinstance(self._learning_rate, LRScheduler): - lr_var = self._global_learning_rate() - # only create global lr_var once - if not isinstance(lr_var, framework.Variable): - lr_name = unique_name.generate('learning_rate') - self._learning_rate._var_name = lr_name - lr_var = self.helper.create_global_variable( - name=lr_name, - shape=[1], - persistable=True, - stop_gradient=True, - dtype='float32' if self._dtype is None else self._dtype, - ) - main_prog = framework.default_main_program() - main_prog.lr_scheduler = self._learning_rate - main_prog.lr_var = lr_var - self._learning_rate_map[ - framework.default_main_program() - ] = lr_var - - lr_value = float(self._learning_rate()) - self.helper.set_variable_initializer( - lr_var, - initializer=paddle.nn.initializer.Constant(value=lr_value), - ) - return - - if imperative_base.enabled(): - # create learning rate Variable - if isinstance(self._learning_rate, float): - lr = self._global_learning_rate() - - if isinstance(lr, framework.Variable): - return - else: - self._learning_rate_map[ - framework.default_main_program() - ] = paddle.static.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(self._learning_rate), - dtype='float32' if self._dtype is None else self._dtype, - persistable=True, - ) - # get learning rate Variable from LearningRateDecay - elif isinstance(self._learning_rate, LearningRateDecay): - self._learning_rate_map[ - framework.default_main_program() - ] = self._learning_rate() - else: - raise TypeError( - "optimizer's learning rate must be float or LearningRateDecay" - ) - else: - lr = self._global_learning_rate() - - if isinstance(lr, framework.Variable): - return - else: - if not isinstance(self._learning_rate, float): - raise TypeError( - "learning rate variable is create outside optimizer," - "can not create new learning rate variable for new program" - ) - - # create learning rate in the current main program - self._learning_rate_map[ - framework.default_main_program() - ] = paddle.static.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(self._learning_rate), - dtype='float32' if self._dtype is None else self._dtype, - persistable=True, - ) - - @framework.dygraph_only - def set_lr(self, value): - """ - :api_attr: imperative - - Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay, - this API cannot be invoked, because it will lead to conflict. - - Args: - value (float|Variable): the value of learning rate - - Returns: - None - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import paddle - - with fluid.dygraph.guard(): - linear = paddle.nn.Linear(10, 10) - - adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) - - # set learning rate manually by python float value - lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] - for i in range(5): - adam.set_lr(lr_list[i]) - lr = adam.get_lr() - print("current lr is {}".format(lr)) - # Print: - # current lr is 0.2 - # current lr is 0.3 - # current lr is 0.4 - # current lr is 0.5 - # current lr is 0.6 - - - - - - """ - if not isinstance(value, (framework.Variable, float)): - raise TypeError( - "The type of 'value' in optimizer.set_lr must be (float, Variable), but received %s." - % (type(value)) - ) - if isinstance(self._learning_rate, LearningRateDecay): - raise RuntimeError( - "optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict." - ) - if isinstance(value, float): - self._learning_rate = value - current_lr = self._global_learning_rate() - if current_lr is not None: - if in_dygraph_mode(): - place = _current_expected_place() - _C_ops.full_( - current_lr, - list(current_lr.shape), - float(value), - current_lr.dtype, - place, - ) - else: - global_block = ( - framework.default_main_program().global_block() - ) - global_block.append_op( - type='fill_constant', - outputs={'Out': [current_lr]}, - attrs={ - 'dtype': current_lr.dtype, - 'shape': list(current_lr.shape), - 'value': float(value), - }, - stop_gradient=True, - ) - else: - assert ( - len(value.shape) == 1 and value.shape[0] == 1 - ), "optimizer's learning rate must be 1-D Tensor with shape[1]" - self._learning_rate_map[framework.default_main_program()] = value - - @framework.dygraph_only - def current_step_lr(self): - """ - :api_attr: imperative - - Get current step learning rate. The return value is all the same When LearningRateDecay is not used, - otherwise return the step learning rate. - - Returns: - float: The learning rate of the current step. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - # example1: LearningRateDecay is not used, return value is all the same - with fluid.dygraph.guard(): - emb = paddle.nn.Embedding(10, 10) - adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) - lr = adam.get_lr() - print(lr) # 0.001 - - # example2: PiecewiseDecay is used, return the step learning rate - with fluid.dygraph.guard(): - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") - linear = paddle.nn.Linear(10, 10) - inp = fluid.dygraph.to_variable(inp) - out = linear(inp) - loss = paddle.mean(out) - - bd = [2, 4, 6, 8] - value = [0.2, 0.4, 0.6, 0.8, 1.0] - adam = paddle.optimizer.Adam(paddle.optimizer.lr.PiecewiseDecay(bd, value), - parameters=linear.parameters()) - - # first step: learning rate is 0.2 - np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True - - # learning rate for different steps - ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] - for i in range(12): - adam.minimize(loss) - adam.step() - lr = adam.get_lr() - np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True - - """ - current_lr = self._global_learning_rate() - if isinstance(current_lr, framework.Variable): - return float(current_lr) - - if isinstance(self._learning_rate, float): - return self._learning_rate - elif isinstance(self._learning_rate, _LearningRateEpochDecay): - step_lr = self._learning_rate() - return float(step_lr) - else: - step_lr = self._learning_rate.step() - if isinstance(step_lr, (float, int)): - return step_lr - else: - return float(step_lr) - - def _global_learning_rate(self, program=None): - """ - get global decayed learning rate - :return: - """ - if program is None: - program = framework.default_main_program() - return self._learning_rate_map.get(program, None) - - def _append_optimize_op(self, block, param_and_grad): - """append optimize operator to block and return all the added optimize_op""" - raise NotImplementedError() - - def _create_param_lr(self, param_and_grad): - # create learning rate variable for every parameter - param = param_and_grad[0] - param_lr = param.optimize_attr['learning_rate'] - if type(param_lr) == Variable: - return param_lr - else: - if param_lr == 1.0: - return self._global_learning_rate() - else: - with default_main_program()._lr_schedule_guard( - is_with_opt=True - ), framework.name_scope('scale_with_param_lr'): - return self._global_learning_rate() * param_lr - - def _is_dtype_fp16_or_bf16(self, dtype): - """ - check the dtype is fp16 or the dtype is bf16 - :param dtype: instance of core.VarDesc.VarType - :return: True if dtype is one of fp16 or bf16, False otherwise - """ - assert isinstance( - dtype, core.VarDesc.VarType - ), "The dtype should be an instance of core.VarDesc.VarType." - return ( - dtype == core.VarDesc.VarType.FP16 - or dtype == core.VarDesc.VarType.BF16 - ) - - def _create_master_weight(self, param): - if param.name in self._master_weights: - var = self._master_weights[param.name] - else: - assert isinstance(self.helper, LayerHelper) - - var_name = param.name + "_fp32_master" - var_name = unique_name.generate(var_name) - var = paddle.static.create_global_var( - name=var_name, - shape=param.shape, - value=0, - dtype='float32', - persistable=True, - ) - block = self.helper.startup_program.global_block() - block.append_op( - type="cast", - inputs={"X": [param]}, - outputs={"Out": [var]}, - attrs={ - "in_dtype": param.dtype, - "out_dtype": core.VarDesc.VarType.FP32, - }, - ) - self._master_weights[param.name] = var - return var - - def _create_accumulators(self, block, parameters): - """Create all accumulators needed by the parameters - - Args: - block: the block in which the loss variable is present - parameters: list of parameter variables for the optimizer - """ - pass - - def _finish_update(self, block, parameters_and_grads): - """Finish any custom updates needed - before completing an optimization step - - Args: - block: the block in which the loss variable is present - parameters: list of parameter variables for the optimizer - - Returns: - None - """ - pass - - def _add_accumulator( - self, - name, - param, - dtype=None, - fill_value=0.0, - shape=None, - type=None, - device=None, - ): - """Utility function to add an accumulator for a parameter - - Args: - block: the block in which the loss variable is present - name: name of the accumulator - param: parameter variable for which accumulator is to be added - dtype: data type of the accumulator variable - fill_value: value to initialize the accumulator variable - """ - if self._name is not None: - name = self._name + "_" + name - if ( - name in self._accumulators - and param.name in self._accumulators[name] - ): - if in_dygraph_mode(): - return self._accumulators[name][param.name] - raise Exception( - "Accumulator {} already exists for parameter {}".format( - name, param.name - ) - ) - if shape is None: - shape = param.shape - assert isinstance(self.helper, LayerHelper) - - var_name = param.name + "_" + name - var_name = unique_name.generate(var_name) - self._opti_name_list.append(var_name) - - var = self.helper.create_global_variable( - name=var_name, - persistable=True, - dtype=dtype or param.dtype, - type=core.VarDesc.VarType.LOD_TENSOR - if in_dygraph_mode() - else (param.type if type is None else type), - shape=shape, - belong_to_optimizer=True, - ) - if device is None: - device = self._get_device_for_param(param.name) - with device_guard(device): - self.helper.set_variable_initializer( - var, - initializer=paddle.nn.initializer.Constant( - value=float(fill_value) - ), - ) - - if in_dygraph_mode(): - if len(self._accumulators_holder) > 0: - assert ( - var_name in self._accumulators_holder - ), "Optimizer set error, {} should in state dict".format( - var_name - ) - var.set_value(self._accumulators_holder[var_name]) - - self._accumulators[name][param.name] = var - return var - - def _add_global_accumulator( - self, - name, - dtype=None, - fill_value=0.0, - shape=None, - type=None, - device=None, - ): - """Utility function to add a global accumulator for all parameters in the model - - Args: - block: the block in which the loss variable is present - name: name of the accumulator - dtype: data type of the accumulator variable - fill_value: value to initialize the accumulator variable - shape: the shape of the accumulator - type: the variable type of the accumulator - device: the target place of the accumulator - """ - if self._name is not None: - name = self._name + "_" + name - if name in self._global_accumulators: - if in_dygraph_mode(): - return self._global_accumulators[name] - raise Exception("Global accumulator {} already exists".format(name)) - if shape is None: - shape = [1] # most case, global accumulator is of shape [1] - assert isinstance(self.helper, LayerHelper) - - var_name = name - var_name = unique_name.generate(var_name) - self._opti_name_list.append(var_name) - - var = self.helper.create_global_variable( - name=var_name, - persistable=True, - dtype=dtype if dtype else self._dtype, - type=type, - shape=shape, - belong_to_optimizer=True, - ) - if device is None: - device = 'cpu' - with device_guard(device): - self.helper.set_variable_initializer( - var, - initializer=paddle.nn.initializer.Constant( - value=float(fill_value) - ), - ) - - if in_dygraph_mode(): - if len(self._accumulators_holder) > 0: - assert ( - var_name in self._accumulators_holder - ), "Optimizer set error, {} should in state dict".format( - var_name - ) - var.set_value(self._accumulators_holder[var_name]) - - self._global_accumulators[name] = var - return var - - def _get_accumulator(self, name, param): - """Utility function to fetch an accumulator for a parameter - - Args: - name: name of the accumulator - param: parameter variable for which accumulator is to be fetched - - Returns: - accumulator variable - """ - if self._name is not None: - name = self._name + "_" + name - if ( - name not in self._accumulators - or param.name not in self._accumulators[name] - ): - raise Exception( - "Accumulator {} does not exist for parameter {}".format( - name, param.name - ) - ) - return self._accumulators[name][param.name] - - def _get_accumulator_master(self, name, param): - """Utility function to fetch an accumulator for a parameter - Args: - name: name of the accumulator - param: parameter variable for which accumulator is to be fetched - Returns: - accumulator variable for the parameter - """ - if self._name is not None: - name = self._name + "_" + name - find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( - param.dtype - ) - target_param = ( - self._master_weights[param.name] if find_master else param - ) - target_name = target_param.name - if ( - name not in self._accumulators - or target_name not in self._accumulators[name] - ): - raise Exception( - "Accumulator {} does not exist for parameter {}".format( - name, target_name - ) - ) - return self._accumulators[name][target_name] - - def _get_global_accumulator(self, name): - """Utility function to fetch a global accumulator - - Args: - name: name of the accumulator - - Returns: - accumulator variable - """ - if self._name is not None: - name = self._name + "_" + name - if name not in self._global_accumulators: - raise Exception("Global accumulator {} does not exist".format(name)) - return self._global_accumulators[name] - - def _update_param_device_map(self, parameters_and_grads, target_block): - for param_and_grad in parameters_and_grads: - if param_and_grad[0].trainable is True: - param_name = param_and_grad[0].name - ops = target_block.ops - device_attr_name = ( - core.op_proto_and_checker_maker.kOpDeviceAttrName() - ) - for op in ops: - input_arg_names = op.input_arg_names - if param_name in input_arg_names: - self._param_device_map[param_name] = op.attr( - device_attr_name - ) - break - - def _get_device_for_param(self, param_name): - device = None - if param_name in self._param_device_map: - device = self._param_device_map[param_name] - return device - - def _create_optimization_pass(self, parameters_and_grads): - """Add optimization operators to update gradients to variables. - - Args: - parameters_and_grads(list(tuple(Variable, Variable))): - a list of (variable, gradient) pair to update. - - Returns: - return_op_list: a list of operators that will complete one step of - optimization. This will include parameter update ops, global step - update ops and any other custom ops required by subclasses to manage - their internal state. - """ - # This is a default implementation of create_optimization_pass that - # can be shared by most optimizers. This implementation assumes that - # the subclass will implement the _append_optimize_op method and the - # _initialize_tensors method. The subclass can extend the - # _create_accumulators method if it needs to create accumulators - # for parameters and extend _finish_update method to add custom ops. - - # Allways called under program_guard use global block as loss block - # But if current block is in control flow, append optimize op in the - # grad block of current block - - global_block = framework.default_main_program().global_block() - target_block = global_block - current_block = framework.default_main_program().current_block() - if current_block.idx != global_block.idx: - assert ( - current_block.backward_block_idx != -1 - ), "current block is not global_block, but it doesn't have backward block." - target_block = framework.default_main_program().blocks[ - current_block.backward_block_idx - ] - - start = len(target_block.ops) - - self._update_param_device_map(parameters_and_grads, target_block) - self._create_accumulators( - target_block, [p[0] for p in parameters_and_grads if p[0].trainable] - ) - self._create_global_learning_rate() - - if in_dygraph_mode(): - found_inf = self._get_auxiliary_var('found_inf') - if found_inf: - if isinstance(found_inf, core.eager.Tensor): - self._set_auxiliary_var('found_inf', True) - else: - if isinstance(found_inf, core.eager.Tensor): - self._set_auxiliary_var('found_inf', False) - for param_and_grad in parameters_and_grads: - if param_and_grad[1] is None: - continue - if param_and_grad[0].trainable is True: - self._append_optimize_op(target_block, param_and_grad) - else: - for param_and_grad in parameters_and_grads: - if param_and_grad[1] is None: - continue - with param_and_grad[0].block.program._optimized_guard( - param_and_grad - ), name_scope("optimizer"): - if param_and_grad[0].trainable is True: - device = self._get_device_for_param( - param_and_grad[0].name - ) - with device_guard(device): - optimize_op = self._append_optimize_op( - target_block, param_and_grad - ) - - # Get custom finish ops for subclasses - # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(target_block, parameters_and_grads) - - end = len(target_block.ops) - return target_block._slice_ops(start, end) - - def _process_distribute_lookuptable(self, param_grads): - """ - Because distribute lookup table only support SGD optimizer for now, not support - other optimizer and regularization, so we should find the table parameter out, - and avoid to add regularization and other op for it, and add sgd optimize op - for it independently. - :param param_grads(list((Var, Var))): list of (param, grad) pair. - :param loss: the loss variable. - :param startup_program: the startup program - """ - from paddle.distributed.distribute_lookup_table import ( - find_distributed_lookup_table, - ) - - program = framework.default_main_program() - global_block = framework.default_main_program().global_block() - table_name = find_distributed_lookup_table(program) - table_param = None - table_grad = None - new_param_grads = [] - for p, g in param_grads: - if p.name == table_name: - if table_param is not None: - raise RuntimeError( - "multi dist table var found, only support one now!" - ) - table_param = p - table_grad = g - else: - new_param_grads.append((p, g)) - sgd_op = None - if table_param is not None: - param_and_grad = [table_param, table_grad] - with table_param.block.program._optimized_guard( - param_and_grad - ), framework.name_scope("optimizer"): - self._create_global_learning_rate() - # create the optimize op - sgd_op = global_block.append_op( - type='sgd', - inputs={ - "Param": table_param, - "Grad": table_grad, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={"ParamOut": param_and_grad[0]}, - ) - return new_param_grads, (table_param, table_grad), sgd_op - - def backward( - self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None, - callbacks=None, - ): - """ - The first part of ``minimize``, do auto-diff to append backward operations for - the current program. - - Args: - loss (Variable): ``loss`` variable to run optimizations. - startup_program (Program, optional): :ref:`api_fluid_Program` for - initializing parameters in ``parameter_list``. The default value - is None, at this time :ref:`api_fluid_default_startup_program` will be used. - parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update - to minimize ``loss``. The default value is None, at this time all parameters - will be updated. - no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need - to be updated. The default value is None. - callbacks (list, optional): list of callable objects to run when appending backward - operator for one parameter. The default value is None. - - Return: - list: list of (param, grad) variable pairs, param is ``Parameter``, - grad is the gradient value corresponding to the parameter. - - Examples: - See examples in ``apply_gradients``. - """ - act_no_grad_set = None - if in_dygraph_mode(): - pass - else: - act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) - - # Infer dtype by loss if None - if self._dtype is None: - self._dtype = loss.dtype - - if in_dygraph_mode(): - parameter_list = ( - parameter_list if parameter_list else self._parameter_list - ) - - params_grads = [] - for param in parameter_list: - if not param.trainable: - continue - if param._grad_ivar() is not None: - # create gradient variable - grad_var = param._grad_ivar() - params_grads.append((param, grad_var)) - else: - if callbacks is None: - callbacks = [paddle.nn.clip.error_clip_callback] - else: - assert isinstance(callbacks, list) - program = loss.block.program - assert np.prod(loss.shape) == 1, ( - "The number of elements of loss should be 1, but the current loss.shape is {}, whose number of elements is not 1. " - "Maybe that you should call paddle.mean to process the current loss.".format( - loss.shape - ) - ) - parameter_list = ( - parameter_list if parameter_list else self._parameter_list - ) - with program_guard(program, startup_program): - params_grads = append_backward( - loss, parameter_list, act_no_grad_set, callbacks - ) - return params_grads - - def _create_regularization_of_grad(self, param, grad, regularization=None): - """Create and add backward regularization Operators - - Function helper of append_regularization_ops. - """ - # If no gradient or no regularization is specified, then we don't need to do anything - if grad is None or ( - ( - not hasattr(param, 'regularizer') - or (hasattr(param, 'regularizer') and param.regularizer is None) - ) - and regularization is None - ): - return grad - regularization_term = None - if hasattr(param, 'regularizer') and param.regularizer is not None: - # Add variable for regularization term in grad block - regularization_term = param.regularizer(param, grad, grad.block) - elif regularization is not None: - regularization_term = regularization(param, grad, grad.block) - - assert regularization_term is not None - - if in_dygraph_mode(): - return _legacy_C_ops.sum([grad, regularization_term]) - - new_grad = grad - if grad.type == core.VarDesc.VarType.SELECTED_ROWS: - # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization, - # the grad's type and name will be changed. But the gradient's name - # is used in ParallelExecutor Reduce mode, so I add a flag for - # the new_grad here. - new_grad = grad.block.create_var( - name=grad.name + core.kNewGradSuffix(), - dtype=param.dtype, - shape=param.shape, - lod_level=param.lod_level, - type=core.VarDesc.VarType.LOD_TENSOR, - ) - - inputs = {"X": [grad, regularization_term]} - outputs = {"Out": [new_grad]} - grad.block.append_op(type='sum', inputs=inputs, outputs=outputs) - - return new_grad - - def append_regularization_ops( - self, parameters_and_grads, regularization=None - ): - r"""Create and add backward regularization Operators - - Creates and adds backward regularization operators in the BlockDesc. - This will add gradients of the regularizer function to the gradients - of the parameters and return these modified gradients. This is the - same as implementing weight decay in optimizers for regularization. - - Args: - parameters_and_grads: A list of (parameters, gradients) pairs - that need to be regularized. - regularization: A global regularizer. If the parameter is not - set. It will be applied with regularizer. - - Returns: - list[(Variable, Variable)]: list of (parameters, gradients) \ - pair with the regularized gradient - - Raises: - Exception: Unknown regularization type - """ - params_and_grads = [] - if in_dygraph_mode(): - for param, grad in parameters_and_grads: - new_grad = self._create_regularization_of_grad( - param, grad, regularization - ) - params_and_grads.append((param, new_grad)) - else: - repeate_regularizer = False - with framework.name_scope('regularization'): - for param, grad in parameters_and_grads: - if ( - not repeate_regularizer - and getattr(param, 'regularizer', None) is not None - and regularization is not None - ): - repeate_regularizer = True - logging.info( - "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. " - "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" - % regularization.__str__() - ) - with param.block.program._optimized_guard([param, grad]): - new_grad = self._create_regularization_of_grad( - param, grad, regularization - ) - params_and_grads.append((param, new_grad)) - return params_and_grads - - def flatten_param_grads(self, params_grads): - need_flatten_params = [] - need_flatten_grads = [] - for p, g in params_grads: - if g is None: - continue - g.persistable = True - if ( - getattr(p, 'need_clip', True) is False - or getattr(p, 'regularizer', None) is not None - ): - warnings.warn( - "flatten_param_grads=True will be discarded since paramter '{}''s need_clip is False or " - "the regularizer is set".format(p.name) - ) - self._flatten_param_grads = False - return params_grads - - need_flatten_params.append(p) - need_flatten_grads.append(g) - - shape = [np.prod(p.shape) for p in need_flatten_params] - block = need_flatten_params[0].block - - flatten_param = self.helper.create_global_variable( - name='flatten_param', - persistable=True, - dtype=need_flatten_params[0].dtype, - shape=[np.sum(shape)], - belong_to_optimizer=True, - ) - - flatten_param.trainable = True - flatten_param.optimize_attr = need_flatten_params[0].optimize_attr - flatten_param.regularizer = need_flatten_params[0].regularizer - - flatten_grad = self.helper.create_global_variable( - name='flatten_grad', - persistable=True, - dtype=need_flatten_grads[0].dtype, - shape=[np.sum(shape)], - belong_to_optimizer=True, - ) - - with program_guard(default_main_program()): - block.append_op( - type="coalesce_tensor", - inputs={"Input": need_flatten_params}, - outputs={ - "Output": need_flatten_params, - "FusedOutput": flatten_param, - }, - attrs={ - "copy_data": True, - "use_align": True, - "align_size": self._align_size, - "dtype": need_flatten_params[0].dtype, - }, - ) - - block.append_op( - type="coalesce_tensor", - inputs={"Input": need_flatten_grads}, - outputs={ - "Output": need_flatten_grads, - "FusedOutput": flatten_grad, - }, - attrs={ - "copy_data": True, - "use_align": True, - "align_size": self._align_size, - "dtype": need_flatten_grads[0].dtype, - }, - ) - - # NOTE(zhiqiu): the initializer should be set after coalesce_tensor op, - # so the shape of flatten_param and flatten_grad will be inferred. - self.helper.set_variable_initializer( - flatten_param, - initializer=paddle.nn.initializer.Constant(0.0), - ) - self.helper.set_variable_initializer( - flatten_grad, - initializer=paddle.nn.initializer.Constant(0.0), - ) - - return [(flatten_param, flatten_grad)] - - def apply_gradients(self, params_grads): - """ - Second part of `minimize`, appending optimization operators for - given `params_grads` pairs. - - Args: - params_grads (list): list of (param, grad) pair to do optimization. - - Returns: - list: A list of operators appended to the current program. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - loss = network() - optimizer = paddle.optimizer.SGD(learning_rate=0.1) - params_grads = optimizer.backward(loss) - # you may append operations for params_grads here - # ... - optimizer.apply_gradients(params_grads) - """ - params_grads = sorted(params_grads, key=lambda x: x[0].name) - - # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization. - if self._flatten_param_grads and self.regularization is None: - if self._grad_clip is None or isinstance( - self._grad_clip, paddle.nn.ClipGradByGlobalNorm - ): - params_grads = self.flatten_param_grads(params_grads) - - # 'optimizer(grad_clip)' or 'set_gradient_clip' - if self._grad_clip is not None: - params_grads = self._grad_clip(params_grads) - else: - params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads) - - # Add regularization if any - params_grads = self.append_regularization_ops( - params_grads, self.regularization - ) - - optimize_ops = self._create_optimization_pass(params_grads) - return optimize_ops - - def apply_optimize(self, loss, startup_program, params_grads): - """ - Second part of `minimize`, appending optimization operators for - given `params_grads` pairs. - Args: - loss (Variable): loss variable to run optimizations. - startup_program (Program): startup_program for initializing parameters - in `parameter_list`. - params_grads (list): list of (param, grad) pair to do optimization. - Returns: - list: A list of operators appended to the current program. - """ - if in_dygraph_mode(): - with program_guard( - framework.default_main_program(), - framework.default_startup_program(), - ): - if self._grad_clip is not None: - params_grads = self._grad_clip(params_grads) - params_grads = self.append_regularization_ops( - params_grads, self.regularization - ) - optimize_ops = self._create_optimization_pass(params_grads) - else: - program = loss.block.program - with program_guard(program, startup_program): - optimize_ops = self.apply_gradients(params_grads) - return optimize_ops - - def _get_no_grad_set(self, loss, no_grad_set=None): - no_grad_set = _get_no_grad_set_name(no_grad_set) - parameters = loss.block.program.global_block().all_parameters() - param_no_trainable = set( - [param.name for param in parameters if param.trainable is False] - ) - # If the parameter is no trainable, it should not have a gradient. - no_grad_set.update(param_no_trainable) - - return no_grad_set - - @framework.dygraph_only - def clear_gradients(self): - """ - Clear the gradients of all optimized parameters for model. - - If not, new gradient will accumulat on previous gradient. - - Returns: - None - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy as np - - with fluid.dygraph.guard(): - value = np.arange(26).reshape(2, 13).astype("float32") - a = fluid.dygraph.to_variable(value) - linear = paddle.nn.Linear(13, 5) - # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.Adam(learning_rate = 0.01, - parameters = linear.parameters()) - out = linear(a) - out.backward() - adam.minimize(out) - adam.clear_gradients() - - """ - for p in self._parameter_list: - if p.trainable: - p.clear_gradient() - - @imperative_base.no_grad - def minimize( - self, loss, startup_program=None, parameter_list=None, no_grad_set=None - ): - """ - Add operations to minimize ``loss`` by updating ``parameter_list``. - - Args: - loss (Variable): A ``Variable`` containing the value to minimize. - startup_program (Program, optional): :ref:`api_fluid_Program` for - initializing parameters in ``parameter_list``. The default value - is None, at this time :ref:`api_fluid_default_startup_program` will be used. - parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update - to minimize ``loss``. The default value is None, at this time all parameters - will be updated. - no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need - to be updated. The default value is None. - - Returns: - tuple: tuple (optimize_ops, params_grads), A list of operators appended - by minimize and a list of (param, grad) variable pairs, param is - ``Parameter``, grad is the gradient value corresponding to the parameter. - The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to - indicate program pruning. If so, the program will be pruned by ``feed`` and - ``fetch_list`` before run, see details in ``Executor``. - - Examples: - Please refer to the example of current Optimizer. - """ - assert isinstance(loss, Variable), "The loss should be an Variable." - - parameter_list = ( - parameter_list if parameter_list else self._parameter_list - ) - - params_grads = self.backward( - loss, - startup_program=startup_program, - parameter_list=parameter_list, - no_grad_set=no_grad_set, - ) - - optimize_ops = self.apply_optimize( - loss, startup_program=startup_program, params_grads=params_grads - ) - - return optimize_ops, params_grads diff --git a/python/paddle/incubate/distributed/fleet/base.py b/python/paddle/incubate/distributed/fleet/base.py index ad00ebdb95e2b..a9eda099f7211 100644 --- a/python/paddle/incubate/distributed/fleet/base.py +++ b/python/paddle/incubate/distributed/fleet/base.py @@ -290,10 +290,8 @@ class DistributedOptimizer(metaclass=abc.ABCMeta): """ def __init__(self, optimizer, strategy=None): - if ( - not isinstance(optimizer, SGD.__bases__) - and not isinstance(optimizer, fluid.optimizer.Optimizer) - and not isinstance(optimizer, OptimizerWithMixedPrecision) + if not isinstance(optimizer, SGD.__bases__) and not isinstance( + optimizer, OptimizerWithMixedPrecision ): raise TypeError("optimizer must be an instance of Optimizer") diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py index 4b2baca637c7c..b3adb88271a75 100644 --- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py +++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py @@ -28,7 +28,6 @@ Executor, ) from paddle.fluid.compiler import CompiledProgram -from paddle.fluid.optimizer import Optimizer from paddle.distributed.transpiler.distribute_transpiler import ( DistributeTranspilerConfig, diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index 1efc75cb7d6f9..9f2873fe81b09 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -19,8 +19,8 @@ from paddle.fluid.executor import global_scope from paddle.fluid.framework import Variable, name_scope from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid.optimizer import Optimizer from paddle.nn import ClipGradByGlobalNorm +from paddle.optimizer import Optimizer def init_communicator(block, rank, ranks, ring_id): diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py index 4e903c07c673b..1769ac62a7e02 100644 --- a/python/paddle/incubate/optimizer/pipeline.py +++ b/python/paddle/incubate/optimizer/pipeline.py @@ -27,7 +27,6 @@ default_startup_program, in_dygraph_mode, ) -from paddle.fluid.optimizer import Optimizer __all__ = [] @@ -99,7 +98,6 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): if in_dygraph_mode(): raise Exception("In dygraph, don't support PipelineOptimizer.") valid_optimizers = ( - Optimizer, paddle.optimizer.Optimizer, paddle.static.amp.decorator.OptimizerWithMixedPrecision, ) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 5becbc8cec22c..c198c436b23e0 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -1114,9 +1114,6 @@ def _create_optimization_pass( end = len(target_block.ops) return target_block._slice_ops(start, end) - def _append_dgc_ops(self, param_and_grad): - pass - def backward( self, loss, @@ -1205,9 +1202,6 @@ def backward( params_grads = append_backward( loss, parameter_list, act_no_grad_set, callbacks ) - # Note: since we can't use all_reduce_op now, - # dgc_op should be the last op of one grad. - self._append_dgc_ops(params_grads) return params_grads def apply_gradients(self, params_grads): diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index d8247cf6561bb..88e5b16f6b348 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -70,7 +70,6 @@ from ..fluid.framework import set_ipu_shard # noqa: F401 from .nn.control_flow import Print # noqa: F401 from ..fluid.param_attr import WeightNormParamAttr # noqa: F401 -from ..fluid.optimizer import Optimizer # noqa: F401 from .nn.metric import auc # noqa: F401 diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index 9dfa612600958..3c59e76d03af4 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -37,10 +37,10 @@ def _set_multi_precision(optimizer, multi_precision): if not isinstance( optimizer, - (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer), + (paddle.optimizer.Optimizer), ): raise RuntimeError( - "Current AMP training level is O2, optimizer is expected to be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".format( + "Current AMP training level is O2, optimizer is expected to be paddle.optimizer.Optimizer, but receive {}.".format( type(optimizer) ) ) diff --git a/test/legacy_test/test_dist_transpiler.py b/test/legacy_test/test_dist_transpiler.py index b3a2f95aef78c..094bdc2f6cf68 100644 --- a/test/legacy_test/test_dist_transpiler.py +++ b/test/legacy_test/test_dist_transpiler.py @@ -1126,7 +1126,7 @@ def net_conf(self): y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) + optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) optimizer.minimize(avg_cost) def transpiler_test_impl(self): diff --git a/test/legacy_test/test_imperative_save_load_v2.py b/test/legacy_test/test_imperative_save_load_v2.py index 74dcbe059cd3f..11f3386df3461 100644 --- a/test/legacy_test/test_imperative_save_load_v2.py +++ b/test/legacy_test/test_imperative_save_load_v2.py @@ -22,9 +22,9 @@ from paddle import fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay from paddle.nn import Embedding from paddle.optimizer import Adam +from paddle.optimizer.lr import LRScheduler class SimpleLSTMRNN(paddle.nn.Layer): @@ -552,7 +552,7 @@ def func_testSetVariable(self): self.assertTrue(np.sum(np.abs(v.numpy())) == 0) - if isinstance(adam._learning_rate, LearningRateDecay): + if isinstance(adam._learning_rate, LRScheduler): adam._learning_rate.step_num = 0 adam.set_state_dict(self.opti_dict) @@ -673,7 +673,7 @@ def func_testSetNumpy(self): else: np_opti_dict[k] = v - if isinstance(adam._learning_rate, LearningRateDecay): + if isinstance(adam._learning_rate, LRScheduler): adam._learning_rate.step_num = 0 adam.set_state_dict(np_opti_dict) From a6a498559c9fbb74c0724cfaf5dde0af29c77918 Mon Sep 17 00:00:00 2001 From: Meteor Liu Date: Fri, 11 Aug 2023 17:34:05 +0800 Subject: [PATCH 023/246] [dy2static]implement tensor.cuda() in static graph (#56092) * [dy2static]implement tensor.cuda() in static graph * [dy2static]implement tensor.cuda() in static graph - change the patch place * [dy2static]implement tensor.cuda() in static graph - fix code-block in comment * [dy2static]implement tensor.cuda() in static graph - add ut for warning branch --- .../interpreter/interpreter_util.cc | 57 +++++++++++++- python/paddle/fluid/layers/math_op_patch.py | 57 ++++++++++++-- .../test_tensor_memcpy_on_cpu.py | 76 +++++++++++++++++++ .../test_tensor_memcpy_on_gpu.py | 76 +++++++++++++++++++ 4 files changed, 258 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index b6e55051a45ef..7f5bcff428195 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -476,6 +476,56 @@ void ApplyDeviceGuard(const OperatorBase* op_base, } } +platform::DeviceContext* ConstructDeviceContext(const OperatorBase* op, + const platform::Place& place) { + auto& pool = platform::DeviceContextPool::Instance(); + auto* default_dev_ctx = pool.Get(place); + + // Replace the default_dev_ctx according to dst_place_type for memcpy op if + // needed. + + // NOTE(liudongxue01): + // Please apply the following logic in other Executor/Interpreter modules + // likewise. + // + + // NOTE(liudongxue01): + // The following code aims to fixup the memcpy kernel which does not handle + // some rare case. The case is: + // 1. The default place in the current execution context is not CUDAPlace, + // such as CPUPlace, + // 2. The dst_place_type is 1 which means CUDAPlace, + // 3. The expected result place is CUDAPlace but the actual result is + // CPUPlace. + // When the default place is CPUPlace, we call the tensor.cuda() would + // simply hit such case. + // + // Q: Why we do not add such logic in the memcpy kernel? + // A: (1) To fixup the memcpy kernel, we need to construct a CUDAPlace() and + // corresponding DeviceContext instance which used by the phi::Copy(...) + // api to perform the real memcpy action. (2) We should not access the + // singleton of the DeviceContextPool object in the PHI framework which + // is designed as a standalone module and all context data should passed + // into the kernel API through arguments. (3) So we have no way to + // construct a CUDAPlace() in the memcpy kernel and then pass it + // to the phi::Copy(...) api. + if (!platform::is_gpu_place(place)) { + const auto& op_type = op->Type(); + if (op_type == "memcpy") { + int dst_place_type = op->Attr("dst_place_type"); + if (dst_place_type == 1) { // 1 : CUDAPlace + auto dev_ctx = pool.Get(paddle::DefaultGPUPlace()); + VLOG(4) << "Change the device context for memcpy OP: (" + << default_dev_ctx->type_info().name() << ") -> (" + << dev_ctx->type_info().name() << ")"; + return dev_ctx; + } + } + } + + return default_dev_ctx; +} + void HandleOperatorBase(const platform::Place& place, std::shared_ptr op, OpFuncNode* op_func_node, @@ -537,6 +587,8 @@ void BuildOpFuncList(const platform::Place& place, } auto unused_var_map = GetUnusedVars(block, ops); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + bool flag_log_is_printed = false; for (size_t i = 0; i < ops.size(); ++i) { auto op = ops[i].get(); @@ -655,8 +707,9 @@ void BuildOpFuncList(const platform::Place& place, runtime_scope = local_scope; } - auto& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); + // construct the device context + auto* dev_ctx = ConstructDeviceContext(op, place); + SetDeviceCommContext(op, dev_ctx); auto exec_ctx = ExecutionContext( *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context); diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py index 684404d79422a..e95fbacf2f66a 100644 --- a/python/paddle/fluid/layers/math_op_patch.py +++ b/python/paddle/fluid/layers/math_op_patch.py @@ -146,6 +146,7 @@ def cpu(self): In Static Graph Mode: .. code-block:: python + import paddle paddle.enable_static() @@ -162,7 +163,7 @@ def cpu(self): persistable=False, stop_gradient=True, ) - # 0 means cpu place, see paddle/fluid/operators/memcpy_op.h + # 0 means cpu place, see paddle/phi/kernels/memcpy_kernel.cc attrs = {'dst_place_type': 0} block.append_op( type='memcpy', @@ -173,13 +174,57 @@ def cpu(self): return output @static_only - def cuda(self): + def cuda(self, device_id=None, blocking=True): """ - Variable should not have cpu() and cuda() interface. - But this interface can greatly facilitate dy2static. - We do nothing here. + In dy2static, Variable also needs cpu() and cuda() interface. + But, the underneath operator has only forward op but not backward one. + + Args: + self(Variable): The variable itself. + device_id(int, optional): The destination GPU device id. Default: None, means current device. + We add this argument for dy2static translation, please do not use it. + blocking(bool, optional): Whether blocking or not, Default: True. + We add this argument for dy2static translation, please do not use it. + + Returns: + The tensor which has copied to cuda place. + + Examples: + In Static Graph Mode: + + .. code-block:: python + + import paddle + paddle.enable_static() + + x = paddle.static.data(name="x", shape=[2,2], dtype='float32') + y = x.cpu() + z = y.cuda() """ - return self + if device_id is not None: + warnings.warn("device_id is not supported, and it will be ignored.") + if blocking is not True: + warnings.warn("blocking is not supported, and it will be ignored.") + + block = current_block(self) + tmp_name = unique_tmp_name() + output = block.create_var( + name=tmp_name, + dtype=self.dtype, + shape=self.shape, + type=self.type, + persistable=False, + stop_gradient=True, + ) + # 1 means cuda place, see paddle/phi/kernels/memcpy_kernel.cc + attrs = {'dst_place_type': 1} + block.append_op( + type='memcpy', + inputs={'X': [self]}, + outputs={'Out': [output]}, + attrs=attrs, + ) + return output @static_only def place(self): diff --git a/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py b/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py index a9ea152f7a85b..45f3a0a555018 100644 --- a/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py +++ b/test/dygraph_to_static/test_tensor_memcpy_on_cpu.py @@ -26,6 +26,20 @@ def tensor_copy_to_cpu(x): return y +@paddle.jit.to_static +def tensor_copy_to_cuda(x): + x = paddle.to_tensor(x) + y = x.cuda() + return y + + +@paddle.jit.to_static +def tensor_copy_to_cuda_with_warning(x, device_id=None, blocking=True): + x = paddle.to_tensor(x) + y = x.cuda(device_id, blocking) + return y + + class TestTensorCopyToCpuOnDefaultCPU(unittest.TestCase): def _run(self, to_static): paddle.jit.enable_to_static(to_static) @@ -46,5 +60,67 @@ def test_tensor_cpu_on_default_cpu(self): self.assertTrue(static_place.is_cpu_place()) +class TestTensorCopyToCUDAOnDefaultCPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda(x1) + return x1.place, x2.place, x2.numpy() + + def test_tensor_cuda_on_default_cpu(self): + if not paddle.fluid.is_compiled_with_cuda(): + return + + """ + Note(liudongxue01): If the following asserts fail to run, + please check the workaround logic for memcpy OP + whether is still taking effect or not. + See ConstructDeviceContext() in interpreter_util.cc. + """ + paddle.fluid.framework._set_expected_place(paddle.CPUPlace()) + dygraph_x1_place, dygraph_place, dygraph_res = self._run( + to_static=False + ) + static_x1_place, static_place, static_res = self._run(to_static=True) + np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05) + self.assertTrue(dygraph_x1_place.is_cpu_place()) + self.assertTrue(static_x1_place.is_cpu_place()) + self.assertTrue(dygraph_place.is_gpu_place()) + self.assertTrue(static_place.is_gpu_place()) + + +class TestTensorCopyToCUDAWithWarningOnCPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda_with_warning(x1, device_id=1, blocking=False) + return x1.place, x2.place, x2.numpy() + + def test_with_warning_on_cpu(self): + if not paddle.fluid.is_compiled_with_cuda(): + return + + paddle.fluid.framework._set_expected_place(paddle.CPUPlace()) + + x1 = paddle.ones([1, 2, 3]) + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=1, blocking=True + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=None, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=2, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + if __name__ == '__main__': unittest.main() diff --git a/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py b/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py index 4f7479d680bb4..de642dd4087e8 100644 --- a/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py +++ b/test/dygraph_to_static/test_tensor_memcpy_on_gpu.py @@ -27,6 +27,20 @@ def tensor_copy_to_cpu(x): return y +@paddle.jit.to_static +def tensor_copy_to_cuda(x): + x = paddle.to_tensor(x) + y = x.cuda() + return y + + +@paddle.jit.to_static +def tensor_copy_to_cuda_with_warning(x, device_id=None, blocking=True): + x = paddle.to_tensor(x) + y = x.cuda(device_id, blocking) + return y + + class TestTensorCopyToCpuOnDefaultGPU(unittest.TestCase): def _run(self, to_static): paddle.jit.enable_to_static(to_static) @@ -53,5 +67,67 @@ def test_tensor_cpu_on_default_gpu(self): self.assertTrue(static_place.is_cpu_place()) +class TestTensorCopyToCUDAOnDefaultGPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda(x1) + return x1.place, x2.place, x2.numpy() + + def test_tensor_cuda_on_default_gpu(self): + if paddle.fluid.is_compiled_with_cuda(): + place = paddle.CUDAPlace( + int(os.environ.get('FLAGS_selected_gpus', 0)) + ) + else: + return + paddle.fluid.framework._set_expected_place(place) + dygraph_x1_place, dygraph_place, dygraph_res = self._run( + to_static=False + ) + static_x1_place, static_place, static_res = self._run(to_static=True) + np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05) + self.assertTrue(dygraph_x1_place.is_gpu_place()) + self.assertTrue(static_x1_place.is_gpu_place()) + self.assertTrue(dygraph_place.is_gpu_place()) + self.assertTrue(static_place.is_gpu_place()) + + +class TestTensorCopyToCUDAWithWarningOnGPU(unittest.TestCase): + def _run(self, to_static): + paddle.jit.enable_to_static(to_static) + x1 = paddle.ones([1, 2, 3]) + x2 = tensor_copy_to_cuda_with_warning(x1, device_id=1, blocking=False) + return x1.place, x2.place, x2.numpy() + + def test_with_warning_on_gpu(self): + if paddle.fluid.is_compiled_with_cuda(): + place = paddle.CUDAPlace( + int(os.environ.get('FLAGS_selected_gpus', 0)) + ) + else: + return + paddle.fluid.framework._set_expected_place(place) + + x1 = paddle.ones([1, 2, 3]) + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=1, blocking=True + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=None, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + with self.assertWarns(UserWarning, msg="ignored") as cm: + x2 = tensor_copy_to_cuda_with_warning( + x1, device_id=2, blocking=False + ) + self.assertIn('math_op_patch.py', cm.filename) + + if __name__ == '__main__': unittest.main() From f74e32e9b83b1f47c419ce6aefcd2fd15a904d50 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Fri, 11 Aug 2023 18:10:58 +0800 Subject: [PATCH 024/246] [CustomDevice] fix amp op list (#56176) --- paddle/fluid/imperative/amp_auto_cast.cc | 16 +--------------- python/paddle/static/amp/fp16_lists.py | 5 +++++ 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index bf6c32be2f372..cb8f369daf82a 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -51,6 +51,7 @@ OpSupportedInfos(const std::string& place, {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, {"XPU", &platform::is_xpu_place}, + {"CUSTOM_DEVICE", &platform::is_custom_place}, }; PADDLE_ENFORCE_NE( is_target_place.count(query_place), @@ -76,12 +77,6 @@ OpSupportedInfos(const std::string& place, } } -#ifdef PADDLE_WITH_CUSTOM_DEVICE - auto is_custom_place = [&](const std::string& place) { - return is_target_place.count(place) && place != "CPU" && place != "GPU" && - place != "XPU"; - }; -#endif auto phi_kernels = phi::KernelFactory::Instance().kernels(); for (auto& kernel_pair : phi_kernels) { auto op_type = phi::TransToFluidOpName(kernel_pair.first); @@ -90,15 +85,6 @@ OpSupportedInfos(const std::string& place, all_ops.count(op_type) == 0) { continue; } -#ifdef PADDLE_WITH_CUSTOM_DEVICE - if (info_pair.first.backend() == phi::Backend::CUSTOM) { - if (is_custom_place(query_place)) { - VLOG(4) << op_type << " " << supported_ops.size(); - supported_ops.emplace(op_type); - } - continue; - } -#endif if (is_target_place[query_place]( phi::TransToPhiPlace(info_pair.first.backend(), false))) { VLOG(8) << op_type << " " << supported_ops.size(); diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py index c3d8f20b04dd2..0860ec9bcbc1b 100644 --- a/python/paddle/static/amp/fp16_lists.py +++ b/python/paddle/static/amp/fp16_lists.py @@ -15,6 +15,7 @@ import copy import logging +import paddle from paddle.amp.amp_lists import ( EXTRA_BLACK_LIST, FP16_BLACK_LIST, @@ -94,6 +95,10 @@ def _get_sys_unsupported_list(dtype): device = None if core.is_compiled_with_xpu(): device = 'XPU' + elif isinstance( + paddle.framework._current_expected_place(), paddle.CustomPlace + ): + device = 'CUSTOM_DEVICE' else: device = 'GPU' all_ops, _, sys_unsupported_list = core.op_supported_infos(device, var_type) From e9c0fe032e81b8d427893a4783fd9b2f9a5ba364 Mon Sep 17 00:00:00 2001 From: Sanbu <96160062+sanbuphy@users.noreply.github.com> Date: Fri, 11 Aug 2023 18:13:31 +0800 Subject: [PATCH 025/246] [cmake] add boost third_party cache (#56168) * [cmake] add boost third_party cache * del boost --- cmake/cinn/external/boost.cmake | 65 --------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 cmake/cinn/external/boost.cmake diff --git a/cmake/cinn/external/boost.cmake b/cmake/cinn/external/boost.cmake deleted file mode 100644 index 4d891bf8d3613..0000000000000 --- a/cmake/cinn/external/boost.cmake +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -include(ExternalProject) - -set(BOOST_PROJECT "extern_boost") -# To release PaddlePaddle as a pip package, we have to follow the -# manylinux1 standard, which features as old Linux kernels and -# compilers as possible and recommends CentOS 5. Indeed, the earliest -# CentOS version that works with NVIDIA CUDA is CentOS 6. And a new -# version of boost, say, 1.66.0, doesn't build on CentOS 6. We -# checked that the devtools package of CentOS 6 installs boost 1.41.0. -# So we use 1.41.0 here. -set(BOOST_VER "1.41.0") -set(BOOST_TAR - "boost_1_41_0" - CACHE STRING "" FORCE) -set(BOOST_URL - "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" - CACHE STRING "" FORCE) - -message(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") - -set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) -set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") - -set(BOOST_INCLUDE_DIR - "${BOOST_DOWNLOAD_DIR}" - CACHE PATH "boost include directory." FORCE) -set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) -include_directories(${BOOST_INCLUDE_DIR}) - -ExternalProject_Add( - ${BOOST_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} - URL ${BOOST_URL} - DOWNLOAD_NO_PROGRESS 1 - PREFIX ${BOOST_SOURCES_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "") - -if(${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) - file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") - add_library(boost STATIC ${dummyfile}) -else() - add_library(boost INTERFACE) -endif() - -add_dependencies(boost ${BOOST_PROJECT}) -set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) From 8bdb336d92800112f9ced17a70a6faf26d3b2006 Mon Sep 17 00:00:00 2001 From: umiswing Date: Fri, 11 Aug 2023 20:18:22 +0800 Subject: [PATCH 026/246] [Sparse] Fix bugs in parameter freezing (#56154) * Add enforce for sparse_bn. * Add enforce for sp conv. --- paddle/phi/kernels/funcs/sparse/convolution.h | 25 +++--- .../kernels/sparse/batch_norm_grad_kernel.cc | 15 +++- .../kernels/sparse/gpu/conv_grad_kernel.cu | 88 +++++++++++-------- 3 files changed, 77 insertions(+), 51 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index aa7a5e996f17f..e6f3a573088b2 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -163,17 +163,20 @@ inline void SubmPreProcess(const Context& dev_ctx, DenseTensor* kernel_grad, DenseTensor* x_grad) { auto blas = phi::funcs::GetBlas(dev_ctx); - T* d_kernel_ptr = kernel_grad->data(); - blas.GEMM(CblasTrans, - CblasNoTrans, - x.non_zero_elements().dims()[1], - out_grad.dims()[1], - x.non_zero_elements().dims()[0], - static_cast(1), - x.non_zero_elements().data(), - out_grad.data(), - static_cast(0), - d_kernel_ptr + half_kernel_size * in_channels * out_channels); + const bool is_params_freezing = kernel_grad == nullptr; + if (!is_params_freezing) { + T* d_kernel_ptr = kernel_grad->data(); + blas.GEMM(CblasTrans, + CblasNoTrans, + x.non_zero_elements().dims()[1], + out_grad.dims()[1], + x.non_zero_elements().dims()[0], + static_cast(1), + x.non_zero_elements().data(), + out_grad.data(), + static_cast(0), + d_kernel_ptr + half_kernel_size * in_channels * out_channels); + } // call gemm: d_x = out_grad * transpose(kernel) // (n, out_channels) * (out_channels, in_channels) diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc index ff3173ec0a101..73af07da806e0 100644 --- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc @@ -42,8 +42,19 @@ void BatchNormCooGradKernel(const Context& dev_ctx, DenseTensor* scale_grad, DenseTensor* bias_grad) { EmptyLikeCooKernel(dev_ctx, x, x_grad); - *scale_grad = phi::EmptyLike(dev_ctx, scale); - *bias_grad = phi::EmptyLike(dev_ctx, bias); + + // TODO(umiswing): add check for parameter freezing automatically + PADDLE_ENFORCE_EQ((scale_grad == nullptr && bias_grad == nullptr) || + (scale_grad != nullptr && bias_grad != nullptr), + true, + phi::errors::InvalidArgument( + "Weight and bias's stop_gradient of BatchNorm must be " + "True or False at the same time.")); + + if (scale_grad && bias_grad) { + *scale_grad = phi::EmptyLike(dev_ctx, scale); + *bias_grad = phi::EmptyLike(dev_ctx, bias); + } phi::BatchNormGradKernel(dev_ctx, x.values(), scale, diff --git a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu index cd2472b453cba..d4076e5ef0b5d 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu @@ -56,6 +56,7 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, const std::string& key, SparseCooTensor* x_grad, DenseTensor* kernel_grad) { + const bool is_params_freezing = kernel_grad == nullptr; const auto& kernel_dims = kernel.dims(); const bool is2D = kernel_dims.size() == 4 ? true : false; const int kernel_size = @@ -79,10 +80,13 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, T* in_features_ptr = in_features.data(); T* d_x_features_ptr = d_x_features.data(); T* out_grad_features_ptr = out_grad_features.data(); - *kernel_grad = phi::EmptyLike(dev_ctx, kernel); - T* d_kernel_ptr = kernel_grad->data(); - phi::backends::gpu::GpuMemsetAsync( - d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream()); + T* d_kernel_ptr = nullptr; + if (!is_params_freezing) { + *kernel_grad = phi::EmptyLike(dev_ctx, kernel); + d_kernel_ptr = kernel_grad->data(); + phi::backends::gpu::GpuMemsetAsync( + d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream()); + } int half_kernel_size = kernel_size / 2; auto blas = phi::funcs::GetBlas(dev_ctx); @@ -184,6 +188,8 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, } #endif const T* kernel_ptr = kernel.data(); + T* tmp_d_x_ptr = nullptr; + T* tmp_d_kernel_ptr = nullptr; for (int i = 0; i < kernel_size; i++) { if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) { continue; @@ -195,8 +201,10 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels; const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels; - T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * in_channels; - T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels; + tmp_d_x_ptr = d_x_features_ptr + offsets[i] * in_channels; + if (!is_params_freezing) { + tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels; + } #if defined(PADDLE_WITH_CUTLASS) && SPCONV_WITH_CUTLASS if (cutlass) { @@ -204,26 +212,28 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, const IntT* scatter_x_indices = rulebook_ptr + offsets[i]; const IntT* gather_out_indices = rulebook_ptr + rulebook_len + offsets[i]; const size_t key = autotune::GenKey(M / features_num_range, N, K); - // call gemm: d_kernel = transpose(x) * out_grad - // (in_channels, n) * (n, out_channels) - static cutlass::device_memory::allocation workspace( - workspace_size); - GatherGemmScatterDriver<80, true, false>( - dev_ctx, - key, - x.values().data(), - out_grad.values().data(), - tmp_d_kernel_ptr, - tmp_d_kernel_ptr, - in_channels, - out_channels, - counter_ptr[i], - gather_x_indices, - gather_out_indices, - static_cast(nullptr), - static_cast(1.0), - static_cast(0.0), - &workspace); + if (!is_params_freezing) { + // call gemm: d_kernel = transpose(x) * out_grad + // (in_channels, n) * (n, out_channels) + static cutlass::device_memory::allocation workspace( + workspace_size); + GatherGemmScatterDriver<80, true, false>( + dev_ctx, + key, + x.values().data(), + out_grad.values().data(), + tmp_d_kernel_ptr, + tmp_d_kernel_ptr, + in_channels, + out_channels, + counter_ptr[i], + gather_x_indices, + gather_out_indices, + static_cast(nullptr), + static_cast(1.0), + static_cast(0.0), + &workspace); + } // call gemm: d_x = out_grad * transpose(kernel) // (n, out_channels) * (out_channels, in_channels) GatherGemmScatterDriver<80, false, true>( @@ -244,18 +254,20 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, nullptr); } else { #endif - // call gemm: d_kernel = transpose(x) * out_grad - // (in_channels, n) * (n, out_channels) - blas.GEMM(CblasTrans, - CblasNoTrans, - K, - N, - M, - static_cast(1), - tmp_in_ptr, - tmp_out_grad_ptr, - static_cast(0), - tmp_d_kernel_ptr); + if (!is_params_freezing) { + // call gemm: d_kernel = transpose(x) * out_grad + // (in_channels, n) * (n, out_channels) + blas.GEMM(CblasTrans, + CblasNoTrans, + K, + N, + M, + static_cast(1), + tmp_in_ptr, + tmp_out_grad_ptr, + static_cast(0), + tmp_d_kernel_ptr); + } // call gemm: d_x = out_grad * transpose(kernel) // (n, out_channels) * (out_channels, in_channels) From 77da237b0312a9f06804159260092ad2d628faa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 14 Aug 2023 09:28:18 +0800 Subject: [PATCH 027/246] [xdoctest] reformat example code with google style in No. 212 (#56211) --- python/paddle/distributed/communicator.py | 38 +++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/python/paddle/distributed/communicator.py b/python/paddle/distributed/communicator.py index 9644abfab1936..613ee9c98d999 100755 --- a/python/paddle/distributed/communicator.py +++ b/python/paddle/distributed/communicator.py @@ -53,12 +53,12 @@ def __init__(self, mode, kwargs=None, envs=None): Examples: .. code-block:: python - import paddle + >>> import paddle - prog = paddle.static.Program() - comm = paddle.distributed.communicator.Communicator(prog) - comm.start() - comm.stop() + >>> prog = paddle.static.Program() + >>> comm = paddle.distributed.communicator.Communicator(prog) + >>> comm.start() + >>> comm.stop() """ # set all recv op to not_run mode @@ -136,12 +136,12 @@ def start(self): Examples: .. code-block:: python - import paddle + >>> import paddle - prog = paddle.static.Program() - comm = paddle.distributed.communicator.Communicator(prog) - comm.start() - comm.stop() + >>> prog = paddle.static.Program() + >>> comm = paddle.distributed.communicator.Communicator(prog) + >>> comm.start() + >>> comm.stop() """ if self.communicator_ is None: print('you must call init_with_ctx first to init comm before start') @@ -158,12 +158,12 @@ def stop(self): Examples: .. code-block:: python - import paddle + >>> import paddle - prog = paddle.static.Program() - comm = paddle.distributed.communicator.Communicator(prog) - comm.start() - comm.stop() + >>> prog = paddle.static.Program() + >>> comm = paddle.distributed.communicator.Communicator(prog) + >>> comm.start() + >>> comm.stop() """ if self.communicator_ is None: print('you must call init_with_ctx first to init comm before stop') @@ -180,11 +180,11 @@ def is_running(self): Examples: .. code-block:: python - import paddle + >>> import paddle - prog = paddle.static.Program() - comm = paddle.distributed.communicator.Communicator(prog) - comm.is_running() + >>> prog = paddle.static.Program() + >>> comm = paddle.distributed.communicator.Communicator(prog) + >>> comm.is_running() """ if self.communicator_ is None: print('you must call init_with_ctx first to init comm before stop') From 0b6b2d350ec339b68c46caeaa0281bec4a156f5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 14 Aug 2023 09:30:36 +0800 Subject: [PATCH 028/246] [xdoctest] reformat example code with google style in No. 214 (#56212) --- python/paddle/distributed/fleet/base/private_helper_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py index 20858379c2bc2..c5199eb46a747 100644 --- a/python/paddle/distributed/fleet/base/private_helper_function.py +++ b/python/paddle/distributed/fleet/base/private_helper_function.py @@ -31,7 +31,7 @@ def wait_server_ready(endpoints): Examples: .. code-block:: python - wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) + >>> wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) """ assert not isinstance(endpoints, str) while True: From e075a0dd165d2a13c07e78b41a6985190713f4c4 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 14 Aug 2023 10:01:49 +0800 Subject: [PATCH 029/246] Disable cinn test: test_resnet_prim (#56245) disable cinn test Pcard-67012 --- test/prim/model/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/prim/model/CMakeLists.txt b/test/prim/model/CMakeLists.txt index 52ca91e86a3bc..1395be8b3a243 100644 --- a/test/prim/model/CMakeLists.txt +++ b/test/prim/model/CMakeLists.txt @@ -8,7 +8,7 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endforeach() -set_tests_properties(test_resnet_prim PROPERTIES TIMEOUT 850) +#set_tests_properties(test_resnet_prim PROPERTIES TIMEOUT 850) set_tests_properties(test_bert_prim PROPERTIES TIMEOUT 500) set_tests_properties(test_prim_simplenet_cinn PROPERTIES TIMEOUT 120) @@ -18,7 +18,7 @@ if(WITH_CINN) set_tests_properties(test_bert_cinn PROPERTIES TIMEOUT 500) set_tests_properties(test_bert_prim_cinn PROPERTIES TIMEOUT 500) - set_tests_properties(test_resnet_prim PROPERTIES LABELS "RUN_TYPE=CINN") + #set_tests_properties(test_resnet_prim PROPERTIES LABELS "RUN_TYPE=CINN") #set_tests_properties(test_resnet_cinn PROPERTIES LABELS "RUN_TYPE=CINN") #set_tests_properties(test_resnet_prim_cinn PROPERTIES LABELS "RUN_TYPE=CINN") set_tests_properties(test_bert_prim PROPERTIES LABELS "RUN_TYPE=CINN") From a97b507e233f80a65193477e16d9677bc2a115ce Mon Sep 17 00:00:00 2001 From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:11:59 +0800 Subject: [PATCH 030/246] [Semi-Auto] Add reshape spmd rule (#55177) * add reshape spmd rule * add unit test for reshape spmd rule * bug fix * replace the print_info function with to_string * fix typo * bug fix * add handling for "0" in target shape * remove the part of computing size in dim_trans.cc --- .../auto_parallel/spmd_rules/common.h | 6 +- .../auto_parallel/spmd_rules/dim_trans.cc | 355 ++++++++++++++++++ .../auto_parallel/spmd_rules/dim_trans.h | 160 ++++++++ .../spmd_rules/reshape_spmd_rule.cc | 206 ++++++++++ .../spmd_rules/reshape_spmd_rule.h | 40 ++ .../auto_parallel/spmd_rules/rules.h | 4 + test/auto_parallel/spmd_rules/CMakeLists.txt | 1 + .../spmd_rules/test_reshape_rule.py | 219 +++++++++++ 8 files changed, 988 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.cc create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h create mode 100644 test/auto_parallel/spmd_rules/test_reshape_rule.py diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h index f5a49ab0a9f18..26c421eb27e23 100644 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h @@ -125,14 +125,14 @@ std::string GetBroadcastAxes(const int64_t& tenosr_ndim, TensorDistAttr ReplicatedOnMesh(const TensorDistAttr& src_dist_attr); // Check whether the given DistTensorSpec objects are valid. For each -// DistTensorSpec, the rank of its dimsmapping must be equal to the rank of its +// DistTensorSpec, the rank of its dims mapping must be equal to the rank of its // corresponding tensor shape. the parameter op_name is used for logging error // message. void VerifySpecs(const std::vector& specs, const std::string& op_name); -// Get dimsmapping for the given tensors. Return the pair of each -// tensor's einsum notation and the corresponding dimsmapping. +// Get dims mapping for the given tensors. Return the pair of each +// tensor's einsum notation and the corresponding dims mapping. std::vector>> GetAxesDimsMappingPair(const std::vector& tensor_axes, const std::vector& specs); diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.cc new file mode 100644 index 0000000000000..993793a7d64ec --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.cc @@ -0,0 +1,355 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h" +#include +#include +#include +#include +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +static std::vector all_dim_trans; + +DimTrans::DimTrans(Type type) : type_(type) {} + +DimTrans::~DimTrans() {} + +DimTrans::Type DimTrans::type() const { return type_; } + +void DimTrans::set_type(Type type) { type_ = type; } + +std::string DimTrans::to_string() { return std::string(""); } + +InputDim::InputDim() : DimTrans(DimTrans::Type::INPUTDIM) { + input_dim_ = -1; + all_dim_trans.emplace_back(this); +} + +InputDim::InputDim(int64_t dim) : DimTrans(DimTrans::Type::INPUTDIM) { + input_dim_ = dim; + all_dim_trans.emplace_back(this); +} + +InputDim::~InputDim() {} + +int64_t InputDim::input_dim() const { return input_dim_; } + +void InputDim::set_input_dim(int64_t dim) { input_dim_ = dim; } + +std::string InputDim::to_string() { + return ("InputDim(" + std::to_string(input_dim_) + ")"); +} + +Singleton::Singleton() : DimTrans(DimTrans::Type::SINGLETON) { + all_dim_trans.emplace_back(this); +} + +std::string Singleton::to_string() { return "Singleton()"; } + +Flatten::Flatten() : DimTrans(DimTrans::Type::FLATTEN) { + all_dim_trans.emplace_back(this); +} + +Flatten::Flatten(const std::vector& dims) + : DimTrans(DimTrans::Type::FLATTEN) { + input_dims_ = dims; + all_dim_trans.emplace_back(this); +} + +Flatten::~Flatten() { + input_dims_.assign(input_dims_.size(), nullptr); + std::vector().swap(input_dims_); +} + +const std::vector& Flatten::inputs() const { return input_dims_; } + +void Flatten::set_inputs(const std::vector& dims) { + input_dims_.assign(dims.begin(), dims.end()); +} + +std::string Flatten::to_string() { + std::string ret_str("Flatten("); + for (int64_t i = 0, n = input_dims_.size(); i < n; ++i) { + ret_str += input_dims_[i]->to_string(); + if (i < n - 1) { + ret_str += ","; + } + } + return ret_str + ")"; +} + +Split::Split() : DimTrans(DimTrans::Type::SPLIT) { + input_dim_trans_ = nullptr; + all_dim_trans.emplace_back(this); +} + +Split::Split(DimTrans* dim, const std::vector& shape, int64_t id) + : DimTrans(DimTrans::Type::SPLIT) { + input_dim_trans_ = dim; + split_id_ = id; + splitted_shape_.assign(shape.begin(), shape.end()); + all_dim_trans.emplace_back(this); +} + +Split::~Split() { + input_dim_trans_ = nullptr; + std::vector().swap(splitted_shape_); +} + +DimTrans* Split::input() const { return input_dim_trans_; } + +void Split::set_input(DimTrans* dim) { input_dim_trans_ = dim; } + +int64_t Split::split_id() const { return split_id_; } + +int64_t Split::local_splitted_shape_value() { + return splitted_shape_[split_id_]; +} + +std::string Split::to_string() { + std::string ret_str("Split("); + ret_str += input_dim_trans_->to_string() + ", ("; + for (int64_t i = 0, n = splitted_shape_.size(); i < n; ++i) { + ret_str += std::to_string(splitted_shape_[i]); + if (i < n - 1) { + ret_str += ","; + } + } + return ret_str + "), " + std::to_string(split_id_) + ")"; +} + +DimTrans* make_flatten(const std::vector& dims) { + DimTrans* ptr = nullptr; + if (dims.size() == 0) { + ptr = new Singleton(); + } else if (dims.size() == 1) { + ptr = dims[0]; + } else { + ptr = new Flatten(dims); + } + return ptr; +} + +DimTrans* make_split(DimTrans* dim, + const std::vector& shape, + int64_t id) { + assert(shape.size() > 0); + DimTrans* ptr = nullptr; + if (shape.size() == 1) { + assert(id == 0); + ptr = dim; + } else if (shape[id] == 1) { + ptr = new Singleton(); + } else { + // new shape that remove 1 + std::vector new_shape; + // map between from idx in shape to new_shape + std::vector idx_map(shape.size(), -1); + for (int64_t i = 0, n = shape.size(); i < n; ++i) { + if (shape[id] != 1) { + idx_map[i] = new_shape.size(); + new_shape.emplace_back(shape[i]); + } + } + ptr = new Split(dim, new_shape, idx_map[id]); + } + return ptr; +} + +void CleanUp() { + for (int64_t i = 0, n = all_dim_trans.size(); i < n; i++) { + if (all_dim_trans[i]) { + delete all_dim_trans[i]; + all_dim_trans[i] = nullptr; + } + } + std::vector().swap(all_dim_trans); +} + +// Given a `dim_trans` of an output axis, get the input axis +// whose dim mapping should be propogated to it. +// If the returned input axis is none, the output axis's +// dim mapping should be set to -1 (replicated). For an axis +// that is flattened from input axes, return the leftmost +// flattened input axis. For the split transformation, +// only the leftmost split axis in output will return its input. +DimTrans* GetDimTrans(DimTrans* dim_trans, + std::vector>* shardable, + std::set* seen_dims, + const std::vector& input_shape, + const std::vector& mesh_shape, + const std::vector& input_dims_mapping, + const std::set& sharded_input_dims) { + DimTrans::Type type = dim_trans->type(); + DimTrans* ret_dim_trans = nullptr; + + if (type == DimTrans::Type::INPUTDIM) { + InputDim* inputdim = dynamic_cast(dim_trans); + int64_t dim = inputdim->input_dim(); + seen_dims->insert(dim); + + if (sharded_input_dims.count(dim) > 0) { + ret_dim_trans = dim_trans; + } + } else if (type == DimTrans::Type::FLATTEN) { + Flatten* flatten = dynamic_cast(dim_trans); + const std::vector& inputs = flatten->inputs(); + int64_t nmesh = (*shardable)[0].size(); + for (int64_t i = 1, n = inputs.size(); i < n; i++) { + DimTrans* input = inputs[i]; + if (input->type() == DimTrans::Type::INPUTDIM) { + (*shardable)[i].assign(nmesh, false); + } + + GetDimTrans(input, + shardable, + seen_dims, + input_shape, + mesh_shape, + input_dims_mapping, + sharded_input_dims); + } + + DimTrans* dim0 = inputs[0]; + if (dim0->type() == DimTrans::Type::INPUTDIM) { + InputDim* inputdim = dynamic_cast(dim0); + if (sharded_input_dims.count(inputdim->input_dim()) > 0) { + ret_dim_trans = dim0; + } + } + } else if (type == DimTrans::Type::SPLIT) { + Split* split = dynamic_cast(dim_trans); + DimTrans* dim = GetDimTrans(split->input(), + shardable, + seen_dims, + input_shape, + mesh_shape, + input_dims_mapping, + sharded_input_dims); + int64_t ret_size = split->local_splitted_shape_value(); + + if (split->split_id() == 0) { + if (dim != nullptr) { + PADDLE_ENFORCE_EQ(dim->type(), + DimTrans::Type::INPUTDIM, + phi::errors::InvalidArgument( + "The returned dim_trans must be INPUTDIM.")); + InputDim* inputdim = dynamic_cast(dim); + int64_t nmesh = mesh_shape.size(); + int64_t input_axis = inputdim->input_dim(); + + // Check whether the sharded dim can be sharded on + // each mesh dimension. The dimension should be + // divisible by the mesh size that it is sharded on + for (int64_t imesh = 0; imesh < nmesh; imesh++) { + (*shardable)[input_axis][imesh] = (ret_size % mesh_shape[imesh] == 0); + } + } + ret_dim_trans = dim; + } + } else if (type == DimTrans::Type::SINGLETON) { + ret_dim_trans = nullptr; + } + return ret_dim_trans; +} + +void GetUsedInputDim(DimTrans* dim_trans, std::set* seen_dims) { + if (dim_trans->type() == DimTrans::Type::INPUTDIM) { + InputDim* input = dynamic_cast(dim_trans); + seen_dims->insert(input->input_dim()); + } else if (dim_trans->type() == DimTrans::Type::FLATTEN) { + Flatten* flatten = dynamic_cast(dim_trans); + for (DimTrans* trans : flatten->inputs()) { + GetUsedInputDim(trans, seen_dims); + } + } else if (dim_trans->type() == DimTrans::Type::SPLIT) { + Split* split = dynamic_cast(dim_trans); + GetUsedInputDim(split->input(), seen_dims); + } else { + return; + } +} + +std::vector> InferFromDimTrans( + const DistTensorSpec& input_spec, const std::vector& dim_trans) { + const std::vector& input_shape = input_spec.shape(); + const std::vector& input_dims_mapping = input_spec.dims_mapping(); + const ProcessMesh& mesh = input_spec.dist_attr().process_mesh(); + const std::vector& mesh_shape = mesh.shape(); + + std::set sharded_input_dims; + for (int64_t i = 0, n = input_dims_mapping.size(); i < n; ++i) { + if (input_dims_mapping[i] > -1) { + sharded_input_dims.insert(i); + } + } + int64_t ndim = input_shape.size(); + int64_t nmesh = mesh_shape.size(); + std::vector> shardable(ndim, + std::vector(nmesh, true)); + + std::set seen_input_dims; + for (DimTrans* trans : dim_trans) { + GetUsedInputDim(trans, &seen_input_dims); + } + + for (int64_t idim = 0; idim < ndim; idim++) { + bool seen = seen_input_dims.count(idim); + if (!seen) { + shardable[idim].assign(nmesh, seen); + } + } + + // get the map from sharded input dimensions to output dimensions. + std::vector dim_map_src2tgt(ndim, -1); + for (int64_t i = 0, n = dim_trans.size(); i < n; i++) { + DimTrans* dim = GetDimTrans(dim_trans[i], + &shardable, + &seen_input_dims, + input_shape, + mesh_shape, + input_dims_mapping, + sharded_input_dims); + if (dim != nullptr && dim->type() == DimTrans::Type::INPUTDIM) { + InputDim* inputdim = dynamic_cast(dim); + dim_map_src2tgt[inputdim->input_dim()] = i; + } + } + + std::vector out_dims_mapping(dim_trans.size(), -1); + std::vector new_input_dims_mapping(input_dims_mapping); + + // set output dims mapping with corresponding input dimensions. + // if one input dimension is sharded on a unshardable mesh after + // splitting, we need to make it replicated. + for (int64_t i = 0; i < ndim; i++) { + int64_t mesh_dim = input_dims_mapping[i]; + if (mesh_dim > -1 && shardable[i][mesh_dim] && dim_map_src2tgt[i] > -1) { + out_dims_mapping[dim_map_src2tgt[i]] = input_dims_mapping[i]; + } else { + new_input_dims_mapping[i] = -1; + } + } + + return {new_input_dims_mapping, out_dims_mapping}; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h new file mode 100644 index 0000000000000..f196a0266d5d4 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +// This is a base class to describe how each dimension in output tensor +// is transformed from input tensor's axes. The transformation includes +// Flatten, Split, etc. A vector whose size equals to the +// output tensor's rank can be used to describe how the output shape is +// transformed from the input shape. Each element in vector +// describes the transformation of one output axis. For example, when +// a reshape operator reshapes a tensor from the shape of (6, 12, 48) +// to (72, 6, 8), this transfromation can be described as: +// [Flatten(Dim(0), Dim(1)), Split(Dim(2), (6,8), 0), Split(Dim(2), (6,8), 1)] +// meaning that dim0 in output is flattened from dim0 and dim1 in input, +// dim1 and dim2 in output are obtained by splitting dim2 in input, the +// splitted shape is (6, 8), dim1 referes to the first shape value in (6, 8) +// and dim2 referes to the second shape value in (6, 8). +class DimTrans { + public: + enum class Type { INPUTDIM, SINGLETON, FLATTEN, SPLIT }; + + DimTrans() = default; + + explicit DimTrans(Type type); + + virtual ~DimTrans(); + + Type type() const; + + void set_type(Type type); + + virtual std::string to_string(); + + private: + Type type_; +}; + +// InputDim indicates that the output dimention +// is obtained directed from one input dimension. +class InputDim : public DimTrans { + public: + InputDim(); + + explicit InputDim(int64_t dim); + + virtual ~InputDim(); + + int64_t input_dim() const; + + void set_input_dim(int64_t dim); + + std::string to_string() override; + + private: + int64_t input_dim_; +}; + +// Singleton indicates that the shape of the +// corresponding output dimension is 1 +class Singleton : public DimTrans { + public: + Singleton(); + std::string to_string() override; +}; + +// Flatten indicates that the output dimension +// is obtained from flattening input dimensions. +class Flatten : public DimTrans { + public: + Flatten(); + + explicit Flatten(const std::vector& dims); + + virtual ~Flatten(); + + const std::vector& inputs() const; + + void set_inputs(const std::vector& dims); + + std::string to_string() override; + + private: + std::vector input_dims_; +}; + +// Split indicates that the output dimension +// is obtained by splitting input dimension. +class Split : public DimTrans { + public: + Split(); + + Split(DimTrans* dim, const std::vector& shape, int64_t id); + + virtual ~Split(); + + DimTrans* input() const; + + void set_input(DimTrans* dim); + + int64_t split_id() const; + + // get the splitted shape value of the split_id_ dimension + int64_t local_splitted_shape_value(); + + std::string to_string() override; + + private: + DimTrans* input_dim_trans_; + std::vector splitted_shape_; + int64_t split_id_; +}; + +void CleanUp(); + +DimTrans* make_flatten(const std::vector& dims = {}); + +DimTrans* make_split(DimTrans* dim, + const std::vector& shape = {}, + int64_t id = 0); + +// Infer the dims mapping of the output tensor according to the transformation +// `dim_trans`. Returns the dims mapping of the input tensor (the input dims +// mapping may be changed for resharding) and output tensor. The inferring +// follows the rules: +// 1. For Singleton, i.e., the shape of this output axis is 1, its dim mapping +// is -1, indicating that the output axis is replicated. +// 2. For InputDim, i.e., the output axis is transformed directly from an input +// axis, set its dim mapping equals to the corresponding input axis. +// 3. For Flatten, i.e., the output axis is flattened from some input axes, it +// can be sharded only if the leftmost flattened axes is sharded. +// 4. For Split, i.e., the output axes is splited from a input axis, only the +// leftmost output split axis can be sharded when its shape can be divisible +// by the mesh dimension. +std::vector> InferFromDimTrans( + const DistTensorSpec& input_spec, const std::vector& dim_trans); + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc new file mode 100644 index 0000000000000..0b64a4f00ecde --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc @@ -0,0 +1,206 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h" +#include +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h" +#include "paddle/phi/core/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +using phi::distributed::auto_parallel::str_join; + +// The target shape in reshape may contains a -1 dimension, +// this function is used to infer what the "-1" dimension is. +std::vector InferTargetShape(const std::vector& shape, + int64_t len) { + int64_t infer_idx = -1; + for (int64_t i = 0, n = shape.size(); i < n; i++) { + if (shape[i] == -1) { + PADDLE_ENFORCE_EQ( + infer_idx, + -1, + phi::errors::InvalidArgument( + "There can't be more than one -1 dimension in target shape.")); + infer_idx = i; + } + } + + int64_t product = std::accumulate( + shape.begin(), shape.end(), 1, std::multiplies()); + if (product > 0) { + PADDLE_ENFORCE_EQ( + product, + len, + phi::errors::InvalidArgument("The total size are not matched")); + return std::vector(shape); + } else { + std::vector new_shape(shape); + product = -product; + int64_t infer_size = len / product; + PADDLE_ENFORCE_EQ(len % infer_size, + 0, + phi::errors::InvalidArgument( + "The total is not diviable by infer_size")); + new_shape[infer_idx] = infer_size; + return new_shape; + } +} + +// Compute how each dimension in target shape +// is obtained from the input dimensions +std::vector MakeReshapeDimTrans( + const std::vector& src_shape, + const std::vector& tgt_shape) { + std::vector ret; + int64_t total_elem_num_src = std::accumulate( + src_shape.begin(), src_shape.end(), 1, std::multiplies()); + std::vector inferred_tgt_shape = + InferTargetShape(tgt_shape, total_elem_num_src); + + int64_t src_idx = 0, tgt_idx = 0; + int64_t s, t; + int64_t src_len, tgt_len; + src_len = src_shape.size(); + tgt_len = inferred_tgt_shape.size(); + while (src_idx < src_len || tgt_idx < tgt_len) { + std::vector src_dims, tgt_splitted_shape; + if (src_idx >= src_len) { + s = 1; + } else { + s = src_shape[src_idx]; + src_dims.emplace_back(src_idx); + src_idx++; + } + if (tgt_idx >= tgt_len) { + t = 1; + } else { + t = tgt_shape[tgt_idx]; + tgt_splitted_shape.emplace_back(t); + tgt_idx++; + } + + // deal with the singleton case + if (s == 1 && t != 1) { + // case [1] [a] + tgt_idx--; + tgt_splitted_shape.clear(); + } else if (s != 1 && t == 1) { + src_idx--; + src_dims.clear(); + } else { + while (s != t) { + if (s < t) { + src_dims.emplace_back(src_idx); + s *= src_shape[src_idx]; + src_idx++; + } else { + tgt_splitted_shape.emplace_back(inferred_tgt_shape[tgt_idx]); + t *= inferred_tgt_shape[tgt_idx]; + tgt_idx++; + } + } + } + + if (tgt_splitted_shape.size() > 0) { + std::vector input_dims; + for (int64_t i = 0, n = src_dims.size(); i < n; i++) { + int64_t in_dim = src_dims[i]; + if (src_shape[in_dim] > 1) { + input_dims.emplace_back(new InputDim(in_dim)); + } + } + DimTrans* flatten = make_flatten(input_dims); + + for (int64_t i = 0, n = tgt_splitted_shape.size(); i < n; i++) { + ret.emplace_back(make_split(flatten, tgt_splitted_shape, i)); + } + } + } + return ret; +} + +std::pair, std::vector> +paddle::distributed::auto_parallel::ReshapeSPMDRule::InferForward( + const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) { + // step0: Verify Input Args Based on Reshape Logic + int64_t ninputs = input_specs.size(); + PADDLE_ENFORCE_EQ( + ninputs, + 1, + phi::errors::InvalidArgument("The size of InputSpec in reshape must " + "be equal to 1, but got [%d].", + ninputs)); + VerifySpecs(input_specs, "reshape"); + + // step1: build the transformation from + // original shape to target shape + std::vector src_shape = input_specs[0].shape(); + std::vector tgt_shape = + ExtractAttr>("shape", attrs); + + // handle the '0' values in target shape, '0' indicates + // that the target shape is equal to the source shape + for (int64_t i = 0, n = tgt_shape.size(); i < n; i++) { + if (tgt_shape[i] == 0) { + tgt_shape[i] = src_shape[i]; + } + } + + std::vector trans = MakeReshapeDimTrans(src_shape, tgt_shape); + + // step2: infer the dims mapping of input (if reshard is + // needed) and output from the dimension transformation. + std::vector> dims_mapping_vec = + InferFromDimTrans(input_specs[0], trans); + + // step3: update the dist attributes of input + // and output with the inferred dims mapping + TensorDistAttr new_input_dist_attr(input_specs[0].dist_attr()); + new_input_dist_attr.set_dims_mapping(dims_mapping_vec[0]); + TensorDistAttr output_dist_attr(input_specs[0].dist_attr()); + output_dist_attr.set_dims_mapping(dims_mapping_vec[1]); + + VLOG(4) << "Reshape: input_shape: [" << str_join(src_shape) + << "] output_shape: [" << str_join(tgt_shape) << "]"; + VLOG(4) << "Transformation from input to output:"; + for (int64_t i = 0, n = trans.size(); i < n; i++) { + DimTrans* t = trans[i]; + VLOG(4) << "\tOutput axis " << i << ": " << t->to_string(); + } + VLOG(4) << "input_dims_mapping: [" << str_join(dims_mapping_vec[0]) + << "] output_dims_mapping: [" << str_join(dims_mapping_vec[1]) + << "]\n\n"; + + CleanUp(); + + return {{new_input_dist_attr}, {output_dist_attr}}; +} + +std::pair, std::vector> +paddle::distributed::auto_parallel::ReshapeSPMDRule::InferBackward( + const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) { + PADDLE_THROW(phi::errors::Unimplemented( + "InferBackward of ReductionSPMDRule is NOT implemented yet.")); + + return {}; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h new file mode 100644 index 0000000000000..63b9a5a6f038a --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +class ReshapeSPMDRule : public SPMDRuleBase { + public: + std::pair, std::vector> + InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) override; + + std::pair, std::vector> + InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) override; +}; +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h index 713a52770926d..cf4046950964a 100644 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h @@ -22,6 +22,7 @@ #include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h" @@ -159,6 +160,9 @@ REGISTER_SPMD_RULE(split_with_num, SplitSPMDRule); // transpose rule REGISTER_SPMD_RULE(transpose, TransposeSPMDRule); +// reshape rule +REGISTER_SPMD_RULE(reshape, ReshapeSPMDRule); + } // namespace auto_parallel } // namespace distributed } // namespace paddle diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt index 43afd9aed75e7..c981aee6f83e1 100644 --- a/test/auto_parallel/spmd_rules/CMakeLists.txt +++ b/test/auto_parallel/spmd_rules/CMakeLists.txt @@ -10,6 +10,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_matmul_rule MODULES test_softmax_rule) py_test_modules(test_split_rule MODULES test_split_rule) py_test_modules(test_transpose_rule MODULES test_transpose_rule) + py_test_modules(test_reshape_rule MODULES test_reshape_rule) # End of unittests WITH single card WITHOUT timeout endif() diff --git a/test/auto_parallel/spmd_rules/test_reshape_rule.py b/test/auto_parallel/spmd_rules/test_reshape_rule.py new file mode 100644 index 0000000000000..8999bc3e34c38 --- /dev/null +++ b/test/auto_parallel/spmd_rules/test_reshape_rule.py @@ -0,0 +1,219 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.static.completion import get_spmd_rule +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.distributed.fleet import auto + + +class TestReshapeSPMDRule(unittest.TestCase): + def setUp(self): + self.rule = get_spmd_rule("reshape") + + x_shape = [6, 12, 48, 24] + process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]]) + + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [-1, -1] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + + self.attrs = {"shape": [1, 72, 48, 4, 6]} + + def test_reshape_infer_forward(self): + # shape: [6, 12, 48, 24] --> [1, 72, 48, 4, 6] + # dims_mapping: [0, -1, 1, -1] --> [0, -1, 1, -1] [-1, 0, 1, -1, -1] + self.x_dist_tensor_spec.set_dims_mapping([0, -1, 1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 1) + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [0, -1, 1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, 0, 1, -1, -1] + ) + + # shape: [6, 12, 48, 24] --> [1, 72, 48, 4, 6] + # dims_mapping: [-1, 0, -1, 1] --> [-1, -1, -1, -1] [-1, -1, -1, -1, -1] + self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1, -1, -1] + ) + + # shape: [6, 12, 48, 24] --> [1, 72, 48, 4, 6] + # dims_mapping: [1, -1, -1, 0] --> [1, -1, -1, 0] [-1, 1, -1, 0, -1] + self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, 0] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, 1, -1, 0, -1] + ) + + # shape: [6, 12, 48, 24] --> [3, 24, 6, 8, 24] + # dims_mapping: [0, 1, -1, -1] --> [-1, -1, -1, -1] [-1, -1, -1, -1, -1] + self.attrs["shape"] = [3, 24, 6, 8, 24] + self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1]) + + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1, -1, -1] + ) + + # shape: [6, 12, 48, 24] --> [3, 24, 6, 8, 24] + # dims_mapping: [1, -1, -1, 0] --> [1, -1, -1, 0] [1, -1, -1, -1, 0] + self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0]) + + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, 0] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [1, -1, -1, -1, 0] + ) + + # shape: [6, 12, 48, 24] --> [3, 24, 6, -1, 24] + # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, 0, 1], [-1, -1, 0, -1, 1] + self.attrs["shape"] = [3, 24, 6, -1, 24] + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1]) + + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0, 1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, 0, -1, 1] + ) + + # shape: [6, 12, 48, 24] --> [1, 72, 0, 4, 6] + # dims_mapping: [1, -1, -1, 0] --> [1, -1, -1, 0] [-1, 1, -1, 0, -1] + self.attrs["shape"] = [1, 72, 0, 4, 6] + self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, 0] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, 1, -1, 0, -1] + ) + + # shape: [6, 12, 48, 24] --> [6, 12, 48, 24] + # dims_mapping: [-1, -1, 0, 1] --> [-1, -1, 0, 1], [-1, -1, 0, 1] + self.attrs["shape"] = [6, 12, 48, 24] + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0, 1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, 0, 1] + ) + + # shape: [6, 12, 48, 24] --> [72, 3, 16, 24] + # dims_mapping: [0, -1, 1, -1] --> [0, -1, 1, -1], [0, 1, -1, -1] + self.attrs["shape"] = [72, 3, 16, 24] + self.x_dist_tensor_spec.set_dims_mapping([0, -1, 1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [0, -1, 1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [0, 1, -1, -1] + ) + + # shape: [6, 12, 48, 24] --> [72, 3, 16, 24] + # dims_mapping: [1, -1, 0, -1] --> [1, -1, -1, -1], [1, -1, -1, -1] + self.attrs["shape"] = [72, 3, 16, 24] + self.x_dist_tensor_spec.set_dims_mapping([1, -1, 0, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [1, -1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [1, -1, -1, -1] + ) + + # shape: [6, 12, 48, 24] --> [3, 24, 6, -1, -1] + # raise error + self.attrs["shape"] = [3, 24, 6, -1, -1] + with self.assertRaises(BaseException): + self.rule.infer_forward([self.x_dist_tensor_spec], self.attrs) + + +if __name__ == "__main__": + unittest.main() From 1ad502df56169407b06c94bc55708d5b2c36b707 Mon Sep 17 00:00:00 2001 From: liuruyan <44316842+liuruyan@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:37:49 +0800 Subject: [PATCH 031/246] Add ShapeDialect dict & SymbolicDimOp with UT. (#56156) * Add ShapeDialect dict & SymbolicDimOp without UT. * add unittest and fix Update_xxx_Func. * change std::string to const std::string & and remove phi dependency. --- paddle/ir/CMakeLists.txt | 1 + paddle/ir/dialect/CMakeLists.txt | 1 + paddle/ir/dialect/shape/CMakeLists.txt | 2 + paddle/ir/dialect/shape/shape_dialect.cc | 30 +++++ paddle/ir/dialect/shape/shape_dialect.h | 42 +++++++ paddle/ir/dialect/shape/shape_op.cc | 109 ++++++++++++++++++ paddle/ir/dialect/shape/shape_op.h | 60 ++++++++++ test/cpp/ir/CMakeLists.txt | 1 + test/cpp/ir/shape_dialect/CMakeLists.txt | 1 + .../ir/shape_dialect/assist_struct_test.cc | 52 +++++++++ 10 files changed, 299 insertions(+) create mode 100644 paddle/ir/dialect/CMakeLists.txt create mode 100644 paddle/ir/dialect/shape/CMakeLists.txt create mode 100644 paddle/ir/dialect/shape/shape_dialect.cc create mode 100644 paddle/ir/dialect/shape/shape_dialect.h create mode 100644 paddle/ir/dialect/shape/shape_op.cc create mode 100644 paddle/ir/dialect/shape/shape_op.h create mode 100644 test/cpp/ir/shape_dialect/CMakeLists.txt create mode 100644 test/cpp/ir/shape_dialect/assist_struct_test.cc diff --git a/paddle/ir/CMakeLists.txt b/paddle/ir/CMakeLists.txt index 39e5ff3fda611..581bb3f8a7c58 100644 --- a/paddle/ir/CMakeLists.txt +++ b/paddle/ir/CMakeLists.txt @@ -38,6 +38,7 @@ add_subdirectory(core) add_subdirectory(pass) add_subdirectory(pattern_rewrite) add_subdirectory(builtin_transforms) +add_subdirectory(dialect) if(WIN32) if(WITH_SHARED_IR) diff --git a/paddle/ir/dialect/CMakeLists.txt b/paddle/ir/dialect/CMakeLists.txt new file mode 100644 index 0000000000000..a87b0abfb2383 --- /dev/null +++ b/paddle/ir/dialect/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(shape) diff --git a/paddle/ir/dialect/shape/CMakeLists.txt b/paddle/ir/dialect/shape/CMakeLists.txt new file mode 100644 index 0000000000000..ab8ecdd7eda28 --- /dev/null +++ b/paddle/ir/dialect/shape/CMakeLists.txt @@ -0,0 +1,2 @@ +file(GLOB SHAPE_SRCS "*.cc") +ir_library(ir_shape SRCS ${SHAPE_SRCS} DEPS ir_core) diff --git a/paddle/ir/dialect/shape/shape_dialect.cc b/paddle/ir/dialect/shape/shape_dialect.cc new file mode 100644 index 0000000000000..a5e3adc3ac0a5 --- /dev/null +++ b/paddle/ir/dialect/shape/shape_dialect.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/ir/dialect/shape/shape_dialect.h" +#include "paddle/ir/dialect/shape/shape_op.h" + +namespace ir { +namespace dialect { +ShapeDialect::ShapeDialect(IrContext *context) + : Dialect(name(), context, TypeId::get()) { + initialize(); +} + +void ShapeDialect::initialize() { RegisterOps(); } + +} // namespace dialect +} // namespace ir + +IR_DEFINE_EXPLICIT_TYPE_ID(ir::dialect::ShapeDialect) diff --git a/paddle/ir/dialect/shape/shape_dialect.h b/paddle/ir/dialect/shape/shape_dialect.h new file mode 100644 index 0000000000000..eb47aa1345f28 --- /dev/null +++ b/paddle/ir/dialect/shape/shape_dialect.h @@ -0,0 +1,42 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/ir/core/dialect.h" + +namespace ir { +namespace dialect { +/// +/// \brief Shape Dialect: +/// +class IR_API ShapeDialect : public ir::Dialect { + public: + explicit ShapeDialect(ir::IrContext *context); + /// + /// \brief Each Dialect needs to provide a name function to return the name of + /// the Dialect. + /// + /// \return The name of this Dialect. + /// + static const char *name() { return "shape"; } + + private: + void initialize(); +}; + +} // namespace dialect +} // namespace ir + +IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::dialect::ShapeDialect) diff --git a/paddle/ir/dialect/shape/shape_op.cc b/paddle/ir/dialect/shape/shape_op.cc new file mode 100644 index 0000000000000..7befe847790bf --- /dev/null +++ b/paddle/ir/dialect/shape/shape_op.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/ir/dialect/shape/shape_op.h" +#include "paddle/ir/core/builtin_attribute.h" + +namespace ir { +namespace dialect { + +const char *SymbolicDim::attributes_name[attributes_num] = {"knownNegativeOne", + "knownNonNegative", + "knownNonSizeOne", + "knownNonSizeZero", + "sym_name", + "value"}; // NOLINT + +void SymbolicDim::Build( + Builder &builder, + OperationArgument &argument, + const std::string &sym_name, + int64_t value, // TODO(zhangbo) value = ShapedType::kDynamic + bool knownNonNegative, + bool knownNegativeOne, + bool knownNonSizeOne, + bool knownNonSizeZero) { + ir::Attribute attr_sym_name = + ir::StrAttribute::get(ir::IrContext::Instance(), sym_name); + argument.AddAttribute("sym_name", attr_sym_name); + ir::Attribute attr_value = + ir::Int64Attribute::get(ir::IrContext::Instance(), value); + argument.AddAttribute("value", attr_value); + ir::Attribute attr_knownNonNegative = + ir::BoolAttribute::get(ir::IrContext::Instance(), knownNonNegative); + argument.AddAttribute("knownNonNegative", attr_knownNonNegative); + ir::Attribute attr_knownNegativeOne = + ir::BoolAttribute::get(ir::IrContext::Instance(), knownNegativeOne); + argument.AddAttribute("knownNegativeOne", attr_knownNegativeOne); + ir::Attribute attr_knownNonSizeOne = + ir::BoolAttribute::get(ir::IrContext::Instance(), knownNonSizeOne); + argument.AddAttribute("knownNonSizeOne", attr_knownNonSizeOne); + ir::Attribute attr_knownNonSizeZero = + ir::BoolAttribute::get(ir::IrContext::Instance(), knownNonSizeZero); + argument.AddAttribute("knownNonSizeZero", attr_knownNonSizeZero); +} + +std::string SymbolicDim::getSymName() { + return attribute("sym_name").AsString(); +} +int64_t SymbolicDim::getValue() { + return attribute("value").data(); +} +bool SymbolicDim::getKnownNonNegative() { + return attribute("knownNonNegative").data(); +} +bool SymbolicDim::getKnownNegativeOne() { + return attribute("knownNegativeOne").data(); +} +bool SymbolicDim::getKnownNonSizeOne() { + return attribute("knownNonSizeOne").data(); +} +bool SymbolicDim::getKnownNonSizeZero() { + return attribute("knownNonSizeZero").data(); +} + +void SymbolicDim::updateSymName(std::string attrValue) { + operation()->set_attribute( + "sym_name", ir::StrAttribute::get(ir::IrContext::Instance(), attrValue)); +} +void SymbolicDim::updateValue(int64_t attrValue) { + operation()->set_attribute( + "value", ir::Int64Attribute::get(ir::IrContext::Instance(), attrValue)); +} + +void SymbolicDim::updateKnownNonNegative(bool attrValue) { + operation()->set_attribute( + "knownNonNegative", + ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue)); +} +void SymbolicDim::updateKnownNegativeOne(bool attrValue) { + operation()->set_attribute( + "knownNegativeOne", + ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue)); +} +void SymbolicDim::updateKnownNonSizeOne(bool attrValue) { + operation()->set_attribute( + "knownNonSizeOne", + ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue)); +} +void SymbolicDim::updateKnownNonSizeZero(bool attrValue) { + operation()->set_attribute( + "knownNonSizeZero", + ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue)); +} + +} // namespace dialect +} // namespace ir + +IR_DEFINE_EXPLICIT_TYPE_ID(ir::dialect::SymbolicDim) diff --git a/paddle/ir/dialect/shape/shape_op.h b/paddle/ir/dialect/shape/shape_op.h new file mode 100644 index 0000000000000..48445d4e8cb75 --- /dev/null +++ b/paddle/ir/dialect/shape/shape_op.h @@ -0,0 +1,60 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/ir/core/builder.h" +#include "paddle/ir/core/op_base.h" + +namespace ir { +namespace dialect { + +class IR_API SymbolicDim : public Op { + public: + using Op::Op; + static const char *name() { return "shape.SymbolicDim"; } + + static constexpr uint32_t attributes_num = 6; + static const char *attributes_name[attributes_num]; + + static void Build( + Builder &builder, // NOLINT + OperationArgument &argument, // NOLINT + const std::string &sym_name, + int64_t value = -100000, // TODO(zhangbo): value = ShapedType::kDynamic + bool knownNonNegative = false, + bool knownNegativeOne = false, + bool knownNonSizeOne = false, + bool knownNonSizeZero = false); + std::string getSymName(); + int64_t getValue(); + bool getKnownNonNegative(); + bool getKnownNegativeOne(); + bool getKnownNonSizeOne(); + bool getKnownNonSizeZero(); + + void updateSymName(std::string attrValue); + void updateValue(int64_t attrValue); + + void updateKnownNonNegative(bool attrValue); + void updateKnownNegativeOne(bool attrValue); + void updateKnownNonSizeOne(bool attrValue); + void updateKnownNonSizeZero(bool attrValue); + void Verify() {} +}; + +} // namespace dialect +} // namespace ir + +IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::dialect::SymbolicDim); diff --git a/test/cpp/ir/CMakeLists.txt b/test/cpp/ir/CMakeLists.txt index f33f84eab3711..d2117ad5c24e2 100644 --- a/test/cpp/ir/CMakeLists.txt +++ b/test/cpp/ir/CMakeLists.txt @@ -3,3 +3,4 @@ add_subdirectory(pass) add_subdirectory(pattern_rewrite) add_subdirectory(kernel_dialect) add_subdirectory(cinn) +add_subdirectory(shape_dialect) diff --git a/test/cpp/ir/shape_dialect/CMakeLists.txt b/test/cpp/ir/shape_dialect/CMakeLists.txt new file mode 100644 index 0000000000000..7b959a283b20e --- /dev/null +++ b/test/cpp/ir/shape_dialect/CMakeLists.txt @@ -0,0 +1 @@ +cc_test_old(assist_struct_test SRCS assist_struct_test.cc DEPS ir gtest) diff --git a/test/cpp/ir/shape_dialect/assist_struct_test.cc b/test/cpp/ir/shape_dialect/assist_struct_test.cc new file mode 100644 index 0000000000000..05ecf3734c421 --- /dev/null +++ b/test/cpp/ir/shape_dialect/assist_struct_test.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/ir/core/block.h" +#include "paddle/ir/core/builder.h" +#include "paddle/ir/core/dialect.h" +#include "paddle/ir/core/ir_context.h" +#include "paddle/ir/core/program.h" +#include "paddle/ir/dialect/shape/shape_dialect.h" +#include "paddle/ir/dialect/shape/shape_op.h" + +TEST(assist_struct_test, symbolic_dim) { + ir::IrContext *ctx = ir::IrContext::Instance(); + ir::Program program(ctx); + ctx->GetOrRegisterDialect(); + ir::Builder builder = ir::Builder(ctx, program.block()); + ir::dialect::SymbolicDim sym_dim = builder.Build( + "S0", 10, false, false, false, false); + EXPECT_EQ(sym_dim.getValue(), 10); + EXPECT_EQ(sym_dim.getSymName(), "S0"); + EXPECT_FALSE(sym_dim.getKnownNegativeOne()); + EXPECT_FALSE(sym_dim.getKnownNonSizeOne()); + EXPECT_FALSE(sym_dim.getKnownNonSizeZero()); + EXPECT_FALSE(sym_dim.getKnownNonNegative()); + + sym_dim.updateValue(20); + sym_dim.updateSymName("S1"); + sym_dim.updateKnownNegativeOne(true); + sym_dim.updateKnownNonSizeOne(true); + sym_dim.updateKnownNonSizeZero(true); + sym_dim.updateKnownNonNegative(true); + + EXPECT_EQ(sym_dim.getValue(), 20); + EXPECT_EQ(sym_dim.getSymName(), "S1"); + EXPECT_TRUE(sym_dim.getKnownNegativeOne()); + EXPECT_TRUE(sym_dim.getKnownNonSizeOne()); + EXPECT_TRUE(sym_dim.getKnownNonSizeZero()); + EXPECT_TRUE(sym_dim.getKnownNonNegative()); +} From 2ac6a7e4d0fed274dfe650cb1a3454ef42154b3e Mon Sep 17 00:00:00 2001 From: ZZK <359521840@qq.com> Date: Mon, 14 Aug 2023 10:42:56 +0800 Subject: [PATCH 032/246] Add rmsnorm residual bias add and quant (#55965) * add rmsnorm residual bias add and quant * refine python interface * add rmsnorm unittest * Add layernorm * fix layernorm unittest * refine unittest * fix example code * fix review comment --- paddle/phi/api/yaml/ops.yaml | 16 +- paddle/phi/infermeta/binary.cc | 32 - paddle/phi/infermeta/binary.h | 7 - paddle/phi/infermeta/multiary.cc | 110 ++ paddle/phi/infermeta/multiary.h | 31 + .../fusion/gpu/fused_dropout_act_bias.h | 1 + .../kernels/fusion/gpu/fused_dropout_helper.h | 20 +- .../fusion/gpu/fused_layernorm_kernel.cu | 1094 +++++++++++++++++ .../fusion/gpu/fused_layernorm_kernel.h | 43 + .../fused_layernorm_residual_dropout_bias.h | 41 +- .../fusion/gpu/fused_residual_dropout_bias.h | 21 +- paddle/phi/kernels/gpu/rms_norm_kernel.cu | 349 +----- paddle/phi/kernels/rms_norm_kernel.h | 69 +- .../paddle/incubate/nn/functional/__init__.py | 6 +- .../nn/functional/fused_layer_norm.py | 125 ++ .../incubate/nn/functional/fused_rms_norm.py | 114 ++ .../paddle/incubate/nn/functional/rms_norm.py | 59 - test/legacy_test/CMakeLists.txt | 2 + test/legacy_test/test_fused_layernorm_op.py | 623 ++++++++++ test/legacy_test/test_rms_norm_op.py | 400 +++++- 20 files changed, 2655 insertions(+), 508 deletions(-) create mode 100644 paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu create mode 100644 paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.h create mode 100644 python/paddle/incubate/nn/functional/fused_layer_norm.py create mode 100644 python/paddle/incubate/nn/functional/fused_rms_norm.py delete mode 100644 python/paddle/incubate/nn/functional/rms_norm.py create mode 100644 test/legacy_test/test_fused_layernorm_op.py diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index ecc29de613dea..37a5368f8ee75 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -1017,6 +1017,16 @@ data_type : dtype backend : place +- op : fused_bias_residual_layernorm + args : (Tensor x, Tensor bias, Tensor residual, Tensor norm_weight, Tensor norm_bias, float epsilon, float residual_alpha, int begin_norm_axis, float quant_scale, int quant_round_type, float quant_max_bound, float quant_min_bound) + output : Tensor(out), Tensor(residual_out), Tensor(mean), Tensor(variance) + infer_meta : + func : FusedLayerNormInferMeta + kernel : + func : fused_bias_residual_layernorm + data_type : x + optional : bias, residual, norm_weight, norm_bias, residual_out + - op : gather args : (Tensor x, Tensor index, Scalar axis=0) output : Tensor(out) @@ -2071,14 +2081,14 @@ backward : reverse_grad - op : rms_norm - args : (Tensor x, Tensor weight, Tensor bias, float epsilon, int begin_norm_axis) - output : Tensor(out) + args : (Tensor x, Tensor bias, Tensor residual, Tensor norm_weight, Tensor norm_bias, float epsilon, int begin_norm_axis, float quant_scale, int quant_round_type, float quant_max_bound, float quant_min_bound) + output : Tensor(out), Tensor(residual_out) infer_meta : func : RmsNormInferMeta kernel : func : rms_norm data_type : x - optional : bias + optional : bias, residual, norm_bias, residual_out - op : rmsprop_ args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon = 1.0e-10f, float decay = 0.9f, float momentum = 0.0f, bool centered = false, bool multi_precision = false) diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index cfc88c5c2d50d..fee5882787ea4 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -3239,38 +3239,6 @@ void Unpool3dInferMeta(const MetaTensor& x, } } -void RmsNormInferMeta(const MetaTensor& x, - const MetaTensor& weight, - const MetaTensor& bias, - const float epsilon, - const int begin_norm_axis, - MetaTensor* out) { - std::vector x_dims_vec = phi::vectorize(x.dims()); - auto x_dims_size = x_dims_vec.size(); - - size_t normalized_dims = 1; - for (size_t i = begin_norm_axis; i < x_dims_size; ++i) { - normalized_dims *= x_dims_vec[i]; - } - - PADDLE_ENFORCE_EQ(normalized_dims, - weight.dims()[0], - phi::errors::InvalidArgument( - "The normalized size of Input(X) must equal to be" - "the size of Weight, but received" - "normalized size of Input(X) is [%d], received size" - "of Weight is [%d]", - normalized_dims, - weight.dims()[0])); - - auto out_dims = phi::make_ddim(x_dims_vec); - - out->set_dims(out_dims); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 48615cc22c5e6..8aa4114e74046 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -490,11 +490,4 @@ void Unpool3dInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); -void RmsNormInferMeta(const MetaTensor& x, - const MetaTensor& weight, - const MetaTensor& bias, - const float epsilon, - const int begin_norm_axis, - MetaTensor* out); - } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index ee84f6d169da1..9b3ad135cf733 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1506,6 +1506,68 @@ void FusedBiasActInferMeta(const MetaTensor& x, out->set_layout(x.layout()); } +void FusedLayerNormInferMeta(const MetaTensor& x, + const MetaTensor& bias, + const MetaTensor& residual, + const MetaTensor& norm_weight, + const MetaTensor& norm_bias, + const float epsilon, + const float residual_alpha, + const int begin_norm_axis, + const float quant_scale, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + MetaTensor* out, + MetaTensor* residual_out, + MetaTensor* mean, + MetaTensor* variance) { + std::vector x_dims_vec = phi::vectorize(x.dims()); + auto x_dims_size = x_dims_vec.size(); + + size_t normalized_dims = 1; + for (size_t i = begin_norm_axis; i < x_dims_size; ++i) { + normalized_dims *= x_dims_vec[i]; + } + + int32_t rows = 1; + for (int i = 0; i < begin_norm_axis; i++) { + rows *= x.dims()[i]; + } + + PADDLE_ENFORCE_EQ(normalized_dims, + norm_weight.dims()[0], + phi::errors::InvalidArgument( + "The normalized size of Input(X) must equal to be" + "the size of Weight, but received" + "normalized size of Input(X) is [%d], received size" + "of Weight is [%d]", + normalized_dims, + norm_weight.dims()[0])); + + auto out_dims = phi::make_ddim(x_dims_vec); + + out->set_dims(out_dims); + if (quant_scale <= 0.0f) { + out->set_dtype(x.dtype()); + } else { + out->set_dtype(phi::DataType::INT8); + } + out->set_layout(x.layout()); + + residual_out->set_dims(out_dims); + residual_out->set_dtype(x.dtype()); + residual_out->set_layout(x.layout()); + + mean->set_dims(phi::make_ddim({rows})); + mean->set_dtype(DataType::FLOAT32); + mean->set_layout(x.layout()); + + variance->set_dims(phi::make_ddim({rows})); + variance->set_dtype(DataType::FLOAT32); + variance->set_layout(x.layout()); +} + void FusedLinearParamGradAddInferMeta(const MetaTensor& x, const MetaTensor& dout, const MetaTensor& dweight, @@ -2918,6 +2980,54 @@ void PsroiPoolInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void RmsNormInferMeta(const MetaTensor& x, + const MetaTensor& bias, + const MetaTensor& residual, + const MetaTensor& norm_weight, + const MetaTensor& norm_bias, + const float epsilon, + const int begin_norm_axis, + const float quant_scale, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + MetaTensor* out, + MetaTensor* residual_out) { + std::vector x_dims_vec = phi::vectorize(x.dims()); + auto x_dims_size = x_dims_vec.size(); + + size_t normalized_dims = 1; + for (size_t i = begin_norm_axis; i < x_dims_size; ++i) { + normalized_dims *= x_dims_vec[i]; + } + + PADDLE_ENFORCE_EQ(normalized_dims, + norm_weight.dims()[0], + phi::errors::InvalidArgument( + "The normalized size of Input(X) must equal to be" + "the size of Weight, but received" + "normalized size of Input(X) is [%d], received size" + "of Weight is [%d]", + normalized_dims, + norm_weight.dims()[0])); + + auto out_dims = phi::make_ddim(x_dims_vec); + + out->set_dims(out_dims); + if (quant_scale <= 0.0f) { + out->set_dtype(x.dtype()); + } else { + out->set_dtype(phi::DataType::INT8); + } + out->set_layout(x.layout()); + out->share_lod(x); + + residual_out->set_dims(out_dims); + residual_out->set_dtype(x.dtype()); + residual_out->set_layout(x.layout()); + residual_out->share_lod(x); +} + void RmspropInferMeta(const MetaTensor& param, const MetaTensor& mean_square, const MetaTensor& grad, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 2d24b2252a566..f1ade56c30989 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -301,6 +301,23 @@ void FusedBiasActInferMeta(const MetaTensor& x, float quant_min_bound, MetaTensor* out); +void FusedLayerNormInferMeta(const MetaTensor& x, + const MetaTensor& bias, + const MetaTensor& residual, + const MetaTensor& norm_weight, + const MetaTensor& norm_bias, + const float epsilon, + const float residual_alpha, + const int begin_norm_axis, + const float quant_scale, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + MetaTensor* out, + MetaTensor* residual_out, + MetaTensor* mean, + MetaTensor* variance); + void FusedLinearParamGradAddInferMeta(const MetaTensor& x, const MetaTensor& dout, const MetaTensor& dweight, @@ -516,6 +533,20 @@ void PsroiPoolInferMeta(const MetaTensor& x, float spatial_scale, MetaTensor* out); +void RmsNormInferMeta(const MetaTensor& x, + const MetaTensor& bias, + const MetaTensor& residual, + const MetaTensor& norm_weight, + const MetaTensor& norm_bias, + const float epsilon, + const int begin_norm_axis, + const float quant_scale, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + MetaTensor* out, + MetaTensor* residual_out); + void RmspropInferMeta(const MetaTensor& param, const MetaTensor& mean_square, const MetaTensor& grad, diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h index 8868a4435b431..e5f5c9ba50ba4 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h @@ -124,6 +124,7 @@ __global__ void FusedDropoutActBias( nullptr, nullptr, act, + 1.0, /*Since Dropout Act bias do not use residual alpha, we set 1.0*/ quant_last_in_scale, dequant_out_scale_data, quant_next_in_scale, diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h index c73a35d2265ce..681e6cdac57e8 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h @@ -123,10 +123,12 @@ class FusedDropoutHelper { FusedDropoutHelper(const phi::GPUContext& ctx, const int rows, const int cols, - const DropoutParam& dropout_param) { + const DropoutParam& dropout_param, + const float residual_alpha = 1.0) { rows_ = rows; cols_ = cols; dropout_param_ = dropout_param; + residual_alpha_ = residual_alpha; } // out = residual + dropout( src + bias ) @@ -156,7 +158,8 @@ class FusedDropoutHelper { ctx, quant_last_in_scale, dequant_out_scale_data, - quant_next_in_scale); + quant_next_in_scale, + residual_alpha_); } void ResidualDropoutBiasGrad(const phi::GPUContext& ctx, @@ -336,6 +339,7 @@ class FusedDropoutHelper { int rows_; int cols_; DropoutParam dropout_param_; + float residual_alpha_; }; template ; this->rows_ = rows; this->cols_ = cols; epsilon_ = epsilon; + this->residual_alpha_ = residual_alpha; } FusedDropoutLayerNormHelper(const phi::GPUContext& ctx, const int rows, const int cols, const DropoutParam& dropout_param, - const float epsilon) + const float epsilon, + const float residual_alpha = 1.0) : FusedDropoutHelper( - ctx, rows, cols, dropout_param) { + ctx, rows, cols, dropout_param, residual_alpha) { using U = phi::funcs::LayerNormParamType; epsilon_ = epsilon; } @@ -476,7 +483,8 @@ class FusedDropoutLayerNormHelper quant_next_in_scale, quant_round_type, quant_max_bound, - quant_min_bound); + quant_min_bound, + this->residual_alpha_); } template , diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu new file mode 100644 index 0000000000000..138e5583a3a0a --- /dev/null +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu @@ -0,0 +1,1094 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Original OneFlow copyright notice: + +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/layer_norm.cuh +// The following code modified from OneFlow's implementation, and change to use +// single Pass algorithm. Support Int8 quant, dequant Load/Store implementation. + +#include "paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.h" +#include +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#ifndef PADDLE_WITH_HIP +#include +#include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h" +#include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" +#endif + +namespace phi { + +namespace fusion { + +namespace { + +#ifndef PADDLE_WITH_HIP + +constexpr int kWarpSize = 32; + +template +struct SumOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return a + b; + } +}; + +template +struct MaxOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return max(a, b); + } +}; + +template