Skip to content

Commit

Permalink
Merge commit '33b8b3d22034423455a493712955e419aac7b19b' into develop
Browse files Browse the repository at this point in the history
* commit '33b8b3d22034423455a493712955e419aac7b19b': (251 commits)
  Remove redundant commands in build.sh and build_doc.sh
  Add dependencies
  Move v2/api/fluid to fluid/api and Adjust doc build commands
  Plain LRN op throws an exception when is_test is set in backward pass
  fix compiler error of profiler_test in ONLY_CPU mode
  fix server shutdown
  Translation for Model Configuration (PaddlePaddle#9513)
  Fix data transform when inplace (PaddlePaddle#9450)
  refine parallel
  add FAQ (PaddlePaddle#9494)
  Fix dist error with lr decay layer (PaddlePaddle#9489)
  add prefetch_op (PaddlePaddle#9495)
  Fix some errors (PaddlePaddle#9403)
  hookup WITH_FLUID_ONLY in TeamCity build.sh (PaddlePaddle#9509)
  Fix the order of reads and write from buffered channel  (PaddlePaddle#9423)
  change WITH_FLUID to WITH_FLUID_ONLY (PaddlePaddle#9427)
  fix block num
  Revert "make append activation in place by default (PaddlePaddle#9417)"
  Speed/sequence op1 (PaddlePaddle#9217)
  fix a compile error
  ...
  • Loading branch information
mikeseven committed Mar 30, 2018
2 parents 42a0798 + 33b8b3d commit 9481af7
Show file tree
Hide file tree
Showing 250 changed files with 7,301 additions and 1,007 deletions.
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option.
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
Expand Down Expand Up @@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
endif()

if (WITH_C_API)
set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
endif()

if(MOBILE_INFERENCE)
Expand Down Expand Up @@ -147,6 +146,7 @@ include(external/cares)
include(external/grpc)
include(external/snappy) # download snappy
include(external/snappystream)
include(external/threadpool)

include(cudnn) # set cudnn libraries, must before configure
include(cupti)
Expand Down
4 changes: 2 additions & 2 deletions cmake/external/mklml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
Expand All @@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${MKLML_VER}\n"
"install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
" DESTINATION ${MKLML_DST_DIR})\n")

ExternalProject_Add(
Expand Down
30 changes: 30 additions & 0 deletions cmake/external/threadpool.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
INCLUDE(ExternalProject)

SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})

ExternalProject_Add(
extern_threadpool
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/progschj/ThreadPool.git"
GIT_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040
PREFIX ${THREADPOOL_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)

if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
add_library(simple_threadpool STATIC ${dummyfile})
else()
add_library(simple_threadpool INTERFACE)
endif()

add_dependencies(simple_threadpool extern_threadpool)

LIST(APPEND external_project_dependencies simple_threadpool)
6 changes: 6 additions & 0 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME)
get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)

#FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
# somehow it didn't. line 602 to 604 is to patching this. Leaving this here
# for now to enable dist CI.
protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
Expand All @@ -597,6 +600,9 @@ function(grpc_library TARGET_NAME)
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
--plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
"${ABS_PROTO}"
DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)

# FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
Expand Down
6 changes: 6 additions & 0 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND)
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir}
)
elseif (WITH_MKLML)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
)
endif()

# paddle fluid module
Expand Down
7 changes: 7 additions & 0 deletions doc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
add_custom_target(paddle_apis ALL
DEPENDS paddle_v2_apis paddle_fluid_apis)

add_custom_target(paddle_docs ALL
DEPENDS paddle_v2_docs paddle_v2_docs_cn
paddle_fluid_docs paddle_fluid_docs_cn)

add_subdirectory(v2)
add_subdirectory(fluid)
83 changes: 83 additions & 0 deletions doc/design/images/parallel_executor_overview.dot
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
digraph G {
subgraph cluster_init {
label="Initialization"
startup_program [label="startup", shape=box]
node_w_g0 [label="W\nGPU0"]
startup_program -> node_w_g0 [label="Initialize"]
node_w_g1 [label="W\nGPU1"]
node_w_g0 -> node_w_g1 [label="broadcast"]
}

subgraph cluster_train {
label="forward_backward"

subgraph cluster_gpu0 {
label="GPU0"
fc_0 [label="fc\nGPU0", shape=box]
hidden_0 [label="hidden\nGPU0"]
node_w_g0 -> fc_0
fc_0 -> hidden_0
loss0 [label="loss\nGPU0"]
hidden_0 -> loss0 [label="many ops omitted"]
scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
loss_g0 [label="loss_grad\nGPU0"]
scale_loss_0->loss_g0

fc_g_0 [label="w_grad\nGPU0", shape=box]
loss0 -> fc_g_0
loss_g0 -> fc_g_0
hidden_0 -> fc_g_0
}

subgraph cluster_gpu1 {
label="GPU1"
fc_1 [label="fc\nGPU1", shape=box]
hidden_1 [label="hidden\nGPU1"]
node_w_g1 -> fc_1
fc_1 -> hidden_1
loss1 [label="loss\nGPU1"]
hidden_1 -> loss1 [label="many ops omitted"]
scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
loss_g1 [label="loss_grad\nGPU1"]
scale_loss_1->loss_g1

fc_g_1 [label="w_grad\nGPU1", shape=box]
loss1 -> fc_g_1
loss_g1 -> fc_g_1
hidden_1 -> fc_g_1
}
}

all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
fc_g_0 -> all_reduce_w
fc_g_1 -> all_reduce_w

fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
all_reduce_w -> fc_g_0_merged
all_reduce_w -> fc_g_1_merged

subgraph cluster_optimization {
label="Optimization"
subgraph cluster_opt_gpu0 {
label="GPU0"
sgd_0 [label="SGD Op\nGPU0", shape=box]

fc_g_0_merged -> sgd_0
node_w_g0 -> sgd_0
optimized_w_0 [label="Optimized W\nGPU0"]
sgd_0 -> optimized_w_0
}
subgraph cluster_opt_gpu1 {
label="GPU1"
sgd_1 [label="SGD Op\nGPU1", shape=box]

fc_g_1_merged -> sgd_1
node_w_g1 -> sgd_1
optimized_w_1 [label="Optimized W\nGPU0"]
sgd_1 -> optimized_w_1
}
}


}
Binary file added doc/design/images/parallel_executor_overview.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
104 changes: 104 additions & 0 deletions doc/design/parallel_executor.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# ParallelExecutor

## Background

Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.

The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.

We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs.

ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.


## Overview of MultiGPUs logic

The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.

![alt](images/parallel_executor_overview.png)

There are several optimizations for this logic.

1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready.
* GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
* Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
3. The streams of computation, merge gradients and fetch data are different.

The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.

| Number of GPUs | 1 | 2 | 3 | 4|
| --- | --- | --- | --- | --- |
| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |


## Static single assignment Graph

[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.

The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.

The data structure of `SSA` graph is:

```c++
struct VarHandleBase {
OpHandleBase* generated_op_;
vector<OpHandleBase*> pending_ops_;

string name;
Place place;
size_t version;
};

struct OpHandleBase {
vector<OpHandleBase*> inputs_;
vector<OpHnadleBase*> outputs_;
};

struct SSAGraph {
// vars on each devices.
// * the vars in each map in vector is on different device.
// * the map is mapping a variable name to variable handles
// with different versions
vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;

// All ops
vector<OpHandleBase> ops_;
};
```
The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
## Execute SSA Graph
The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
1. Maintaining a map of an operator and its needed input number.
2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
3. If there is an operator which needed input number is decreased to zero, just run this operator.
4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
## Synchronize GPU Kernels
The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
1. `OpHandle` will record `DeviceContext` that it is used.
2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
The `wait` are implemented by two strategies:
1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
## What's next?
* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
6 changes: 6 additions & 0 deletions doc/fluid/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ sphinx_add_target(paddle_fluid_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})

add_dependencies(paddle_fluid_docs gen_proto_py)

# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")

Expand All @@ -47,3 +49,7 @@ sphinx_add_target(paddle_fluid_docs_cn
${SPHINX_CACHE_DIR_CN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})

add_dependencies(paddle_fluid_docs_cn gen_proto_py)

add_subdirectory(api)
22 changes: 22 additions & 0 deletions doc/fluid/api/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")

# Sphinx cache with pickled ReST documents
set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")

# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
@ONLY)

sphinx_add_target(paddle_fluid_apis
html
${BINARY_BUILD_DIR_EN}
${SPHINX_CACHE_DIR_EN}
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})

add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
1 change: 1 addition & 0 deletions doc/fluid/build_and_install/build_from_source_cn.rst
1 change: 1 addition & 0 deletions doc/fluid/build_and_install/build_from_source_en.rst
1 change: 1 addition & 0 deletions doc/fluid/build_and_install/docker_install_cn.rst
1 change: 1 addition & 0 deletions doc/fluid/build_and_install/docker_install_en.rst
2 changes: 0 additions & 2 deletions doc/fluid/build_and_install/index_cn.rst

This file was deleted.

1 change: 1 addition & 0 deletions doc/fluid/build_and_install/index_cn.rst
2 changes: 0 additions & 2 deletions doc/fluid/build_and_install/index_en.rst

This file was deleted.

1 change: 1 addition & 0 deletions doc/fluid/build_and_install/index_en.rst
1 change: 1 addition & 0 deletions doc/fluid/build_and_install/pip_install_cn.rst
1 change: 1 addition & 0 deletions doc/fluid/build_and_install/pip_install_en.rst
7 changes: 7 additions & 0 deletions doc/fluid/design/algorithm/index_cn.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
梯度更新算法
------------

.. toctree::
:maxdepth: 1

parameter_average.md
7 changes: 7 additions & 0 deletions doc/fluid/design/algorithm/index_en.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Gradient Update Algorithm
--------------------------------------

.. toctree::
:maxdepth: 1

parameter_average.md
Loading

0 comments on commit 9481af7

Please sign in to comment.