From d8984e836e1f04142912e366ca73e51a6b1c6353 Mon Sep 17 00:00:00 2001 From: Da Zheng Date: Tue, 18 Sep 2018 17:07:01 -0700 Subject: [PATCH] [MXNET-910] Multithreading inference. (#12456) * add multi-threading inference. * demo multi-threading inference. * add new capi. * make naive engine thread local. * create an executor inside each thread. * fix format. * fix format. * fix format. * Revert "make naive engine thread local." This reverts commit b9d844e46d33f11ad409feb099194e183da9bbda. * Update CAPI. * add doc. * fix lint. * update example. * update. * fix. * add check. * fix. * fix example. * update name. * update README. --- .../image-classification/predict-cpp/Makefile | 2 +- .../predict-cpp/README.md | 6 +- .../image-classification-predict.cc | 131 +++++++----- include/mxnet/c_predict_api.h | 33 +++ src/c_api/c_predict_api.cc | 197 +++++++++++++----- 5 files changed, 262 insertions(+), 107 deletions(-) diff --git a/example/image-classification/predict-cpp/Makefile b/example/image-classification/predict-cpp/Makefile index e0c0bc657297..5c084119b966 100644 --- a/example/image-classification/predict-cpp/Makefile +++ b/example/image-classification/predict-cpp/Makefile @@ -15,7 +15,7 @@ LDFLAGS+=`pkg-config --libs opencv` export MXNET_ROOT=`pwd`/../../.. CFLAGS+=-Wall -I$(MXNET_ROOT)/include -LDFLAGS+=$(MXNET_ROOT)/lib/libmxnet.so +LDFLAGS+=$(MXNET_ROOT)/lib/libmxnet.so -lpthread image-classification-predict: image-classification-predict.o g++ -O3 -o image-classification-predict image-classification-predict.o $(LDFLAGS) diff --git a/example/image-classification/predict-cpp/README.md b/example/image-classification/predict-cpp/README.md index 69f63d706006..2a5e350e4afb 100644 --- a/example/image-classification/predict-cpp/README.md +++ b/example/image-classification/predict-cpp/README.md @@ -1,5 +1,5 @@ # Image Classification Example Using the C Predict API -This is a simple predictor which shows how to use the MXNet C Predict API for image classification with a pre-trained ImageNet model. +This is a simple predictor which shows how to use the MXNet C Predict API for image classification with a pre-trained ImageNet model in a single thread and multiple threads. ## Prerequisites @@ -45,10 +45,10 @@ Run the example by passing it an image that you want to classify. If you don't h wget https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Honeycrisp.jpg/1920px-Honeycrisp.jpg ``` -Then run the `image-classification-predict` program, passing the image as the argument. +Then run the `image-classification-predict` program, passing the image as the first argument and the number of threads as the second parameter. ```bash - ./image-classification-predict 1920px-Honeycrisp.jpg + ./image-classification-predict 1920px-Honeycrisp.jpg 1 ``` ## Tips diff --git a/example/image-classification/predict-cpp/image-classification-predict.cc b/example/image-classification/predict-cpp/image-classification-predict.cc index 186107bd530f..2a605b8b2674 100644 --- a/example/image-classification/predict-cpp/image-classification-predict.cc +++ b/example/image-classification/predict-cpp/image-classification-predict.cc @@ -37,6 +37,7 @@ #include #include #include +#include #include #include // Path for c_predict_api @@ -179,14 +180,56 @@ void PrintOutputResult(const std::vector& data, const std::vector &image_data, + NDListHandle nd_hnd, const std::string &synset_file, int i) { + auto image_size = image_data.size(); + // Set Input Image + MXPredSetInput(pred_hnd, "data", image_data.data(), static_cast(image_size)); + + // Do Predict Forward + MXPredForward(pred_hnd); + + mx_uint output_index = 0; + + mx_uint* shape = nullptr; + mx_uint shape_len; + + // Get Output Result + MXPredGetOutputShape(pred_hnd, output_index, &shape, &shape_len); + + std::size_t size = 1; + for (mx_uint i = 0; i < shape_len; ++i) { size *= shape[i]; } + + std::vector data(size); + + MXPredGetOutput(pred_hnd, output_index, &(data[0]), static_cast(size)); + + // Release NDList + if (nd_hnd) { + MXNDListFree(nd_hnd); + } + + // Release Predictor + MXPredFree(pred_hnd); + + // Synset path for your model, you have to modify it + auto synset = LoadSynset(synset_file); + + // Print Output Data + PrintOutputResult(data, synset); +} + int main(int argc, char* argv[]) { if (argc < 2) { std::cout << "No test image here." << std::endl - << "Usage: ./image-classification-predict apple.jpg" << std::endl; + << "Usage: ./image-classification-predict apple.jpg [num_threads]" << std::endl; return EXIT_FAILURE; } std::string test_file(argv[1]); + int num_threads = 1; + if (argc == 3) + num_threads = std::atoi(argv[2]); // Models path for your model, you have to modify it std::string json_file = "model/Inception/Inception-BN-symbol.json"; @@ -214,25 +257,11 @@ int main(int argc, char* argv[]) { static_cast(channels), static_cast(height), static_cast(width) }; - PredictorHandle pred_hnd = nullptr; if (json_data.GetLength() == 0 || param_data.GetLength() == 0) { return EXIT_FAILURE; } - // Create Predictor - MXPredCreate(static_cast(json_data.GetBuffer()), - static_cast(param_data.GetBuffer()), - static_cast(param_data.GetLength()), - dev_type, - dev_id, - num_input_nodes, - input_keys, - input_shape_indptr, - input_shape_data, - &pred_hnd); - assert(pred_hnd); - auto image_size = static_cast(width * height * channels); // Read Mean Data @@ -259,40 +288,46 @@ int main(int argc, char* argv[]) { GetImageFile(test_file, image_data.data(), channels, cv::Size(width, height), nd_data); - // Set Input Image - MXPredSetInput(pred_hnd, "data", image_data.data(), static_cast(image_size)); - - // Do Predict Forward - MXPredForward(pred_hnd); - - mx_uint output_index = 0; - - mx_uint* shape = nullptr; - mx_uint shape_len; - - // Get Output Result - MXPredGetOutputShape(pred_hnd, output_index, &shape, &shape_len); - - std::size_t size = 1; - for (mx_uint i = 0; i < shape_len; ++i) { size *= shape[i]; } - - std::vector data(size); - - MXPredGetOutput(pred_hnd, output_index, &(data[0]), static_cast(size)); - - // Release NDList - if (nd_hnd) { - MXNDListFree(nd_hnd); + if (num_threads == 1) { + // Create Predictor + PredictorHandle pred_hnd; + MXPredCreate(static_cast(json_data.GetBuffer()), + static_cast(param_data.GetBuffer()), + static_cast(param_data.GetLength()), + dev_type, + dev_id, + num_input_nodes, + input_keys, + input_shape_indptr, + input_shape_data, + &pred_hnd); + assert(pred_hnd); + + predict(pred_hnd, image_data, nd_hnd, synset_file, 0); + } else { + // Create Predictor + std::vector pred_hnds(num_threads, nullptr); + MXPredCreateMultiThread(static_cast(json_data.GetBuffer()), + static_cast(param_data.GetBuffer()), + static_cast(param_data.GetLength()), + dev_type, + dev_id, + num_input_nodes, + input_keys, + input_shape_indptr, + input_shape_data, + pred_hnds.size(), + pred_hnds.data()); + for (auto hnd : pred_hnds) + assert(hnd); + + std::vector threads; + for (int i = 0; i < num_threads; i++) + threads.emplace_back(predict, pred_hnds[i], image_data, nd_hnd, synset_file, i); + for (int i = 0; i < num_threads; i++) + threads[i].join(); } - - // Release Predictor - MXPredFree(pred_hnd); - - // Synset path for your model, you have to modify it - auto synset = LoadSynset(synset_file); - - // Print Output Data - PrintOutputResult(data, synset); + printf("run successfully\n"); return EXIT_SUCCESS; } diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h index cc1c2966bd75..16addff6345f 100644 --- a/include/mxnet/c_predict_api.h +++ b/include/mxnet/c_predict_api.h @@ -119,6 +119,39 @@ MXNET_DLL int MXPredCreatePartialOut(const char* symbol_json_str, mx_uint num_output_nodes, const char** output_keys, PredictorHandle* out); + +/*! + * \brief create predictors for multiple threads. One predictor for a thread. + * \param symbol_json_str The JSON string of the symbol. + * \param param_bytes The in-memory raw bytes of parameter ndarray file. + * \param param_size The size of parameter ndarray file. + * \param dev_type The device type, 1: cpu, 2:gpu + * \param dev_id The device id of the predictor. + * \param num_input_nodes Number of input nodes to the net, + * For feedforward net, this is 1. + * \param input_keys The name of input argument. + * For feedforward net, this is {"data"} + * \param input_shape_indptr Index pointer of shapes of each input node. + * The length of this array = num_input_nodes + 1. + * For feedforward net that takes 4 dimensional input, this is {0, 4}. + * \param input_shape_data A flatted data of shapes of each input node. + * For feedforward net that takes 4 dimensional input, this is the shape data. + * \param num_threads The number of threads that we'll run the predictors. + * \param out An array of created predictor handles. The array has to be large + * enough to keep `num_threads` predictors. + * \return 0 when success, -1 when failure. + */ +MXNET_DLL int MXPredCreateMultiThread(const char* symbol_json_str, + const void* param_bytes, + int param_size, + int dev_type, int dev_id, + mx_uint num_input_nodes, + const char** input_keys, + const mx_uint* input_shape_indptr, + const mx_uint* input_shape_data, + int num_threads, + PredictorHandle* out); + /*! * \brief Change the input shape of an existing predictor. * \param num_input_nodes Number of input nodes to the net, diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc index d84a89ab2133..c2576cc8e0af 100644 --- a/src/c_api/c_predict_api.cc +++ b/src/c_api/c_predict_api.cc @@ -67,47 +67,39 @@ struct MXAPINDList { std::vector data; }; -int MXPredCreate(const char* symbol_json_str, - const void* param_bytes, - int param_size, - int dev_type, int dev_id, - mx_uint num_input_nodes, - const char** input_keys, - const mx_uint* input_shape_indptr, - const mx_uint* input_shape_data, - PredictorHandle* out) { - return MXPredCreatePartialOut( - symbol_json_str, - param_bytes, - param_size, - dev_type, - dev_id, - num_input_nodes, - input_keys, - input_shape_indptr, - input_shape_data, - 0, - NULL, - out); +inline void _CreateExecutor(PredictorHandle pred_hnd) { + MXAPIPredictor *pred = static_cast(pred_hnd); + if (pred->exec == nullptr) { + auto sym = pred->sym; + auto ctx = pred->ctx; + auto key2arg = pred->key2arg; + auto arg_arrays = pred->arg_arrays; + auto aux_arrays = pred->aux_arrays; + std::map ctx_map; + std::vector grad_store(arg_arrays.size()); + std::vector grad_req(arg_arrays.size(), kNullOp); + pred->exec.reset(Executor::Bind(sym, ctx, ctx_map, arg_arrays, + grad_store, grad_req, aux_arrays)); + pred->out_arrays = pred->exec->outputs(); + } } -namespace mxnet { -} // namespace mxnet - -int MXPredCreatePartialOut(const char* symbol_json_str, - const void* param_bytes, - int param_size, - int dev_type, int dev_id, - mx_uint num_input_nodes, - const char** input_keys, - const mx_uint* input_shape_indptr, - const mx_uint* input_shape_data, - mx_uint num_output_nodes, - const char** output_keys, - PredictorHandle* out) { +int _CreatePartialOut(const char* symbol_json_str, + const void* param_bytes, + int param_size, + int dev_type, int dev_id, + mx_uint num_input_nodes, + const char** input_keys, + const mx_uint* input_shape_indptr, + const mx_uint* input_shape_data, + mx_uint num_output_nodes, + const char** output_keys, + // This is used for paralle inference. + int num_threads, + bool lazy, + PredictorHandle* out) { using nnvm::Symbol; - MXAPIPredictor* ret = new MXAPIPredictor(); API_BEGIN(); Symbol sym; // make sure symbols are registered @@ -140,7 +132,6 @@ int MXPredCreatePartialOut(const char* symbol_json_str, } sym = nnvm::Symbol::CreateGroup(out_syms); } - ret->sym = sym; // load the parameters std::unordered_map arg_params, aux_params; @@ -188,9 +179,10 @@ int MXPredCreatePartialOut(const char* symbol_json_str, std::vector out_shapes(sym.ListOutputNames().size()); std::vector aux_shapes(aux_names.size()); std::vector arg_shapes; + std::unordered_map key2arg; for (size_t i = 0; i < arg_names.size(); ++i) { std::string key = arg_names[i]; - ret->key2arg[key] = i; + key2arg[key] = i; } try { @@ -215,7 +207,6 @@ int MXPredCreatePartialOut(const char* symbol_json_str, } Context ctx = Context::Create(static_cast(dev_type), dev_id); - ret->ctx = ctx; std::vector arg_arrays, aux_arrays; for (size_t i = 0; i < arg_shapes.size(); ++i) { @@ -232,24 +223,117 @@ int MXPredCreatePartialOut(const char* symbol_json_str, } aux_arrays.push_back(nd); } - ret->arg_arrays = arg_arrays; - ret->aux_arrays = aux_arrays; // bind - { - std::map ctx_map; - std::vector grad_store(arg_arrays.size()); - std::vector grad_req(arg_arrays.size(), kNullOp); - - - ret->exec.reset(Executor::Bind(sym, ctx, ctx_map, - arg_arrays, - grad_store, grad_req, - aux_arrays)); + for (int i = 0; i < num_threads; i++) { + std::unique_ptr ret(new MXAPIPredictor()); + ret->sym = sym; + ret->ctx = ctx; + ret->key2arg = key2arg; + ret->arg_arrays = arg_arrays; + ret->aux_arrays = aux_arrays; ret->out_shapes = out_shapes; - ret->out_arrays = ret->exec->outputs(); + + if (!lazy) { + std::map ctx_map; + std::vector grad_store(arg_arrays.size()); + std::vector grad_req(arg_arrays.size(), kNullOp); + ret->exec.reset(Executor::Bind(sym, ctx, ctx_map, + arg_arrays, + grad_store, grad_req, + aux_arrays)); + ret->out_arrays = ret->exec->outputs(); + } + out[i] = ret.release(); } - *out = ret; - API_END_HANDLE_ERROR(delete ret); + API_END_HANDLE_ERROR(); +} + +int MXPredCreatePartialOut(const char* symbol_json_str, + const void* param_bytes, + int param_size, + int dev_type, int dev_id, + mx_uint num_input_nodes, + const char** input_keys, + const mx_uint* input_shape_indptr, + const mx_uint* input_shape_data, + mx_uint num_output_nodes, + const char** output_keys, + PredictorHandle* out) { + return _CreatePartialOut( + symbol_json_str, + param_bytes, + param_size, + dev_type, dev_id, + num_input_nodes, + input_keys, + input_shape_indptr, + input_shape_data, + num_output_nodes, + output_keys, + 1, + false, + out); +} + +int MXPredCreate(const char* symbol_json_str, + const void* param_bytes, + int param_size, + int dev_type, int dev_id, + mx_uint num_input_nodes, + const char** input_keys, + const mx_uint* input_shape_indptr, + const mx_uint* input_shape_data, + PredictorHandle* out) { + return _CreatePartialOut( + symbol_json_str, + param_bytes, + param_size, + dev_type, + dev_id, + num_input_nodes, + input_keys, + input_shape_indptr, + input_shape_data, + 0, + NULL, + 1, + false, + out); +} + +int MXPredCreateMultiThread(const char* symbol_json_str, + const void* param_bytes, + int param_size, + int dev_type, int dev_id, + mx_uint num_input_nodes, + const char** input_keys, + const mx_uint* input_shape_indptr, + const mx_uint* input_shape_data, + // This is used for paralle inference. + int num_threads, + PredictorHandle* out) { + const char *type = getenv("MXNET_ENGINE_TYPE"); + std::string stype; + if (type) + stype = type; + CHECK(stype == "NaiveEngine") << "Multithread inference only works with NaiveEngine.\n" + << "Please set MXNET_ENGINE_TYPE to NaiveEngine" + << std::endl; + return _CreatePartialOut( + symbol_json_str, + param_bytes, + param_size, + dev_type, + dev_id, + num_input_nodes, + input_keys, + input_shape_indptr, + input_shape_data, + 0, + NULL, + num_threads, + true, + out); } int MXPredReshape(mx_uint num_input_nodes, @@ -258,6 +342,7 @@ int MXPredReshape(mx_uint num_input_nodes, const mx_uint* input_shape_data, PredictorHandle handle, PredictorHandle* out) { + _CreateExecutor(handle); MXAPIPredictor* p = static_cast(handle); std::unique_ptr ret(new MXAPIPredictor()); @@ -374,6 +459,7 @@ int MXPredSetInput(PredictorHandle handle, } int MXPredForward(PredictorHandle handle) { + _CreateExecutor(handle); MXAPIPredictor* p = static_cast(handle); API_BEGIN(); p->exec->Forward(false); @@ -381,6 +467,7 @@ int MXPredForward(PredictorHandle handle) { } int MXPredPartialForward(PredictorHandle handle, int step, int* step_left) { + _CreateExecutor(handle); MXAPIPredictor* p = static_cast(handle); API_BEGIN(); p->exec->PartialForward(false, step, step_left);