Feat/optimize single prediction (#2992)

* [performance] Add Fast methods to C API for SingleRow Predictions * Add methods to C API to make single-row predictions faster: - LGBM_BoosterPredictForMatSingleRowFastInit (setup) - LGBM_BoosterPredictForMatSingleRowFast (predict) - LGBM_FastConfigFree (cleanup setup outputs) * Code syle cleanup * Fix lint errors * [performance] Revert FastConfig improvement to pass data at init This reduces optimization by 5% / 30% with this branch but makes it so it can be used for higher level wrappers in MMLSpark. And outside it as well. * [performance] Introduce Fast variants for SingleRow predictors. Although this already provides performance gains by itself for any callers, two new functions were added to Java's SWIG interfaces to exploit that AND the GetPrimitiveArrayCritical data fetches. * [tests/profiling] Profile Fast predict methods Build with -DBUILD_PROFILING_TESTS=ON and copy the default model trained on the Higgs dataset from the benchmarks repo https://github.com/guolinke/boosting_tree_benchmarks.git to LightGBM repo root and run the lightgbm_profile_* binaries. The single instance used is the first row from that dataset. * Update comment on CMakeLists. * Fix doxygen-introduced issue (#threads) * Fix conflicts due to new RowFunctionFromCSR signature in master * Change FastConfig ncol to int32_t. * Removed profiling folder * fix doxygen typo include/LightGBM/c_api.h Co-authored-by: Nikita Titov <[email protected]> * fix doxygen typo include/LightGBM/c_api.h Co-authored-by: Nikita Titov <[email protected]> * fix doxygen typo include/LightGBM/c_api.h Co-authored-by: Nikita Titov <[email protected]> * Doxygen: change new docstrings to double back-quote Co-authored-by: alberto.ferreira <[email protected]> Co-authored-by: Nikita Titov <[email protected]>
microsoft · Jul 15, 2020 · fc79b36 · fc79b36
1 parent 87d4648
commit fc79b36
Show file tree

Hide file tree

Showing 3 changed files with 309 additions and 1 deletion.
diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
@@ -22,6 +22,7 @@
 
 typedef void* DatasetHandle;  /*!< \brief Handle of dataset. */
 typedef void* BoosterHandle;  /*!< \brief Handle of booster. */
+typedef void* FastConfigHandle; /*!< \brief Handle of FastConfig. */
 
 #define C_API_DTYPE_FLOAT32 (0)  /*!< \brief float32 (single precision float). */
 #define C_API_DTYPE_FLOAT64 (1)  /*!< \brief float64 (double precision float). */
@@ -580,7 +581,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterGetEvalCounts(BoosterHandle handle,
  * \param len Number of ``char*`` pointers stored at ``out_strs``.
  *            If smaller than the max size, only this many strings are copied
  * \param[out] out_len Total number of evaluation datasets
- * \param buffer_len Size of pre-allocated strings. 
+ * \param buffer_len Size of pre-allocated strings.
  *                   Content is copied up to ``buffer_len - 1`` and null-terminated
  * \param[out] out_buffer_len String sizes required to do the full string copies
  * \param[out] out_strs Names of evaluation datasets, should pre-allocate memory
@@ -706,6 +707,14 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
                                                  int num_iteration,
                                                  int64_t* out_len);
 
+/*!
+ * \brief Release FastConfig object.
+ *
+ * \param fastConfig Handle to the FastConfig object acquired with a ``*FastInit()`` method.
+ * \return 0 when it succeeds, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_FastConfigFree(FastConfigHandle fastConfig);
+
 /*!
  * \brief Make prediction for a new dataset in CSR format.
  * \note
@@ -847,6 +856,73 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
                                                          int64_t* out_len,
                                                          double* out_result);
 
+/*!
+ * \brief Initialize and return a ``FastConfigHandle`` for use with ``LGBM_BoosterPredictForCSRSingleRowFast``.
+ *
+ * Release the ``FastConfig`` by passing its handle to ``LGBM_FastConfigFree`` when no longer needed.
+ *
+ * \param handle Booster handle
+ * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
+ * \param num_col Number of columns
+ * \param parameter Other parameters for prediction, e.g. early stopping for prediction
+ * \param[out] out_fastConfig FastConfig object with which you can call ``LGBM_BoosterPredictForCSRSingleRowFast``
+ * \return 0 when it succeeds, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
+                                                                 const int data_type,
+                                                                 const int64_t num_col,
+                                                                 const char* parameter,
+                                                                 FastConfigHandle *out_fastConfig);
+
+/*!
+ * \brief Faster variant of ``LGBM_BoosterPredictForCSRSingleRow``.
+ *
+ * Score single rows after setup with ``LGBM_BoosterPredictForCSRSingleRowFastInit``.
+ *
+ * By removing the setup steps from this call extra optimizations can be made like
+ * initializing the config only once, instead of once per call.
+ *
+ * \note
+ *   Setting up the number of threads is only done once at ``LGBM_BoosterPredictForCSRSingleRowFastInit``
+ *   instead of at each prediction.
+ *   If you use a different number of threads in other calls, you need to start the setup process over,
+ *   or that number of threads will be used for these calls as well.
+ *
+ * \note
+ * You should pre-allocate memory for ``out_result``:
+ *   - for normal and raw score, its length is equal to ``num_class * num_data``;
+ *   - for leaf index, its length is equal to ``num_class * num_data * num_iteration``;
+ *   - for feature contributions, its length is equal to ``num_class * num_data * (num_feature + 1)``.
+ *
+ * \param fastConfig_handle FastConfig object handle returned by ``LGBM_BoosterPredictForCSRSingleRowFastInit``
+ * \param indptr Pointer to row headers
+ * \param indptr_type Type of ``indptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64``
+ * \param indices Pointer to column indices
+ * \param data Pointer to the data space
+ * \param nindptr Number of rows in the matrix + 1
+ * \param nelem Number of nonzero elements in the matrix
+ * \param predict_type What should be predicted
+ *   - ``C_API_PREDICT_NORMAL``: normal prediction, with transform (if needed);
+ *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
+ *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
+ *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param num_iteration Number of iterations for prediction, <= 0 means no limit
+ * \param[out] out_len Length of output result
+ * \param[out] out_result Pointer to array with predictions
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle,
+                                                             const void* indptr,
+                                                             int indptr_type,
+                                                             const int32_t* indices,
+                                                             const void* data,
+                                                             int64_t nindptr,
+                                                             int64_t nelem,
+                                                             int predict_type,
+                                                             int num_iteration,
+                                                             int64_t* out_len,
+                                                             double* out_result);
+
 /*!
  * \brief Make prediction for a new dataset in CSC format.
  * \note
@@ -960,6 +1036,57 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
                                                          int64_t* out_len,
                                                          double* out_result);
 
+/*!
+ * \brief Initialize and return a ``FastConfigHandle`` for use with ``LGBM_BoosterPredictForMatSingleRowFast``.
+ *
+ * Release the ``FastConfig`` by passing its handle to ``LGBM_FastConfigFree`` when no longer needed.
+ *
+ * \param handle Booster handle
+ * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
+ * \param ncol Number of columns
+ * \param parameter Other parameters for prediction, e.g. early stopping for prediction
+ * \param[out] out_fastConfig FastConfig object with which you can call ``LGBM_BoosterPredictForMatSingleRowFast``
+ * \return 0 when it succeeds, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
+                                                                 int data_type,
+                                                                 int32_t ncol,
+                                                                 const char* parameter,
+                                                                 FastConfigHandle *out_fastConfig);
+
+/*!
+ * \brief Faster variant of ``LGBM_BoosterPredictForMatSingleRow``.
+ *
+ * Score a single row after setup with ``LGBM_BoosterPredictForMatSingleRowFastInit``.
+ *
+ * By removing the setup steps from this call extra optimizations can be made like
+ * initializing the config only once, instead of once per call.
+ *
+ * \note
+ *   Setting up the number of threads is only done once at ``LGBM_BoosterPredictForMatSingleRowFastInit``
+ *   instead of at each prediction.
+ *   If you use a different number of threads in other calls, you need to start the setup process over,
+ *   or that number of threads will be used for these calls as well.
+ *
+ * \param fastConfig_handle FastConfig object handle returned by ``LGBM_BoosterPredictForMatSingleRowFastInit``
+ * \param data Single-row array data (no other way than row-major form).
+ * \param predict_type What should be predicted
+ *   - ``C_API_PREDICT_NORMAL``: normal prediction, with transform (if needed);
+ *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
+ *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
+ *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param num_iteration Number of iteration for prediction, <= 0 means no limit
+ * \param[out] out_len Length of output result
+ * \param[out] out_result Pointer to array with predictions
+ * \return 0 when it succeeds, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fastConfig_handle,
+                                                             const void* data,
+                                                             int predict_type,
+                                                             int num_iteration,
+                                                             int64_t* out_len,
+                                                             double* out_result);
+
 /*!
  * \brief Make prediction for a new dataset presented in a form of array of pointers to rows.
  * \note

diff --git a/src/c_api.cpp b/src/c_api.cpp
@@ -1739,6 +1739,36 @@ int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
   API_END();
 }
 
+/*!
+ * \brief Object to store resources meant for single-row Fast Predict methods.
+ *
+ * Meant to be used as a basic struct by the *Fast* predict methods only.
+ * It stores the configuration resources for reuse during prediction.
+ *
+ * Even the row function is stored. We score the instance at the same memory
+ * address all the time. One just replaces the feature values at that address
+ * and scores again with the *Fast* methods.
+ */
+struct FastConfig {
+  FastConfig(Booster *const booster_ptr,
+             const char *parameter,
+             const int data_type_,
+             const int32_t num_cols) : booster(booster_ptr), data_type(data_type_), ncol(num_cols) {
+    config.Set(Config::Str2Map(parameter));
+  }
+
+  Booster* const booster;
+  Config config;
+  const int data_type;
+  const int32_t ncol;
+};
+
+int LGBM_FastConfigFree(FastConfigHandle fastConfig) {
+  API_BEGIN();
+  delete reinterpret_cast<FastConfig*>(fastConfig);
+  API_END();
+}
+
 int LGBM_BoosterPredictForCSR(BoosterHandle handle,
                               const void* indptr,
                               int indptr_type,
@@ -1890,6 +1920,51 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
   API_END();
 }
 
+int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
+                                               const int data_type,
+                                               const int64_t num_col,
+                                               const char* parameter,
+                                               FastConfigHandle *out_fastConfig) {
+  API_BEGIN();
+  if (num_col <= 0) {
+    Log::Fatal("The number of columns should be greater than zero.");
+  } else if (num_col >= INT32_MAX) {
+    Log::Fatal("The number of columns should be smaller than INT32_MAX.");
+  }
+
+  auto fastConfig_ptr = std::unique_ptr<FastConfig>(new FastConfig(
+    reinterpret_cast<Booster*>(handle),
+    parameter,
+    data_type,
+    static_cast<int32_t>(num_col)));
+
+  if (fastConfig_ptr->config.num_threads > 0) {
+    omp_set_num_threads(fastConfig_ptr->config.num_threads);
+  }
+
+  *out_fastConfig = fastConfig_ptr.release();
+  API_END();
+}
+
+int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle,
+                                           const void* indptr,
+                                           int indptr_type,
+                                           const int32_t* indices,
+                                           const void* data,
+                                           int64_t nindptr,
+                                           int64_t nelem,
+                                           int predict_type,
+                                           int num_iteration,
+                                           int64_t* out_len,
+                                           double* out_result) {
+  API_BEGIN();
+  FastConfig *fastConfig = reinterpret_cast<FastConfig*>(fastConfig_handle);
+  auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, fastConfig->data_type, nindptr, nelem);
+  fastConfig->booster->PredictSingleRow(num_iteration, predict_type, fastConfig->ncol,
+                                        get_row_fun, fastConfig->config, out_result, out_len);
+  API_END();
+}
+
 
 int LGBM_BoosterPredictForCSC(BoosterHandle handle,
                               const void* col_ptr,
@@ -1987,6 +2062,42 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
   API_END();
 }
 
+int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
+                                               const int data_type,
+                                               const int32_t ncol,
+                                               const char* parameter,
+                                               FastConfigHandle *out_fastConfig) {
+  API_BEGIN();
+  auto fastConfig_ptr = std::unique_ptr<FastConfig>(new FastConfig(
+    reinterpret_cast<Booster*>(handle),
+    parameter,
+    data_type,
+    ncol));
+
+  if (fastConfig_ptr->config.num_threads > 0) {
+    omp_set_num_threads(fastConfig_ptr->config.num_threads);
+  }
+
+  *out_fastConfig = fastConfig_ptr.release();
+  API_END();
+}
+
+int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fastConfig_handle,
+                                           const void* data,
+                                           const int predict_type,
+                                           const int num_iteration,
+                                           int64_t* out_len,
+                                           double* out_result) {
+  API_BEGIN();
+  FastConfig *fastConfig = reinterpret_cast<FastConfig*>(fastConfig_handle);
+  // Single row in row-major format:
+  auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, fastConfig->ncol, fastConfig->data_type, 1);
+  fastConfig->booster->PredictSingleRow(num_iteration, predict_type, fastConfig->ncol,
+                                        get_row_fun, fastConfig->config,
+                                        out_result, out_len);
+  API_END();
+}
+
 
 int LGBM_BoosterPredictForMats(BoosterHandle handle,
                                const void** data,

diff --git a/swig/lightgbmlib.i b/swig/lightgbmlib.i
@@ -97,6 +97,32 @@
     return ret;
   }
 
+  /*! \brief Even faster variant of `LGBM_BoosterPredictForMatSingle`.
+   *
+   * Uses `LGBM_BoosterPredictForMatSingleRowFast` which is faster
+   * than `LGBM_BoosterPredictForMatSingleRow` and the trick of
+   * `LGBM_BoosterPredictForMatSingle` to capture the Java data array
+   * using `GetPrimitiveArrayCritical`, which can yield faster access
+   * to the array if the JVM passes the actual address to the C++ side
+   * instead of performing a copy.
+   */
+  int LGBM_BoosterPredictForMatSingleRowFastCriticalSWIG(JNIEnv *jenv,
+                                                         jdoubleArray data,
+                                                         FastConfigHandle handle,
+                                                         int predict_type,
+                                                         int num_iteration,
+                                                         int64_t* out_len,
+                                                         double* out_result) {
+    double* data0 = (double*)jenv->GetPrimitiveArrayCritical(data, 0);
+
+    int ret = LGBM_BoosterPredictForMatSingleRowFast(handle, data0, predict_type,
+                                                     num_iteration, out_len, out_result);
+
+    jenv->ReleasePrimitiveArrayCritical(data, data0, JNI_ABORT);
+
+    return ret;
+  }
+
   int LGBM_BoosterPredictForCSRSingle(JNIEnv *jenv,
                                       jintArray indices,
                                       jdoubleArray values,
@@ -132,6 +158,50 @@
     return ret;
   }
 
+  /*! \brief Even faster variant of `LGBM_BoosterPredictForCSRSingle`.
+   *
+   * Uses `LGBM_BoosterPredictForCSRSingleRowFast` which is faster
+   * than `LGBM_BoosterPredictForMatSingleRow` and the trick of
+   * `LGBM_BoosterPredictForCSRSingle` to capture the Java data array
+   * using `GetPrimitiveArrayCritical`, which can yield faster access
+   * to the array if the JVM passes the actual address to the C++ side
+   * instead of performing a copy.
+   */
+  int LGBM_BoosterPredictForCSRSingleRowFastCriticalSWIG(JNIEnv *jenv,
+                                                         jintArray indices,
+                                                         jdoubleArray values,
+                                                         int numNonZeros,
+                                                         FastConfigHandle handle,
+                                                         int indptr_type,
+                                                         //int data_type,
+                                                         int64_t nelem,
+                                                         //int64_t num_col,
+                                                         int predict_type,
+                                                         int num_iteration,
+                                                         //const char* parameter,
+                                                         int64_t* out_len,
+                                                         double* out_result) {
+    // Alternatives
+    // - GetIntArrayElements: performs copy
+    // - GetDirectBufferAddress: fails on wrapped array
+    // Some words of warning for GetPrimitiveArrayCritical
+    // https://stackoverflow.com/questions/23258357/whats-the-trade-off-between-using-getprimitivearraycritical-and-getprimitivety
+
+    jboolean isCopy;
+    int* indices0 = (int*)jenv->GetPrimitiveArrayCritical(indices, &isCopy);
+    double* values0 = (double*)jenv->GetPrimitiveArrayCritical(values, &isCopy);
+
+    int32_t ind[2] = { 0, numNonZeros };
+
+    int ret = LGBM_BoosterPredictForCSRSingleRowFast(handle, ind, indptr_type, indices0, values0, 2,
+                                                     nelem, predict_type, num_iteration, out_len, out_result);
+
+    jenv->ReleasePrimitiveArrayCritical(values, values0, JNI_ABORT);
+    jenv->ReleasePrimitiveArrayCritical(indices, indices0, JNI_ABORT);
+
+    return ret;
+  }
+
   #include <functional>
   #include <vector>