Update orca python api doc (intel-analytics#3501)

* Add orca estimators python api docs * meet comments * add docs * refine tf2 comments * change label cols * add some updates * fix style Co-authored-by: cyita <[email protected]>
yangw1234 · Sep 23, 2021 · 57b9eda · 57b9eda
1 parent c945230
commit 57b9eda
Show file tree

Hide file tree

Showing 6 changed files with 775 additions and 138 deletions.
diff --git a/python/orca/src/bigdl/orca/learn/bigdl/estimator.py b/python/orca/src/bigdl/orca/learn/bigdl/estimator.py
@@ -33,13 +33,15 @@ def from_bigdl(*, model, loss=None, optimizer=None, metrics=None,
         """
         Construct an Estimator with BigDL model, loss function and Preprocessing for feature and
         label data.
+
         :param model: BigDL Model to be trained.
         :param loss: BigDL criterion.
         :param optimizer: BigDL optimizer.
         :param metrics: A evaluation metric or a list of evaluation metrics
-        :param feature_preprocessing: The param converts the data in feature column to a
-               Tensor or to a Sample directly. It expects a List of Int as the size of the
-               converted Tensor, or a Preprocessing[F, Tensor[T]]
+        :param feature_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
+               The param converts the data in feature column to a Tensor or to a Sample directly.
+               It expects a List of Int as the size of the converted Tensor, or a Preprocessing[F,
+               Tensor[T]]
 
                If a List of Int is set as feature_preprocessing, it can only handle the case that
                feature column contains the following data types:
@@ -55,7 +57,8 @@ def from_bigdl(*, model, loss=None, optimizer=None, metrics=None,
 
                The feature_preprocessing will also be copied to the generated NNModel and applied
                to feature column during transform.
-        :param label_preprocessing: similar to feature_preprocessing, but applies to Label data.
+        :param label_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
+            similar to feature_preprocessing, but applies to Label data.
         :param model_dir: The path to save model. During the training, if checkpoint_trigger is
             defined and triggered, the model will be saved to model_dir.
         :return:
@@ -91,6 +94,26 @@ def __init__(self, *, model, loss, optimizer=None, metrics=None,
     def fit(self, data, epochs, batch_size=32, feature_cols="features", label_cols="label",
             caching_sample=True, validation_data=None, validation_trigger=None,
             checkpoint_trigger=None):
+        """
+        Train this BigDL model with train data.
+
+        :param data: train data. It can be XShards or Spark DataFrame.
+        If data is XShards, each partition is a dictionary of  {'x': feature,
+        'y': label}, where feature(label) is a numpy array or a list of numpy arrays.
+        :param epochs: Number of epochs to train the model.
+        :param batch_size: Batch size used for training. Default: 32.
+        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
+        DataFrame. Default: "features".
+        :param label_cols: Label column name(s) of data. Only used when data is a Spark DataFrame.
+        Default: "label".
+        :param caching_sample: whether to cache the Samples after preprocessing. Default: True
+        :param validation_data: Validation data. XShards and Spark DataFrame are supported.
+        If data is XShards, each partition is a dictionary of  {'x': feature,
+        'y': label}, where feature(label) is a numpy array or a list of numpy arrays.
+        :param validation_trigger: Orca Trigger to trigger validation computation.
+        :param checkpoint_trigger: Orca Trigger to set a checkpoint.
+        :return:
+        """
         from zoo.orca.learn.trigger import Trigger
 
         assert batch_size > 0, "batch_size should be greater than 0"
@@ -166,6 +189,26 @@ def fit(self, data, epochs, batch_size=32, feature_cols="features", label_cols="
         return self
 
     def predict(self, data, batch_size=4, feature_cols="features", sample_preprocessing=None):
+        """
+        Predict input data
+
+        :param data: predict input data. It can be XShards or Spark DataFrame.
+        If data is XShards, each partition is a dictionary of  {'x': feature}, where feature
+        is a numpy array or a list of numpy arrays.
+        :param batch_size: Batch size used for inference. Default: 4.
+        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
+        DataFrame. Default: "features".
+        :param sample_preprocessing: Used when data is a Spark DataFrame. If the user want change
+        the default feature_preprocessing specified in Estimator.from_bigdl, the user can pass the
+        new sample_preprocessing methods.
+        :return: predicted result.
+        If input data is Spark DataFrame, the predict result is a DataFrame which includes original
+         columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
+         or Array of VectorUDT depending on model outputs shape.
+        If input data is an XShards, the predict result is a XShards, each partition of the XShards
+        is a dictionary of {'prediction': result}, where result is a numpy array or a list of numpy
+        arrays.
+        """
         if isinstance(data, DataFrame):
             if isinstance(feature_cols, list):
                 data, _, feature_cols = \
@@ -185,6 +228,18 @@ def predict(self, data, batch_size=4, feature_cols="features", sample_preprocess
                              data.__class__.__name__)
 
     def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None):
+        """
+        Evaluate model.
+
+        :param data: validation data. It can be XShards, each partition is a dictionary of
+        {'x': feature, 'y': label}, where feature(label) is a numpy array or a list of numpy arrays.
+        :param batch_size: Batch size used for validation. Default: 32.
+        :param feature_cols: (Not supported yet) Feature column name(s) of data. Only used when
+        data is a Spark  DataFrame. Default: None.
+        :param label_cols: (Not supported yet) Label column name(s) of data. Only used when data
+        is a Spark DataFrame. Default: None.
+        :return:
+        """
         assert data is not None, "validation data shouldn't be None"
         assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \
                                          " argument when creating this estimator."
@@ -204,9 +259,20 @@ def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None):
         return bigdl_metric_results_to_dict(result)
 
     def get_model(self):
+        """
+        Get the trained BigDL model
+
+        :return: The trained BigDL model
+        """
         return self.model
 
     def save(self, model_path):
+        """
+        Save the BigDL model to model_path
+
+        :param model_path: path to save the trained model.
+        :return:
+        """
         try:
             model = self.get_model()
             model.saveModel(model_path + ".bigdl", model_path + ".bin", True)
@@ -215,6 +281,39 @@ def save(self, model_path):
 
     def load(self, checkpoint, optimizer=None, loss=None, feature_preprocessing=None,
              label_preprocessing=None, model_dir=None, is_checkpoint=False):
+        """
+        Load existing BigDL model or checkpoint
+
+        :param checkpoint: Path to the existing model or checkpoint.
+        :param optimizer: BigDL optimizer.
+        :param loss: BigDL criterion.
+        :param feature_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
+               The param converts the data in feature column to a Tensor or to a Sample directly.
+               It expects a List of Int as the size of the converted Tensor, or a Preprocessing[F,
+               Tensor[T]]
+
+               If a List of Int is set as feature_preprocessing, it can only handle the case that
+               feature column contains the following data types:
+               Float, Double, Int, Array[Float], Array[Double], Array[Int] and MLlib Vector. The
+               feature data are converted to Tensors with the specified sizes before
+               sending to the model. Internally, a SeqToTensor is generated according to the
+               size, and used as the feature_preprocessing.
+
+               Alternatively, user can set feature_preprocessing as Preprocessing[F, Tensor[T]]
+               that transforms the feature data to a Tensor[T]. Some pre-defined Preprocessing are
+               provided in package zoo.feature. Multiple Preprocessing can be combined as a
+               ChainedPreprocessing.
+
+               The feature_preprocessing will also be copied to the generated NNModel and applied
+               to feature column during transform.
+        :param label_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
+            similar to feature_preprocessing, but applies to Label data.
+        :param model_dir: The path to save model. During the training, if checkpoint_trigger is
+            defined and triggered, the model will be saved to model_dir.
+        :param is_checkpoint: Whether the path is a checkpoint or a saved BigDL model.
+            Default: False.
+        :return: The loaded estimator object.
+        """
         if loss is not None:
             self.loss = loss
         if optimizer is not None:
@@ -243,6 +342,15 @@ def load(self, checkpoint, optimizer=None, loss=None, feature_preprocessing=None
         return self
 
     def load_orca_checkpoint(self, path, version, prefix=None):
+        """
+        Load existing checkpoint
+
+        :param path: Path to the existing checkpoint.
+        :param version: checkpoint version, which is the suffix of model.* file,
+        i.e., for modle.4 file, the version is 4.
+        :param prefix: optimMethod prefix, for example 'optimMethod-Sequentialf53bddcc'
+        :return:
+        """
         from bigdl.nn.layer import Model, Container
         from bigdl.optim.optimizer import OptimMethod
         import os
@@ -263,25 +371,57 @@ def load_orca_checkpoint(self, path, version, prefix=None):
         self.nn_model = NNModel(self.model, feature_preprocessing=self.feature_preprocessing)
 
     def load_latest_orca_checkpoint(self, path):
+        """
+        Load latest Orca checkpoint under specified directory.
+
+        :param path: directory containing Orca checkpoint files.
+        """
         from zoo.orca.learn.utils import find_latest_checkpoint
         path, prefix, version = find_latest_checkpoint(path, model_type="bigdl")
         if path is None:
             raise ValueError("Cannot find BigDL checkpoint, please check your checkpoint path.")
         self.load_orca_checkpoint(path=path, version=version, prefix=prefix)
 
     def clear_gradient_clipping(self):
+        """
+        Clear gradient clipping parameters. In this case, gradient clipping will not be applied.
+        In order to take effect, it needs to be called before fit.
+
+        :return:
+        """
         self.nn_estimator.clearGradientClipping()
         self.estimator.clear_gradient_clipping()
 
     def set_constant_gradient_clipping(self, min, max):
+        """
+        Set constant gradient clipping during the training process.
+        In order to take effect, it needs to be called before fit.
+
+        :param min: The minimum value to clip by.
+        :param max: The maximum value to clip by.
+        :return:
+        """
         self.nn_estimator.setConstantGradientClipping(min, max)
         self.estimator.set_constant_gradient_clipping(min, max)
 
     def set_l2_norm_gradient_clipping(self, clip_norm):
+        """
+        Clip gradient to a maximum L2-Norm during the training process.
+        In order to take effect, it needs to be called before fit.
+
+        :param clip_norm: Gradient L2-Norm threshold.
+        :return:
+        """
         self.nn_estimator.setGradientClippingByL2Norm(clip_norm)
         self.estimator.set_l2_norm_gradient_clipping(clip_norm)
 
     def get_train_summary(self, tag=None):
+        """
+        Get the scalar from model train summary
+        Return list of summary data of [iteration_number, scalar_value, timestamp]
+
+        tag: The string variable represents the scalar wanted
+        """
         # Exception handle
         if tag != "Loss" and tag != "LearningRate" and tag != "Throughput":
             raise TypeError('Only "Loss", "LearningRate", "Throughput"'
@@ -293,6 +433,31 @@ def get_train_summary(self, tag=None):
             return self.estimator.get_train_summary(tag=tag)
 
     def get_validation_summary(self, tag=None):
+        """
+        Get the scalar from model validation summary
+        Return list of summary data of [iteration_number, scalar_value, timestamp]
+        Note: The metric and tag may not be consistent
+        Please look up following form to pass tag parameter
+        Left side is your metric during compile
+        Right side is the tag you should pass
+        'Accuracy'                  |   'Top1Accuracy'
+        'BinaryAccuracy'            |   'Top1Accuracy'
+        'CategoricalAccuracy'       |   'Top1Accuracy'
+        'SparseCategoricalAccuracy' |   'Top1Accuracy'
+        'AUC'                       |   'AucScore'
+        'HitRatio'                  |   'HitRate@k' (k is Top-k)
+        'Loss'                      |   'Loss'
+        'MAE'                       |   'MAE'
+        'NDCG'                      |   'NDCG'
+        'TFValidationMethod'        |   '${name + " " + valMethod.toString()}'
+        'Top5Accuracy'              |   'Top5Accuracy'
+        'TreeNNAccuracy'            |   'TreeNNAccuracy()'
+        'MeanAveragePrecision'      |   'MAP@k' (k is Top-k) (BigDL)
+        'MeanAveragePrecision'      |   'PascalMeanAveragePrecision' (Zoo)
+        'StatelessMetric'           |   '${name}'
+
+        tag: The string variable represents the scalar wanted
+        """
         if self.is_nnframe_fit:
             assert tag is not None, "You should provide tag which should match the name of " \
                                     "the ValidationMethod set into the optimizer. " \