Skip to content

Commit

Permalink
Update orca python api doc (intel-analytics#3501)
Browse files Browse the repository at this point in the history
* Add orca estimators python api docs

* meet comments

* add docs

* refine tf2 comments

* change label cols

* add some updates

* fix style

Co-authored-by: cyita <[email protected]>
  • Loading branch information
yangw1234 and cyita committed Sep 23, 2021
1 parent c945230 commit 57b9eda
Show file tree
Hide file tree
Showing 6 changed files with 775 additions and 138 deletions.
173 changes: 169 additions & 4 deletions python/orca/src/bigdl/orca/learn/bigdl/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,15 @@ def from_bigdl(*, model, loss=None, optimizer=None, metrics=None,
"""
Construct an Estimator with BigDL model, loss function and Preprocessing for feature and
label data.
:param model: BigDL Model to be trained.
:param loss: BigDL criterion.
:param optimizer: BigDL optimizer.
:param metrics: A evaluation metric or a list of evaluation metrics
:param feature_preprocessing: The param converts the data in feature column to a
Tensor or to a Sample directly. It expects a List of Int as the size of the
converted Tensor, or a Preprocessing[F, Tensor[T]]
:param feature_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
The param converts the data in feature column to a Tensor or to a Sample directly.
It expects a List of Int as the size of the converted Tensor, or a Preprocessing[F,
Tensor[T]]
If a List of Int is set as feature_preprocessing, it can only handle the case that
feature column contains the following data types:
Expand All @@ -55,7 +57,8 @@ def from_bigdl(*, model, loss=None, optimizer=None, metrics=None,
The feature_preprocessing will also be copied to the generated NNModel and applied
to feature column during transform.
:param label_preprocessing: similar to feature_preprocessing, but applies to Label data.
:param label_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
similar to feature_preprocessing, but applies to Label data.
:param model_dir: The path to save model. During the training, if checkpoint_trigger is
defined and triggered, the model will be saved to model_dir.
:return:
Expand Down Expand Up @@ -91,6 +94,26 @@ def __init__(self, *, model, loss, optimizer=None, metrics=None,
def fit(self, data, epochs, batch_size=32, feature_cols="features", label_cols="label",
caching_sample=True, validation_data=None, validation_trigger=None,
checkpoint_trigger=None):
"""
Train this BigDL model with train data.
:param data: train data. It can be XShards or Spark DataFrame.
If data is XShards, each partition is a dictionary of {'x': feature,
'y': label}, where feature(label) is a numpy array or a list of numpy arrays.
:param epochs: Number of epochs to train the model.
:param batch_size: Batch size used for training. Default: 32.
:param feature_cols: Feature column name(s) of data. Only used when data is a Spark
DataFrame. Default: "features".
:param label_cols: Label column name(s) of data. Only used when data is a Spark DataFrame.
Default: "label".
:param caching_sample: whether to cache the Samples after preprocessing. Default: True
:param validation_data: Validation data. XShards and Spark DataFrame are supported.
If data is XShards, each partition is a dictionary of {'x': feature,
'y': label}, where feature(label) is a numpy array or a list of numpy arrays.
:param validation_trigger: Orca Trigger to trigger validation computation.
:param checkpoint_trigger: Orca Trigger to set a checkpoint.
:return:
"""
from zoo.orca.learn.trigger import Trigger

assert batch_size > 0, "batch_size should be greater than 0"
Expand Down Expand Up @@ -166,6 +189,26 @@ def fit(self, data, epochs, batch_size=32, feature_cols="features", label_cols="
return self

def predict(self, data, batch_size=4, feature_cols="features", sample_preprocessing=None):
"""
Predict input data
:param data: predict input data. It can be XShards or Spark DataFrame.
If data is XShards, each partition is a dictionary of {'x': feature}, where feature
is a numpy array or a list of numpy arrays.
:param batch_size: Batch size used for inference. Default: 4.
:param feature_cols: Feature column name(s) of data. Only used when data is a Spark
DataFrame. Default: "features".
:param sample_preprocessing: Used when data is a Spark DataFrame. If the user want change
the default feature_preprocessing specified in Estimator.from_bigdl, the user can pass the
new sample_preprocessing methods.
:return: predicted result.
If input data is Spark DataFrame, the predict result is a DataFrame which includes original
columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
or Array of VectorUDT depending on model outputs shape.
If input data is an XShards, the predict result is a XShards, each partition of the XShards
is a dictionary of {'prediction': result}, where result is a numpy array or a list of numpy
arrays.
"""
if isinstance(data, DataFrame):
if isinstance(feature_cols, list):
data, _, feature_cols = \
Expand All @@ -185,6 +228,18 @@ def predict(self, data, batch_size=4, feature_cols="features", sample_preprocess
data.__class__.__name__)

def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None):
"""
Evaluate model.
:param data: validation data. It can be XShards, each partition is a dictionary of
{'x': feature, 'y': label}, where feature(label) is a numpy array or a list of numpy arrays.
:param batch_size: Batch size used for validation. Default: 32.
:param feature_cols: (Not supported yet) Feature column name(s) of data. Only used when
data is a Spark DataFrame. Default: None.
:param label_cols: (Not supported yet) Label column name(s) of data. Only used when data
is a Spark DataFrame. Default: None.
:return:
"""
assert data is not None, "validation data shouldn't be None"
assert self.metrics is not None, "metrics shouldn't be None, please specify the metrics" \
" argument when creating this estimator."
Expand All @@ -204,9 +259,20 @@ def evaluate(self, data, batch_size=32, feature_cols=None, label_cols=None):
return bigdl_metric_results_to_dict(result)

def get_model(self):
"""
Get the trained BigDL model
:return: The trained BigDL model
"""
return self.model

def save(self, model_path):
"""
Save the BigDL model to model_path
:param model_path: path to save the trained model.
:return:
"""
try:
model = self.get_model()
model.saveModel(model_path + ".bigdl", model_path + ".bin", True)
Expand All @@ -215,6 +281,39 @@ def save(self, model_path):

def load(self, checkpoint, optimizer=None, loss=None, feature_preprocessing=None,
label_preprocessing=None, model_dir=None, is_checkpoint=False):
"""
Load existing BigDL model or checkpoint
:param checkpoint: Path to the existing model or checkpoint.
:param optimizer: BigDL optimizer.
:param loss: BigDL criterion.
:param feature_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
The param converts the data in feature column to a Tensor or to a Sample directly.
It expects a List of Int as the size of the converted Tensor, or a Preprocessing[F,
Tensor[T]]
If a List of Int is set as feature_preprocessing, it can only handle the case that
feature column contains the following data types:
Float, Double, Int, Array[Float], Array[Double], Array[Int] and MLlib Vector. The
feature data are converted to Tensors with the specified sizes before
sending to the model. Internally, a SeqToTensor is generated according to the
size, and used as the feature_preprocessing.
Alternatively, user can set feature_preprocessing as Preprocessing[F, Tensor[T]]
that transforms the feature data to a Tensor[T]. Some pre-defined Preprocessing are
provided in package zoo.feature. Multiple Preprocessing can be combined as a
ChainedPreprocessing.
The feature_preprocessing will also be copied to the generated NNModel and applied
to feature column during transform.
:param label_preprocessing: Used when data in `fit` and `predict` is a Spark DataFrame.
similar to feature_preprocessing, but applies to Label data.
:param model_dir: The path to save model. During the training, if checkpoint_trigger is
defined and triggered, the model will be saved to model_dir.
:param is_checkpoint: Whether the path is a checkpoint or a saved BigDL model.
Default: False.
:return: The loaded estimator object.
"""
if loss is not None:
self.loss = loss
if optimizer is not None:
Expand Down Expand Up @@ -243,6 +342,15 @@ def load(self, checkpoint, optimizer=None, loss=None, feature_preprocessing=None
return self

def load_orca_checkpoint(self, path, version, prefix=None):
"""
Load existing checkpoint
:param path: Path to the existing checkpoint.
:param version: checkpoint version, which is the suffix of model.* file,
i.e., for modle.4 file, the version is 4.
:param prefix: optimMethod prefix, for example 'optimMethod-Sequentialf53bddcc'
:return:
"""
from bigdl.nn.layer import Model, Container
from bigdl.optim.optimizer import OptimMethod
import os
Expand All @@ -263,25 +371,57 @@ def load_orca_checkpoint(self, path, version, prefix=None):
self.nn_model = NNModel(self.model, feature_preprocessing=self.feature_preprocessing)

def load_latest_orca_checkpoint(self, path):
"""
Load latest Orca checkpoint under specified directory.
:param path: directory containing Orca checkpoint files.
"""
from zoo.orca.learn.utils import find_latest_checkpoint
path, prefix, version = find_latest_checkpoint(path, model_type="bigdl")
if path is None:
raise ValueError("Cannot find BigDL checkpoint, please check your checkpoint path.")
self.load_orca_checkpoint(path=path, version=version, prefix=prefix)

def clear_gradient_clipping(self):
"""
Clear gradient clipping parameters. In this case, gradient clipping will not be applied.
In order to take effect, it needs to be called before fit.
:return:
"""
self.nn_estimator.clearGradientClipping()
self.estimator.clear_gradient_clipping()

def set_constant_gradient_clipping(self, min, max):
"""
Set constant gradient clipping during the training process.
In order to take effect, it needs to be called before fit.
:param min: The minimum value to clip by.
:param max: The maximum value to clip by.
:return:
"""
self.nn_estimator.setConstantGradientClipping(min, max)
self.estimator.set_constant_gradient_clipping(min, max)

def set_l2_norm_gradient_clipping(self, clip_norm):
"""
Clip gradient to a maximum L2-Norm during the training process.
In order to take effect, it needs to be called before fit.
:param clip_norm: Gradient L2-Norm threshold.
:return:
"""
self.nn_estimator.setGradientClippingByL2Norm(clip_norm)
self.estimator.set_l2_norm_gradient_clipping(clip_norm)

def get_train_summary(self, tag=None):
"""
Get the scalar from model train summary
Return list of summary data of [iteration_number, scalar_value, timestamp]
tag: The string variable represents the scalar wanted
"""
# Exception handle
if tag != "Loss" and tag != "LearningRate" and tag != "Throughput":
raise TypeError('Only "Loss", "LearningRate", "Throughput"'
Expand All @@ -293,6 +433,31 @@ def get_train_summary(self, tag=None):
return self.estimator.get_train_summary(tag=tag)

def get_validation_summary(self, tag=None):
"""
Get the scalar from model validation summary
Return list of summary data of [iteration_number, scalar_value, timestamp]
Note: The metric and tag may not be consistent
Please look up following form to pass tag parameter
Left side is your metric during compile
Right side is the tag you should pass
'Accuracy' | 'Top1Accuracy'
'BinaryAccuracy' | 'Top1Accuracy'
'CategoricalAccuracy' | 'Top1Accuracy'
'SparseCategoricalAccuracy' | 'Top1Accuracy'
'AUC' | 'AucScore'
'HitRatio' | 'HitRate@k' (k is Top-k)
'Loss' | 'Loss'
'MAE' | 'MAE'
'NDCG' | 'NDCG'
'TFValidationMethod' | '${name + " " + valMethod.toString()}'
'Top5Accuracy' | 'Top5Accuracy'
'TreeNNAccuracy' | 'TreeNNAccuracy()'
'MeanAveragePrecision' | 'MAP@k' (k is Top-k) (BigDL)
'MeanAveragePrecision' | 'PascalMeanAveragePrecision' (Zoo)
'StatelessMetric' | '${name}'
tag: The string variable represents the scalar wanted
"""
if self.is_nnframe_fit:
assert tag is not None, "You should provide tag which should match the name of " \
"the ValidationMethod set into the optimizer. " \
Expand Down
Loading

0 comments on commit 57b9eda

Please sign in to comment.