TensorFlow on CloudML

refs: https://cloud.google.com/ml/docs/quickstarts/training

CloudML Setup

refs: https://cloud.google.com/ml/docs/how-tos/getting-set-up

install miniconda: http://conda.pydata.org/miniconda.html

install CloudML SDK

conda create --name cloudml python=2.7
source activate cloudml
pip install -r requirements.txt
pip install --upgrade --ignore-installed setuptools \
  https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
gcloud components install beta
gcloud beta auth application-default login
pip install --upgrade --force-reinstall \
  https://storage.googleapis.com/cloud-ml/sdk/cloudml.latest.tar.gz

verify your envroiment

curl https://storage.googleapis.com/cloud-ml/scripts/check_environment.py | python

training local

cd trainable

Clear the output from any previous local run. Train locally.

rm -rf data/
python -m trainer.task

Inspect job

tensorboard --logdir=data/ --port=8080
open http://localhost:8080

CloudML run single worker

cd trainable

setup

JOB_NAME=mnist_1
PROJECT_ID=`gcloud config list project --format "value(core.project)"`
TRAIN_BUCKET=gs://${PROJECT_ID}-ml
TRAIN_PATH=${TRAIN_BUCKET}/${JOB_NAME}
gsutil rm -rf ${TRAIN_PATH}

run

gcloud beta ml jobs submit training ${JOB_NAME} \
  --package-path=trainer \
  --module-name=trainer.task \
  --staging-bucket="${TRAIN_BUCKET}" \
  --region=us-central1 \
  -- \
  --train_dir="${TRAIN_PATH}/train"

waiting finish jobs

gcloud beta ml jobs describe --project ${PROJECT_ID} ${JOB_NAME}

CloudML run distributed multiple workers

cd distributed

training local

rm -rf output/
python -m trainer.task \
  --train_data_paths=gs://cloud-ml-data/mnist/train.tfr.gz \
  --eval_data_paths=gs://cloud-ml-data/mnist/eval.tfr.gz \
  --output_path=output

training CloudML

setup

JOB_NAME=distributed_1
PROJECT_ID=`gcloud config list project --format "value(core.project)"`
TRAIN_BUCKET=gs://${PROJECT_ID}-ml
TRAIN_PATH=${TRAIN_BUCKET}/${JOB_NAME}
gsutil rm -rf ${TRAIN_PATH}

cat << EOF > config.yaml
trainingInput:
  scaleTier: STANDARD_1
EOF

submit jobs

gcloud beta ml jobs submit training ${JOB_NAME} \
  --package-path=trainer \
  --module-name=trainer.task \
  --staging-bucket="${TRAIN_BUCKET}" \
  --region=us-central1 \
  --config=config.yaml \
  -- \
  --train_data_paths="gs://cloud-ml-data/mnist/train.tfr.gz" \
  --eval_data_paths="gs://cloud-ml-data/mnist/eval.tfr.gz" \
  --output_path="${TRAIN_PATH}/output"

Inspect job

gcloud beta ml jobs describe --project ${PROJECT_ID} ${JOB_NAME}

Hyperparameter tuning

cd hptuning

setup

JOB_NAME=mnist_hptuning_1
PROJECT_ID=`gcloud config list project --format "value(core.project)"`
TRAIN_BUCKET=gs://${PROJECT_ID}-ml
TRAIN_PATH=${TRAIN_BUCKET}/${JOB_NAME}
gsutil rm -rf ${TRAIN_PATH}

cat << EOF > config.yaml
trainingInput:
  # Use a cluster with many workers and a few parameter servers.
  scaleTier: STANDARD_1
  # Hyperparameter-tuning specification.
  hyperparameters:
    # Maximize the objective value.
    goal: MAXIMIZE
    # Run at most 10 trials with different hyperparameters.
    maxTrials: 10
    # Run two trials at a time.
    maxParallelTrials: 2
    params:
      # Allow the size of the first hidden layer to vary between 40 and 400.
      # One value in this range will be passed to each trial via the
      # --hidden1 command-line flag.
      - parameterName: hidden1
        type: INTEGER
        minValue: 40
        maxValue: 400
        scaleType: UNIT_LINEAR_SCALE
      # Allow the size of the second hidden layer to vary between 5 and 250.
      # One value in this range will be passed to each trial via the
      # --hidden2 command-line flag.
      - parameterName: hidden2
        type: INTEGER
        minValue: 5
        maxValue: 250
        scaleType: UNIT_LINEAR_SCALE
      # Allow the learning rate to vary between 0.0001 and 0.5.
      # One value in this range will be passed to each trial via the
      # --learning_rate command-line flag.
      - parameterName: learning_rate
        type: DOUBLE
        minValue: 0.0001
        maxValue: 0.5
        scaleType: UNIT_LOG_SCALE
EOF

run

gcloud beta ml jobs submit training ${JOB_NAME} \
  --package-path=trainer \
  --module-name=trainer.task \
  --staging-bucket="${TRAIN_BUCKET}" \
  --region=us-central1 \
  --config=config.yaml \
  -- \
  --train_data_paths="gs://cloud-ml-data/mnist/train.tfr.gz" \
  --eval_data_paths="gs://cloud-ml-data/mnist/eval.tfr.gz" \
  --output_path="${TRAIN_PATH}/output"

Deploy model cloud

cd deployable

Train the updated model on the CloudML

rm -f data/{checkpoint,events,export}*
python -m trainer.task

JOB_NAME=mnist_deployable_1
PROJECT_ID=`gcloud config list project --format "value(core.project)"`
TRAIN_BUCKET=gs://${PROJECT_ID}-ml
TRAIN_PATH=${TRAIN_BUCKET}/${JOB_NAME}
# Clear the output from any previous cloud run.
gsutil rm -rf ${TRAIN_PATH}
gcloud beta ml jobs submit training ${JOB_NAME} \
  --package-path=trainer \
  --module-name=trainer.task \
  --staging-bucket="${TRAIN_BUCKET}" \
  --region=us-central1 \
  -- \
  --train_dir="${TRAIN_PATH}/train" \
  --model_dir="${TRAIN_PATH}/model"

Deploy the model on the cloud

MODEL_NAME=mnist_1
gcloud beta ml models create ${MODEL_NAME}
gcloud beta ml models versions create \
  --origin=${TRAIN_PATH}/model/ \
  --model=${MODEL_NAME} \
  v1
gcloud beta ml models versions set-default --model=${MODEL_NAME} v1

Use the online prediction service(alpha)

gcloud beta ml predict --model=${MODEL_NAME} \
  --instances=data/predict_sample.tensor.json

Name		Name	Last commit message	Last commit date
Latest commit History 4 Commits
deployable		deployable
distributed		distributed
hptuning		hptuning
trainable/trainer		trainable/trainer
.gitignore		.gitignore
LICENSE		LICENSE
README.md		README.md
requirements.txt		requirements.txt

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

TensorFlow on CloudML

CloudML Setup

training local

CloudML run single worker

CloudML run distributed multiple workers

training local

training CloudML

Hyperparameter tuning

Deploy model cloud

About

Releases

Packages

Languages

License

mainyaa/tensorflow_mnist_cloudml

Folders and files

Latest commit

History

Repository files navigation

TensorFlow on CloudML

CloudML Setup

training local

CloudML run single worker

CloudML run distributed multiple workers

training local

training CloudML

Hyperparameter tuning

Deploy model cloud

About

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages