kubeflow · k8s-ci-robot · Jun 3, 2019 · May 28, 2019 · May 28, 2019 · May 29, 2019
diff --git a/Gopkg.lock b/Gopkg.lock
diff --git a/cmd/tf-operator.v1/app/server.go b/cmd/tf-operator.v1/app/server.go
@@ -39,6 +39,8 @@ import (
 	election "k8s.io/client-go/tools/leaderelection"
 	"k8s.io/client-go/tools/leaderelection/resourcelock"
 	"k8s.io/client-go/tools/record"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
 )
 
 const (
@@ -55,6 +57,13 @@ var (
 
 const RecommendedKubeConfigPathEnv = "KUBECONFIG"
 
+var (
+	isLeader = promauto.NewGauge(prometheus.GaugeOpts{
+		Name: "tf_operator_is_leader",
+		Help: "Is this client the leader of this tf-operator client set?",
+  })
+)
+
 func Run(opt *options.ServerOption) error {
 	// Check if the -version flag was passed and, if so, print the version and exit.
 	if opt.PrintVersion {
@@ -119,6 +128,7 @@ func Run(opt *options.ServerOption) error {
 
 	// Set leader election start function.
 	run := func(<-chan struct{}) {
+		isLeader.Set(1)
 		if err := tc.Run(opt.Threadiness, stopCh); err != nil {
 			log.Errorf("Failed to run the controller: %v", err)
 		}
@@ -157,6 +167,7 @@ func Run(opt *options.ServerOption) error {
 		Callbacks: election.LeaderCallbacks{
 			OnStartedLeading: run,
 			OnStoppedLeading: func() {
+				isLeader.Set(0)
 				log.Fatalf("leader election lost")
 			},
 		},

diff --git a/cmd/tf-operator.v1/main.go b/cmd/tf-operator.v1/main.go
@@ -15,13 +15,17 @@
 package main
 
 import (
+	"os"
 	"flag"
+	"fmt"
+	"net/http"
 
 	"github.com/onrik/logrus/filename"
 	log "github.com/sirupsen/logrus"
 
 	"github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app"
 	"github.com/kubeflow/tf-operator/cmd/tf-operator.v1/app/options"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 )
 
 func init() {
@@ -31,6 +35,15 @@ func init() {
 	log.AddHook(filenameHook)
 }
 
+func startMonitoring() {
+	go func() {
+		monitoringPort := os.Getenv("MONITORING_CLIENT_PORT") //TODO (krishnadurai): remove with static port
+		log.Infof("Setting up client for monitoring on port: %s", monitoringPort)
+		http.Handle("/metrics", promhttp.Handler())
+		http.ListenAndServe(fmt.Sprintf(":%s", monitoringPort), nil)
+	}()
+}
+
 func main() {
 	s := options.NewServerOption()
 	s.AddFlags(flag.CommandLine)
@@ -42,6 +55,8 @@ func main() {
 		log.SetFormatter(&log.JSONFormatter{})
 	}
 
+  startMonitoring()
+
 	if err := app.Run(s); err != nil {
 		log.Fatalf("%v\n", err)
 	}

diff --git a/docs/monitoring/README.md b/docs/monitoring/README.md
@@ -0,0 +1,75 @@
+# Prometheus Monitoring for TF operator
+
+## Install Prometheus in your Kubernetes Cluster
+To install the chart with the release name `my-release`:
+
+```console
+$ helm install --name my-release stable/prometheus-operator
+```
+
+Follow instructions in this [link](https://github.com/helm/charts/blob/master/stable/prometheus-operator/README.md#installing-the-chart) for elaborate instructions.
+
+*Note*: This [link](https://github.com/coreos/prometheus-operator/blob/master/Documentation/troubleshooting.md) helps in troubleshooting your setup.
+
+## Available Metrics
+
+Currently available metrics to monitor are listed below.
+
+### Metrics for Each Component Container for TF operator
+
+Component Containers:
+* tf-operator
+* tf-master
+* tf-ps
+* tf-worker
+
+#### Each Container Reports on its:
+
+Use prometheus graph to run the following example commands to visualize metrics.
+
+*Note*: These metrics are derived from [cAdvisor](https://github.com/google/cadvisor) kubelet integration which reports to Prometheus through our prometheus-operator installation. You may see a complete list of metrics available in `\metrics` page of your Prometheus web UI which you can further use to compose your own queries.
+
+**CPU usage**
+```
+sum (rate (container_cpu_usage_seconds_total{pod_name=~"tfjob-name-.*"}[1m])) by (pod_name)
+```
+
+**GPU Usage**
+
+**Memory Usage**
+```
+sum (rate (container_memory_usage_bytes{pod_name=~"tfjob-name-.*"}[1m])) by (pod_name)
+```
+
+**Network Usage**
+```
+sum (rate (container_network_transmit_bytes_total{pod_name=~"tfjob-name-.*"}[1m])) by (pod_name)
+```
+
+**I/O Usage**
+```
+sum (rate (container_fs_write_seconds_total{pod_name=~"tfjob-name-.*"}[1m])) by (pod_name)
+```
+
+**Keep-Alive check**  
+```
+up
+```
+This is maintained by Prometheus on its own with its `up` metric detailed in the documentation [here](https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series).
+
+**Is Leader check**
+```
+tf_operator_is_leader
+```
+
+*Note*: Replace `tfjob-name` with your own TF Job name you want to monitor for the example queries above.
+
+### Report TFJob metrics:
+
+**Job Creation**
+
+**Job Deletion**
+
+**Jobs Created per Hour**
+
+**Successful Job Completions**
diff --git a/vendor/github.com/beorn7/perks/LICENSE b/vendor/github.com/beorn7/perks/LICENSE