From e3c7dcbb482035a43fce2ab8069f4e3ad1a598f1 Mon Sep 17 00:00:00 2001 From: Kobi Samoray Date: Tue, 31 Dec 2019 17:55:17 +0200 Subject: [PATCH] Antrea Prometheus integration (#236) Integrate with Prometheus monitoring solution. Integration of the Prometheus client into Antrea controller and agent allows the exposure of various metrics to Prometheus server. In addition to Antrea's own set of metrics, Prometheus client will also expose metrics which are defined by various components which are part of the Antrea ecosystem, e.g golang, Prometheus itself etc. --- build/yamls/antrea-eks.yml | 33 ++++- build/yamls/antrea-ipsec.yml | 33 ++++- build/yamls/antrea.yml | 33 ++++- build/yamls/base/agent.yml | 4 + build/yamls/base/conf/antrea-agent.conf | 9 ++ build/yamls/base/conf/antrea-controller.conf | 8 ++ build/yamls/base/controller.yml | 4 + cmd/antrea-agent/agent.go | 8 ++ cmd/antrea-agent/config.go | 9 ++ cmd/antrea-controller/config.go | 9 ++ cmd/antrea-controller/controller.go | 50 +++++++- go.mod | 1 + pkg/agent/metrics/prometheus.go | 119 +++++++++++++++++++ 13 files changed, 306 insertions(+), 14 deletions(-) create mode 100644 pkg/agent/metrics/prometheus.go diff --git a/build/yamls/antrea-eks.yml b/build/yamls/antrea-eks.yml index a43ccb9d4f1..a3e45c65257 100644 --- a/build/yamls/antrea-eks.yml +++ b/build/yamls/antrea-eks.yml @@ -286,6 +286,15 @@ data: # networkPolicyOnly: Antrea enforces NetworkPolicy only, and utilizes CNI chaining and delegates Pod IPAM and connectivity to primary CNI. # trafficEncapMode: networkPolicyOnly + + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false antrea-cni.conf: | { "cniVersion":"0.3.0", @@ -295,18 +304,30 @@ data: "type": "host-local" } } - antrea-controller.conf: "" + antrea-controller.conf: | + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener. + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false kind: ConfigMap metadata: annotations: {} labels: app: antrea - name: antrea-config-57hfkfg8kd + name: antrea-config-b4d69452ff namespace: kube-system --- apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/port: "443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea name: antrea @@ -394,7 +415,7 @@ spec: key: node-role.kubernetes.io/master volumes: - configMap: - name: antrea-config-57hfkfg8kd + name: antrea-config-b4d69452ff name: antrea-config --- apiVersion: apiregistration.k8s.io/v1 @@ -444,6 +465,10 @@ spec: component: antrea-agent template: metadata: + annotations: + prometheus.io/port: "10443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea component: antrea-agent @@ -578,7 +603,7 @@ spec: operator: Exists volumes: - configMap: - name: antrea-config-57hfkfg8kd + name: antrea-config-b4d69452ff name: antrea-config - hostPath: path: /etc/cni/net.d diff --git a/build/yamls/antrea-ipsec.yml b/build/yamls/antrea-ipsec.yml index 7c102b89ae1..56ca842f755 100644 --- a/build/yamls/antrea-ipsec.yml +++ b/build/yamls/antrea-ipsec.yml @@ -286,6 +286,15 @@ data: # networkPolicyOnly: Antrea enforces NetworkPolicy only, and utilizes CNI chaining and delegates Pod IPAM and connectivity to primary CNI. # #trafficEncapMode: encap + + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false antrea-cni.conf: | { "cniVersion":"0.3.0", @@ -295,13 +304,21 @@ data: "type": "host-local" } } - antrea-controller.conf: "" + antrea-controller.conf: | + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener. + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false kind: ConfigMap metadata: annotations: {} labels: app: antrea - name: antrea-config-f59cfh8thg + name: antrea-config-fggd7g4h2k namespace: kube-system --- apiVersion: v1 @@ -316,6 +333,10 @@ type: Opaque apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/port: "443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea name: antrea @@ -403,7 +424,7 @@ spec: key: node-role.kubernetes.io/master volumes: - configMap: - name: antrea-config-f59cfh8thg + name: antrea-config-fggd7g4h2k name: antrea-config --- apiVersion: apiregistration.k8s.io/v1 @@ -453,6 +474,10 @@ spec: component: antrea-agent template: metadata: + annotations: + prometheus.io/port: "10443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea component: antrea-agent @@ -619,7 +644,7 @@ spec: operator: Exists volumes: - configMap: - name: antrea-config-f59cfh8thg + name: antrea-config-fggd7g4h2k name: antrea-config - hostPath: path: /etc/cni/net.d diff --git a/build/yamls/antrea.yml b/build/yamls/antrea.yml index 35f77f3a345..9d3344b6016 100644 --- a/build/yamls/antrea.yml +++ b/build/yamls/antrea.yml @@ -286,6 +286,15 @@ data: # networkPolicyOnly: Antrea enforces NetworkPolicy only, and utilizes CNI chaining and delegates Pod IPAM and connectivity to primary CNI. # #trafficEncapMode: encap + + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false antrea-cni.conf: | { "cniVersion":"0.3.0", @@ -295,18 +304,30 @@ data: "type": "host-local" } } - antrea-controller.conf: "" + antrea-controller.conf: | + # Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener. + #enablePrometheusMetrics: false + + # Enable golang metrics exposure via Prometheus. + #enablePrometheusGoMetrics: false + + # Enable process metrics exposure via Prometheus. + #enablePrometheusProcessMetrics: false kind: ConfigMap metadata: annotations: {} labels: app: antrea - name: antrea-config-b2b5bdkh8t + name: antrea-config-656thg244c namespace: kube-system --- apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/port: "443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea name: antrea @@ -394,7 +415,7 @@ spec: key: node-role.kubernetes.io/master volumes: - configMap: - name: antrea-config-b2b5bdkh8t + name: antrea-config-656thg244c name: antrea-config --- apiVersion: apiregistration.k8s.io/v1 @@ -444,6 +465,10 @@ spec: component: antrea-agent template: metadata: + annotations: + prometheus.io/port: "10443" + prometheus.io/scheme: https + prometheus.io/scrape: "true" labels: app: antrea component: antrea-agent @@ -578,7 +603,7 @@ spec: operator: Exists volumes: - configMap: - name: antrea-config-b2b5bdkh8t + name: antrea-config-656thg244c name: antrea-config - hostPath: path: /etc/cni/net.d diff --git a/build/yamls/base/agent.yml b/build/yamls/base/agent.yml index 9acc4a78b69..d5e35a386a2 100644 --- a/build/yamls/base/agent.yml +++ b/build/yamls/base/agent.yml @@ -13,6 +13,10 @@ spec: type: RollingUpdate template: metadata: + annotations: + prometheus.io/port: "10443" + prometheus.io/scrape: "true" + prometheus.io/scheme: "https" labels: component: antrea-agent spec: diff --git a/build/yamls/base/conf/antrea-agent.conf b/build/yamls/base/conf/antrea-agent.conf index 5bab23b3748..063b2669f6d 100644 --- a/build/yamls/base/conf/antrea-agent.conf +++ b/build/yamls/base/conf/antrea-agent.conf @@ -41,3 +41,12 @@ # networkPolicyOnly: Antrea enforces NetworkPolicy only, and utilizes CNI chaining and delegates Pod IPAM and connectivity to primary CNI. # #trafficEncapMode: encap + +# Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener +#enablePrometheusMetrics: false + +# Enable golang metrics exposure via Prometheus. +#enablePrometheusGoMetrics: false + +# Enable process metrics exposure via Prometheus. +#enablePrometheusProcessMetrics: false diff --git a/build/yamls/base/conf/antrea-controller.conf b/build/yamls/base/conf/antrea-controller.conf index e69de29bb2d..3b09b41be92 100644 --- a/build/yamls/base/conf/antrea-controller.conf +++ b/build/yamls/base/conf/antrea-controller.conf @@ -0,0 +1,8 @@ +# Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener. +#enablePrometheusMetrics: false + +# Enable golang metrics exposure via Prometheus. +#enablePrometheusGoMetrics: false + +# Enable process metrics exposure via Prometheus. +#enablePrometheusProcessMetrics: false diff --git a/build/yamls/base/controller.yml b/build/yamls/base/controller.yml index a6d6906bd04..0bd605e482a 100644 --- a/build/yamls/base/controller.yml +++ b/build/yamls/base/controller.yml @@ -2,6 +2,10 @@ apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/port: "443" + prometheus.io/scrape: "true" + prometheus.io/scheme: "https" name: antrea spec: ports: diff --git a/cmd/antrea-agent/agent.go b/cmd/antrea-agent/agent.go index 030cbe0da40..652247466a1 100644 --- a/cmd/antrea-agent/agent.go +++ b/cmd/antrea-agent/agent.go @@ -30,6 +30,7 @@ import ( "github.com/vmware-tanzu/antrea/pkg/agent/controller/networkpolicy" "github.com/vmware-tanzu/antrea/pkg/agent/controller/noderoute" "github.com/vmware-tanzu/antrea/pkg/agent/interfacestore" + "github.com/vmware-tanzu/antrea/pkg/agent/metrics" "github.com/vmware-tanzu/antrea/pkg/agent/openflow" "github.com/vmware-tanzu/antrea/pkg/agent/route" "github.com/vmware-tanzu/antrea/pkg/apis/networking/v1beta1" @@ -151,6 +152,13 @@ func run(o *Options) error { go networkPolicyController.Run(stopCh) + if o.config.EnablePrometheusMetrics { + go metrics.InitializePrometheusMetrics( + o.config.EnablePrometheusGoMetrics, + o.config.EnablePrometheusProcessMetrics, + o.config.OVSBridge, ifaceStore, ofClient) + } + agentMonitor := monitor.NewAgentMonitor( crdClient, o.config.OVSBridge, diff --git a/cmd/antrea-agent/config.go b/cmd/antrea-agent/config.go index e14b3157ddf..eb7890c5067 100644 --- a/cmd/antrea-agent/config.go +++ b/cmd/antrea-agent/config.go @@ -72,4 +72,13 @@ type AgentConfig struct { // Hybrid: noEncap if worker Nodes on same subnet, otherwise encap. // NetworkPolicyOnly: Antrea enforces NetworkPolicy only, and utilizes CNI chaining and delegates Pod IPAM and connectivity to primary CNI. TrafficEncapMode string `yaml:"trafficEncapMode,omitempty"` + // Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + // Defaults to false. + EnablePrometheusMetrics bool `yaml:"enablePrometheusMetrics,omitempty"` + // Enable golang metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusGoMetrics bool `yaml:"enablePrometheusGoMetrics,omitempty"` + // Enable process metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusProcessMetrics bool `yaml:"enablePrometheusProcessMetrics,omitempty"` } diff --git a/cmd/antrea-controller/config.go b/cmd/antrea-controller/config.go index aa4b57b5658..9dbe16344c9 100644 --- a/cmd/antrea-controller/config.go +++ b/cmd/antrea-controller/config.go @@ -22,4 +22,13 @@ type ControllerConfig struct { // clientConnection specifies the kubeconfig file and client connection settings for the agent // to communicate with the apiserver. ClientConnection componentbaseconfig.ClientConnectionConfiguration `yaml:"clientConnection"` + // Enable metrics exposure via Prometheus. Initializes Prometheus metrics listener + // Defaults to false. + EnablePrometheusMetrics bool `yaml:"enablePrometheusMetrics,omitempty"` + // Enable golang metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusGoMetrics bool `yaml:"enablePrometheusGoMetrics,omitempty"` + // Enable process metrics exposure via Prometheus + // Defaults to false. + EnablePrometheusProcessMetrics bool `yaml:"enablePrometheusProcessMetrics,omitempty"` } diff --git a/cmd/antrea-controller/controller.go b/cmd/antrea-controller/controller.go index ae9c90bfd06..464671883ac 100644 --- a/cmd/antrea-controller/controller.go +++ b/cmd/antrea-controller/controller.go @@ -17,8 +17,12 @@ package main import ( "fmt" "net" + "net/http" + "os" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" genericopenapi "k8s.io/apiserver/pkg/endpoints/openapi" genericapiserver "k8s.io/apiserver/pkg/server" genericoptions "k8s.io/apiserver/pkg/server/options" @@ -74,7 +78,8 @@ func run(o *Options) error { addressGroupStore, appliedToGroupStore, networkPolicyStore, - controllerMonitor) + controllerMonitor, + o.config.EnablePrometheusMetrics) if err != nil { return fmt.Errorf("error creating API server config: %v", err) } @@ -96,16 +101,54 @@ func run(o *Options) error { go apiServer.GenericAPIServer.PrepareRun().Run(stopCh) + if o.config.EnablePrometheusMetrics { + go initializePrometheusMetrics( + o.config.EnablePrometheusGoMetrics, + o.config.EnablePrometheusProcessMetrics) + } + <-stopCh klog.Info("Stopping Antrea controller") return nil } +// Initialize Prometheus metrics collection. +func initializePrometheusMetrics( + enablePrometheusGoMetrics bool, + enablePrometheusProcessMetrics bool) { + hostname, err := os.Hostname() + if err != nil { + klog.Errorf("Failed to retrieve agent node name, %v", err) + } + + klog.Info("Initializing prometheus") + gaugeHost := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "antrea_controller_host", + Help: "Antrea controller hostname (as a label), typically used in grouping/aggregating stats; " + + "the label defaults to the hostname of the host but can be overridden by configuration. " + + "The value of the gauge is always set to 1.", + ConstLabels: prometheus.Labels{"host": hostname}, + }) + gaugeHost.Set(1) + prometheus.MustRegister(gaugeHost) + http.Handle("/metrics", promhttp.Handler()) + + if !enablePrometheusGoMetrics { + klog.Info("Golang metrics are disabled") + prometheus.Unregister(prometheus.NewGoCollector()) + } + if !enablePrometheusProcessMetrics { + klog.Info("Process metrics are disabled") + prometheus.Unregister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) + } +} + func createAPIServerConfig(kubeconfig string, addressGroupStore storage.Interface, appliedToGroupStore storage.Interface, networkPolicyStore storage.Interface, - controllerQuerier monitor.ControllerQuerier) (*apiserver.Config, error) { + controllerQuerier monitor.ControllerQuerier, + enablePrometheusMetrics bool) (*apiserver.Config, error) { // TODO: // 1. Support user-provided certificate. // 2. Support configurable https port. @@ -113,6 +156,9 @@ func createAPIServerConfig(kubeconfig string, authentication := genericoptions.NewDelegatingAuthenticationOptions() authorization := genericoptions.NewDelegatingAuthorizationOptions() + if enablePrometheusMetrics { + authorization.WithAlwaysAllowPaths("/metrics") + } // Set the PairName but leave certificate directory blank to generate in-memory by default secureServing.ServerCert.CertDirectory = "" secureServing.ServerCert.PairName = "antrea-apiserver" diff --git a/go.mod b/go.mod index 6e8187d64c5..06bb7bb1d11 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,7 @@ require ( github.com/imdario/mergo v0.3.7 // indirect github.com/j-keck/arping v1.0.0 github.com/kevinburke/ssh_config v0.0.0-20190725054713-01f96b0aa0cd + github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829 github.com/satori/go.uuid v1.2.0 github.com/spf13/cobra v0.0.5 github.com/spf13/pflag v1.0.3 diff --git a/pkg/agent/metrics/prometheus.go b/pkg/agent/metrics/prometheus.go new file mode 100644 index 00000000000..bb0a248d0d4 --- /dev/null +++ b/pkg/agent/metrics/prometheus.go @@ -0,0 +1,119 @@ +// Copyright 2020 Antrea Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/vmware-tanzu/antrea/pkg/agent/interfacestore" + "k8s.io/klog" + "net/http" + "os" + "strconv" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/vmware-tanzu/antrea/pkg/agent/openflow" +) + +type OVSStatManager struct { + ofClient openflow.Client + OVSBridge string + OVSTableDesc *prometheus.Desc +} + +func (c *OVSStatManager) OVSGetStatistics() ( + ovsFlowsByTable map[string]float64, +) { + ovsFlowsByTable = make(map[string]float64) + flowTableStatus := c.ofClient.GetFlowTableStatus() + for _, tableStatus := range flowTableStatus { + ovsFlowsByTable[strconv.Itoa(int(tableStatus.ID))] = float64(tableStatus.FlowCount) + } + return +} + +func (c *OVSStatManager) Describe(ch chan<- *prometheus.Desc) { + ch <- c.OVSTableDesc +} + +func (c *OVSStatManager) Collect(ch chan<- prometheus.Metric) { + ovsFlowsByTable := c.OVSGetStatistics() + for tableId, tableFlowCount := range ovsFlowsByTable { + ch <- prometheus.MustNewConstMetric( + c.OVSTableDesc, + prometheus.GaugeValue, + tableFlowCount, + tableId, + ) + } +} + +func NewOVSStatManager(ovsBridge string, ofClient openflow.Client) *OVSStatManager { + return &OVSStatManager{ + ofClient: ofClient, + OVSBridge: ovsBridge, + OVSTableDesc: prometheus.NewDesc( + "antrea_agent_ovs_flow_table", + "OVS flow table flow count.", + []string{"table_id"}, + prometheus.Labels{"bridge": ovsBridge}, + ), + } +} + +func InitializePrometheusMetrics( + enablePrometheusGoMetrics bool, + enablePrometheusProcessMetrics bool, + ovsBridge string, + ifaceStore interfacestore.InterfaceStore, + ofClient openflow.Client) { + hostname, err := os.Hostname() + if err != nil { + klog.Errorf("Failed to retrieve agent node name, %v", err) + } + klog.Info("Binding antrea_local_pod_count") + if err := prometheus.Register(prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Name: "antrea_agent_local_pod_count", + Help: "Number of pods on local node.", + }, + func() float64 { return float64(ifaceStore.GetContainerInterfaceNum()) }, + )); err != nil { + klog.Error("Failed to register local_pod_count with Prometheus") + } + + gaugeHost := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "antrea_agent_host", + Help: "Antrea agent hostname (as a label), typically used in grouping/aggregating stats; " + + "the label defaults to the hostname of the host but can be overridden by configuration. " + + "The value of the gauge is always set to 1.", + ConstLabels: prometheus.Labels{"host": hostname}, + }) + gaugeHost.Set(1) + prometheus.MustRegister(gaugeHost) + http.Handle("/metrics", promhttp.Handler()) + + ovsStats := NewOVSStatManager(ovsBridge, ofClient) + prometheus.MustRegister(ovsStats) + + if !enablePrometheusGoMetrics { + klog.Info("Golang metrics are disabled") + prometheus.Unregister(prometheus.NewGoCollector()) + } + if !enablePrometheusProcessMetrics { + klog.Info("Process metrics are disabled") + prometheus.Unregister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) + } +}