diff --git a/apis/metrics/metrics.go b/apis/metrics/metrics.go index dbd969294..cb0f28bfe 100644 --- a/apis/metrics/metrics.go +++ b/apis/metrics/metrics.go @@ -1,9 +1,9 @@ package metrics import ( - "fmt" "sync" - "time" + + util_metrics "github.com/alibaba/pouch/pkg/utils/metrics" "github.com/prometheus/client_golang/prometheus" ) @@ -14,34 +14,33 @@ func init() { } const ( - namespace = "engine" - subsystem = "daemon" + subsystemPouch = "daemon" ) var ( // ImagePullSummary records the summary of pulling image latency. - ImagePullSummary = newLabelSummary("image_pull_latency_microseconds", "Latency in microseconds to pull a image.", "image") + ImagePullSummary = util_metrics.NewLabelSummary(subsystemPouch, "image_pull_latency_microseconds", "Latency in microseconds to pull a image.", "image") // ContainerActionsCounter records the number of container operations. - ContainerActionsCounter = newLabelCounter("container_actions_counter", "The number of container operations", "action") + ContainerActionsCounter = util_metrics.NewLabelCounter(subsystemPouch, "container_actions_counter", "The number of container operations", "action") // ContainerSuccessActionsCounter records the number of container success operations. - ContainerSuccessActionsCounter = newLabelCounter("container_success_actions_counter", "The number of container success operations", "action") + ContainerSuccessActionsCounter = util_metrics.NewLabelCounter(subsystemPouch, "container_success_actions_counter", "The number of container success operations", "action") // ImageActionsCounter records the number of image operations. - ImageActionsCounter = newLabelCounter("image_actions_counter", "The number of image operations", "action") + ImageActionsCounter = util_metrics.NewLabelCounter(subsystemPouch, "image_actions_counter", "The number of image operations", "action") // ImageSuccessActionsCounter the number of image success operations. - ImageSuccessActionsCounter = newLabelCounter("image_success_actions_counter", "The number of image success operations", "action") + ImageSuccessActionsCounter = util_metrics.NewLabelCounter(subsystemPouch, "image_success_actions_counter", "The number of image success operations", "action") // ContainerActionsTimer records the time cost of each container action. - ContainerActionsTimer = newLabelTimer("container_actions", "The number of seconds it takes to process each container action", "action") + ContainerActionsTimer = util_metrics.NewLabelTimer(subsystemPouch, "container_actions", "The number of seconds it takes to process each container action", "action") // ImageActionsTimer records the time cost of each image action. - ImageActionsTimer = newLabelTimer("image_actions", "The number of seconds it takes to process each image action", "action") + ImageActionsTimer = util_metrics.NewLabelTimer(subsystemPouch, "image_actions", "The number of seconds it takes to process each image action", "action") // EngineVersion records the version and commit information of the engine process. - EngineVersion = newLabelGauge("engine", "The version and commit information of the engine process", "commit") + EngineVersion = util_metrics.NewLabelGauge(subsystemPouch, "engine", "The version and commit information of the engine process", "commit") ) var registerMetrics sync.Once @@ -60,52 +59,3 @@ func Register() { prometheus.MustRegister(ImageActionsTimer) }) } - -// SinceInMicroseconds gets the time since the specified start in microseconds. -func SinceInMicroseconds(start time.Time) float64 { - return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds()) -} - -func newLabelSummary(name, help string, labels ...string) *prometheus.SummaryVec { - return prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: name, - Help: help, - ConstLabels: nil, - }, labels) -} - -func newLabelCounter(name, help string, labels ...string) *prometheus.CounterVec { - return prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: fmt.Sprintf("%s_%s", name, total), - Help: help, - ConstLabels: nil, - }, labels) -} - -func newLabelGauge(name, help string, labels ...string) *prometheus.GaugeVec { - return prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: fmt.Sprintf("%s_%s", name, Unit("info")), - Help: help, - ConstLabels: nil, - }, labels) -} - -func newLabelTimer(name, help string, labels ...string) *prometheus.HistogramVec { - return prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: fmt.Sprintf("%s_%s", name, seconds), - Help: help, - ConstLabels: nil, - }, labels) -} diff --git a/apis/server/image_bridge.go b/apis/server/image_bridge.go index 17f10e799..2e4146e33 100644 --- a/apis/server/image_bridge.go +++ b/apis/server/image_bridge.go @@ -14,6 +14,7 @@ import ( "github.com/alibaba/pouch/apis/types" "github.com/alibaba/pouch/daemon/mgr" "github.com/alibaba/pouch/pkg/httputils" + util_metrics "github.com/alibaba/pouch/pkg/utils/metrics" "github.com/gorilla/mux" "github.com/opencontainers/go-digest" @@ -39,7 +40,7 @@ func (s *Server) pullImage(ctx context.Context, rw http.ResponseWriter, req *htt // record the time spent during image pull procedure. defer func(start time.Time) { - metrics.ImagePullSummary.WithLabelValues(image).Observe(metrics.SinceInMicroseconds(start)) + metrics.ImagePullSummary.WithLabelValues(image).Observe(util_metrics.SinceInMicroseconds(start)) metrics.ImageActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) }(time.Now()) diff --git a/cri/metrics/metrics.go b/cri/metrics/metrics.go new file mode 100644 index 000000000..36c8ee6bc --- /dev/null +++ b/cri/metrics/metrics.go @@ -0,0 +1,96 @@ +package metrics + +import ( + "sync" + + util_metrics "github.com/alibaba/pouch/pkg/utils/metrics" + + "github.com/grpc-ecosystem/go-grpc-prometheus" + "github.com/prometheus/client_golang/prometheus" +) + +func init() { + // Register prometheus metrics. + Register() +} + +const ( + subsystemCRI = "cri" +) + +var ( + // GRPCMetrics create some standard server metrics. + GRPCMetrics = grpc_prometheus.NewServerMetrics() + + // PodActionsCounter records the number of pod operations. + PodActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "pod_actions_counter", "The number of pod operations", "action") + + // PodSuccessActionsCounter records the number of pod success operations. + PodSuccessActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "pod_success_actions_counter", "The number of pod success operations", "action") + + // PodActionsTimer records the time cost of each pod action. + PodActionsTimer = util_metrics.NewLabelTimer(subsystemCRI, "pod_actions", "The number of seconds it takes to process each pod action", "action") + + // ContainerActionsCounter records the number of container operations. + ContainerActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "container_actions_counter", "The number of container operations", "action") + + // ContainerSuccessActionsCounter records the number of container success operations. + ContainerSuccessActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "container_success_actions_counter", "The number of container success operations", "action") + + // ContainerActionsTimer records the time cost of each container action. + ContainerActionsTimer = util_metrics.NewLabelTimer(subsystemCRI, "container_actions", "The number of seconds it takes to process each container action", "action") + + // ImagePullSummary records the summary of pulling image latency. + ImagePullSummary = util_metrics.NewLabelSummary(subsystemCRI, "image_pull_latency_microseconds", "Latency in microseconds to pull a image.", "image") + + // ImageActionsCounter records the number of image operations. + ImageActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "image_actions_counter", "The number of image operations", "action") + + // ImageSuccessActionsCounter the number of image success operations. + ImageSuccessActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "image_success_actions_counter", "The number of image success operations", "action") + + // ImageActionsTimer records the time cost of each image action. + ImageActionsTimer = util_metrics.NewLabelTimer(subsystemCRI, "image_actions", "The number of seconds it takes to process each image action", "action") + + // VolumeActionsCounter records the number of volume operations. + VolumeActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "volume_actions_counter", "The number of volume operations", "action") + + // VolumeSuccessActionsCounter the number of volume success operations. + VolumeSuccessActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "volume_success_actions_counter", "The number of volume success operations", "action") + + // VolumeActionsTimer records the time cost of each volume action. + VolumeActionsTimer = util_metrics.NewLabelTimer(subsystemCRI, "volume_actions", "The number of seconds it takes to process each volume action", "action") + + // RuntimeActionsCounter records the number of runtime operations. + RuntimeActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "runtime_actions_counter", "The number of runtime operations", "action") + + // RuntimeSuccessActionsCounter the number of runtime success operations. + RuntimeSuccessActionsCounter = util_metrics.NewLabelCounter(subsystemCRI, "runtime_success_actions_counter", "The number of runtime success operations", "action") + + // RuntimeActionsTimer records the time cost of each runtime action. + RuntimeActionsTimer = util_metrics.NewLabelTimer(subsystemCRI, "runtime_actions", "The number of seconds it takes to process each runtime action", "action") +) + +var registerMetrics sync.Once + +// Register all metrics. +func Register() { + registerMetrics.Do(func() { + prometheus.MustRegister(PodActionsCounter) + prometheus.MustRegister(PodSuccessActionsCounter) + prometheus.MustRegister(PodActionsTimer) + prometheus.MustRegister(ContainerActionsCounter) + prometheus.MustRegister(ContainerSuccessActionsCounter) + prometheus.MustRegister(ContainerActionsTimer) + prometheus.MustRegister(ImagePullSummary) + prometheus.MustRegister(ImageActionsCounter) + prometheus.MustRegister(ImageSuccessActionsCounter) + prometheus.MustRegister(ImageActionsTimer) + prometheus.MustRegister(VolumeActionsCounter) + prometheus.MustRegister(VolumeSuccessActionsCounter) + prometheus.MustRegister(VolumeActionsTimer) + prometheus.MustRegister(RuntimeActionsCounter) + prometheus.MustRegister(RuntimeSuccessActionsCounter) + prometheus.MustRegister(RuntimeActionsTimer) + }) +} diff --git a/cri/v1alpha2/cri.go b/cri/v1alpha2/cri.go index 7c89167f5..2de59a04b 100644 --- a/cri/v1alpha2/cri.go +++ b/cri/v1alpha2/cri.go @@ -17,6 +17,7 @@ import ( apitypes "github.com/alibaba/pouch/apis/types" anno "github.com/alibaba/pouch/cri/annotations" runtime "github.com/alibaba/pouch/cri/apis/v1alpha2" + "github.com/alibaba/pouch/cri/metrics" cni "github.com/alibaba/pouch/cri/ocicni" "github.com/alibaba/pouch/cri/stream" criutils "github.com/alibaba/pouch/cri/utils" @@ -26,6 +27,7 @@ import ( "github.com/alibaba/pouch/pkg/meta" "github.com/alibaba/pouch/pkg/reference" "github.com/alibaba/pouch/pkg/utils" + util_metrics "github.com/alibaba/pouch/pkg/utils/metrics" "github.com/alibaba/pouch/version" // NOTE: "golang.org/x/net/context" is compatible with standard "context" in golang1.7+. @@ -214,6 +216,12 @@ func (c *CriManager) Version(ctx context.Context, r *runtime.VersionRequest) (*r // RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure // the sandbox is in ready state. func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) { + label := "run" + metrics.PodActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.PodActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + config := r.GetConfig() // Step 1: Prepare image for the sandbox. @@ -317,6 +325,8 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, err } + metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil } @@ -324,6 +334,12 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox // and we should reconfigure it with network plugin which will make sure it reacquire its original network configuration, // like IP address. func (c *CriManager) StartPodSandbox(ctx context.Context, r *runtime.StartPodSandboxRequest) (*runtime.StartPodSandboxResponse, error) { + label := "start" + metrics.PodActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.PodActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + podSandboxID := r.GetPodSandboxId() // start PodSandbox. @@ -382,12 +398,20 @@ func (c *CriManager) StartPodSandbox(ctx context.Context, r *runtime.StartPodSan return nil, err } + metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.StartPodSandboxResponse{}, nil } // StopPodSandbox stops the sandbox. If there are any running containers in the // sandbox, they should be forcibly terminated. func (c *CriManager) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandboxRequest) (*runtime.StopPodSandboxResponse, error) { + label := "stop" + metrics.PodActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.PodActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + podSandboxID := r.GetPodSandboxId() res, err := c.SandboxStore.Get(podSandboxID) if err != nil { @@ -455,12 +479,20 @@ func (c *CriManager) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb return nil, fmt.Errorf("failed to stop sandbox %q: %v", podSandboxID, err) } + metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.StopPodSandboxResponse{}, nil } // RemovePodSandbox removes the sandbox. If there are running containers in the // sandbox, they should be forcibly removed. func (c *CriManager) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodSandboxRequest) (*runtime.RemovePodSandboxResponse, error) { + label := "remove" + metrics.PodActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.PodActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + podSandboxID := r.GetPodSandboxId() opts := &mgr.ContainerListOption{All: true} @@ -501,11 +533,19 @@ func (c *CriManager) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodS return nil, fmt.Errorf("failed to remove meta %q: %v", sandboxRootDir, err) } + metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.RemovePodSandboxResponse{}, nil } // PodSandboxStatus returns the status of the PodSandbox. func (c *CriManager) PodSandboxStatus(ctx context.Context, r *runtime.PodSandboxStatusRequest) (*runtime.PodSandboxStatusResponse, error) { + label := "status" + metrics.PodActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.PodActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + podSandboxID := r.GetPodSandboxId() res, err := c.SandboxStore.Get(podSandboxID) @@ -569,11 +609,19 @@ func (c *CriManager) PodSandboxStatus(ctx context.Context, r *runtime.PodSandbox }, } + metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.PodSandboxStatusResponse{Status: status}, nil } // ListPodSandbox returns a list of Sandbox. func (c *CriManager) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandboxRequest) (*runtime.ListPodSandboxResponse, error) { + label := "list" + metrics.PodActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.PodActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + opts := &mgr.ContainerListOption{All: true} filter := func(c *mgr.Container) bool { return c.Config.Labels[containerTypeLabelKey] == containerTypeLabelSandbox @@ -603,11 +651,19 @@ func (c *CriManager) ListPodSandbox(ctx context.Context, r *runtime.ListPodSandb result := filterCRISandboxes(sandboxes, r.GetFilter()) + metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.ListPodSandboxResponse{Items: result}, nil } // CreateContainer creates a new container in the given PodSandbox. func (c *CriManager) CreateContainer(ctx context.Context, r *runtime.CreateContainerRequest) (*runtime.CreateContainerResponse, error) { + label := "create" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + config := r.GetConfig() sandboxConfig := r.GetSandboxConfig() podSandboxID := r.GetPodSandboxId() @@ -701,11 +757,20 @@ func (c *CriManager) CreateContainer(ctx context.Context, r *runtime.CreateConta return nil, err } } + + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.CreateContainerResponse{ContainerId: containerID}, nil } // StartContainer starts the container. func (c *CriManager) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (*runtime.StartContainerResponse, error) { + label := "start" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + containerID := r.GetContainerId() err := c.ContainerMgr.Start(ctx, containerID, &apitypes.ContainerStartOptions{}) @@ -713,11 +778,19 @@ func (c *CriManager) StartContainer(ctx context.Context, r *runtime.StartContain return nil, fmt.Errorf("failed to start container %q: %v", containerID, err) } + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.StartContainerResponse{}, nil } // StopContainer stops a running container with a grace period (i.e., timeout). func (c *CriManager) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) { + label := "stop" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + containerID := r.GetContainerId() err := c.ContainerMgr.Stop(ctx, containerID, r.GetTimeout()) @@ -725,11 +798,19 @@ func (c *CriManager) StopContainer(ctx context.Context, r *runtime.StopContainer return nil, fmt.Errorf("failed to stop container %q: %v", containerID, err) } + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.StopContainerResponse{}, nil } // RemoveContainer removes the container. func (c *CriManager) RemoveContainer(ctx context.Context, r *runtime.RemoveContainerRequest) (*runtime.RemoveContainerResponse, error) { + label := "remove" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + containerID := r.GetContainerId() err := c.ContainerMgr.Remove(ctx, containerID, &apitypes.ContainerRemoveOptions{Volumes: true, Force: true}) @@ -737,11 +818,19 @@ func (c *CriManager) RemoveContainer(ctx context.Context, r *runtime.RemoveConta return nil, fmt.Errorf("failed to remove container %q: %v", containerID, err) } + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.RemoveContainerResponse{}, nil } // ListContainers lists all containers matching the filter. func (c *CriManager) ListContainers(ctx context.Context, r *runtime.ListContainersRequest) (*runtime.ListContainersResponse, error) { + label := "list" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + opts := &mgr.ContainerListOption{All: true} filter := func(c *mgr.Container) bool { return c.Config.Labels[containerTypeLabelKey] == containerTypeLabelContainer @@ -766,11 +855,19 @@ func (c *CriManager) ListContainers(ctx context.Context, r *runtime.ListContaine result := filterCRIContainers(containers, r.GetFilter()) + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.ListContainersResponse{Containers: result}, nil } // ContainerStatus inspects the container and returns the status. func (c *CriManager) ContainerStatus(ctx context.Context, r *runtime.ContainerStatusRequest) (*runtime.ContainerStatusResponse, error) { + label := "status" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + id := r.GetContainerId() container, err := c.ContainerMgr.Get(ctx, id) if err != nil { @@ -899,12 +996,20 @@ func (c *CriManager) ContainerStatus(ctx context.Context, r *runtime.ContainerSt Envs: parseEnvsFromPouch(container.Config.Env), } + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.ContainerStatusResponse{Status: status}, nil } // ContainerStats returns stats of the container. If the container does not // exist, the call returns an error. func (c *CriManager) ContainerStats(ctx context.Context, r *runtime.ContainerStatsRequest) (*runtime.ContainerStatsResponse, error) { + label := "stats" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + containerID := r.GetContainerId() container, err := c.ContainerMgr.Get(ctx, containerID) @@ -917,11 +1022,19 @@ func (c *CriManager) ContainerStats(ctx context.Context, r *runtime.ContainerSta return nil, fmt.Errorf("failed to decode container metrics: %v", err) } + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.ContainerStatsResponse{Stats: cs}, nil } // ListContainerStats returns stats of all running containers. func (c *CriManager) ListContainerStats(ctx context.Context, r *runtime.ListContainerStatsRequest) (*runtime.ListContainerStatsResponse, error) { + label := "stats_list" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + opts := &mgr.ContainerListOption{All: true} filter := func(c *mgr.Container) bool { if c.Config.Labels[containerTypeLabelKey] != containerTypeLabelContainer { @@ -958,11 +1071,19 @@ func (c *CriManager) ListContainerStats(ctx context.Context, r *runtime.ListCont result.Stats = append(result.Stats, cs) } + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return result, nil } // UpdateContainerResources updates ContainerConfig of the container. func (c *CriManager) UpdateContainerResources(ctx context.Context, r *runtime.UpdateContainerResourcesRequest) (*runtime.UpdateContainerResourcesResponse, error) { + label := "update" + metrics.ContainerActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ContainerActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + containerID := r.GetContainerId() container, err := c.ContainerMgr.Get(ctx, containerID) if err != nil { @@ -984,6 +1105,8 @@ func (c *CriManager) UpdateContainerResources(ctx context.Context, r *runtime.Up return nil, fmt.Errorf("failed to update resource for container %q: %v", containerID, err) } + metrics.ContainerSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.UpdateContainerResourcesResponse{}, nil } @@ -1089,6 +1212,13 @@ func (c *CriManager) UpdateRuntimeConfig(ctx context.Context, r *runtime.UpdateR // Status returns the status of the runtime. func (c *CriManager) Status(ctx context.Context, r *runtime.StatusRequest) (*runtime.StatusResponse, error) { + label := "status" + metrics.RuntimeActionsCounter.WithLabelValues(label).Inc() + // record the time spent during image pull procedure. + defer func(start time.Time) { + metrics.RuntimeActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + runtimeCondition := &runtime.RuntimeCondition{ Type: runtime.RuntimeReady, Status: true, @@ -1123,11 +1253,20 @@ func (c *CriManager) Status(ctx context.Context, r *runtime.StatusRequest) (*run // TODO return more info } + metrics.RuntimeSuccessActionsCounter.WithLabelValues(label).Inc() + return resp, nil } // ListImages lists existing images. func (c *CriManager) ListImages(ctx context.Context, r *runtime.ListImagesRequest) (*runtime.ListImagesResponse, error) { + label := "list" + metrics.ImageActionsCounter.WithLabelValues(label).Inc() + // record the time spent during image pull procedure. + defer func(start time.Time) { + metrics.ImageActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + // TODO: handle image list filters. imageList, err := c.ImageMgr.ListImages(ctx, "") if err != nil { @@ -1157,12 +1296,20 @@ func (c *CriManager) ListImages(ctx context.Context, r *runtime.ListImagesReques idExist[i.ID] = true } + metrics.ImageSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.ListImagesResponse{Images: images}, nil } // ImageStatus returns the status of the image. If the image is not present, // returns a response with ImageStatusResponse.Image set to nil. func (c *CriManager) ImageStatus(ctx context.Context, r *runtime.ImageStatusRequest) (*runtime.ImageStatusResponse, error) { + label := "status" + metrics.ImageActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ImageActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + imageRef := r.GetImage().GetImage() ref, err := reference.Parse(imageRef) if err != nil { @@ -1182,6 +1329,8 @@ func (c *CriManager) ImageStatus(ctx context.Context, r *runtime.ImageStatusRequ return nil, err } + metrics.ImageSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.ImageStatusResponse{Image: image}, nil } @@ -1190,6 +1339,14 @@ func (c *CriManager) PullImage(ctx context.Context, r *runtime.PullImageRequest) // TODO: authentication. imageRef := r.GetImage().GetImage() + label := "pull" + metrics.ImageActionsCounter.WithLabelValues(label).Inc() + // record the time spent during image pull procedure. + defer func(start time.Time) { + metrics.ImagePullSummary.WithLabelValues(imageRef).Observe(util_metrics.SinceInMicroseconds(start)) + metrics.ImageActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + authConfig := &apitypes.AuthConfig{} if r.Auth != nil { authConfig.Auth = r.Auth.Auth @@ -1209,11 +1366,19 @@ func (c *CriManager) PullImage(ctx context.Context, r *runtime.PullImageRequest) return nil, err } + metrics.ImageSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.PullImageResponse{ImageRef: imageInfo.ID}, nil } // RemoveImage removes the image. func (c *CriManager) RemoveImage(ctx context.Context, r *runtime.RemoveImageRequest) (*runtime.RemoveImageResponse, error) { + label := "remove" + metrics.ImageActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ImageActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + imageRef := r.GetImage().GetImage() if err := c.ImageMgr.RemoveImage(ctx, imageRef, false); err != nil { @@ -1223,11 +1388,20 @@ func (c *CriManager) RemoveImage(ctx context.Context, r *runtime.RemoveImageRequ } return nil, err } + + metrics.ImageSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.RemoveImageResponse{}, nil } // ImageFsInfo returns information of the filesystem that is used to store images. func (c *CriManager) ImageFsInfo(ctx context.Context, r *runtime.ImageFsInfoRequest) (*runtime.ImageFsInfoResponse, error) { + label := "info" + metrics.ImageActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.ImageActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + snapshots := c.SnapshotStore.List() timestamp := time.Now().UnixNano() var usedBytes, inodesUsed uint64 @@ -1239,6 +1413,9 @@ func (c *CriManager) ImageFsInfo(ctx context.Context, r *runtime.ImageFsInfoRequ usedBytes += sn.Size inodesUsed += sn.Inodes } + + metrics.ImageSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.ImageFsInfoResponse{ ImageFilesystems: []*runtime.FilesystemUsage{ { @@ -1253,9 +1430,18 @@ func (c *CriManager) ImageFsInfo(ctx context.Context, r *runtime.ImageFsInfoRequ // RemoveVolume removes the volume. func (c *CriManager) RemoveVolume(ctx context.Context, r *runtime.RemoveVolumeRequest) (*runtime.RemoveVolumeResponse, error) { + label := "remove" + metrics.VolumeActionsCounter.WithLabelValues(label).Inc() + defer func(start time.Time) { + metrics.VolumeActionsTimer.WithLabelValues(label).Observe(time.Since(start).Seconds()) + }(time.Now()) + volumeName := r.GetVolumeName() if err := c.VolumeMgr.Remove(ctx, volumeName); err != nil { return nil, err } + + metrics.VolumeSuccessActionsCounter.WithLabelValues(label).Inc() + return &runtime.RemoveVolumeResponse{}, nil } diff --git a/cri/v1alpha2/service/cri.go b/cri/v1alpha2/service/cri.go index 1be998af9..4f978ce0e 100644 --- a/cri/v1alpha2/service/cri.go +++ b/cri/v1alpha2/service/cri.go @@ -2,6 +2,7 @@ package service import ( runtime "github.com/alibaba/pouch/cri/apis/v1alpha2" + "github.com/alibaba/pouch/cri/metrics" cri "github.com/alibaba/pouch/cri/v1alpha2" "github.com/alibaba/pouch/daemon/config" "github.com/alibaba/pouch/pkg/netutils" @@ -20,7 +21,10 @@ type Service struct { func NewService(cfg *config.Config, criMgr cri.CriMgr) (*Service, error) { s := &Service{ config: cfg, - server: grpc.NewServer(), + server: grpc.NewServer( + grpc.StreamInterceptor(metrics.GRPCMetrics.StreamServerInterceptor()), + grpc.UnaryInterceptor(metrics.GRPCMetrics.UnaryServerInterceptor()), + ), criMgr: criMgr, } @@ -28,6 +32,13 @@ func NewService(cfg *config.Config, criMgr cri.CriMgr) (*Service, error) { runtime.RegisterImageServiceServer(s.server, s.criMgr) runtime.RegisterVolumeServiceServer(s.server, s.criMgr) + // EnableHandlingTimeHistogram turns on recording of handling time + // of RPCs. Histogram metrics can be very expensive for Prometheus + // to retain and query. + metrics.GRPCMetrics.EnableHandlingTimeHistogram() + // Initialize all metrics. + metrics.GRPCMetrics.InitializeMetrics(s.server) + return s, nil } diff --git a/pkg/utils/metrics/metrics.go b/pkg/utils/metrics/metrics.go new file mode 100644 index 000000000..ef7f3cbd8 --- /dev/null +++ b/pkg/utils/metrics/metrics.go @@ -0,0 +1,65 @@ +package metrics + +import ( + "fmt" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + namespace = "engine" +) + +// SinceInMicroseconds gets the time since the specified start in microseconds. +func SinceInMicroseconds(start time.Time) float64 { + return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds()) +} + +// NewLabelSummary return a new SummaryVec +func NewLabelSummary(subsystem, name, help string, labels ...string) *prometheus.SummaryVec { + return prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: name, + Help: help, + ConstLabels: nil, + }, labels) +} + +// NewLabelCounter return a new CounterVec +func NewLabelCounter(subsystem, name, help string, labels ...string) *prometheus.CounterVec { + return prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: fmt.Sprintf("%s_%s", name, total), + Help: help, + ConstLabels: nil, + }, labels) +} + +// NewLabelGauge return a new GaugeVec +func NewLabelGauge(subsystem, name, help string, labels ...string) *prometheus.GaugeVec { + return prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: fmt.Sprintf("%s_%s", name, Unit("info")), + Help: help, + ConstLabels: nil, + }, labels) +} + +// NewLabelTimer return a new HistogramVec +func NewLabelTimer(subsystem, name, help string, labels ...string) *prometheus.HistogramVec { + return prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: fmt.Sprintf("%s_%s", name, seconds), + Help: help, + ConstLabels: nil, + }, labels) +} diff --git a/apis/metrics/unit.go b/pkg/utils/metrics/unit.go similarity index 100% rename from apis/metrics/unit.go rename to pkg/utils/metrics/unit.go diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/CHANGELOG.md b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/CHANGELOG.md new file mode 100644 index 000000000..19a8059e1 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/CHANGELOG.md @@ -0,0 +1,24 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [1.2.0](https://github.com/grpc-ecosystem/go-grpc-prometheus/releases/tag/v1.2.0) - 2018-06-04 + +### Added + +* Provide metrics object as `prometheus.Collector`, for conventional metric registration. +* Support non-default/global Prometheus registry. +* Allow configuring counters with `prometheus.CounterOpts`. + +### Changed + +* Remove usage of deprecated `grpc.Code()`. +* Remove usage of deprecated `grpc.Errorf` and replace with `status.Errorf`. + +--- + +This changelog was started with version `v1.2.0`, for earlier versions refer to the respective [GitHub releases](https://github.com/grpc-ecosystem/go-grpc-prometheus/releases). diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/LICENSE b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/LICENSE new file mode 100644 index 000000000..b2b065037 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/README.md b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/README.md new file mode 100644 index 000000000..499c58355 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/README.md @@ -0,0 +1,247 @@ +# Go gRPC Interceptors for Prometheus monitoring + +[![Travis Build](https://travis-ci.org/grpc-ecosystem/go-grpc-prometheus.svg)](https://travis-ci.org/grpc-ecosystem/go-grpc-prometheus) +[![Go Report Card](https://goreportcard.com/badge/github.com/grpc-ecosystem/go-grpc-prometheus)](http://goreportcard.com/report/grpc-ecosystem/go-grpc-prometheus) +[![GoDoc](http://img.shields.io/badge/GoDoc-Reference-blue.svg)](https://godoc.org/github.com/grpc-ecosystem/go-grpc-prometheus) +[![SourceGraph](https://sourcegraph.com/github.com/grpc-ecosystem/go-grpc-prometheus/-/badge.svg)](https://sourcegraph.com/github.com/grpc-ecosystem/go-grpc-prometheus/?badge) +[![codecov](https://codecov.io/gh/grpc-ecosystem/go-grpc-prometheus/branch/master/graph/badge.svg)](https://codecov.io/gh/grpc-ecosystem/go-grpc-prometheus) +[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) + +[Prometheus](https://prometheus.io/) monitoring for your [gRPC Go](https://github.com/grpc/grpc-go) servers and clients. + +A sister implementation for [gRPC Java](https://github.com/grpc/grpc-java) (same metrics, same semantics) is in [grpc-ecosystem/java-grpc-prometheus](https://github.com/grpc-ecosystem/java-grpc-prometheus). + +## Interceptors + +[gRPC Go](https://github.com/grpc/grpc-go) recently acquired support for Interceptors, i.e. middleware that is executed +by a gRPC Server before the request is passed onto the user's application logic. It is a perfect way to implement +common patterns: auth, logging and... monitoring. + +To use Interceptors in chains, please see [`go-grpc-middleware`](https://github.com/mwitkow/go-grpc-middleware). + +## Usage + +There are two types of interceptors: client-side and server-side. This package provides monitoring Interceptors for both. + +### Server-side + +```go +import "github.com/grpc-ecosystem/go-grpc-prometheus" +... + // Initialize your gRPC server's interceptor. + myServer := grpc.NewServer( + grpc.StreamInterceptor(grpc_prometheus.StreamServerInterceptor), + grpc.UnaryInterceptor(grpc_prometheus.UnaryServerInterceptor), + ) + // Register your gRPC service implementations. + myservice.RegisterMyServiceServer(s.server, &myServiceImpl{}) + // After all your registrations, make sure all of the Prometheus metrics are initialized. + grpc_prometheus.Register(myServer) + // Register Prometheus metrics handler. + http.Handle("/metrics", promhttp.Handler()) +... +``` + +### Client-side + +```go +import "github.com/grpc-ecosystem/go-grpc-prometheus" +... + clientConn, err = grpc.Dial( + address, + grpc.WithUnaryInterceptor(grpc_prometheus.UnaryClientInterceptor), + grpc.WithStreamInterceptor(grpc_prometheus.StreamClientInterceptor) + ) + client = pb_testproto.NewTestServiceClient(clientConn) + resp, err := client.PingEmpty(s.ctx, &myservice.Request{Msg: "hello"}) +... +``` + +# Metrics + +## Labels + +All server-side metrics start with `grpc_server` as Prometheus subsystem name. All client-side metrics start with `grpc_client`. Both of them have mirror-concepts. Similarly all methods +contain the same rich labels: + + * `grpc_service` - the [gRPC service](http://www.grpc.io/docs/#defining-a-service) name, which is the combination of protobuf `package` and + the `grpc_service` section name. E.g. for `package = mwitkow.testproto` and + `service TestService` the label will be `grpc_service="mwitkow.testproto.TestService"` + * `grpc_method` - the name of the method called on the gRPC service. E.g. + `grpc_method="Ping"` + * `grpc_type` - the gRPC [type of request](http://www.grpc.io/docs/guides/concepts.html#rpc-life-cycle). + Differentiating between the two is important especially for latency measurements. + + - `unary` is single request, single response RPC + - `client_stream` is a multi-request, single response RPC + - `server_stream` is a single request, multi-response RPC + - `bidi_stream` is a multi-request, multi-response RPC + + +Additionally for completed RPCs, the following labels are used: + + * `grpc_code` - the human-readable [gRPC status code](https://github.com/grpc/grpc-go/blob/master/codes/codes.go). + The list of all statuses is to long, but here are some common ones: + + - `OK` - means the RPC was successful + - `IllegalArgument` - RPC contained bad values + - `Internal` - server-side error not disclosed to the clients + +## Counters + +The counters and their up to date documentation is in [server_reporter.go](server_reporter.go) and [client_reporter.go](client_reporter.go) +the respective Prometheus handler (usually `/metrics`). + +For the purpose of this documentation we will only discuss `grpc_server` metrics. The `grpc_client` ones contain mirror concepts. + +For simplicity, let's assume we're tracking a single server-side RPC call of [`mwitkow.testproto.TestService`](examples/testproto/test.proto), +calling the method `PingList`. The call succeeds and returns 20 messages in the stream. + +First, immediately after the server receives the call it will increment the +`grpc_server_started_total` and start the handling time clock (if histograms are enabled). + +```jsoniq +grpc_server_started_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + +Then the user logic gets invoked. It receives one message from the client containing the request +(it's a `server_stream`): + +```jsoniq +grpc_server_msg_received_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + +The user logic may return an error, or send multiple messages back to the client. In this case, on +each of the 20 messages sent back, a counter will be incremented: + +```jsoniq +grpc_server_msg_sent_total{grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 20 +``` + +After the call completes, its status (`OK` or other [gRPC status code](https://github.com/grpc/grpc-go/blob/master/codes/codes.go)) +and the relevant call labels increment the `grpc_server_handled_total` counter. + +```jsoniq +grpc_server_handled_total{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + +## Histograms + +[Prometheus histograms](https://prometheus.io/docs/concepts/metric_types/#histogram) are a great way +to measure latency distributions of your RPCs. However, since it is bad practice to have metrics +of [high cardinality](https://prometheus.io/docs/practices/instrumentation/#do-not-overuse-labels) +the latency monitoring metrics are disabled by default. To enable them please call the following +in your server initialization code: + +```jsoniq +grpc_prometheus.EnableHandlingTimeHistogram() +``` + +After the call completes, its handling time will be recorded in a [Prometheus histogram](https://prometheus.io/docs/concepts/metric_types/#histogram) +variable `grpc_server_handling_seconds`. The histogram variable contains three sub-metrics: + + * `grpc_server_handling_seconds_count` - the count of all completed RPCs by status and method + * `grpc_server_handling_seconds_sum` - cumulative time of RPCs by status and method, useful for + calculating average handling times + * `grpc_server_handling_seconds_bucket` - contains the counts of RPCs by status and method in respective + handling-time buckets. These buckets can be used by Prometheus to estimate SLAs (see [here](https://prometheus.io/docs/practices/histograms/)) + +The counter values will look as follows: + +```jsoniq +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.005"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.01"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.025"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.05"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.1"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.25"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="0.5"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="1"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="2.5"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="5"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="10"} 1 +grpc_server_handling_seconds_bucket{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream",le="+Inf"} 1 +grpc_server_handling_seconds_sum{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 0.0003866430000000001 +grpc_server_handling_seconds_count{grpc_code="OK",grpc_method="PingList",grpc_service="mwitkow.testproto.TestService",grpc_type="server_stream"} 1 +``` + + +## Useful query examples + +Prometheus philosophy is to provide raw metrics to the monitoring system, and +let the aggregations be handled there. The verbosity of above metrics make it possible to have that +flexibility. Here's a couple of useful monitoring queries: + + +### request inbound rate +```jsoniq +sum(rate(grpc_server_started_total{job="foo"}[1m])) by (grpc_service) +``` +For `job="foo"` (common label to differentiate between Prometheus monitoring targets), calculate the +rate of requests per second (1 minute window) for each gRPC `grpc_service` that the job has. Please note +how the `grpc_method` is being omitted here: all methods of a given gRPC service will be summed together. + +### unary request error rate +```jsoniq +sum(rate(grpc_server_handled_total{job="foo",grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) +``` +For `job="foo"`, calculate the per-`grpc_service` rate of `unary` (1:1) RPCs that failed, i.e. the +ones that didn't finish with `OK` code. + +### unary request error percentage +```jsoniq +sum(rate(grpc_server_handled_total{job="foo",grpc_type="unary",grpc_code!="OK"}[1m])) by (grpc_service) + / +sum(rate(grpc_server_started_total{job="foo",grpc_type="unary"}[1m])) by (grpc_service) + * 100.0 +``` +For `job="foo"`, calculate the percentage of failed requests by service. It's easy to notice that +this is a combination of the two above examples. This is an example of a query you would like to +[alert on](https://prometheus.io/docs/alerting/rules/) in your system for SLA violations, e.g. +"no more than 1% requests should fail". + +### average response stream size +```jsoniq +sum(rate(grpc_server_msg_sent_total{job="foo",grpc_type="server_stream"}[10m])) by (grpc_service) + / +sum(rate(grpc_server_started_total{job="foo",grpc_type="server_stream"}[10m])) by (grpc_service) +``` +For `job="foo"` what is the `grpc_service`-wide `10m` average of messages returned for all ` +server_stream` RPCs. This allows you to track the stream sizes returned by your system, e.g. allows +you to track when clients started to send "wide" queries that ret +Note the divisor is the number of started RPCs, in order to account for in-flight requests. + +### 99%-tile latency of unary requests +```jsoniq +histogram_quantile(0.99, + sum(rate(grpc_server_handling_seconds_bucket{job="foo",grpc_type="unary"}[5m])) by (grpc_service,le) +) +``` +For `job="foo"`, returns an 99%-tile [quantile estimation](https://prometheus.io/docs/practices/histograms/#quantiles) +of the handling time of RPCs per service. Please note the `5m` rate, this means that the quantile +estimation will take samples in a rolling `5m` window. When combined with other quantiles +(e.g. 50%, 90%), this query gives you tremendous insight into the responsiveness of your system +(e.g. impact of caching). + +### percentage of slow unary queries (>250ms) +```jsoniq +100.0 - ( +sum(rate(grpc_server_handling_seconds_bucket{job="foo",grpc_type="unary",le="0.25"}[5m])) by (grpc_service) + / +sum(rate(grpc_server_handling_seconds_count{job="foo",grpc_type="unary"}[5m])) by (grpc_service) +) * 100.0 +``` +For `job="foo"` calculate the by-`grpc_service` fraction of slow requests that took longer than `0.25` +seconds. This query is relatively complex, since the Prometheus aggregations use `le` (less or equal) +buckets, meaning that counting "fast" requests fractions is easier. However, simple maths helps. +This is an example of a query you would like to alert on in your system for SLA violations, +e.g. "less than 1% of requests are slower than 250ms". + + +## Status + +This code has been used since August 2015 as the basis for monitoring of *production* gRPC micro services at [Improbable](https://improbable.io). + +## License + +`go-grpc-prometheus` is released under the Apache 2.0 license. See the [LICENSE](LICENSE) file for details. diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client.go new file mode 100644 index 000000000..751a4c72d --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client.go @@ -0,0 +1,39 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +// gRPC Prometheus monitoring interceptors for client-side gRPC. + +package grpc_prometheus + +import ( + prom "github.com/prometheus/client_golang/prometheus" +) + +var ( + // DefaultClientMetrics is the default instance of ClientMetrics. It is + // intended to be used in conjunction the default Prometheus metrics + // registry. + DefaultClientMetrics = NewClientMetrics() + + // UnaryClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Unary RPCs. + UnaryClientInterceptor = DefaultClientMetrics.UnaryClientInterceptor() + + // StreamClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Streaming RPCs. + StreamClientInterceptor = DefaultClientMetrics.StreamClientInterceptor() +) + +func init() { + prom.MustRegister(DefaultClientMetrics.clientStartedCounter) + prom.MustRegister(DefaultClientMetrics.clientHandledCounter) + prom.MustRegister(DefaultClientMetrics.clientStreamMsgReceived) + prom.MustRegister(DefaultClientMetrics.clientStreamMsgSent) +} + +// EnableClientHandlingTimeHistogram turns on recording of handling time of +// RPCs. Histogram metrics can be very expensive for Prometheus to retain and +// query. This function acts on the DefaultClientMetrics variable and the +// default Prometheus metrics registry. +func EnableClientHandlingTimeHistogram(opts ...HistogramOption) { + DefaultClientMetrics.EnableClientHandlingTimeHistogram(opts...) + prom.Register(DefaultClientMetrics.clientHandledHistogram) +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_metrics.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_metrics.go new file mode 100644 index 000000000..9b476f983 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_metrics.go @@ -0,0 +1,170 @@ +package grpc_prometheus + +import ( + "io" + + prom "github.com/prometheus/client_golang/prometheus" + "golang.org/x/net/context" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// ClientMetrics represents a collection of metrics to be registered on a +// Prometheus metrics registry for a gRPC client. +type ClientMetrics struct { + clientStartedCounter *prom.CounterVec + clientHandledCounter *prom.CounterVec + clientStreamMsgReceived *prom.CounterVec + clientStreamMsgSent *prom.CounterVec + clientHandledHistogramEnabled bool + clientHandledHistogramOpts prom.HistogramOpts + clientHandledHistogram *prom.HistogramVec +} + +// NewClientMetrics returns a ClientMetrics object. Use a new instance of +// ClientMetrics when not using the default Prometheus metrics registry, for +// example when wanting to control which metrics are added to a registry as +// opposed to automatically adding metrics via init functions. +func NewClientMetrics(counterOpts ...CounterOption) *ClientMetrics { + opts := counterOptions(counterOpts) + return &ClientMetrics{ + clientStartedCounter: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_client_started_total", + Help: "Total number of RPCs started on the client.", + }), []string{"grpc_type", "grpc_service", "grpc_method"}), + + clientHandledCounter: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_client_handled_total", + Help: "Total number of RPCs completed by the client, regardless of success or failure.", + }), []string{"grpc_type", "grpc_service", "grpc_method", "grpc_code"}), + + clientStreamMsgReceived: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_client_msg_received_total", + Help: "Total number of RPC stream messages received by the client.", + }), []string{"grpc_type", "grpc_service", "grpc_method"}), + + clientStreamMsgSent: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_client_msg_sent_total", + Help: "Total number of gRPC stream messages sent by the client.", + }), []string{"grpc_type", "grpc_service", "grpc_method"}), + + clientHandledHistogramEnabled: false, + clientHandledHistogramOpts: prom.HistogramOpts{ + Name: "grpc_client_handling_seconds", + Help: "Histogram of response latency (seconds) of the gRPC until it is finished by the application.", + Buckets: prom.DefBuckets, + }, + clientHandledHistogram: nil, + } +} + +// Describe sends the super-set of all possible descriptors of metrics +// collected by this Collector to the provided channel and returns once +// the last descriptor has been sent. +func (m *ClientMetrics) Describe(ch chan<- *prom.Desc) { + m.clientStartedCounter.Describe(ch) + m.clientHandledCounter.Describe(ch) + m.clientStreamMsgReceived.Describe(ch) + m.clientStreamMsgSent.Describe(ch) + if m.clientHandledHistogramEnabled { + m.clientHandledHistogram.Describe(ch) + } +} + +// Collect is called by the Prometheus registry when collecting +// metrics. The implementation sends each collected metric via the +// provided channel and returns once the last metric has been sent. +func (m *ClientMetrics) Collect(ch chan<- prom.Metric) { + m.clientStartedCounter.Collect(ch) + m.clientHandledCounter.Collect(ch) + m.clientStreamMsgReceived.Collect(ch) + m.clientStreamMsgSent.Collect(ch) + if m.clientHandledHistogramEnabled { + m.clientHandledHistogram.Collect(ch) + } +} + +// EnableClientHandlingTimeHistogram turns on recording of handling time of RPCs. +// Histogram metrics can be very expensive for Prometheus to retain and query. +func (m *ClientMetrics) EnableClientHandlingTimeHistogram(opts ...HistogramOption) { + for _, o := range opts { + o(&m.clientHandledHistogramOpts) + } + if !m.clientHandledHistogramEnabled { + m.clientHandledHistogram = prom.NewHistogramVec( + m.clientHandledHistogramOpts, + []string{"grpc_type", "grpc_service", "grpc_method"}, + ) + } + m.clientHandledHistogramEnabled = true +} + +// UnaryClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Unary RPCs. +func (m *ClientMetrics) UnaryClientInterceptor() func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { + return func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { + monitor := newClientReporter(m, Unary, method) + monitor.SentMessage() + err := invoker(ctx, method, req, reply, cc, opts...) + if err != nil { + monitor.ReceivedMessage() + } + st, _ := status.FromError(err) + monitor.Handled(st.Code()) + return err + } +} + +// StreamClientInterceptor is a gRPC client-side interceptor that provides Prometheus monitoring for Streaming RPCs. +func (m *ClientMetrics) StreamClientInterceptor() func(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) { + return func(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) { + monitor := newClientReporter(m, clientStreamType(desc), method) + clientStream, err := streamer(ctx, desc, cc, method, opts...) + if err != nil { + st, _ := status.FromError(err) + monitor.Handled(st.Code()) + return nil, err + } + return &monitoredClientStream{clientStream, monitor}, nil + } +} + +func clientStreamType(desc *grpc.StreamDesc) grpcType { + if desc.ClientStreams && !desc.ServerStreams { + return ClientStream + } else if !desc.ClientStreams && desc.ServerStreams { + return ServerStream + } + return BidiStream +} + +// monitoredClientStream wraps grpc.ClientStream allowing each Sent/Recv of message to increment counters. +type monitoredClientStream struct { + grpc.ClientStream + monitor *clientReporter +} + +func (s *monitoredClientStream) SendMsg(m interface{}) error { + err := s.ClientStream.SendMsg(m) + if err == nil { + s.monitor.SentMessage() + } + return err +} + +func (s *monitoredClientStream) RecvMsg(m interface{}) error { + err := s.ClientStream.RecvMsg(m) + if err == nil { + s.monitor.ReceivedMessage() + } else if err == io.EOF { + s.monitor.Handled(codes.OK) + } else { + st, _ := status.FromError(err) + s.monitor.Handled(st.Code()) + } + return err +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_reporter.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_reporter.go new file mode 100644 index 000000000..cbf153229 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/client_reporter.go @@ -0,0 +1,46 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +package grpc_prometheus + +import ( + "time" + + "google.golang.org/grpc/codes" +) + +type clientReporter struct { + metrics *ClientMetrics + rpcType grpcType + serviceName string + methodName string + startTime time.Time +} + +func newClientReporter(m *ClientMetrics, rpcType grpcType, fullMethod string) *clientReporter { + r := &clientReporter{ + metrics: m, + rpcType: rpcType, + } + if r.metrics.clientHandledHistogramEnabled { + r.startTime = time.Now() + } + r.serviceName, r.methodName = splitMethodName(fullMethod) + r.metrics.clientStartedCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() + return r +} + +func (r *clientReporter) ReceivedMessage() { + r.metrics.clientStreamMsgReceived.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *clientReporter) SentMessage() { + r.metrics.clientStreamMsgSent.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *clientReporter) Handled(code codes.Code) { + r.metrics.clientHandledCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName, code.String()).Inc() + if r.metrics.clientHandledHistogramEnabled { + r.metrics.clientHandledHistogram.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Observe(time.Since(r.startTime).Seconds()) + } +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/makefile b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/makefile new file mode 100644 index 000000000..74c084223 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/makefile @@ -0,0 +1,16 @@ +SHELL="/bin/bash" + +GOFILES_NOVENDOR = $(shell go list ./... | grep -v /vendor/) + +all: vet fmt test + +fmt: + go fmt $(GOFILES_NOVENDOR) + +vet: + go vet $(GOFILES_NOVENDOR) + +test: vet + ./scripts/test_all.sh + +.PHONY: all vet test diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/metric_options.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/metric_options.go new file mode 100644 index 000000000..9d51aec98 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/metric_options.go @@ -0,0 +1,41 @@ +package grpc_prometheus + +import ( + prom "github.com/prometheus/client_golang/prometheus" +) + +// A CounterOption lets you add options to Counter metrics using With* funcs. +type CounterOption func(*prom.CounterOpts) + +type counterOptions []CounterOption + +func (co counterOptions) apply(o prom.CounterOpts) prom.CounterOpts { + for _, f := range co { + f(&o) + } + return o +} + +// WithConstLabels allows you to add ConstLabels to Counter metrics. +func WithConstLabels(labels prom.Labels) CounterOption { + return func(o *prom.CounterOpts) { + o.ConstLabels = labels + } +} + +// A HistogramOption lets you add options to Histogram metrics using With* +// funcs. +type HistogramOption func(*prom.HistogramOpts) + +// WithHistogramBuckets allows you to specify custom bucket ranges for histograms if EnableHandlingTimeHistogram is on. +func WithHistogramBuckets(buckets []float64) HistogramOption { + return func(o *prom.HistogramOpts) { o.Buckets = buckets } +} + +// WithHistogramConstLabels allows you to add custom ConstLabels to +// histograms metrics. +func WithHistogramConstLabels(labels prom.Labels) HistogramOption { + return func(o *prom.HistogramOpts) { + o.ConstLabels = labels + } +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server.go new file mode 100644 index 000000000..322f99046 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server.go @@ -0,0 +1,48 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +// gRPC Prometheus monitoring interceptors for server-side gRPC. + +package grpc_prometheus + +import ( + prom "github.com/prometheus/client_golang/prometheus" + "google.golang.org/grpc" +) + +var ( + // DefaultServerMetrics is the default instance of ServerMetrics. It is + // intended to be used in conjunction the default Prometheus metrics + // registry. + DefaultServerMetrics = NewServerMetrics() + + // UnaryServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Unary RPCs. + UnaryServerInterceptor = DefaultServerMetrics.UnaryServerInterceptor() + + // StreamServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Streaming RPCs. + StreamServerInterceptor = DefaultServerMetrics.StreamServerInterceptor() +) + +func init() { + prom.MustRegister(DefaultServerMetrics.serverStartedCounter) + prom.MustRegister(DefaultServerMetrics.serverHandledCounter) + prom.MustRegister(DefaultServerMetrics.serverStreamMsgReceived) + prom.MustRegister(DefaultServerMetrics.serverStreamMsgSent) +} + +// Register takes a gRPC server and pre-initializes all counters to 0. This +// allows for easier monitoring in Prometheus (no missing metrics), and should +// be called *after* all services have been registered with the server. This +// function acts on the DefaultServerMetrics variable. +func Register(server *grpc.Server) { + DefaultServerMetrics.InitializeMetrics(server) +} + +// EnableHandlingTimeHistogram turns on recording of handling time +// of RPCs. Histogram metrics can be very expensive for Prometheus +// to retain and query. This function acts on the DefaultServerMetrics +// variable and the default Prometheus metrics registry. +func EnableHandlingTimeHistogram(opts ...HistogramOption) { + DefaultServerMetrics.EnableHandlingTimeHistogram(opts...) + prom.Register(DefaultServerMetrics.serverHandledHistogram) +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_metrics.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_metrics.go new file mode 100644 index 000000000..5b1467e7a --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_metrics.go @@ -0,0 +1,185 @@ +package grpc_prometheus + +import ( + prom "github.com/prometheus/client_golang/prometheus" + "golang.org/x/net/context" + "google.golang.org/grpc" + "google.golang.org/grpc/status" +) + +// ServerMetrics represents a collection of metrics to be registered on a +// Prometheus metrics registry for a gRPC server. +type ServerMetrics struct { + serverStartedCounter *prom.CounterVec + serverHandledCounter *prom.CounterVec + serverStreamMsgReceived *prom.CounterVec + serverStreamMsgSent *prom.CounterVec + serverHandledHistogramEnabled bool + serverHandledHistogramOpts prom.HistogramOpts + serverHandledHistogram *prom.HistogramVec +} + +// NewServerMetrics returns a ServerMetrics object. Use a new instance of +// ServerMetrics when not using the default Prometheus metrics registry, for +// example when wanting to control which metrics are added to a registry as +// opposed to automatically adding metrics via init functions. +func NewServerMetrics(counterOpts ...CounterOption) *ServerMetrics { + opts := counterOptions(counterOpts) + return &ServerMetrics{ + serverStartedCounter: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_server_started_total", + Help: "Total number of RPCs started on the server.", + }), []string{"grpc_type", "grpc_service", "grpc_method"}), + serverHandledCounter: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_server_handled_total", + Help: "Total number of RPCs completed on the server, regardless of success or failure.", + }), []string{"grpc_type", "grpc_service", "grpc_method", "grpc_code"}), + serverStreamMsgReceived: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_server_msg_received_total", + Help: "Total number of RPC stream messages received on the server.", + }), []string{"grpc_type", "grpc_service", "grpc_method"}), + serverStreamMsgSent: prom.NewCounterVec( + opts.apply(prom.CounterOpts{ + Name: "grpc_server_msg_sent_total", + Help: "Total number of gRPC stream messages sent by the server.", + }), []string{"grpc_type", "grpc_service", "grpc_method"}), + serverHandledHistogramEnabled: false, + serverHandledHistogramOpts: prom.HistogramOpts{ + Name: "grpc_server_handling_seconds", + Help: "Histogram of response latency (seconds) of gRPC that had been application-level handled by the server.", + Buckets: prom.DefBuckets, + }, + serverHandledHistogram: nil, + } +} + +// EnableHandlingTimeHistogram enables histograms being registered when +// registering the ServerMetrics on a Prometheus registry. Histograms can be +// expensive on Prometheus servers. It takes options to configure histogram +// options such as the defined buckets. +func (m *ServerMetrics) EnableHandlingTimeHistogram(opts ...HistogramOption) { + for _, o := range opts { + o(&m.serverHandledHistogramOpts) + } + if !m.serverHandledHistogramEnabled { + m.serverHandledHistogram = prom.NewHistogramVec( + m.serverHandledHistogramOpts, + []string{"grpc_type", "grpc_service", "grpc_method"}, + ) + } + m.serverHandledHistogramEnabled = true +} + +// Describe sends the super-set of all possible descriptors of metrics +// collected by this Collector to the provided channel and returns once +// the last descriptor has been sent. +func (m *ServerMetrics) Describe(ch chan<- *prom.Desc) { + m.serverStartedCounter.Describe(ch) + m.serverHandledCounter.Describe(ch) + m.serverStreamMsgReceived.Describe(ch) + m.serverStreamMsgSent.Describe(ch) + if m.serverHandledHistogramEnabled { + m.serverHandledHistogram.Describe(ch) + } +} + +// Collect is called by the Prometheus registry when collecting +// metrics. The implementation sends each collected metric via the +// provided channel and returns once the last metric has been sent. +func (m *ServerMetrics) Collect(ch chan<- prom.Metric) { + m.serverStartedCounter.Collect(ch) + m.serverHandledCounter.Collect(ch) + m.serverStreamMsgReceived.Collect(ch) + m.serverStreamMsgSent.Collect(ch) + if m.serverHandledHistogramEnabled { + m.serverHandledHistogram.Collect(ch) + } +} + +// UnaryServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Unary RPCs. +func (m *ServerMetrics) UnaryServerInterceptor() func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { + return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { + monitor := newServerReporter(m, Unary, info.FullMethod) + monitor.ReceivedMessage() + resp, err := handler(ctx, req) + st, _ := status.FromError(err) + monitor.Handled(st.Code()) + if err == nil { + monitor.SentMessage() + } + return resp, err + } +} + +// StreamServerInterceptor is a gRPC server-side interceptor that provides Prometheus monitoring for Streaming RPCs. +func (m *ServerMetrics) StreamServerInterceptor() func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { + return func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { + monitor := newServerReporter(m, streamRPCType(info), info.FullMethod) + err := handler(srv, &monitoredServerStream{ss, monitor}) + st, _ := status.FromError(err) + monitor.Handled(st.Code()) + return err + } +} + +// InitializeMetrics initializes all metrics, with their appropriate null +// value, for all gRPC methods registered on a gRPC server. This is useful, to +// ensure that all metrics exist when collecting and querying. +func (m *ServerMetrics) InitializeMetrics(server *grpc.Server) { + serviceInfo := server.GetServiceInfo() + for serviceName, info := range serviceInfo { + for _, mInfo := range info.Methods { + preRegisterMethod(m, serviceName, &mInfo) + } + } +} + +func streamRPCType(info *grpc.StreamServerInfo) grpcType { + if info.IsClientStream && !info.IsServerStream { + return ClientStream + } else if !info.IsClientStream && info.IsServerStream { + return ServerStream + } + return BidiStream +} + +// monitoredStream wraps grpc.ServerStream allowing each Sent/Recv of message to increment counters. +type monitoredServerStream struct { + grpc.ServerStream + monitor *serverReporter +} + +func (s *monitoredServerStream) SendMsg(m interface{}) error { + err := s.ServerStream.SendMsg(m) + if err == nil { + s.monitor.SentMessage() + } + return err +} + +func (s *monitoredServerStream) RecvMsg(m interface{}) error { + err := s.ServerStream.RecvMsg(m) + if err == nil { + s.monitor.ReceivedMessage() + } + return err +} + +// preRegisterMethod is invoked on Register of a Server, allowing all gRPC services labels to be pre-populated. +func preRegisterMethod(metrics *ServerMetrics, serviceName string, mInfo *grpc.MethodInfo) { + methodName := mInfo.Name + methodType := string(typeFromMethodInfo(mInfo)) + // These are just references (no increments), as just referencing will create the labels but not set values. + metrics.serverStartedCounter.GetMetricWithLabelValues(methodType, serviceName, methodName) + metrics.serverStreamMsgReceived.GetMetricWithLabelValues(methodType, serviceName, methodName) + metrics.serverStreamMsgSent.GetMetricWithLabelValues(methodType, serviceName, methodName) + if metrics.serverHandledHistogramEnabled { + metrics.serverHandledHistogram.GetMetricWithLabelValues(methodType, serviceName, methodName) + } + for _, code := range allCodes { + metrics.serverHandledCounter.GetMetricWithLabelValues(methodType, serviceName, methodName, code.String()) + } +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_reporter.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_reporter.go new file mode 100644 index 000000000..aa9db5401 --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/server_reporter.go @@ -0,0 +1,46 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +package grpc_prometheus + +import ( + "time" + + "google.golang.org/grpc/codes" +) + +type serverReporter struct { + metrics *ServerMetrics + rpcType grpcType + serviceName string + methodName string + startTime time.Time +} + +func newServerReporter(m *ServerMetrics, rpcType grpcType, fullMethod string) *serverReporter { + r := &serverReporter{ + metrics: m, + rpcType: rpcType, + } + if r.metrics.serverHandledHistogramEnabled { + r.startTime = time.Now() + } + r.serviceName, r.methodName = splitMethodName(fullMethod) + r.metrics.serverStartedCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() + return r +} + +func (r *serverReporter) ReceivedMessage() { + r.metrics.serverStreamMsgReceived.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *serverReporter) SentMessage() { + r.metrics.serverStreamMsgSent.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Inc() +} + +func (r *serverReporter) Handled(code codes.Code) { + r.metrics.serverHandledCounter.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName, code.String()).Inc() + if r.metrics.serverHandledHistogramEnabled { + r.metrics.serverHandledHistogram.WithLabelValues(string(r.rpcType), r.serviceName, r.methodName).Observe(time.Since(r.startTime).Seconds()) + } +} diff --git a/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/util.go b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/util.go new file mode 100644 index 000000000..7987de35f --- /dev/null +++ b/vendor/github.com/grpc-ecosystem/go-grpc-prometheus/util.go @@ -0,0 +1,50 @@ +// Copyright 2016 Michal Witkowski. All Rights Reserved. +// See LICENSE for licensing terms. + +package grpc_prometheus + +import ( + "strings" + + "google.golang.org/grpc" + "google.golang.org/grpc/codes" +) + +type grpcType string + +const ( + Unary grpcType = "unary" + ClientStream grpcType = "client_stream" + ServerStream grpcType = "server_stream" + BidiStream grpcType = "bidi_stream" +) + +var ( + allCodes = []codes.Code{ + codes.OK, codes.Canceled, codes.Unknown, codes.InvalidArgument, codes.DeadlineExceeded, codes.NotFound, + codes.AlreadyExists, codes.PermissionDenied, codes.Unauthenticated, codes.ResourceExhausted, + codes.FailedPrecondition, codes.Aborted, codes.OutOfRange, codes.Unimplemented, codes.Internal, + codes.Unavailable, codes.DataLoss, + } +) + +func splitMethodName(fullMethodName string) (string, string) { + fullMethodName = strings.TrimPrefix(fullMethodName, "/") // remove leading slash + if i := strings.Index(fullMethodName, "/"); i >= 0 { + return fullMethodName[:i], fullMethodName[i+1:] + } + return "unknown", "unknown" +} + +func typeFromMethodInfo(mInfo *grpc.MethodInfo) grpcType { + if !mInfo.IsClientStream && !mInfo.IsServerStream { + return Unary + } + if mInfo.IsClientStream && !mInfo.IsServerStream { + return ClientStream + } + if !mInfo.IsClientStream && mInfo.IsServerStream { + return ServerStream + } + return BidiStream +} diff --git a/vendor/vendor.json b/vendor/vendor.json index 378c36e45..d66440b65 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -1367,6 +1367,12 @@ "revision": "b5461758462b97cff258bf388ca42110f529d747", "revisionTime": "2017-11-07T15:27:38Z" }, + { + "checksumSHA1": "3iVD2sJv4uYnA8YgkR8yzZiUF7o=", + "path": "github.com/grpc-ecosystem/go-grpc-prometheus", + "revision": "93bf4626fba73b751b0f3cdf2649be4ce0c420cd", + "revisionTime": "2018-08-20T15:04:22Z" + }, { "checksumSHA1": "cZw5u6BGWe3gjJxmXT6J0Tfn6Mw=", "path": "github.com/hashicorp/consul/api",