From e82b37cccd310765459ad31da895eed3b92ab303 Mon Sep 17 00:00:00 2001 From: Pengfei Ni Date: Mon, 14 Mar 2022 12:15:44 +0000 Subject: [PATCH 1/2] chore: add logs for operation latency --- pkg/metrics/azure_metrics.go | 25 +++++++++++++++++++------ pkg/provider/azure_routes.go | 4 ++-- pkg/provider/azure_standard.go | 4 ++-- pkg/provider/azure_vmss.go | 4 ++-- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/pkg/metrics/azure_metrics.go b/pkg/metrics/azure_metrics.go index 31855edcbb..a213d49f1d 100644 --- a/pkg/metrics/azure_metrics.go +++ b/pkg/metrics/azure_metrics.go @@ -22,6 +22,7 @@ import ( "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/klog/v2" "sigs.k8s.io/cloud-provider-azure/pkg/consts" "sigs.k8s.io/cloud-provider-azure/pkg/retry" @@ -80,23 +81,35 @@ func (mc *MetricContext) ThrottledCount() { } // Observe observes the request latency and failed requests. -func (mc *MetricContext) Observe(rerr *retry.Error) { - apiMetrics.latency.WithLabelValues(mc.attributes...).Observe( - time.Since(mc.start).Seconds()) +func (mc *MetricContext) Observe(rerr *retry.Error, labelAndValues ...interface{}) { + latency := time.Since(mc.start).Seconds() + apiMetrics.latency.WithLabelValues(mc.attributes...).Observe(latency) if rerr != nil { errorCode := rerr.ServiceErrorCode() attributes := append(mc.attributes, errorCode) apiMetrics.errors.WithLabelValues(attributes...).Inc() } + mc.logLatency(latency, append(labelAndValues, "error_code", rerr.ServiceErrorCode())...) } // ObserveOperationWithResult observes the request latency and failed requests of an operation. -func (mc *MetricContext) ObserveOperationWithResult(isOperationSucceeded bool) { - operationMetrics.operationLatency.WithLabelValues(mc.attributes...).Observe( - time.Since(mc.start).Seconds()) +func (mc *MetricContext) ObserveOperationWithResult(isOperationSucceeded bool, labelAndValues ...interface{}) { + latency := time.Since(mc.start).Seconds() + operationMetrics.operationLatency.WithLabelValues(mc.attributes...).Observe(latency) + resultCode := "succeeded" if !isOperationSucceeded { + resultCode = "failed" mc.CountFailedOperation() } + mc.logLatency(latency, append(labelAndValues, "result_code", resultCode)...) +} + +func (mc *MetricContext) logLatency(latency float64, additionalKeysAndValues ...interface{}) { + keysAndValues := []interface{}{"latency_seconds", latency} + for i, label := range metricLabels { + keysAndValues = append(keysAndValues, label, mc.attributes[i]) + } + klog.V(3).InfoS("Observed Request Latency", append(keysAndValues, additionalKeysAndValues...)...) } // CountFailedOperation increase the number of failed operations diff --git a/pkg/provider/azure_routes.go b/pkg/provider/azure_routes.go index 92a93d73c1..43a68a2b1b 100644 --- a/pkg/provider/azure_routes.go +++ b/pkg/provider/azure_routes.go @@ -366,7 +366,7 @@ func (az *Cloud) createRouteTable() error { // route.Name will be ignored, although the cloud-provider may use nameHint // to create a more user-meaningful name. func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint string, kubeRoute *cloudprovider.Route) error { - mc := metrics.NewMetricContext("routes", "create_route", az.ResourceGroup, az.SubscriptionID, "") + mc := metrics.NewMetricContext("routes", "create_route", az.ResourceGroup, az.SubscriptionID, string(kubeRoute.TargetNode)) isOperationSucceeded := false defer func() { mc.ObserveOperationWithResult(isOperationSucceeded) @@ -449,7 +449,7 @@ func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint s // DeleteRoute deletes the specified managed route // Route should be as returned by ListRoutes func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute *cloudprovider.Route) error { - mc := metrics.NewMetricContext("routes", "delete_route", az.ResourceGroup, az.SubscriptionID, "") + mc := metrics.NewMetricContext("routes", "delete_route", az.ResourceGroup, az.SubscriptionID, string(kubeRoute.TargetNode)) isOperationSucceeded := false defer func() { mc.ObserveOperationWithResult(isOperationSucceeded) diff --git a/pkg/provider/azure_standard.go b/pkg/provider/azure_standard.go index e1dcd25926..59d53746c1 100644 --- a/pkg/provider/azure_standard.go +++ b/pkg/provider/azure_standard.go @@ -991,7 +991,7 @@ func (as *availabilitySet) EnsureHostInPool(service *v1.Service, nodeName types. // EnsureHostsInPool ensures the given Node's primary IP configurations are // participating in the specified LoadBalancer Backend Pool. func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string) error { - mc := metrics.NewMetricContext("services", "vmas_ensure_hosts_in_pool", as.ResourceGroup, as.SubscriptionID, service.Name) + mc := metrics.NewMetricContext("services", "vmas_ensure_hosts_in_pool", as.ResourceGroup, as.SubscriptionID, getServiceName(service)) isOperationSucceeded := false defer func() { mc.ObserveOperationWithResult(isOperationSucceeded) @@ -1041,7 +1041,7 @@ func (as *availabilitySet) EnsureBackendPoolDeleted(service *v1.Service, backend return nil } - mc := metrics.NewMetricContext("services", "vmas_ensure_backend_pool_deleted", as.ResourceGroup, as.SubscriptionID, service.Name) + mc := metrics.NewMetricContext("services", "vmas_ensure_backend_pool_deleted", as.ResourceGroup, as.SubscriptionID, getServiceName(service)) isOperationSucceeded := false defer func() { mc.ObserveOperationWithResult(isOperationSucceeded) diff --git a/pkg/provider/azure_vmss.go b/pkg/provider/azure_vmss.go index 585bab3c24..52c95aba18 100644 --- a/pkg/provider/azure_vmss.go +++ b/pkg/provider/azure_vmss.go @@ -1269,7 +1269,7 @@ func (ss *ScaleSet) ensureVMSSInPool(service *v1.Service, nodes []*v1.Node, back // EnsureHostsInPool ensures the given Node's primary IP configurations are // participating in the specified LoadBalancer Backend Pool. func (ss *ScaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetNameOfLB string) error { - mc := metrics.NewMetricContext("services", "vmss_ensure_hosts_in_pool", ss.ResourceGroup, ss.SubscriptionID, service.Name) + mc := metrics.NewMetricContext("services", "vmss_ensure_hosts_in_pool", ss.ResourceGroup, ss.SubscriptionID, getServiceName(service)) isOperationSucceeded := false defer func() { mc.ObserveOperationWithResult(isOperationSucceeded) @@ -1529,7 +1529,7 @@ func (ss *ScaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID, return nil } - mc := metrics.NewMetricContext("services", "vmss_ensure_backend_pool_deleted", ss.ResourceGroup, ss.SubscriptionID, service.Name) + mc := metrics.NewMetricContext("services", "vmss_ensure_backend_pool_deleted", ss.ResourceGroup, ss.SubscriptionID, getServiceName(service)) isOperationSucceeded := false defer func() { mc.ObserveOperationWithResult(isOperationSucceeded) From 2898d2cd8e58c1d4077b2b15f53642b2798c3979 Mon Sep 17 00:00:00 2001 From: Pengfei Ni Date: Mon, 14 Mar 2022 14:11:46 +0000 Subject: [PATCH 2/2] chore: add debug logs for service operations --- pkg/provider/azure_loadbalancer.go | 77 +++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/pkg/provider/azure_loadbalancer.go b/pkg/provider/azure_loadbalancer.go index 5db795fd5b..0a00544c68 100644 --- a/pkg/provider/azure_loadbalancer.go +++ b/pkg/provider/azure_loadbalancer.go @@ -85,20 +85,9 @@ func getPublicIPDomainNameLabel(service *v1.Service) (string, bool) { return "", false } -// EnsureLoadBalancer creates a new load balancer 'name', or updates the existing one. Returns the status of the balancer -func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) (*v1.LoadBalancerStatus, error) { - // When a client updates the internal load balancer annotation, - // the service may be switched from an internal LB to a public one, or vise versa. - // Here we'll firstly ensure service do not lie in the opposite LB. +// reconcileService reconcile the LoadBalancer service. It returns LoadBalancerStatus on success. +func (az *Cloud) reconcileService(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) (*v1.LoadBalancerStatus, error) { serviceName := getServiceName(service) - klog.V(5).Infof("ensureloadbalancer(%s): START clusterName=%q, service: %v", serviceName, clusterName, service) - - mc := metrics.NewMetricContext("services", "ensure_loadbalancer", az.ResourceGroup, az.SubscriptionID, serviceName) - isOperationSucceeded := false - defer func() { - mc.ObserveOperationWithResult(isOperationSucceeded) - }() - lb, err := az.reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */) if err != nil { klog.Errorf("reconcileLoadBalancer(%s) failed: %v", serviceName, err) @@ -115,7 +104,7 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser if lbStatus != nil && len(lbStatus.Ingress) > 0 { serviceIP = &lbStatus.Ingress[0].IP } - klog.V(2).Infof("EnsureLoadBalancer: reconciling security group for service %q with IP %q, wantLb = true", serviceName, logSafe(serviceIP)) + klog.V(2).Infof("reconcileService: reconciling security group for service %q with IP %q, wantLb = true", serviceName, logSafe(serviceIP)) if _, err := az.reconcileSecurityGroup(clusterName, service, serviceIP, true /* wantLb */); err != nil { klog.Errorf("reconcileSecurityGroup(%s) failed: %#v", serviceName, err) return nil, err @@ -129,30 +118,70 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser } // lb is not reused here because the ETAG may be changed in above operations, hence reconcilePublicIP() would get lb again from cache. - klog.V(2).Infof("EnsureLoadBalancer: reconciling pip") + klog.V(2).Infof("reconcileService: reconciling pip") if _, err := az.reconcilePublicIP(clusterName, updateService, to.String(lb.Name), true /* wantLb */); err != nil { klog.Errorf("reconcilePublicIP(%s) failed: %#v", serviceName, err) return nil, err } + return lbStatus, nil +} + +// EnsureLoadBalancer creates a new load balancer 'name', or updates the existing one. Returns the status of the balancer +func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) (*v1.LoadBalancerStatus, error) { + // When a client updates the internal load balancer annotation, + // the service may be switched from an internal LB to a public one, or vise versa. + // Here we'll firstly ensure service do not lie in the opposite LB. + var err error + serviceName := getServiceName(service) + mc := metrics.NewMetricContext("services", "ensure_loadbalancer", az.ResourceGroup, az.SubscriptionID, serviceName) + klog.V(5).InfoS("EnsureLoadBalancer Start", "service", serviceName, "cluster", clusterName, "service_spec", service) + + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + klog.V(5).InfoS("EnsureLoadBalancer Finish", "service", serviceName, "cluster", clusterName, "service_spec", service, "error", err) + }() + + lbStatus, err := az.reconcileService(ctx, clusterName, service, nodes) + if err != nil { + return nil, err + } + isOperationSucceeded = true return lbStatus, nil } // UpdateLoadBalancer updates hosts under the specified load balancer. func (az *Cloud) UpdateLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) error { + var err error + serviceName := getServiceName(service) + mc := metrics.NewMetricContext("services", "update_loadbalancer", az.ResourceGroup, az.SubscriptionID, serviceName) + klog.V(5).InfoS("UpdateLoadBalancer Start", "service", serviceName, "cluster", clusterName, "service_spec", service) + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + klog.V(5).InfoS("UpdateLoadBalancer Finish", "service", serviceName, "cluster", clusterName, "service_spec", service, "error", err) + }() + shouldUpdateLB, err := az.shouldUpdateLoadBalancer(clusterName, service, nodes) if err != nil { return err } if !shouldUpdateLB { + isOperationSucceeded = true klog.V(2).Infof("UpdateLoadBalancer: skipping service %s because it is either being deleted or does not exist anymore", service.Name) return nil } - _, err = az.EnsureLoadBalancer(ctx, clusterName, service, nodes) - return err + _, err = az.reconcileService(ctx, clusterName, service, nodes) + if err != nil { + return err + } + + isOperationSucceeded = true + return nil } // EnsureLoadBalancerDeleted deletes the specified load balancer if it @@ -162,14 +191,15 @@ func (az *Cloud) UpdateLoadBalancer(ctx context.Context, clusterName string, ser // have multiple underlying components, meaning a Get could say that the LB // doesn't exist even if some part of it is still laying around. func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName string, service *v1.Service) error { + var err error isInternal := requiresInternalLoadBalancer(service) serviceName := getServiceName(service) - klog.V(5).Infof("Delete service (%s): START clusterName=%q", serviceName, clusterName) - mc := metrics.NewMetricContext("services", "ensure_loadbalancer_deleted", az.ResourceGroup, az.SubscriptionID, serviceName) + klog.V(5).InfoS("EnsureLoadBalancerDeleted Start", "service", serviceName, "cluster", clusterName, "service_spec", service) isOperationSucceeded := false defer func() { mc.ObserveOperationWithResult(isOperationSucceeded) + klog.V(5).InfoS("EnsureLoadBalancerDeleted Finish", "service", serviceName, "cluster", clusterName, "service_spec", service, "error", err) }() serviceIPToCleanup, err := az.findServiceIPAddress(ctx, clusterName, service, isInternal) @@ -178,15 +208,18 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri } klog.V(2).Infof("EnsureLoadBalancerDeleted: reconciling security group for service %q with IP %q, wantLb = false", serviceName, serviceIPToCleanup) - if _, err := az.reconcileSecurityGroup(clusterName, service, &serviceIPToCleanup, false /* wantLb */); err != nil { + _, err = az.reconcileSecurityGroup(clusterName, service, &serviceIPToCleanup, false /* wantLb */) + if err != nil { return err } - if _, err := az.reconcileLoadBalancer(clusterName, service, nil, false /* wantLb */); err != nil && !retry.HasStatusForbiddenOrIgnoredError(err) { + _, err = az.reconcileLoadBalancer(clusterName, service, nil, false /* wantLb */) + if err != nil && !retry.HasStatusForbiddenOrIgnoredError(err) { return err } - if _, err := az.reconcilePublicIP(clusterName, service, "", false /* wantLb */); err != nil { + _, err = az.reconcilePublicIP(clusterName, service, "", false /* wantLb */) + if err != nil { return err }