diff --git a/collector/cluster-collector.go b/collector/cluster-collector.go index 2275fba..22d3c72 100644 --- a/collector/cluster-collector.go +++ b/collector/cluster-collector.go @@ -134,11 +134,15 @@ func NewEcsClusterCollector(emcecs *ecsclient.EcsClient, namespace string) (*Ecs func (e *EcsClusterCollector) Collect(ch chan<- prometheus.Metric) { log.Debugln("ECS Cluster collect starting") if e.ecsClient == nil { - log.Errorln("ECS client not configured.") + log.Errorf("ECS client not configured.") return } - fields := e.ecsClient.RetrieveClusterState() + fields, err := e.ecsClient.RetrieveClusterState() + if err != nil { + log.Error("Cluster exporter received no info from array.") + return + } // fmt.Printf("TotalDTNum: %v, UnreadyNum: %v, UnKnownNum: %v, NodeIP: %v\n", node.TotalDTnum, node.UnreadyDTnum, node.UnknownDTnum, node.NodeIP) ch <- prometheus.MustNewConstMetric(transactionsuccess, prometheus.CounterValue, fields.TransactionSuccessTotal, fields.VdcName) diff --git a/collector/metering-collector.go b/collector/metering-collector.go index 295051b..2ecdae1 100644 --- a/collector/metering-collector.go +++ b/collector/metering-collector.go @@ -50,7 +50,7 @@ func (e *EcsMeteringCollector) Collect(ch chan<- prometheus.Metric) { start := time.Now() // create a function to go get a list of all namespaces nameSpaceReq := "https://" + e.ecsClient.ClusterAddress + ":4443/object/namespaces" - n := e.ecsClient.CallECSAPI(nameSpaceReq, 2) + n, _ := e.ecsClient.CallECSAPI(nameSpaceReq) result := gjson.Get(n, "namespace.#.name") // We need to limit the number of requests going to the API at once @@ -70,14 +70,14 @@ func (e *EcsMeteringCollector) Collect(ch chan<- prometheus.Metric) { defer func() { <-sem }() // retrieve the quota applied namespaceInfoReq := "https://" + e.ecsClient.ClusterAddress + ":4443/object/namespaces/namespace/" + ns + "/quota" - n := e.ecsClient.CallECSAPI(namespaceInfoReq, 2) + n, _ := e.ecsClient.CallECSAPI(namespaceInfoReq) if gjson.Get(n, "blockSize").Float() > 0 { ch <- prometheus.MustNewConstMetric(nameSpaceQuota, prometheus.GaugeValue, gjson.Get(n, "blockSize").Float()*gb2kb, ns, "block") ch <- prometheus.MustNewConstMetric(nameSpaceQuota, prometheus.GaugeValue, gjson.Get(n, "notificationSize").Float()*gb2kb, ns, "notification") } // retrieve the current metering info namespaceInfoReq = "https://" + e.ecsClient.ClusterAddress + ":4443/object/billing/namespace/" + ns + "/info?sizeunit=KB" - n = e.ecsClient.CallECSAPI(namespaceInfoReq, 2) + n, _ = e.ecsClient.CallECSAPI(namespaceInfoReq) ch <- prometheus.MustNewConstMetric(nameSpaceObjectTotal, prometheus.GaugeValue, gjson.Get(n, "total_objects").Float(), ns) ch <- prometheus.MustNewConstMetric(nameSpaceQuota, prometheus.GaugeValue, gjson.Get(n, "total_size").Float(), ns, "used") }() @@ -87,7 +87,7 @@ func (e *EcsMeteringCollector) Collect(ch chan<- prometheus.Metric) { sem <- true } duration := float64(time.Since(start).Seconds()) - log.Infof("Scrape of metering took %f seconds", duration) + log.Infof("Scrape of metering took %f seconds for cluster %s", duration, e.ecsClient.ClusterAddress) log.Infoln("Metering exporter finished") } diff --git a/collector/repl-collector.go b/collector/repl-collector.go index 7d2b20e..8ca4daf 100644 --- a/collector/repl-collector.go +++ b/collector/repl-collector.go @@ -64,8 +64,11 @@ func (e *EcsReplCollector) Collect(ch chan<- prometheus.Metric) { return } - replState := e.ecsClient.RetrieveReplState() - // fmt.Printf("Replication state is %v \n", replState) + replState, err := e.ecsClient.RetrieveReplState() + if err != nil { + log.Error("Replication exporter received no info from array.") + return + } ch <- prometheus.MustNewConstMetric(replingresstraffic, prometheus.GaugeValue, replState.ReplicationIngressTraffic, replState.RgName) ch <- prometheus.MustNewConstMetric(replegresstraffic, prometheus.GaugeValue, replState.ReplicationEgressTraffic, replState.RgName) diff --git a/ecs_exporter.go b/ecs_exporter.go index b841565..0448ebf 100644 --- a/ecs_exporter.go +++ b/ecs_exporter.go @@ -5,14 +5,14 @@ package main import ( - "flag" "fmt" + "net" "net/http" - "runtime" - "strconv" - "time" "os" "os/signal" + "runtime" + "strconv" + "sync" "syscall" "github.com/paychex/prometheus-emcecs-exporter/collector" @@ -29,25 +29,18 @@ const ( ) var ( - log = logrus.New() - debugLevel = flag.Bool("debug", false, "enable debug messages") - ecsURL string - config *ecsconfig.Config + log = logrus.New() + ecsURL string + config *ecsconfig.Config // Metrics about the EMC ECS exporter itself. - ecsCollectionDuration = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "emcecs_collection_duration_seconds", - Help: "Duration of collections by the ECS exporter for type metering or perf", - }, - []string{"vdc", "type"}, - ) ecsCollectionRequestErrors = prometheus.NewCounter( prometheus.CounterOpts{ Name: "emcecs_request_errors_total", Help: "Total errors in requests to the ECS exporter", }, ) + ecsCollectionBuildInfo = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "emcecs_collector_build_info", @@ -55,49 +48,86 @@ var ( }, []string{"version", "commitid", "goversion"}, ) - ecsClusterInfo = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "emcecs_cluster_version", - Help: "A metric with a constant '1' value labeled by version, and nodecount", + + ecsAuthCacheCounterHit = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "emcecs_authtoken_cache_counter_hit", + Help: "count of authtoken cache hits", + }, + ) + + ecsAuthCacheCounterMiss = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "emcecs_authtoken_cache_counter_miss", + Help: "count of authtoken cache misses", }, - []string{"version", "nodecount"}, ) + + authTokenCache sync.Map ) func init() { log.Formatter = new(logrus.TextFormatter) - if *debugLevel { - log.Level = logrus.DebugLevel - log.Debug("Setting logging to debug level.") - } else { - log.Info("Logging set to standard level.") - log.Level = logrus.InfoLevel - } - // ecsCollectionBuildInfo.WithLabelValues(version.Release, version.Commit, runtime.Version()).Set(1) - prometheus.MustRegister(ecsCollectionDuration) - prometheus.MustRegister(ecsCollectionRequestErrors) prometheus.MustRegister(ecsCollectionBuildInfo) - prometheus.MustRegister(ecsClusterInfo) + prometheus.MustRegister(ecsAuthCacheCounterHit) + prometheus.MustRegister(ecsAuthCacheCounterMiss) // gather our configuration config = ecsconfig.GetConfig() + } func queryHandler(w http.ResponseWriter, r *http.Request) { + + ecsCollectionSuccess := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "emcecs_collection_success", + Help: "returns either 1 or 0 depending on success labeled by target_name", + }, + []string{"target_name"}, + ) + + ecsClusterInfo := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "emcecs_cluster_version", + Help: "A metric with a constant '1' value labeled by version, and nodecount", + }, + []string{"version", "nodecount"}, + ) + + // some initial things we need before we get going. + registry := prometheus.NewRegistry() + registry.MustRegister(ecsCollectionBuildInfo) + registry.MustRegister(ecsClusterInfo) + registry.MustRegister(ecsCollectionRequestErrors) + registry.MustRegister(ecsCollectionSuccess) + target := r.URL.Query().Get("target") if target == "" { - http.Error(w, "'target' parameter must be specified", 400) + log.Info("'target' parameter must be specified") ecsCollectionRequestErrors.Inc() + ecsCollectionSuccess.WithLabelValues("NULL").Set(0) + h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{}) + h.ServeHTTP(w, r) return } - log.Debugf("Scraping target '%s'", target) + // assume success if we fail anywhere along the line, change this to 0 + ecsCollectionSuccess.WithLabelValues(target).Set(1) - start := time.Now() - registry := prometheus.NewRegistry() + // Check and make sure we have a valid dns name, if not dump and run now. + _, err := net.LookupHost(target) + if err != nil { + log.Infof("target: %s is not a valid host.\n error was: %v", target, err) + ecsCollectionRequestErrors.Inc() + ecsCollectionSuccess.WithLabelValues(target).Set(0) + h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{}) + h.ServeHTTP(w, r) + return + } c := ecsclient.EcsClient{ UserName: config.ECS.UserName, @@ -105,21 +135,72 @@ func queryHandler(w http.ResponseWriter, r *http.Request) { ClusterAddress: target, } log.Info("Connecting to ECS Cluster: " + target) - log.Debug("Retrieving ECS authToken") - c.RetrieveAuthToken() - - // get our authtoken for future interactions - c.RetrieveNodeInfo() - log.Debug("ECS Cluster version is: " + c.EcsVersion) + log.Debugf("ECS Cluster version is: %v", c.EcsVersion) log.Debugf("ECS Cluster node count: %v", c.RetrieveNodeCount()) ecsClusterInfo.WithLabelValues(c.EcsVersion, strconv.Itoa(c.RetrieveNodeCount())).Set(1) + // Need to get rid of the goto cheat. + // replacing with a for loop, and ensureing it has backoff and + // a short circuit + lc := 1 + for lc < 4 { + log.Debugf("Looking for cached Auth Token for %s", target) + var ok bool + result, ok := authTokenCache.Load(target) + if !ok { + log.Debug("Authtoken not found in cache.") + log.Debugf("Retrieving ECS authToken for %s", target) + // get our authtoken for future interactions + a, err := c.RetrieveAuthToken() + if err != nil { + log.Debugf("Error getting auth token for %s", target) + + ecsCollectionRequestErrors.Inc() + ecsCollectionSuccess.WithLabelValues(target).Set(0) + h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{}) + h.ServeHTTP(w, r) + return + } + authTokenCache.Store(target, a) + result, _ := authTokenCache.Load(target) + c.AuthToken = result.(string) + ecsAuthCacheCounterMiss.Inc() + } else { + log.Debugf("Authtoken pulled from cache for %s", target) + c.AuthToken = result.(string) + ecsAuthCacheCounterHit.Inc() + } + + // test to make sure that our auth token is good + // if not delete it and loop back to our login logic above + validateLoginURL := "https://" + c.ClusterAddress + ":4443/user/whoami" + _, err = c.CallECSAPI(validateLoginURL) + if err != nil { + authTokenCache.Delete(target) + log.Infof("Invalidating authToken for %s", target) + lc += 1 + } else { + // we have a valid auth token we can break out of this loop + break + } + } + if lc > 3 { + // we looped and failed multiple times, so no need to go further + log.Debugf("Error getting auth token for %s", target) + + ecsCollectionRequestErrors.Inc() + ecsCollectionSuccess.WithLabelValues(target).Set(0) + h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{}) + h.ServeHTTP(w, r) + return + } + if r.URL.Query().Get("metering") == "1" { // get just metering information meterExporter, err := collector.NewEcsMeteringCollector(&c, namespace) if err != nil { - log.Fatalf("Can't create exporter : %s", err) + log.Infof("Can't create exporter : %s", err) } log.Debugln("Register Metering exporter") registry.MustRegister(meterExporter) @@ -128,58 +209,82 @@ func queryHandler(w http.ResponseWriter, r *http.Request) { // nodeexporter dtExporter, err := collector.NewEcsNodeDTCollector(&c, namespace) if err != nil { - log.Fatalf("Can't create exporter : %s", err) + log.Infof("Can't create exporter : %s", err) + } else { + log.Debugln("Register node DT exporter") + registry.MustRegister(dtExporter) } - log.Debugln("Register node DT exporter") - registry.MustRegister(dtExporter) clusterExporter, err := collector.NewEcsClusterCollector(&c, namespace) if err != nil { - log.Fatalf("Can't create exporter : %s", err) + log.Infof("Can't create exporter : %s", err) + } else { + log.Debugln("Register cluster exporter") + registry.MustRegister(clusterExporter) } - log.Debugln("Register cluster exporter") - registry.MustRegister(clusterExporter) replExporter, err := collector.NewEcsReplCollector(&c, namespace) if err != nil { - log.Fatalf("Can't create exporter : %s", err) + log.Infof("Can't create exporter : %s", err) + } else { + log.Debugln("Register Replication exporter") + registry.MustRegister(replExporter) } - log.Debugln("Register Replication exporter") - registry.MustRegister(replExporter) } // Delegate http serving to Promethues client library, which will call collector.Collect. + ecsCollectionRequestErrors.Add(c.ErrorCount) h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{}) h.ServeHTTP(w, r) - c.Logout() - duration := float64(time.Since(start).Seconds()) - ecsCollectionRequestErrors.Add(c.ErrorCount) - if r.URL.Query().Get("metering") == "1" { - ecsCollectionDuration.WithLabelValues(target, "metering").Observe(duration) - } else { - ecsCollectionDuration.WithLabelValues(target, "perf").Observe(duration) - } - log.Debugf("Scrape of target '%s' took %f seconds", target, duration) +} + +func fullLogout() { + //We have been asked to shut down, lets not leave any auth tokens active + log.Info("Logging out of all arrays.") + authTokenCache.Range(func(k, v interface{}) bool { + log.Debugf("Logging out of array: %v", k) + c := ecsclient.EcsClient{ + UserName: config.ECS.UserName, + Password: config.ECS.Password, + ClusterAddress: k.(string), + AuthToken: v.(string), + } + err := c.Logout() + if err != nil { + log.Debugf("Failed to log out of array: %v", k) + } + return true + }) } func main() { - log.Info("Starting the ECS Exporter service...") - log.Infof("commit: %s, build time: %s, release: %s", - version.Commit, version.BuildTime, version.Release, - ) - // enable signal trapping + if config.Exporter.Debug { + log.Level = logrus.DebugLevel + log.Debug("Setting logging to debug level.") + } else { + log.Info("Logging set to standard level.") + log.Level = logrus.InfoLevel + } + + // enable signal trapping to ensure clean shutdown go func() { c := make(chan os.Signal, 1) signal.Notify(c, syscall.SIGINT, // Ctrl+C syscall.SIGTERM, // Termination Request - syscall.SIGSEGV, // FullDerp + syscall.SIGSEGV, // Segmentation Fault syscall.SIGABRT, // Abnormal termination syscall.SIGILL, // illegal instruction syscall.SIGFPE) // floating point sig := <-c - log.Fatalf("Signal (%v) Detected, Shutting Down", sig) + log.Infof("Signal (%v) Detected, Shutting Down", sig) + fullLogout() + os.Exit(2) }() + log.Info("Starting the ECS Exporter service...") + log.Infof("commit: %s, build time: %s, release: %s", + version.Commit, version.BuildTime, version.Release, + ) http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(` diff --git a/ecsclient/ecsclient.go b/ecsclient/ecsclient.go index 6f59d62..e8ad6e4 100644 --- a/ecsclient/ecsclient.go +++ b/ecsclient/ecsclient.go @@ -3,6 +3,7 @@ package ecsclient import ( "crypto/tls" "encoding/xml" + "fmt" "io/ioutil" "net" "net/http" @@ -16,7 +17,7 @@ import ( type EcsClient struct { UserName string Password string - authToken string + AuthToken string ClusterAddress string nodeList []string EcsVersion string @@ -45,7 +46,7 @@ type pingList struct { } // RetrieveAuthToken and store as part of the client struct for future use -func (c *EcsClient) RetrieveAuthToken() { +func (c *EcsClient) RetrieveAuthToken() (authToken string, err error) { reqLoginURL := "https://" + c.ClusterAddress + ":4443/login" log.Debugf("Using the following info to log into the ECS, username: %v, URL: %v", c.UserName, c.ClusterAddress) @@ -63,33 +64,37 @@ func (c *EcsClient) RetrieveAuthToken() { TLSHandshakeTimeout: 10 * time.Second, ExpectContinueTimeout: 1 * time.Second, TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, - }} + }, + Timeout: 15 * time.Second} req, _ := http.NewRequest("GET", reqLoginURL, nil) req.SetBasicAuth(c.UserName, c.Password) resp, err := client.Do(req) if err != nil { - log.Fatalf("\n - Error connecting to ECS: %s", err) + log.Infof("\n - Error connecting to ECS: %s", err) + return "", err } defer resp.Body.Close() - log.Debug("Response Status Code: %v", resp.StatusCode) - log.Debug("Response Status: %v", resp.Status) - log.Debug("Response Body: %v", resp.Body) - log.Debug("AuthToken is: %v", resp.Header.Get("X-Sds-Auth-Token")) + log.Debugf("Response Status Code: %v", resp.StatusCode) + log.Debugf("Response Status: %v", resp.Status) + log.Debugf("Response Body: %v", resp.Body) + log.Debugf("AuthToken is: %v", resp.Header.Get("X-Sds-Auth-Token")) if resp.StatusCode != 200 { // we didnt get a good response code, so bailing out log.Infoln("Got a non 200 response code: ", resp.StatusCode) log.Debugln("response was: ", resp) c.ErrorCount++ + return "", fmt.Errorf("received non 200 error code: %v. the response was: %v", resp.Status, resp) } - c.authToken = resp.Header.Get("X-Sds-Auth-Token") + c.AuthToken = resp.Header.Get("X-Sds-Auth-Token") + return resp.Header.Get("X-Sds-Auth-Token"), nil } // Logout closes out the connection to ECS when we are done. // if we dont log out we use up all of the available login tokens -func (c *EcsClient) Logout() { +func (c *EcsClient) Logout() error { // there’s a maximum number of login tokens (100) per user // need to log out to throw away the token since we arent set up for caching... @@ -98,11 +103,16 @@ func (c *EcsClient) Logout() { log.Infof("Logging out of %s", c.ClusterAddress) // we dont need the reply data, so just throw it away - _ = c.CallECSAPI(reqLogoutURL, 3) - c.authToken = "" + _, err := c.CallECSAPI(reqLogoutURL) + if err != nil { + log.Infof("Error logging out of ECS: %s", c.ClusterAddress) + return err + } + c.AuthToken = "" + return nil } -func (c *EcsClient) CallECSAPI(request string, retryAttempts int) string { +func (c *EcsClient) CallECSAPI(request string) (response string, err error) { client := &http.Client{Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ @@ -115,63 +125,61 @@ func (c *EcsClient) CallECSAPI(request string, retryAttempts int) string { TLSHandshakeTimeout: 10 * time.Second, ExpectContinueTimeout: 1 * time.Second, TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, - }} + }, + Timeout: 45 * time.Second} req, _ := http.NewRequest("GET", request, nil) req.Header.Add("Accept", "application/json") req.Header.Add("Content-Type", "application/json") - req.Header.Add("X-SDS-AUTH-TOKEN", c.authToken) + req.Header.Add("X-SDS-AUTH-TOKEN", c.AuthToken) resp, err := client.Do(req) if err != nil { - log.Fatalf("\n - Error connecting to ECS: %s", err) + log.Infof("\n - Error connecting to ECS: %s", err) + return "", fmt.Errorf("error connecting to : %v. the error was: %v", request, err) } defer resp.Body.Close() respText, err := ioutil.ReadAll(resp.Body) s := string(respText) - if resp.StatusCode == 200 { - log.Debugln(s) - } else { - if retryAttempts >= 1 { - log.Infof("Got unknown code: %v when accessing URL: %s\n Body text is: %s\n", resp.StatusCode, request, respText) - // to do need to re-auth to get back into system - log.Info("Attempting to re-log into the ECS and retrying command") - c.RetrieveAuthToken() - // now lets recursively call ourselves and hopefully we get in again - s = c.CallECSAPI(request, retryAttempts-1) - c.ErrorCount++ - } else { - s = "" - } + + if resp.StatusCode != 200 { + log.Infof("Got error code: %v when accessing URL: %s\n Body text is: %s\n", resp.StatusCode, request, respText) + return "", fmt.Errorf("error connecting to : %v. the error was: %v", request, resp.StatusCode) } - return s + return s, nil } // RetrieveReplState will return a struct containing the state of the ECS cluster on query -func (c *EcsClient) RetrieveReplState() EcsReplState { +func (c *EcsClient) RetrieveReplState() (EcsReplState, error) { // this will only pull the current stats, which is what we want for this application // reqStatusURL := "https://" + c.ClusterAddress + ":4443/dashboard/zones/localzone?dataType=current" reqStatusURL := "https://" + c.ClusterAddress + ":4443/dashboard/zones/localzone/replicationgroups" - s := c.CallECSAPI(reqStatusURL, 3) + s, err := c.CallECSAPI(reqStatusURL) + if err != nil { + return EcsReplState{}, err + } return EcsReplState{ - RgName: gjson.Get(s, "name").String(), + RgName: gjson.Get(s, "name").String(), ReplicationIngressTraffic: gjson.Get(s, "replicationIngressTraffic").Float(), ReplicationEgressTraffic: gjson.Get(s, "replicationEgressTraffic").Float(), ChunksRepoPendingReplicationTotalSize: gjson.Get(s, "chunksRepoPendingReplicationTotalSize").Float(), ChunksJournalPendingReplicationTotalSize: gjson.Get(s, "chunksJournalPendingReplicationTotalSize").Float(), ChunksPendingXorTotalSize: gjson.Get(s, "chunksPendingXorTotalSize").Float(), ReplicationRpoTimestamp: gjson.Get(s, "replicationRpoTimestamp").Float(), - } + }, nil } // RetrieveClusterState will return a struct containing the state of the ECS cluster on query -func (c *EcsClient) RetrieveClusterState() EcsClusterState { +func (c *EcsClient) RetrieveClusterState() (EcsClusterState, error) { // this will only pull the current stats, which is what we want for this application // reqStatusURL := "https://" + c.ClusterAddress + ":4443/dashboard/zones/localzone?dataType=current" reqStatusURL := "https://" + c.ClusterAddress + ":4443/dashboard/zones/localzone" - s := c.CallECSAPI(reqStatusURL, 3) + s, err := c.CallECSAPI(reqStatusURL) + if err != nil { + return EcsClusterState{}, err + } fields := EcsClusterState{ VdcName: gjson.Get(s, "name").String(), @@ -195,7 +203,7 @@ func (c *EcsClient) RetrieveClusterState() EcsClusterState { TransactionWriteBandwidthCurrent: gjson.Get(s, "transactionWriteBandwidthCurrent.0.Bandwidth").Float(), TransactionReadBandwidthCurrent: gjson.Get(s, "transactionReadBandwidthCurrent.0.Bandwidth").Float(), } - return fields + return fields, nil } @@ -220,8 +228,11 @@ func (c *EcsClient) RetrieveNodeInfo() { resp, err := client.Do(req) if err != nil { - log.Fatal("Error connecting to ECS Cluster at: " + reqStatusURL) + log.Info("Error connecting to ECS Cluster at: " + reqStatusURL) + c.nodeList = nil + c.EcsVersion = "" c.ErrorCount++ + return } defer resp.Body.Close()