From 4da3a94b1a363565770fc304ce4dd72118efaeed Mon Sep 17 00:00:00 2001 From: "Harper, Jason M" Date: Tue, 24 Dec 2024 16:38:09 -0800 Subject: [PATCH] support for GCP C4 instances --- cmd/metrics/event_defs.go | 29 +++++++++---- cmd/metrics/event_frame.go | 22 +++++++--- cmd/metrics/metadata.go | 70 +++++++++++++++++++++++++++++++ cmd/metrics/metric_defs.go | 86 +++++++++++++++++++------------------- cmd/metrics/metrics.go | 5 ++- cmd/metrics/summary.go | 2 +- 6 files changed, 155 insertions(+), 59 deletions(-) diff --git a/cmd/metrics/event_defs.go b/cmd/metrics/event_defs.go index ab2fbec..de47edc 100644 --- a/cmd/metrics/event_defs.go +++ b/cmd/metrics/event_defs.go @@ -41,7 +41,7 @@ func LoadEventGroups(eventDefinitionOverridePath string, metadata Metadata) (gro uarch := strings.ToLower(strings.Split(metadata.Microarchitecture, "_")[0]) // use alternate events/metrics when TMA fixed counters are not supported alternate := "" - if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA { + if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA { // AWS VM instances alternate = "_nofixedtma" } eventFileName := fmt.Sprintf("%s%s.txt", uarch, alternate) @@ -132,20 +132,32 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool { slog.Debug("Fixed counter TMA not supported on target", slog.String("event", event.Name)) return false } - // short-circuit for cpu events - if event.Device == "cpu" && !strings.HasPrefix(event.Name, "OCR") { + // PEBS events (not supported on GCP c4 VMs) + pebsEventNames := []string{"INT_MISC.UNKNOWN_BRANCH_CYCLES", "UOPS_RETIRED.MS"} + if !metadata.SupportsPEBS && util.StringInList(event.Name, pebsEventNames) { + slog.Debug("PEBS events not supported on target", slog.String("event", event.Name)) + return false + } + // short-circuit for cpu events that aren't off-core response events + if event.Device == "cpu" && !(strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) { return true } - // short-circuit off-core response events - if event.Device == "cpu" && - strings.HasPrefix(event.Name, "OCR") && - metadata.SupportsUncore { - if flagScope == scopeProcess || flagScope == scopeCgroup { + // off-core response events + if event.Device == "cpu" && (strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) { + if !metadata.SupportsOCR { + slog.Debug("Off-core response events not supported on target", slog.String("event", event.Name)) + return false + } else if flagScope == scopeProcess || flagScope == scopeCgroup { slog.Debug("Off-core response events not supported in process or cgroup scope", slog.String("event", event.Name)) return false } return true } + // uncore events + if !metadata.SupportsUncore && strings.HasPrefix(event.Name, "UNC") { + slog.Debug("Uncore events not supported on target", slog.String("event", event.Name)) + return false + } // exclude uncore events when // - their corresponding device is not found // - not in system-wide collection scope @@ -176,7 +188,6 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool { slog.Debug("ref-cycles not supported on target", slog.String("event", event.Name)) return false } - // no cstate and power events when collecting at process or cgroup scope if (flagScope == scopeProcess || flagScope == scopeCgroup) && (strings.Contains(event.Name, "cstate_") || strings.Contains(event.Name, "power/energy")) { diff --git a/cmd/metrics/event_frame.go b/cmd/metrics/event_frame.go index 5a4a2f6..e0c644d 100644 --- a/cmd/metrics/event_frame.go +++ b/cmd/metrics/event_frame.go @@ -121,8 +121,13 @@ func parseEvents(rawEvents [][]byte, eventGroupDefinitions []GroupDefinition) (e for _, rawEvent := range rawEvents { var event Event if event, err = parseEventJSON(rawEvent); err != nil { - err = fmt.Errorf("failed to parse perf event: %v", err) - return + if strings.Contains(err.Error(), "unrecognized event format") { + slog.Error(err.Error(), slog.String("event", string(rawEvent))) + return + } else { + slog.Warn(err.Error(), slog.String("event", string(rawEvent))) + event.Value = math.NaN() + } } if event.Event != previousEvent { eventIdx++ @@ -347,10 +352,17 @@ func parseEventJSON(rawEvent []byte) (event Event, err error) { err = fmt.Errorf("unrecognized event format: \"%s\"", rawEvent) return } + if event.CounterValue == "" { + err = fmt.Errorf("event not supported: \"%s\"", rawEvent) + return + } + if event.CounterValue == "" { + err = fmt.Errorf("event not counted: \"%s\"", rawEvent) + return + } if event.Value, err = strconv.ParseFloat(event.CounterValue, 64); err != nil { - event.Value = math.NaN() - err = nil - slog.Debug("failed to parse event value", slog.String("event", string(rawEvent))) + err = fmt.Errorf("failed to parse event value as float: \"%s\"", rawEvent) + return } return } diff --git a/cmd/metrics/metadata.go b/cmd/metrics/metadata.go index 30555c7..efc349e 100644 --- a/cmd/metrics/metadata.go +++ b/cmd/metrics/metadata.go @@ -39,6 +39,8 @@ type Metadata struct { SupportsFixedTMA bool SupportsRefCycles bool SupportsUncore bool + SupportsPEBS bool + SupportsOCR bool ThreadsPerCore int TSC int TSCFrequencyHz int @@ -161,6 +163,32 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem } slowFuncChannel <- err }() + // PEBS + go func() { + var err error + var output string + if metadata.SupportsPEBS, output, err = getSupportsPEBS(myTarget, noRoot, perfPath, localTempDir); err != nil { + err = fmt.Errorf("failed to determine if 'PEBS' is supported: %v", err) + } else { + if !metadata.SupportsPEBS { + slog.Warn("'PEBS' events not supported", slog.String("output", output)) + } + } + slowFuncChannel <- err + }() + // Offcore response + go func() { + var err error + var output string + if metadata.SupportsOCR, output, err = getSupportsOCR(myTarget, noRoot, perfPath, localTempDir); err != nil { + err = fmt.Errorf("failed to determine if 'OCR' is supported: %v", err) + } else { + if !metadata.SupportsOCR { + slog.Warn("'OCR' events not supported", slog.String("output", output)) + } + } + slowFuncChannel <- err + }() defer func() { var errs []error errs = append(errs, <-slowFuncChannel) @@ -168,6 +196,8 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem errs = append(errs, <-slowFuncChannel) errs = append(errs, <-slowFuncChannel) errs = append(errs, <-slowFuncChannel) + errs = append(errs, <-slowFuncChannel) + errs = append(errs, <-slowFuncChannel) for _, errInside := range errs { if errInside != nil { slog.Error("error loading metadata", slog.String("error", errInside.Error()), slog.String("target", myTarget.GetName())) @@ -218,6 +248,8 @@ func (md Metadata) String() string { "Fixed TMA slot supported: %t, "+ "ref-cycles supported: %t, "+ "Uncore supported: %t, "+ + "PEBS supported: %t, "+ + "OCR supported: %t, "+ "PMU Driver version: %s, "+ "Kernel version: %s, ", md.ModelName, @@ -234,6 +266,8 @@ func (md Metadata) String() string { md.SupportsFixedTMA, md.SupportsRefCycles, md.SupportsUncore, + md.SupportsPEBS, + md.SupportsOCR, md.PMUDriverVersion, md.KernelVersion) for deviceName, deviceIds := range md.UncoreDeviceIDs { @@ -355,6 +389,42 @@ func getSupportsRefCycles(myTarget target.Target, noRoot bool, perfPath string, return } +// getSupportsPEBS() - checks if the PEBS events are supported on the target +// On some VMs, e.g. GCP C4, PEBS events are not supported and perf returns '' +// Events that use MSR 0x3F7 are PEBS events. We use the INT_MISC.UNKNOWN_BRANCH_CYCLES event since +// it is a PEBS event that we used in EMR metrics. +func getSupportsPEBS(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) { + scriptDef := script.ScriptDefinition{ + Name: "perf stat pebs", + Script: perfPath + " stat -a -e cpu/event=0xad,umask=0x40,period=1000003,name='INT_MISC.UNKNOWN_BRANCH_CYCLES'/ sleep 1", + Superuser: !noRoot, + } + scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir) + if err != nil { + err = fmt.Errorf("failed to determine if pebs is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err) + return + } + supported = !strings.Contains(scriptOutput.Stderr, "") + return +} + +// getSupportsOCR() - checks if the offcore response events are supported on the target +// On some VMs, e.g. GCP C4, offcore response events are not supported and perf returns '' +func getSupportsOCR(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) { + scriptDef := script.ScriptDefinition{ + Name: "perf stat ocr", + Script: perfPath + " stat -a -e cpu/event=0x2a,umask=0x01,offcore_rsp=0x104004477,name='OCR.READS_TO_CORE.LOCAL_DRAM'/ sleep 1", + Superuser: !noRoot, + } + scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir) + if err != nil { + err = fmt.Errorf("failed to determine if ocr is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err) + return + } + supported = !strings.Contains(scriptOutput.Stderr, "") + return +} + // getSupportsFixedTMA - checks if the fixed TMA counter events are // supported by perf. // diff --git a/cmd/metrics/metric_defs.go b/cmd/metrics/metric_defs.go index 80fc034..3a76c4e 100644 --- a/cmd/metrics/metric_defs.go +++ b/cmd/metrics/metric_defs.go @@ -34,7 +34,7 @@ type MetricDefinition struct { // definition file. When the override path argument is empty, the function will load metrics from // the file associated with the platform's architecture found in the provided metadata. When // a list of metric names is provided, only those metric definitions will be loaded. -func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, uncollectableEvents []string, metadata Metadata) (metrics []MetricDefinition, err error) { +func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, metadata Metadata) (metrics []MetricDefinition, err error) { var bytes []byte if metricDefinitionOverridePath != "" { if bytes, err = os.ReadFile(metricDefinitionOverridePath); err != nil { @@ -56,20 +56,6 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics if err = json.Unmarshal(bytes, &metricsInFile); err != nil { return } - // remove "metric_" prefix from metric names - for i := range metricsInFile { - metricsInFile[i].Name = strings.TrimPrefix(metricsInFile[i].Name, "metric_") - } - // remove metrics from list that use uncollectable events - for _, uncollectableEvent := range uncollectableEvents { - for i := 0; i < len(metricsInFile); i++ { - if strings.Contains(metricsInFile[i].Expression, uncollectableEvent) { - slog.Debug("removing metric that uses uncollectable event", slog.String("metric", metricsInFile[i].Name), slog.String("event", uncollectableEvent)) - metricsInFile = append(metricsInFile[:i], metricsInFile[i+1:]...) - i-- - } - } - } // if a list of metric names provided, reduce list to match if len(selectedMetrics) > 0 { // confirm provided metric names are valid (included in metrics defined in file) @@ -102,7 +88,7 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics // ConfigureMetrics prepares metrics for use by the evaluator, by e.g., replacing // metric constants with known values and aligning metric variables to perf event // groups -func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (err error) { +func ConfigureMetrics(loadedMetrics []MetricDefinition, uncollectableEvents []string, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (metrics []MetricDefinition, err error) { // get constants as strings tscFreq := fmt.Sprintf("%f", float64(metadata.TSCFrequencyHz)) tsc := fmt.Sprintf("%f", float64(metadata.TSC)) @@ -112,54 +98,70 @@ func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string] hyperThreadingOn := fmt.Sprintf("%t", metadata.ThreadsPerCore > 1) threadsPerCore := fmt.Sprintf("%f", float64(metadata.ThreadsPerCore)) // configure each metric - for metricIdx := range metrics { + for metricIdx := range loadedMetrics { + tmpMetric := loadedMetrics[metricIdx] + // abbreviate event names in metric expressions to match abbreviations used in uncollectableEvents + tmpMetric.Expression = abbreviateEventName(tmpMetric.Expression) + tmpMetric.ExpressionTxn = abbreviateEventName(tmpMetric.ExpressionTxn) + // skip metrics that use uncollectable events + foundUncollectable := false + for _, uncollectableEvent := range uncollectableEvents { + if strings.Contains(tmpMetric.Expression, uncollectableEvent) { + slog.Warn("removing metric that uses uncollectable event", slog.String("metric", tmpMetric.Name), slog.String("event", uncollectableEvent)) + foundUncollectable = true + break + } + } + if foundUncollectable { + continue + } // swap in per-txn metric definition if transaction rate is provided - if flagTransactionRate != 0 && metrics[metricIdx].ExpressionTxn != "" { - metrics[metricIdx].Expression = metrics[metricIdx].ExpressionTxn - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate)) - metrics[metricIdx].Name = metrics[metricIdx].NameTxn + if flagTransactionRate != 0 && tmpMetric.ExpressionTxn != "" { + tmpMetric.Expression = tmpMetric.ExpressionTxn + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate)) + tmpMetric.Name = tmpMetric.NameTxn } + // remove "metric_" prefix from metric names + tmpMetric.Name = strings.TrimPrefix(tmpMetric.Name, "metric_") // transform if/else to ?/: var transformed string - if transformed, err = transformConditional(metrics[metricIdx].Expression); err != nil { + if transformed, err = transformConditional(tmpMetric.Expression); err != nil { return } - if transformed != metrics[metricIdx].Expression { - slog.Debug("transformed metric", slog.String("original", metrics[metricIdx].Name), slog.String("transformed", transformed)) - metrics[metricIdx].Expression = transformed + if transformed != tmpMetric.Expression { + slog.Debug("transformed metric", slog.String("original", tmpMetric.Name), slog.String("transformed", transformed)) + tmpMetric.Expression = transformed } // replace constants with their values - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SYSTEM_TSC_FREQ]", tscFreq) - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TSC]", tsc) - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CORES_PER_SOCKET]", coresPerSocket) - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CHAS_PER_SOCKET]", chasPerSocket) - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SOCKET_COUNT]", socketCount) - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[HYPERTHREADING_ON]", hyperThreadingOn) - metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CONST_THREAD_COUNT]", threadsPerCore) - // abbreviate event names - metrics[metricIdx].Expression = abbreviateEventName(metrics[metricIdx].Expression) - metrics[metricIdx].ExpressionTxn = abbreviateEventName(metrics[metricIdx].ExpressionTxn) + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SYSTEM_TSC_FREQ]", tscFreq) + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TSC]", tsc) + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CORES_PER_SOCKET]", coresPerSocket) + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CHAS_PER_SOCKET]", chasPerSocket) + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SOCKET_COUNT]", socketCount) + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[HYPERTHREADING_ON]", hyperThreadingOn) + tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CONST_THREAD_COUNT]", threadsPerCore) // get a list of the variables in the expression - metrics[metricIdx].Variables = make(map[string]int) + tmpMetric.Variables = make(map[string]int) expressionIdx := 0 for { - startVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], '[') + startVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], '[') if startVar == -1 { // no more vars in this expression break } - endVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], ']') + endVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], ']') if endVar == -1 { - err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", metrics[metricIdx].Expression[expressionIdx:]) + err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", tmpMetric.Expression[expressionIdx:]) return } // add the variable name to the map, set group index to -1 to indicate it has not yet been determined - metrics[metricIdx].Variables[metrics[metricIdx].Expression[expressionIdx:][startVar+1:endVar]] = -1 + tmpMetric.Variables[tmpMetric.Expression[expressionIdx:][startVar+1:endVar]] = -1 expressionIdx += endVar + 1 } - if metrics[metricIdx].Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(metrics[metricIdx].Expression, evaluatorFunctions); err != nil { - slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", metrics[metricIdx].Name), slog.String("metric expression", metrics[metricIdx].Expression)) + if tmpMetric.Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(tmpMetric.Expression, evaluatorFunctions); err != nil { + slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", tmpMetric.Name), slog.String("metric expression", tmpMetric.Expression)) return } + metrics = append(metrics, tmpMetric) } return } diff --git a/cmd/metrics/metrics.go b/cmd/metrics/metrics.go index 201a30e..b2b01c8 100644 --- a/cmd/metrics/metrics.go +++ b/cmd/metrics/metrics.go @@ -883,7 +883,8 @@ func prepareMetrics(targetContext *targetContext, localTempDir string, channelEr return } // load metric definitions - if targetContext.metricDefinitions, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, uncollectableEvents, targetContext.metadata); err != nil { + var loadedMetrics []MetricDefinition + if loadedMetrics, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, targetContext.metadata); err != nil { err = fmt.Errorf("failed to load metric definitions: %w", err) _ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error())) targetContext.err = err @@ -891,7 +892,7 @@ func prepareMetrics(targetContext *targetContext, localTempDir string, channelEr return } // configure metrics - if err = ConfigureMetrics(targetContext.metricDefinitions, GetEvaluatorFunctions(), targetContext.metadata); err != nil { + if targetContext.metricDefinitions, err = ConfigureMetrics(loadedMetrics, uncollectableEvents, GetEvaluatorFunctions(), targetContext.metadata); err != nil { err = fmt.Errorf("failed to configure metrics: %w", err) _ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error())) targetContext.err = err diff --git a/cmd/metrics/summary.go b/cmd/metrics/summary.go index 9d1e809..3de7f2c 100644 --- a/cmd/metrics/summary.go +++ b/cmd/metrics/summary.go @@ -199,7 +199,7 @@ func (m *metricsFromCSV) getStats() (stats map[string]metricStats, err error) { sum := 0.0 for _, row := range m.rows { val := row.metrics[metricName] - if math.IsNaN(val) { + if math.IsNaN(val) || math.IsInf(val, 0) { continue } if math.IsNaN(min) { // min was initialized to NaN