Skip to content

Commit

Permalink
support for GCP C4 instances
Browse files Browse the repository at this point in the history
  • Loading branch information
harp-intel committed Dec 25, 2024
1 parent fa54070 commit 4da3a94
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 59 deletions.
29 changes: 20 additions & 9 deletions cmd/metrics/event_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func LoadEventGroups(eventDefinitionOverridePath string, metadata Metadata) (gro
uarch := strings.ToLower(strings.Split(metadata.Microarchitecture, "_")[0])
// use alternate events/metrics when TMA fixed counters are not supported
alternate := ""
if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA {
if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA { // AWS VM instances
alternate = "_nofixedtma"
}
eventFileName := fmt.Sprintf("%s%s.txt", uarch, alternate)
Expand Down Expand Up @@ -132,20 +132,32 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
slog.Debug("Fixed counter TMA not supported on target", slog.String("event", event.Name))
return false
}
// short-circuit for cpu events
if event.Device == "cpu" && !strings.HasPrefix(event.Name, "OCR") {
// PEBS events (not supported on GCP c4 VMs)
pebsEventNames := []string{"INT_MISC.UNKNOWN_BRANCH_CYCLES", "UOPS_RETIRED.MS"}
if !metadata.SupportsPEBS && util.StringInList(event.Name, pebsEventNames) {
slog.Debug("PEBS events not supported on target", slog.String("event", event.Name))
return false
}
// short-circuit for cpu events that aren't off-core response events
if event.Device == "cpu" && !(strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) {
return true
}
// short-circuit off-core response events
if event.Device == "cpu" &&
strings.HasPrefix(event.Name, "OCR") &&
metadata.SupportsUncore {
if flagScope == scopeProcess || flagScope == scopeCgroup {
// off-core response events
if event.Device == "cpu" && (strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) {
if !metadata.SupportsOCR {
slog.Debug("Off-core response events not supported on target", slog.String("event", event.Name))
return false
} else if flagScope == scopeProcess || flagScope == scopeCgroup {
slog.Debug("Off-core response events not supported in process or cgroup scope", slog.String("event", event.Name))
return false
}
return true
}
// uncore events
if !metadata.SupportsUncore && strings.HasPrefix(event.Name, "UNC") {
slog.Debug("Uncore events not supported on target", slog.String("event", event.Name))
return false
}
// exclude uncore events when
// - their corresponding device is not found
// - not in system-wide collection scope
Expand Down Expand Up @@ -176,7 +188,6 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
slog.Debug("ref-cycles not supported on target", slog.String("event", event.Name))
return false
}

// no cstate and power events when collecting at process or cgroup scope
if (flagScope == scopeProcess || flagScope == scopeCgroup) &&
(strings.Contains(event.Name, "cstate_") || strings.Contains(event.Name, "power/energy")) {
Expand Down
22 changes: 17 additions & 5 deletions cmd/metrics/event_frame.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,13 @@ func parseEvents(rawEvents [][]byte, eventGroupDefinitions []GroupDefinition) (e
for _, rawEvent := range rawEvents {
var event Event
if event, err = parseEventJSON(rawEvent); err != nil {
err = fmt.Errorf("failed to parse perf event: %v", err)
return
if strings.Contains(err.Error(), "unrecognized event format") {
slog.Error(err.Error(), slog.String("event", string(rawEvent)))
return
} else {
slog.Warn(err.Error(), slog.String("event", string(rawEvent)))
event.Value = math.NaN()
}
}
if event.Event != previousEvent {
eventIdx++
Expand Down Expand Up @@ -347,10 +352,17 @@ func parseEventJSON(rawEvent []byte) (event Event, err error) {
err = fmt.Errorf("unrecognized event format: \"%s\"", rawEvent)
return
}
if event.CounterValue == "<not supported>" {
err = fmt.Errorf("event not supported: \"%s\"", rawEvent)
return
}
if event.CounterValue == "<not counted>" {
err = fmt.Errorf("event not counted: \"%s\"", rawEvent)
return
}
if event.Value, err = strconv.ParseFloat(event.CounterValue, 64); err != nil {
event.Value = math.NaN()
err = nil
slog.Debug("failed to parse event value", slog.String("event", string(rawEvent)))
err = fmt.Errorf("failed to parse event value as float: \"%s\"", rawEvent)
return
}
return
}
70 changes: 70 additions & 0 deletions cmd/metrics/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ type Metadata struct {
SupportsFixedTMA bool
SupportsRefCycles bool
SupportsUncore bool
SupportsPEBS bool
SupportsOCR bool
ThreadsPerCore int
TSC int
TSCFrequencyHz int
Expand Down Expand Up @@ -161,13 +163,41 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem
}
slowFuncChannel <- err
}()
// PEBS
go func() {
var err error
var output string
if metadata.SupportsPEBS, output, err = getSupportsPEBS(myTarget, noRoot, perfPath, localTempDir); err != nil {
err = fmt.Errorf("failed to determine if 'PEBS' is supported: %v", err)
} else {
if !metadata.SupportsPEBS {
slog.Warn("'PEBS' events not supported", slog.String("output", output))
}
}
slowFuncChannel <- err
}()
// Offcore response
go func() {
var err error
var output string
if metadata.SupportsOCR, output, err = getSupportsOCR(myTarget, noRoot, perfPath, localTempDir); err != nil {
err = fmt.Errorf("failed to determine if 'OCR' is supported: %v", err)
} else {
if !metadata.SupportsOCR {
slog.Warn("'OCR' events not supported", slog.String("output", output))
}
}
slowFuncChannel <- err
}()
defer func() {
var errs []error
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
errs = append(errs, <-slowFuncChannel)
for _, errInside := range errs {
if errInside != nil {
slog.Error("error loading metadata", slog.String("error", errInside.Error()), slog.String("target", myTarget.GetName()))
Expand Down Expand Up @@ -218,6 +248,8 @@ func (md Metadata) String() string {
"Fixed TMA slot supported: %t, "+
"ref-cycles supported: %t, "+
"Uncore supported: %t, "+
"PEBS supported: %t, "+
"OCR supported: %t, "+
"PMU Driver version: %s, "+
"Kernel version: %s, ",
md.ModelName,
Expand All @@ -234,6 +266,8 @@ func (md Metadata) String() string {
md.SupportsFixedTMA,
md.SupportsRefCycles,
md.SupportsUncore,
md.SupportsPEBS,
md.SupportsOCR,
md.PMUDriverVersion,
md.KernelVersion)
for deviceName, deviceIds := range md.UncoreDeviceIDs {
Expand Down Expand Up @@ -355,6 +389,42 @@ func getSupportsRefCycles(myTarget target.Target, noRoot bool, perfPath string,
return
}

// getSupportsPEBS() - checks if the PEBS events are supported on the target
// On some VMs, e.g. GCP C4, PEBS events are not supported and perf returns '<not supported>'
// Events that use MSR 0x3F7 are PEBS events. We use the INT_MISC.UNKNOWN_BRANCH_CYCLES event since
// it is a PEBS event that we used in EMR metrics.
func getSupportsPEBS(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) {
scriptDef := script.ScriptDefinition{
Name: "perf stat pebs",
Script: perfPath + " stat -a -e cpu/event=0xad,umask=0x40,period=1000003,name='INT_MISC.UNKNOWN_BRANCH_CYCLES'/ sleep 1",
Superuser: !noRoot,
}
scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir)
if err != nil {
err = fmt.Errorf("failed to determine if pebs is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err)
return
}
supported = !strings.Contains(scriptOutput.Stderr, "<not supported>")
return
}

// getSupportsOCR() - checks if the offcore response events are supported on the target
// On some VMs, e.g. GCP C4, offcore response events are not supported and perf returns '<not supported>'
func getSupportsOCR(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) {
scriptDef := script.ScriptDefinition{
Name: "perf stat ocr",
Script: perfPath + " stat -a -e cpu/event=0x2a,umask=0x01,offcore_rsp=0x104004477,name='OCR.READS_TO_CORE.LOCAL_DRAM'/ sleep 1",
Superuser: !noRoot,
}
scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir)
if err != nil {
err = fmt.Errorf("failed to determine if ocr is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err)
return
}
supported = !strings.Contains(scriptOutput.Stderr, "<not supported>")
return
}

// getSupportsFixedTMA - checks if the fixed TMA counter events are
// supported by perf.
//
Expand Down
86 changes: 44 additions & 42 deletions cmd/metrics/metric_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ type MetricDefinition struct {
// definition file. When the override path argument is empty, the function will load metrics from
// the file associated with the platform's architecture found in the provided metadata. When
// a list of metric names is provided, only those metric definitions will be loaded.
func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, uncollectableEvents []string, metadata Metadata) (metrics []MetricDefinition, err error) {
func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, metadata Metadata) (metrics []MetricDefinition, err error) {
var bytes []byte
if metricDefinitionOverridePath != "" {
if bytes, err = os.ReadFile(metricDefinitionOverridePath); err != nil {
Expand All @@ -56,20 +56,6 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics
if err = json.Unmarshal(bytes, &metricsInFile); err != nil {
return
}
// remove "metric_" prefix from metric names
for i := range metricsInFile {
metricsInFile[i].Name = strings.TrimPrefix(metricsInFile[i].Name, "metric_")
}
// remove metrics from list that use uncollectable events
for _, uncollectableEvent := range uncollectableEvents {
for i := 0; i < len(metricsInFile); i++ {
if strings.Contains(metricsInFile[i].Expression, uncollectableEvent) {
slog.Debug("removing metric that uses uncollectable event", slog.String("metric", metricsInFile[i].Name), slog.String("event", uncollectableEvent))
metricsInFile = append(metricsInFile[:i], metricsInFile[i+1:]...)
i--
}
}
}
// if a list of metric names provided, reduce list to match
if len(selectedMetrics) > 0 {
// confirm provided metric names are valid (included in metrics defined in file)
Expand Down Expand Up @@ -102,7 +88,7 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics
// ConfigureMetrics prepares metrics for use by the evaluator, by e.g., replacing
// metric constants with known values and aligning metric variables to perf event
// groups
func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (err error) {
func ConfigureMetrics(loadedMetrics []MetricDefinition, uncollectableEvents []string, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (metrics []MetricDefinition, err error) {
// get constants as strings
tscFreq := fmt.Sprintf("%f", float64(metadata.TSCFrequencyHz))
tsc := fmt.Sprintf("%f", float64(metadata.TSC))
Expand All @@ -112,54 +98,70 @@ func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string]
hyperThreadingOn := fmt.Sprintf("%t", metadata.ThreadsPerCore > 1)
threadsPerCore := fmt.Sprintf("%f", float64(metadata.ThreadsPerCore))
// configure each metric
for metricIdx := range metrics {
for metricIdx := range loadedMetrics {
tmpMetric := loadedMetrics[metricIdx]
// abbreviate event names in metric expressions to match abbreviations used in uncollectableEvents
tmpMetric.Expression = abbreviateEventName(tmpMetric.Expression)
tmpMetric.ExpressionTxn = abbreviateEventName(tmpMetric.ExpressionTxn)
// skip metrics that use uncollectable events
foundUncollectable := false
for _, uncollectableEvent := range uncollectableEvents {
if strings.Contains(tmpMetric.Expression, uncollectableEvent) {
slog.Warn("removing metric that uses uncollectable event", slog.String("metric", tmpMetric.Name), slog.String("event", uncollectableEvent))
foundUncollectable = true
break
}
}
if foundUncollectable {
continue
}
// swap in per-txn metric definition if transaction rate is provided
if flagTransactionRate != 0 && metrics[metricIdx].ExpressionTxn != "" {
metrics[metricIdx].Expression = metrics[metricIdx].ExpressionTxn
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate))
metrics[metricIdx].Name = metrics[metricIdx].NameTxn
if flagTransactionRate != 0 && tmpMetric.ExpressionTxn != "" {
tmpMetric.Expression = tmpMetric.ExpressionTxn
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate))
tmpMetric.Name = tmpMetric.NameTxn
}
// remove "metric_" prefix from metric names
tmpMetric.Name = strings.TrimPrefix(tmpMetric.Name, "metric_")
// transform if/else to ?/:
var transformed string
if transformed, err = transformConditional(metrics[metricIdx].Expression); err != nil {
if transformed, err = transformConditional(tmpMetric.Expression); err != nil {
return
}
if transformed != metrics[metricIdx].Expression {
slog.Debug("transformed metric", slog.String("original", metrics[metricIdx].Name), slog.String("transformed", transformed))
metrics[metricIdx].Expression = transformed
if transformed != tmpMetric.Expression {
slog.Debug("transformed metric", slog.String("original", tmpMetric.Name), slog.String("transformed", transformed))
tmpMetric.Expression = transformed
}
// replace constants with their values
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SYSTEM_TSC_FREQ]", tscFreq)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TSC]", tsc)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CORES_PER_SOCKET]", coresPerSocket)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CHAS_PER_SOCKET]", chasPerSocket)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SOCKET_COUNT]", socketCount)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[HYPERTHREADING_ON]", hyperThreadingOn)
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CONST_THREAD_COUNT]", threadsPerCore)
// abbreviate event names
metrics[metricIdx].Expression = abbreviateEventName(metrics[metricIdx].Expression)
metrics[metricIdx].ExpressionTxn = abbreviateEventName(metrics[metricIdx].ExpressionTxn)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SYSTEM_TSC_FREQ]", tscFreq)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TSC]", tsc)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CORES_PER_SOCKET]", coresPerSocket)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CHAS_PER_SOCKET]", chasPerSocket)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SOCKET_COUNT]", socketCount)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[HYPERTHREADING_ON]", hyperThreadingOn)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CONST_THREAD_COUNT]", threadsPerCore)
// get a list of the variables in the expression
metrics[metricIdx].Variables = make(map[string]int)
tmpMetric.Variables = make(map[string]int)
expressionIdx := 0
for {
startVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], '[')
startVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], '[')
if startVar == -1 { // no more vars in this expression
break
}
endVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], ']')
endVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], ']')
if endVar == -1 {
err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", metrics[metricIdx].Expression[expressionIdx:])
err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", tmpMetric.Expression[expressionIdx:])
return
}
// add the variable name to the map, set group index to -1 to indicate it has not yet been determined
metrics[metricIdx].Variables[metrics[metricIdx].Expression[expressionIdx:][startVar+1:endVar]] = -1
tmpMetric.Variables[tmpMetric.Expression[expressionIdx:][startVar+1:endVar]] = -1
expressionIdx += endVar + 1
}
if metrics[metricIdx].Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(metrics[metricIdx].Expression, evaluatorFunctions); err != nil {
slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", metrics[metricIdx].Name), slog.String("metric expression", metrics[metricIdx].Expression))
if tmpMetric.Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(tmpMetric.Expression, evaluatorFunctions); err != nil {
slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", tmpMetric.Name), slog.String("metric expression", tmpMetric.Expression))
return
}
metrics = append(metrics, tmpMetric)
}
return
}
Expand Down
5 changes: 3 additions & 2 deletions cmd/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -883,15 +883,16 @@ func prepareMetrics(targetContext *targetContext, localTempDir string, channelEr
return
}
// load metric definitions
if targetContext.metricDefinitions, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, uncollectableEvents, targetContext.metadata); err != nil {
var loadedMetrics []MetricDefinition
if loadedMetrics, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, targetContext.metadata); err != nil {
err = fmt.Errorf("failed to load metric definitions: %w", err)
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
targetContext.err = err
channelError <- targetError{target: myTarget, err: err}
return
}
// configure metrics
if err = ConfigureMetrics(targetContext.metricDefinitions, GetEvaluatorFunctions(), targetContext.metadata); err != nil {
if targetContext.metricDefinitions, err = ConfigureMetrics(loadedMetrics, uncollectableEvents, GetEvaluatorFunctions(), targetContext.metadata); err != nil {
err = fmt.Errorf("failed to configure metrics: %w", err)
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
targetContext.err = err
Expand Down
2 changes: 1 addition & 1 deletion cmd/metrics/summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ func (m *metricsFromCSV) getStats() (stats map[string]metricStats, err error) {
sum := 0.0
for _, row := range m.rows {
val := row.metrics[metricName]
if math.IsNaN(val) {
if math.IsNaN(val) || math.IsInf(val, 0) {
continue
}
if math.IsNaN(min) { // min was initialized to NaN
Expand Down

0 comments on commit 4da3a94

Please sign in to comment.