-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
131567: roachprod: add command to download the latest pprof file r=herkolategan a=nameisbhaskar Currently, whenever we see a spike in the CPU for a specific cluster, we have to download the latest pprof file for analysis. We generally select the latest file across all nodes or a specific set of nodes. This becomes a very manual process as we need to check each cluster to find the suitable file to download. This command helps in finding the latest pprof to download across all nodes or given nodes. The command also takes a time parameter as input with which we can download the latest pprof before a specific time. Epic: none Release note: None Co-authored-by: Bhaskarjyoti Bora <[email protected]>
- Loading branch information
Showing
4 changed files
with
321 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
// Copyright 2024 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the CockroachDB Software License | ||
// included in the /LICENSE file. | ||
|
||
package roachprod | ||
|
||
import ( | ||
"fmt" | ||
"regexp" | ||
"strings" | ||
"testing" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/roachprod/install" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func Test_buildClusterPProfInfo(t *testing.T) { | ||
clusterToPProfInfo := make(map[install.Node][]*pprofInfo) | ||
buildClusterPProfInfo(strings.Split(getListClusterMockResponse(1), "\n"), install.Node(1), clusterToPProfInfo) | ||
buildClusterPProfInfo(strings.Split(getListClusterMockResponse(2), "\n"), install.Node(2), clusterToPProfInfo) | ||
require.Equal(t, 2, len(clusterToPProfInfo)) | ||
pprofRegExp := regexp.MustCompile(`cpuprof\.(.*)\.pprof`) | ||
for node, pprofInfos := range clusterToPProfInfo { | ||
nodeNumber := int(node) | ||
// fileCount represents the file index as returned by getListClusterMockResponse | ||
fileCount := 0 | ||
for _, pi := range pprofInfos { | ||
// fileDate is using the same logic as used in getListClusterMockResponse for calculating the date | ||
fileDate := nodeNumber + 20 - fileCount*2 | ||
require.Equal(t, fmt.Sprintf("cpuprof.2024-09-%dT10_05_22.234.69.pprof", fileDate), | ||
pi.filename) | ||
pprofFileMatch := pprofRegExp.FindStringSubmatch(pi.filename) | ||
require.Equal(t, 2, len(pprofFileMatch)) | ||
ct, err := time.Parse("2006-01-02T15:04:05", | ||
strings.ReplaceAll(strings.Split(pprofFileMatch[1], ".")[0], "_", ":")) | ||
require.Nil(t, err) | ||
require.Equal(t, ct, pi.createdTime) | ||
fileCount++ | ||
if fileCount == 1 { | ||
// the second last file in the list returned is invalid. Since the files get reverse sorted by time, the invalid | ||
// file entry becomes the second entry | ||
fileCount++ | ||
} | ||
} | ||
} | ||
} | ||
|
||
func Test_getLatestPProfInfoBefore(t *testing.T) { | ||
timeNow := time.Now() | ||
fileInfoMap := map[install.Node][]*pprofInfo{ | ||
1: { | ||
{createdTime: timeNow.Add(-10)}, | ||
{createdTime: timeNow.Add(-20)}, | ||
{createdTime: timeNow.Add(-30)}, | ||
}, | ||
2: { | ||
{createdTime: timeNow.Add(-15)}, | ||
{createdTime: timeNow.Add(-25)}, | ||
{createdTime: timeNow.Add(-28)}, | ||
}, | ||
3: { | ||
{createdTime: timeNow.Add(-5)}, | ||
{createdTime: timeNow.Add(-11)}, | ||
{createdTime: timeNow.Add(-15)}, | ||
}, | ||
} | ||
require.Nil(t, getLatestPProfInfo(fileInfoMap, timeNow.Add(-31))) | ||
require.Equal(t, fileInfoMap[3][1], getLatestPProfInfo(fileInfoMap, timeNow.Add(-11))) | ||
require.Equal(t, fileInfoMap[3][0], getLatestPProfInfo(fileInfoMap, time.Now())) | ||
require.Equal(t, fileInfoMap[1][0], getLatestPProfInfo(fileInfoMap, timeNow.Add(-6))) | ||
require.Equal(t, fileInfoMap[2][2], getLatestPProfInfo(fileInfoMap, timeNow.Add(-26))) | ||
} | ||
|
||
// getListClusterMockResponse returns the mocked data for a node. The node number decides the dates. | ||
// e.g. if n=1, it returns 6 records with dates: | ||
// Sept 11, 13, 15, 17, 19, 21 | ||
// Similarly for n=2, the 6 records are: | ||
// Sept 12, 14, 16, 18, 20, 22 | ||
// Records on day n+18 is invalid. | ||
func getListClusterMockResponse(n int) string { | ||
return fmt.Sprintf(`-rw-r----- 1 ubuntu ubuntu 219534 Sep %d 10:05 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof | ||
-rw-r----- 1 ubuntu ubuntu 211461 Sep %d 10:05 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof | ||
-rw-r----- 1 ubuntu ubuntu 207410 Sep %d 10:05 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof | ||
-rw-r----- 1 ubuntu ubuntu 248365 Sep %d 10:06 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof | ||
-rw-r----- 1 ubuntu ubuntu 248365 Sep %d 10:06 logs/pprof_dump/cpuprof.2024-09-Invalid_06_%d.042.82.pprof | ||
-rw-r----- 1 ubuntu ubuntu 231194 Sep %d 10:07 logs/pprof_dump/cpuprof.2024-09-%dT10_05_22.234.69.pprof`, | ||
n+10, n+10, n+12, n+12, n+14, n+14, n+16, n+16, n+18, n+18, n+20, n+20) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
// Copyright 2024 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the CockroachDB Software License | ||
// included in the /LICENSE file. | ||
|
||
package roachprod | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"regexp" | ||
"sort" | ||
"strings" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/roachprod/install" | ||
"github.com/cockroachdb/cockroach/pkg/roachprod/logger" | ||
"github.com/cockroachdb/cockroach/pkg/util/timeutil" | ||
) | ||
|
||
var ( | ||
// pprofFileListRegex extracts the ppprof file name of the "ls -ltr" console output. It also extracts the date | ||
// and time from the file name. e.g. | ||
// -rw-r----- 1 ubuntu ubuntu 221508 Sep 27 10:05 logs/pprof_dump/cpuprof.2024-09-27T10_05_19.067.67.pprof | ||
// extracts cpuprof.2024-09-27T10_05_19.067.67.pprof and 2024-09-27T10_05_19.067.67 | ||
pprofFileListRegex = regexp.MustCompile(`.+ logs/pprof_dump/(cpuprof\.(.*)\.pprof)$`) | ||
) | ||
|
||
// pprofInfo maintains the information of a single pprof file in a cluster | ||
type pprofInfo struct { | ||
node install.Node | ||
filename string | ||
createdTime time.Time | ||
} | ||
|
||
// DownloadLatestPProfFile downloads the latest pprof file before beforeTime across all cluster nodes | ||
// if the beforeTime is not set, it defaults to the current time | ||
func DownloadLatestPProfFile( | ||
ctx context.Context, l *logger.Logger, clusterName string, beforeTimeArg string, | ||
) (err error) { | ||
beforeTime := timeutil.Now() | ||
if beforeTimeArg != "" { | ||
beforeTime, err = time.Parse(time.RFC3339, beforeTimeArg) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
c, err := getClusterFromCache(l, clusterName) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
nodePProfInfo, err := getClusterToPProfInfoMap(ctx, l, c) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
latestPProf := getLatestPProfInfo(nodePProfInfo, beforeTime) | ||
|
||
if latestPProf != nil { | ||
// download the file if found | ||
err = c.Get(ctx, l, install.Nodes{latestPProf.node}, | ||
fmt.Sprintf("logs/pprof_dump/%s", latestPProf.filename), | ||
// the downloaded file name has the cluster name and node number prefixed | ||
fmt.Sprintf("%s_%d_%s", c.Name, latestPProf.node, latestPProf.filename), | ||
) | ||
if err != nil { | ||
return err | ||
} | ||
} else { | ||
fmt.Println("No file found to download.") | ||
} | ||
return nil | ||
} | ||
|
||
// getLatestPProfInfo gets the latest pprofInfo across all clusters before beforeTime | ||
// it returns nil if no pprof were found. | ||
func getLatestPProfInfo( | ||
clusterToPProfInfo map[install.Node][]*pprofInfo, beforeTime time.Time, | ||
) *pprofInfo { | ||
var latestPProf *pprofInfo | ||
allEligiblePProf := make([]*pprofInfo, 0) | ||
for _, pprofInfos := range clusterToPProfInfo { | ||
// data is sorted. so, binary search works | ||
info := getLatestPProfInfoBefore(pprofInfos, beforeTime) | ||
if info != nil { | ||
allEligiblePProf = append(allEligiblePProf, info) | ||
} | ||
} | ||
// check for the length as we may not find any file with the given criteria | ||
if len(allEligiblePProf) > 0 { | ||
latestPProf = allEligiblePProf[0] | ||
for i := 1; i < len(allEligiblePProf); i++ { | ||
if latestPProf.createdTime.Before(allEligiblePProf[i].createdTime) { | ||
latestPProf = allEligiblePProf[i] | ||
} | ||
} | ||
} | ||
return latestPProf | ||
} | ||
|
||
// getClusterToPProfInfoMap run "ls -ltr logs/pprof_dump" on the cluster and creates a map where | ||
// the key is the node number and value is a list of pprofInfo. e.g. | ||
// Consider the following console output for different nodes: | ||
// Node 1 output: | ||
// | ||
// -rw-r----- 1 ubuntu ubuntu 207410 Sep 27 10:05 logs/pprof_dump/cpuprof.2024-09-27T10_05_42.752.79.pprof | ||
// -rw-r----- 1 ubuntu ubuntu 248365 Sep 27 10:06 logs/pprof_dump/cpuprof.2024-09-27T10_06_33.042.82.pprof | ||
// -rw-r----- 1 ubuntu ubuntu 231194 Sep 27 10:07 logs/pprof_dump/cpuprof.2024-09-27T10_07_33.427.85.pprof | ||
// | ||
// Node 2 output: | ||
// | ||
// -rw-r----- 1 ubuntu ubuntu 255149 Sep 27 10:05 logs/pprof_dump/cpuprof.2024-09-27T10_05_39.811.84.pprof | ||
// -rw-r----- 1 ubuntu ubuntu 237958 Sep 27 10:07 logs/pprof_dump/cpuprof.2024-09-27T10_06_50.128.87.pprof | ||
// | ||
// The above forms the following map: | ||
// | ||
// { | ||
// 1: []*pprofInfo{ | ||
// {clusterNodeNumber: 1, filename: cpuprof.2024-09-27T10_07_33.427.85.pprof, createdTime: 2024-09-27T10:07:33Z}, | ||
// {clusterNodeNumber: 1, filename: cpuprof.2024-09-27T10_06_33.042.82.pprof, createdTime: 2024-09-27T10:06:33Z}, | ||
// }, | ||
// 2: []*pprofInfo{ | ||
// {clusterNodeNumber: 2, filename: cpuprof.2024-09-27T10_06_50.128.87.pprof, createdTime: 2024-09-27T10:06:50Z}, | ||
// {clusterNodeNumber: 2, filename: cpuprof.2024-09-27T10_05_39.811.84.pprof, createdTime: 2024-09-27T10:05:39Z}, | ||
// }, | ||
// } | ||
// | ||
// Here 1 amd 2 as key represents the node number and the values represent the files in each node | ||
// Note that the lists per node are reverse sorted. | ||
func getClusterToPProfInfoMap( | ||
ctx context.Context, l *logger.Logger, c *install.SyncedCluster, | ||
) (map[install.Node][]*pprofInfo, error) { | ||
results, err := c.RunWithDetails(ctx, l, install.WithNodes(c.Nodes), "", | ||
"ls -ltr logs/pprof_dump/cpuprof.*.pprof") | ||
if err != nil { | ||
return nil, err | ||
} | ||
clusterToPProfInfo := make(map[install.Node][]*pprofInfo) | ||
for _, r := range results { | ||
node := r.Node | ||
output := r.Stdout | ||
buildClusterPProfInfo(strings.Split(output, "\n"), node, clusterToPProfInfo) | ||
} | ||
|
||
return clusterToPProfInfo, nil | ||
} | ||
|
||
// getLatestPProfInfoBefore gets the last pprofInfo record before a certain time using binary search | ||
func getLatestPProfInfoBefore(pprofList []*pprofInfo, beforeTime time.Time) *pprofInfo { | ||
// Binary search to find the first element where createdTime is before the given time | ||
index := sort.Search(len(pprofList), func(i int) bool { | ||
// Since the list is reverse sorted, we check the opposite condition | ||
// 1 is added to select the file with exact matching timestamp | ||
return pprofList[i].createdTime.Before(beforeTime.Add(1)) | ||
}) | ||
|
||
// no files are found | ||
if index == len(pprofList) { | ||
return nil | ||
} | ||
// the pprofInfo at the index is returned | ||
return pprofList[index] | ||
} | ||
|
||
// buildClusterPProfInfo scans through each line and builds the map of cluster node to the list of files | ||
func buildClusterPProfInfo( | ||
lines []string, node install.Node, clusterToPProfInfo map[install.Node][]*pprofInfo, | ||
) { | ||
// allPProf contains the list of pprofInfo for a node. | ||
// As the lines are read in sequence, the files for a single node always come in one flow. | ||
allPProf := make([]*pprofInfo, 0) | ||
for index := 0; index < len(lines); index++ { | ||
// One line at a time is read. The lines that are of our interest looks like the following: | ||
// "-rw-r----- 1 ubuntu ubuntu 237958 Sep 27 10:07 logs/pprof_dump/<pprof file name" | ||
// any other line is ignored | ||
line := lines[index] | ||
// it checks if the line matches any ls -ltr output of ppprof file | ||
pprofFileMatch := pprofFileListRegex.FindStringSubmatch(line) | ||
if len(pprofFileMatch) > 2 { | ||
// this is a line with a matching pprof file name | ||
// extract the date from the file name. To make it parsable replace _ with : | ||
ct, err := time.Parse("2006-01-02T15:04:05", | ||
strings.ReplaceAll(strings.Split(pprofFileMatch[2], ".")[0], "_", ":")) | ||
if err != nil { | ||
fmt.Println("Unsupported file name format:", pprofFileMatch[1], err) | ||
continue | ||
} | ||
// append the new pprofInfo in the beginning | ||
allPProf = append([]*pprofInfo{{ | ||
node: node, | ||
filename: pprofFileMatch[1], | ||
createdTime: ct, | ||
}}, allPProf...) | ||
} | ||
// if the line does not match any of the above, continue to the next line | ||
} | ||
// we reach here once all the lines are read. if allPProf has any data, this is for the node | ||
if len(allPProf) > 0 { | ||
clusterToPProfInfo[node] = allPProf | ||
} | ||
} |