Skip to content

Commit

Permalink
Merge #97465 #97505
Browse files Browse the repository at this point in the history
97465: c2c: gather perf metrics from prometheus r=stevendanna a=msbutler

c2c roachtest performance metrics are now gathered by a prom/grafana instance running locally on the roachprod cluster. This change allows us to gather and process any metrics exposed to the crdb prom endpoint. Specifically, we now gather: `capacity_used`, `replication_logical_bytes`, `replication_sst_bytes` at various points during the c2c roachtest, allowing us to measure:
- Initial Scan Throughput: initial scan size / initial scan duration
- Workload Throughput: data ingested during workload / workload duration
- Cutover Throughput: (data ingested between cutover time and cutover cmd) / (cutover process duration)

where the size of these operations can be measured as either physical replicated bytes, logical ingested bytes, or physical ingested bytes on the source cluster.

This patch also fixes a recent bug which mislabeled src cluster throughput as initial scan throughput.

Informs #89176

Release note: None

97505: server, ui: remove interpreted jobs retrying status  r=xinhaoz a=xinhaoz

This commit removes the 'Retrying' status from the jobs UX.
Previously, we were interpolating this status from the running
status. This just added confusion and incorectness to the status
of the job being displayed. The status being surfaced now aligns
directly with what is shown in the `crdb_internal.jobs` table.

Some missing job statuses were also added as request options to
the 'Status' dropdown, including:
- Pause Requested
- Cancel Requested
- Revert Failed

Fixes: #95712

Release note (ui change): Retrying is no longer a status shown
in the jobs page.

<img width="1326" alt="image" src="https://user-images.githubusercontent.com/20136951/220738075-733b0cc8-9f77-4ace-a944-3791ff159c62.png">


Co-authored-by: Michael Butler <[email protected]>
Co-authored-by: Xin Hao Zhang <[email protected]>
  • Loading branch information
3 people committed Feb 27, 2023
3 parents 46120ff + 4610bc6 + 2eca521 commit f1a4c63
Show file tree
Hide file tree
Showing 9 changed files with 297 additions and 260 deletions.
301 changes: 187 additions & 114 deletions pkg/cmd/roachtest/tests/cluster_to_cluster.go

Large diffs are not rendered by default.

24 changes: 17 additions & 7 deletions pkg/cmd/roachtest/tests/multitenant_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ func createInMemoryTenant(
sysSQL.Exec(t, "CREATE TENANT $1", tenantName)
sysSQL.Exec(t, "ALTER TENANT $1 START SERVICE SHARED", tenantName)

removeTenantRateLimiters(t, sysSQL, tenantName)

// Opening a SQL session to a newly created in-process tenant may require a
// few retries. Unfortunately, the c.ConnE and MakeSQLRunner APIs do not make
// it clear if they eagerly open a session with the tenant or wait until the
Expand All @@ -331,14 +333,22 @@ func createInMemoryTenant(
return nil
})

// Currently, a tenant has by default a 10m RU burst limit, which can be
// reached during these tests. To prevent RU limit throttling, add 10B RUs to
// the tenant.
var tenantID int
sysSQL.QueryRow(t, `SELECT id FROM [SHOW TENANT $1]`, tenantName).Scan(&tenantID)
sysSQL.Exec(t, `SELECT crdb_internal.update_tenant_resource_limits($1, 10000000000, 0,
10000000000, now(), 0);`, tenantID)
if secure {
createTenantAdminRole(t, tenantName, tenantSQL)
}
}

// removeTenantRateLimiters ensures the tenant is not throttled by limiters.
func removeTenantRateLimiters(t test.Test, systemSQL *sqlutils.SQLRunner, tenantName string) {
var tenantID int
systemSQL.QueryRow(t, `SELECT id FROM [SHOW TENANT $1]`, tenantName).Scan(&tenantID)
systemSQL.Exec(t, `SELECT crdb_internal.update_tenant_resource_limits($1, 10000000000, 0,
10000000000, now(), 0);`, tenantID)
systemSQL.ExecMultiple(t,
`SET CLUSTER SETTING kv.tenant_rate_limiter.burst_limit_seconds = 10000;`,
`SET CLUSTER SETTING kv.tenant_rate_limiter.rate_limit = -1000; `,
`SET CLUSTER SETTING kv.tenant_rate_limiter.read_batch_cost = 0;`,
`SET CLUSTER SETTING kv.tenant_rate_limiter.read_cost_per_mebibyte = 0;`,
`SET CLUSTER SETTING kv.tenant_rate_limiter.write_cost_per_megabyte = 0;`,
`SET CLUSTER SETTING kv.tenant_rate_limiter.write_request_cost = 0;`)
}
51 changes: 30 additions & 21 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -2245,40 +2245,49 @@ func jobsHelper(
cfg *BaseConfig,
sv *settings.Values,
) (_ *serverpb.JobsResponse, retErr error) {
retryRunningCondition := "status='running' AND next_run > now() AND num_runs > 1"
retryRevertingCondition := "status='reverting' AND next_run > now() AND num_runs > 1"

q := makeSQLQuery()
q.Append(`
SELECT job_id, job_type, description, statement, user_name, descriptor_ids,
case
when ` + retryRunningCondition + ` then 'retry-running'
when ` + retryRevertingCondition + ` then 'retry-reverting'
else status
end as status, running_status, created, started, finished, modified, fraction_completed,
high_water_timestamp, error, last_run, next_run, num_runs, execution_events::string, coordinator_id
FROM crdb_internal.jobs
WHERE true
`)
if req.Status == "retrying" {
q.Append(" AND ( ( " + retryRunningCondition + " ) OR ( " + retryRevertingCondition + " ) )")
} else if req.Status != "" {
SELECT
job_id,
job_type,
description,
statement,
user_name,
descriptor_ids,
status,
running_status,
created,
started,
finished,
modified,
fraction_completed,
high_water_timestamp,
error,
last_run,
next_run,
num_runs,
execution_events::string,
coordinator_id
FROM crdb_internal.jobs
WHERE true`) // Simplifies filter construction below.
if req.Status != "" {
q.Append(" AND status = $", req.Status)
}
if req.Type != jobspb.TypeUnspecified {
q.Append(" AND job_type = $", req.Type.String())
} else {
// Don't show automatic jobs in the overview page.
q.Append(" AND (")
q.Append(" AND ( job_type NOT IN (")
for idx, jobType := range jobspb.AutomaticJobTypes {
q.Append("job_type != $", jobType.String())
if idx < len(jobspb.AutomaticJobTypes)-1 {
q.Append(" AND ")
if idx != 0 {
q.Append(", ")
}
q.Append("$", jobType.String())
}
q.Append(" OR job_type IS NULL)")
q.Append(" ) OR job_type IS NULL)")
}
q.Append("ORDER BY created DESC")
q.Append(" ORDER BY created DESC")
if req.Limit > 0 {
q.Append(" LIMIT $", tree.DInt(req.Limit))
}
Expand Down
36 changes: 0 additions & 36 deletions pkg/server/admin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1725,11 +1725,6 @@ func TestAdminAPIJobs(t *testing.T) {
append(append([]int64{}, revertingOnlyIds...), retryRevertingIds...),
[]int64{},
},
{
"jobs?status=retrying",
append(append([]int64{}, retryRunningIds...), retryRevertingIds...),
[]int64{},
},
{
"jobs?status=pending",
[]int64{},
Expand Down Expand Up @@ -1807,11 +1802,6 @@ func TestAdminAPIJobsDetails(t *testing.T) {
defer s.Stopper().Stop(context.Background())
sqlDB := sqlutils.MakeSQLRunner(conn)

runningOnlyIds := []int64{1, 3, 5}
revertingOnlyIds := []int64{2, 4, 6}
retryRunningIds := []int64{7}
retryRevertingIds := []int64{8}

now := timeutil.Now()

encodedError := func(err error) *errors.EncodedError {
Expand Down Expand Up @@ -1891,32 +1881,6 @@ func TestAdminAPIJobsDetails(t *testing.T) {
t.Fatal(err)
}

// test that the select statement correctly converts expected jobs to retry-____ statuses
expectedStatuses := []struct {
status string
ids []int64
}{
{"running", runningOnlyIds},
{"reverting", revertingOnlyIds},
{"retry-running", retryRunningIds},
{"retry-reverting", retryRevertingIds},
}
for _, expected := range expectedStatuses {
var jobsWithStatus []serverpb.JobResponse
for _, job := range res.Jobs {
for _, expectedID := range expected.ids {
if job.ID == expectedID {
jobsWithStatus = append(jobsWithStatus, job)
}
}
}

require.Len(t, jobsWithStatus, len(expected.ids))
for _, job := range jobsWithStatus {
assert.Equal(t, expected.status, job.Status)
}
}

// Trim down our result set to the jobs we injected.
resJobs := append([]serverpb.JobResponse(nil), res.Jobs...)
sort.Slice(resJobs, func(i, j int) bool {
Expand Down
38 changes: 25 additions & 13 deletions pkg/ui/workspaces/cluster-ui/src/jobs/jobsPage/jobsPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,14 @@ import { Pagination, ResultsPerPageLabel } from "src/pagination";
import { isSelectedColumn } from "src/columnsSelector/utils";
import { DATE_FORMAT_24_UTC, syncHistory, TimestampToMoment } from "src/util";
import { jobsColumnLabels, JobsTable, makeJobsColumns } from "./jobsTable";
import { showOptions, statusOptions, typeOptions } from "../util";
import {
showOptions,
statusOptions,
typeOptions,
isValidJobStatus,
defaultRequestOptions,
isValidJobType,
} from "../util";

import { commonStyles } from "src/common";
import sortableTableStyles from "src/sortedtable/sortedtable.module.scss";
Expand Down Expand Up @@ -108,8 +115,8 @@ export class JobsPage extends React.Component<JobsPageProps, PageState> {
}

// Filter Status.
const status = searchParams.get("status") || undefined;
if (this.props.setStatus && status && status != this.props.status) {
const status = searchParams.get("status");
if (this.props.setStatus && status && status !== this.props.status) {
this.props.setStatus(status);
}

Expand Down Expand Up @@ -145,6 +152,17 @@ export class JobsPage extends React.Component<JobsPageProps, PageState> {
}

componentDidUpdate(prevProps: JobsPageProps): void {
// Because we removed the retrying status, we add this check
// just in case there exists an app that attempts to load a non-existent
// status.
if (!isValidJobStatus(this.props.status)) {
this.onStatusSelected(defaultRequestOptions.status);
}

if (!isValidJobType(this.props.type)) {
this.onTypeSelected(defaultRequestOptions.type.toString());
}

if (
prevProps.lastUpdated !== this.props.lastUpdated ||
prevProps.show !== this.props.show ||
Expand Down Expand Up @@ -273,27 +291,21 @@ export class JobsPage extends React.Component<JobsPageProps, PageState> {
<PageConfigItem>
<Dropdown items={statusOptions} onChange={this.onStatusSelected}>
Status:{" "}
{
statusOptions.find(option => option["value"] === status)[
"name"
]
}
{statusOptions.find(option => option.value === status)?.name}
</Dropdown>
</PageConfigItem>
<PageConfigItem>
<Dropdown items={typeOptions} onChange={this.onTypeSelected}>
Type:{" "}
{
typeOptions.find(
option => option["value"] === type.toString(),
)["name"]
typeOptions.find(option => option.value === type.toString())
?.name
}
</Dropdown>
</PageConfigItem>
<PageConfigItem>
<Dropdown items={showOptions} onChange={this.onShowSelected}>
Show:{" "}
{showOptions.find(option => option["value"] === show)["name"]}
Show: {showOptions.find(option => option.value === show)?.name}
</Dropdown>
</PageConfigItem>
</PageConfig>
Expand Down
58 changes: 27 additions & 31 deletions pkg/ui/workspaces/cluster-ui/src/jobs/util/jobOptions.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,8 @@ export function jobToVisual(job: Job): JobStatusVisual {
return JobStatusVisual.BadgeWithErrorMessage;
case JOB_STATUS_RUNNING:
return JobStatusVisual.ProgressBarWithDuration;
case JOB_STATUS_RETRY_RUNNING:
return JobStatusVisual.ProgressBarWithDuration;
case JOB_STATUS_PENDING:
return JobStatusVisual.BadgeWithMessage;
case JOB_STATUS_RETRY_REVERTING:
return JobStatusVisual.BadgeWithRetrying;
case JOB_STATUS_CANCELED:
case JOB_STATUS_CANCEL_REQUESTED:
case JOB_STATUS_PAUSED:
Expand All @@ -59,36 +55,43 @@ export const JOB_STATUS_CANCEL_REQUESTED = "cancel-requested";
export const JOB_STATUS_PAUSED = "paused";
export const JOB_STATUS_PAUSE_REQUESTED = "paused-requested";
export const JOB_STATUS_RUNNING = "running";
export const JOB_STATUS_RETRY_RUNNING = "retry-running";
export const JOB_STATUS_PENDING = "pending";
export const JOB_STATUS_REVERTING = "reverting";
export const JOB_STATUS_REVERT_FAILED = "revert-failed";
export const JOB_STATUS_RETRY_REVERTING = "retry-reverting";

export function isRetrying(status: string): boolean {
return [JOB_STATUS_RETRY_RUNNING, JOB_STATUS_RETRY_REVERTING].includes(
status,
);
}
export function isRunning(status: string): boolean {
return [JOB_STATUS_RUNNING, JOB_STATUS_RETRY_RUNNING].includes(status);
return [JOB_STATUS_RUNNING, JOB_STATUS_REVERTING].some(s =>
status.includes(s),
);
}
export function isTerminalState(status: string): boolean {
return [JOB_STATUS_SUCCEEDED, JOB_STATUS_FAILED].includes(status);
}

export const statusOptions = [
{ value: "", name: "All" },
{ value: "succeeded", name: "Succeeded" },
{ value: "failed", name: "Failed" },
{ value: "paused", name: "Paused" },
{ value: "canceled", name: "Canceled" },
{ value: "running", name: "Running" },
{ value: "pending", name: "Pending" },
{ value: "reverting", name: "Reverting" },
{ value: "retrying", name: "Retrying" },
{ value: JOB_STATUS_SUCCEEDED, name: "Succeeded" },
{ value: JOB_STATUS_FAILED, name: "Failed" },
{ value: JOB_STATUS_PAUSED, name: "Paused" },
{ value: JOB_STATUS_PAUSE_REQUESTED, name: "Pause Requested" },
{ value: JOB_STATUS_CANCELED, name: "Canceled" },
{ value: JOB_STATUS_CANCEL_REQUESTED, name: "Cancel Requested" },
{ value: JOB_STATUS_RUNNING, name: "Running" },
{ value: JOB_STATUS_PENDING, name: "Pending" },
{ value: JOB_STATUS_REVERTING, name: "Reverting" },
{ value: JOB_STATUS_REVERT_FAILED, name: "Revert Failed" },
];

const ALL_JOB_STATUSES = new Set(statusOptions.map(option => option.value));

/**
* @param jobStatus job status - any string
* @returns Returns true if the job status string is a valid status.
*/
export function isValidJobStatus(jobStatus: string): boolean {
return ALL_JOB_STATUSES.has(jobStatus);
}

export function jobHasOneOfStatuses(job: Job, ...statuses: string[]): boolean {
return statuses.indexOf(job.status) !== -1;
}
Expand All @@ -110,21 +113,10 @@ export const jobStatusToBadgeStatus = (status: string): BadgeStatus => {
case JOB_STATUS_PAUSED:
case JOB_STATUS_PAUSE_REQUESTED:
case JOB_STATUS_REVERTING:
case JOB_STATUS_RETRY_REVERTING:
default:
return "default";
}
};
export const jobStatusToBadgeText = (status: string): string => {
switch (status) {
case JOB_STATUS_RETRY_REVERTING:
return JOB_STATUS_REVERTING;
case JOB_STATUS_RETRY_RUNNING:
return JOB_STATUS_RUNNING;
default:
return status;
}
};

const jobTypeKeys = Object.keys(JobType);

Expand Down Expand Up @@ -216,6 +208,10 @@ export const typeOptions = [
},
];

export function isValidJobType(jobType: number): boolean {
return jobType >= 0 && jobType < jobTypeKeys.length;
}

export const showOptions = [
{ value: "50", name: "Latest 50" },
{ value: "0", name: "All" },
Expand Down
4 changes: 1 addition & 3 deletions pkg/ui/workspaces/cluster-ui/src/jobs/util/jobStatus.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import classNames from "classnames/bind";
import React from "react";

import { Duration } from "./duration";
import { JobStatusVisual, isRetrying, jobToVisual } from "./jobOptions";
import { JobStatusVisual, jobToVisual } from "./jobOptions";
import {
JobStatusBadge,
ProgressBar,
Expand Down Expand Up @@ -54,7 +54,6 @@ export const JobStatus: React.FC<JobStatusProps> = ({
</div>
);
case JobStatusVisual.ProgressBarWithDuration: {
const jobIsRetrying = isRetrying(job.status);
return (
<div>
<ProgressBar
Expand All @@ -63,7 +62,6 @@ export const JobStatus: React.FC<JobStatusProps> = ({
showPercentage={true}
/>
<Duration job={job} className={cx("jobs-table__duration")} />
{jobIsRetrying && <RetryingStatusBadge />}
{job.running_status && (
<div className={cx("jobs-table__running-status")}>
{job.running_status}
Expand Down
Loading

0 comments on commit f1a4c63

Please sign in to comment.