Skip to content

Commit

Permalink
Fix Graphs & Configure Infrastructure (#1023)
Browse files Browse the repository at this point in the history
* Rework Inference Metrics & Add Runtime Metrics (invalid quieries)

* Add stacked line chart functionality
  • Loading branch information
andrewballantyne authored Mar 27, 2023
1 parent 8741023 commit d9e950c
Show file tree
Hide file tree
Showing 18 changed files with 504 additions and 186 deletions.
79 changes: 54 additions & 25 deletions frontend/src/api/prometheus/serving.ts
Original file line number Diff line number Diff line change
@@ -1,42 +1,62 @@
import * as React from 'react';
import { ContextResourceData, PrometheusQueryRangeResultValue } from '~/types';
import { ModelServingMetricType } from '~/pages/modelServing/screens/metrics/ModelServingMetricsContext';
import { TimeframeTitle } from '~/pages/modelServing/screens/types';
import {
InferenceMetricType,
RuntimeMetricType,
} from '~/pages/modelServing/screens/metrics/ModelServingMetricsContext';
import { MetricType, TimeframeTitle } from '~/pages/modelServing/screens/types';
import useQueryRangeResourceData from './useQueryRangeResourceData';

export const useModelServingMetrics = (
queries: Record<ModelServingMetricType, string>,
type: MetricType,
queries: Record<RuntimeMetricType, string> | Record<InferenceMetricType, string>,
timeframe: TimeframeTitle,
lastUpdateTime: number,
setLastUpdateTime: (time: number) => void,
): {
data: Record<ModelServingMetricType, ContextResourceData<PrometheusQueryRangeResultValue>>;
data: Record<RuntimeMetricType, ContextResourceData<PrometheusQueryRangeResultValue>>;
refresh: () => void;
} => {
const [end, setEnd] = React.useState(lastUpdateTime);

const endpointHealth = useQueryRangeResourceData(
queries[ModelServingMetricType.ENDPOINT_HEALTH],
const runtimeRequestCount = useQueryRangeResourceData(
type === 'runtime',
queries[RuntimeMetricType.REQUEST_COUNT],
end,
timeframe,
);
const inferencePerformance = useQueryRangeResourceData(
queries[ModelServingMetricType.INFERENCE_PERFORMANCE],

const runtimeAverageResponseTime = useQueryRangeResourceData(
type === 'runtime',
queries[RuntimeMetricType.AVG_RESPONSE_TIME],
end,
timeframe,
);

const runtimeCPUUtilization = useQueryRangeResourceData(
type === 'runtime',
queries[RuntimeMetricType.CPU_UTILIZATION],
end,
timeframe,
);
const averageResponseTime = useQueryRangeResourceData(
queries[ModelServingMetricType.AVG_RESPONSE_TIME],

const runtimeMemoryUtilization = useQueryRangeResourceData(
type === 'runtime',
queries[RuntimeMetricType.MEMORY_UTILIZATION],
end,
timeframe,
);
const requestCount = useQueryRangeResourceData(
queries[ModelServingMetricType.REQUEST_COUNT],

const inferenceRequestSuccessCount = useQueryRangeResourceData(
type === 'inference',
queries[InferenceMetricType.REQUEST_COUNT_SUCCESS],
end,
timeframe,
);
const failedRequestCount = useQueryRangeResourceData(
queries[ModelServingMetricType.FAILED_REQUEST_COUNT],

const inferenceRequestFailedCount = useQueryRangeResourceData(
type === 'inference',
queries[InferenceMetricType.REQUEST_COUNT_FAILED],
end,
timeframe,
);
Expand All @@ -45,7 +65,14 @@ export const useModelServingMetrics = (
setLastUpdateTime(Date.now());
// re-compute lastUpdateTime when data changes
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [endpointHealth, inferencePerformance, averageResponseTime, requestCount, failedRequestCount]);
}, [
runtimeRequestCount,
runtimeAverageResponseTime,
runtimeCPUUtilization,
runtimeMemoryUtilization,
inferenceRequestSuccessCount,
inferenceRequestFailedCount,
]);

const refreshAllMetrics = React.useCallback(() => {
setEnd(Date.now());
Expand All @@ -54,20 +81,22 @@ export const useModelServingMetrics = (
return React.useMemo(
() => ({
data: {
[ModelServingMetricType.ENDPOINT_HEALTH]: endpointHealth,
[ModelServingMetricType.INFERENCE_PERFORMANCE]: inferencePerformance,
[ModelServingMetricType.AVG_RESPONSE_TIME]: averageResponseTime,
[ModelServingMetricType.REQUEST_COUNT]: requestCount,
[ModelServingMetricType.FAILED_REQUEST_COUNT]: failedRequestCount,
[RuntimeMetricType.REQUEST_COUNT]: runtimeRequestCount,
[RuntimeMetricType.AVG_RESPONSE_TIME]: runtimeAverageResponseTime,
[RuntimeMetricType.CPU_UTILIZATION]: runtimeCPUUtilization,
[RuntimeMetricType.MEMORY_UTILIZATION]: runtimeMemoryUtilization,
[InferenceMetricType.REQUEST_COUNT_SUCCESS]: inferenceRequestSuccessCount,
[InferenceMetricType.REQUEST_COUNT_FAILED]: inferenceRequestFailedCount,
},
refresh: refreshAllMetrics,
}),
[
endpointHealth,
inferencePerformance,
averageResponseTime,
requestCount,
failedRequestCount,
runtimeRequestCount,
runtimeAverageResponseTime,
runtimeCPUUtilization,
runtimeMemoryUtilization,
inferenceRequestSuccessCount,
inferenceRequestFailedCount,
refreshAllMetrics,
],
);
Expand Down
11 changes: 9 additions & 2 deletions frontend/src/api/prometheus/usePrometheusQueryRange.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import axios from 'axios';
import { PrometheusQueryRangeResponse, PrometheusQueryRangeResultValue } from '~/types';

const usePrometheusQueryRange = (
active: boolean,
apiPath: string,
queryLang: string,
span: number,
Expand All @@ -22,9 +23,15 @@ const usePrometheusQueryRange = (
const endInS = endInMs / 1000;
const start = endInS - span;

if (!active) {
// Save us the call & data storage -- if it's not active, we don't need to fetch
// If we are already loaded & have data, it's okay -- it can be stale data to quickly show
// if the associated graph renders
return;
}
axios
.post<{ response: PrometheusQueryRangeResponse }>(apiPath, {
query: `${queryLang}&start=${start}&end=${endInS}&step=${step}`,
query: `query=${queryLang}&start=${start}&end=${endInS}&step=${step}`,
})
.then((response) => {
const result = response.data?.response.data.result?.[0]?.values || [];
Expand All @@ -36,7 +43,7 @@ const usePrometheusQueryRange = (
setError(e);
setLoaded(true);
});
}, [queryLang, apiPath, span, endInMs, step]);
}, [endInMs, span, active, apiPath, queryLang, step]);

React.useEffect(() => {
fetchData();
Expand Down
7 changes: 5 additions & 2 deletions frontend/src/api/prometheus/useQueryRangeResourceData.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import { TimeframeStep, TimeframeTime } from '~/pages/modelServing/screens/const';
import { TimeframeStep, TimeframeTimeRange } from '~/pages/modelServing/screens/const';
import { TimeframeTitle } from '~/pages/modelServing/screens/types';
import { ContextResourceData, PrometheusQueryRangeResultValue } from '~/types';
import { useContextResourceData } from '~/utilities/useContextResourceData';
import usePrometheusQueryRange from './usePrometheusQueryRange';

const useQueryRangeResourceData = (
/** Is the query active -- should we be fetching? */
active: boolean,
query: string,
end: number,
timeframe: TimeframeTitle,
): ContextResourceData<PrometheusQueryRangeResultValue> =>
useContextResourceData<PrometheusQueryRangeResultValue>(
usePrometheusQueryRange(
active,
'/api/prometheus/serving',
query,
TimeframeTime[timeframe],
TimeframeTimeRange[timeframe],
end,
TimeframeStep[timeframe],
),
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/pages/modelServing/ModelServingRoutes.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import * as React from 'react';
import { Navigate, Routes, Route } from 'react-router-dom';
import ModelServingContextProvider from './ModelServingContext';
import ModelServingMetricsWrapper from './screens/metrics/ModelServingMetricsWrapper';
import GlobalInferenceMetricsWrapper from './screens/metrics/GlobalInferenceMetricsWrapper';
import ModelServingGlobal from './screens/global/ModelServingGlobal';
import useModelMetricsEnabled from './useModelMetricsEnabled';

Expand All @@ -15,9 +15,10 @@ const ModelServingRoutes: React.FC = () => {
<Route
path="/metrics/:project/:inferenceService"
element={
modelMetricsEnabled ? <ModelServingMetricsWrapper /> : <Navigate replace to="/" />
modelMetricsEnabled ? <GlobalInferenceMetricsWrapper /> : <Navigate replace to="/" />
}
/>
{/* TODO: Global Runtime metrics?? */}
<Route path="*" element={<Navigate to="." />} />
</Route>
</Routes>
Expand Down
22 changes: 17 additions & 5 deletions frontend/src/pages/modelServing/screens/const.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,18 +122,30 @@ export const DEFAULT_MODEL_SERVING_TEMPLATE: ServingRuntimeKind = {
},
};

// unit: seconds
export const TimeframeTime: TimeframeTimeType = {
[TimeframeTitle.FIVE_MINUTES]: 5 * 60,
/**
* The desired range (x-axis) of the charts.
* Unit is in seconds
*/
export const TimeframeTimeRange: TimeframeTimeType = {
[TimeframeTitle.ONE_HOUR]: 60 * 60,
[TimeframeTitle.ONE_DAY]: 24 * 60 * 60,
[TimeframeTitle.ONE_WEEK]: 7 * 24 * 60 * 60,
[TimeframeTitle.ONE_MONTH]: 30 * 7 * 24 * 60 * 60,
// [TimeframeTitle.UNLIMITED]: 0,
};

// make sure we always get ~300 data points
/**
* How large a step is -- value is in how many seconds to combine to great an individual data response
* Each should be getting ~300 data points (assuming data fills the gap)
*
* eg. [TimeframeTitle.ONE_DAY]: 24 * 12,
* 24h * 60m * 60s => 86,400 seconds of space
* 86,400 / (24 * 12) => 300 points of prometheus data
*/
export const TimeframeStep: TimeframeStepType = {
[TimeframeTitle.FIVE_MINUTES]: 1,
[TimeframeTitle.ONE_HOUR]: 12,
[TimeframeTitle.ONE_DAY]: 24 * 12,
[TimeframeTitle.ONE_WEEK]: 7 * 24 * 12,
[TimeframeTitle.ONE_MONTH]: 30 * 7 * 24 * 12,
// [TimeframeTitle.UNLIMITED]: 30 * 7 * 24 * 12, // TODO: determine if we "zoom out" more
};
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ import { Bullseye, Spinner } from '@patternfly/react-core';
import NotFound from '~/pages/NotFound';
import { ModelServingContext } from '~/pages/modelServing/ModelServingContext';
import { getInferenceServiceDisplayName } from '~/pages/modelServing/screens/global/utils';
import InferenceGraphs from '~/pages/modelServing/screens/metrics/InferenceGraphs';
import { MetricType } from '~/pages/modelServing/screens/types';
import { ModelServingMetricsProvider } from './ModelServingMetricsContext';
import MetricsPage from './MetricsPage';
import { getInferenceServiceMetricsQueries } from './utils';

const ModelServingMetricsWrapper: React.FC = () => {
const GlobalInferenceMetricsWrapper: React.FC = () => {
const { project: projectName, inferenceService: modelName } = useParams<{
project: string;
inferenceService: string;
Expand All @@ -33,7 +35,7 @@ const ModelServingMetricsWrapper: React.FC = () => {
const modelDisplayName = getInferenceServiceDisplayName(inferenceService);

return (
<ModelServingMetricsProvider queries={queries}>
<ModelServingMetricsProvider queries={queries} type={MetricType.INFERENCE}>
<MetricsPage
title={`${modelDisplayName} metrics`}
breadcrumbItems={[
Expand All @@ -43,9 +45,11 @@ const ModelServingMetricsWrapper: React.FC = () => {
isActive: true,
},
]}
/>
>
<InferenceGraphs />
</MetricsPage>
</ModelServingMetricsProvider>
);
};

export default ModelServingMetricsWrapper;
export default GlobalInferenceMetricsWrapper;
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import * as React from 'react';
import { Stack, StackItem } from '@patternfly/react-core';
import MetricsChart from '~/pages/modelServing/screens/metrics/MetricsChart';
import {
InferenceMetricType,
ModelServingMetricsContext,
} from '~/pages/modelServing/screens/metrics/ModelServingMetricsContext';
import { TimeframeTitle } from '~/pages/modelServing/screens/types';
import { per100 } from './utils';

const InferenceGraphs: React.FC = () => {
const { data, currentTimeframe } = React.useContext(ModelServingMetricsContext);

const inHours =
currentTimeframe === TimeframeTitle.ONE_HOUR || currentTimeframe === TimeframeTitle.ONE_DAY;

return (
<Stack hasGutter>
<StackItem>
<MetricsChart
metrics={[
{
name: 'Success http requests (x100)',
metric: data[InferenceMetricType.REQUEST_COUNT_SUCCESS],
translatePoint: per100,
},
{
name: 'Failed http requests (x100)',
metric: data[InferenceMetricType.REQUEST_COUNT_FAILED],
translatePoint: (point) => {
// TODO: remove when real values are used
const newPoint = per100(point);
const y = Math.floor(newPoint.y / (Math.floor(Math.random() * 2) + 2));
return { ...newPoint, y };
},
},
]}
title={`Http requests per ${inHours ? 'hour' : 'day'} (x100)`}
/>
</StackItem>
</Stack>
);
};

export default InferenceGraphs;
Loading

0 comments on commit d9e950c

Please sign in to comment.