Fix Graphs & Configure Infrastructure (#1023)

* Rework Inference Metrics & Add Runtime Metrics (invalid quieries) * Add stacked line chart functionality
opendatahub-io · Mar 27, 2023 · d9e950c · d9e950c
1 parent 8741023
commit d9e950c
Show file tree

Hide file tree

Showing 18 changed files with 504 additions and 186 deletions.
diff --git a/frontend/src/api/prometheus/serving.ts b/frontend/src/api/prometheus/serving.ts
@@ -1,42 +1,62 @@
 import * as React from 'react';
 import { ContextResourceData, PrometheusQueryRangeResultValue } from '~/types';
-import { ModelServingMetricType } from '~/pages/modelServing/screens/metrics/ModelServingMetricsContext';
-import { TimeframeTitle } from '~/pages/modelServing/screens/types';
+import {
+  InferenceMetricType,
+  RuntimeMetricType,
+} from '~/pages/modelServing/screens/metrics/ModelServingMetricsContext';
+import { MetricType, TimeframeTitle } from '~/pages/modelServing/screens/types';
 import useQueryRangeResourceData from './useQueryRangeResourceData';
 
 export const useModelServingMetrics = (
-  queries: Record<ModelServingMetricType, string>,
+  type: MetricType,
+  queries: Record<RuntimeMetricType, string> | Record<InferenceMetricType, string>,
   timeframe: TimeframeTitle,
   lastUpdateTime: number,
   setLastUpdateTime: (time: number) => void,
 ): {
-  data: Record<ModelServingMetricType, ContextResourceData<PrometheusQueryRangeResultValue>>;
+  data: Record<RuntimeMetricType, ContextResourceData<PrometheusQueryRangeResultValue>>;
   refresh: () => void;
 } => {
   const [end, setEnd] = React.useState(lastUpdateTime);
 
-  const endpointHealth = useQueryRangeResourceData(
-    queries[ModelServingMetricType.ENDPOINT_HEALTH],
+  const runtimeRequestCount = useQueryRangeResourceData(
+    type === 'runtime',
+    queries[RuntimeMetricType.REQUEST_COUNT],
     end,
     timeframe,
   );
-  const inferencePerformance = useQueryRangeResourceData(
-    queries[ModelServingMetricType.INFERENCE_PERFORMANCE],
+
+  const runtimeAverageResponseTime = useQueryRangeResourceData(
+    type === 'runtime',
+    queries[RuntimeMetricType.AVG_RESPONSE_TIME],
+    end,
+    timeframe,
+  );
+
+  const runtimeCPUUtilization = useQueryRangeResourceData(
+    type === 'runtime',
+    queries[RuntimeMetricType.CPU_UTILIZATION],
     end,
     timeframe,
   );
-  const averageResponseTime = useQueryRangeResourceData(
-    queries[ModelServingMetricType.AVG_RESPONSE_TIME],
+
+  const runtimeMemoryUtilization = useQueryRangeResourceData(
+    type === 'runtime',
+    queries[RuntimeMetricType.MEMORY_UTILIZATION],
     end,
     timeframe,
   );
-  const requestCount = useQueryRangeResourceData(
-    queries[ModelServingMetricType.REQUEST_COUNT],
+
+  const inferenceRequestSuccessCount = useQueryRangeResourceData(
+    type === 'inference',
+    queries[InferenceMetricType.REQUEST_COUNT_SUCCESS],
     end,
     timeframe,
   );
-  const failedRequestCount = useQueryRangeResourceData(
-    queries[ModelServingMetricType.FAILED_REQUEST_COUNT],
+
+  const inferenceRequestFailedCount = useQueryRangeResourceData(
+    type === 'inference',
+    queries[InferenceMetricType.REQUEST_COUNT_FAILED],
     end,
     timeframe,
   );
@@ -45,7 +65,14 @@ export const useModelServingMetrics = (
     setLastUpdateTime(Date.now());
     // re-compute lastUpdateTime when data changes
     // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [endpointHealth, inferencePerformance, averageResponseTime, requestCount, failedRequestCount]);
+  }, [
+    runtimeRequestCount,
+    runtimeAverageResponseTime,
+    runtimeCPUUtilization,
+    runtimeMemoryUtilization,
+    inferenceRequestSuccessCount,
+    inferenceRequestFailedCount,
+  ]);
 
   const refreshAllMetrics = React.useCallback(() => {
     setEnd(Date.now());
@@ -54,20 +81,22 @@ export const useModelServingMetrics = (
   return React.useMemo(
     () => ({
       data: {
-        [ModelServingMetricType.ENDPOINT_HEALTH]: endpointHealth,
-        [ModelServingMetricType.INFERENCE_PERFORMANCE]: inferencePerformance,
-        [ModelServingMetricType.AVG_RESPONSE_TIME]: averageResponseTime,
-        [ModelServingMetricType.REQUEST_COUNT]: requestCount,
-        [ModelServingMetricType.FAILED_REQUEST_COUNT]: failedRequestCount,
+        [RuntimeMetricType.REQUEST_COUNT]: runtimeRequestCount,
+        [RuntimeMetricType.AVG_RESPONSE_TIME]: runtimeAverageResponseTime,
+        [RuntimeMetricType.CPU_UTILIZATION]: runtimeCPUUtilization,
+        [RuntimeMetricType.MEMORY_UTILIZATION]: runtimeMemoryUtilization,
+        [InferenceMetricType.REQUEST_COUNT_SUCCESS]: inferenceRequestSuccessCount,
+        [InferenceMetricType.REQUEST_COUNT_FAILED]: inferenceRequestFailedCount,
       },
       refresh: refreshAllMetrics,
     }),
     [
-      endpointHealth,
-      inferencePerformance,
-      averageResponseTime,
-      requestCount,
-      failedRequestCount,
+      runtimeRequestCount,
+      runtimeAverageResponseTime,
+      runtimeCPUUtilization,
+      runtimeMemoryUtilization,
+      inferenceRequestSuccessCount,
+      inferenceRequestFailedCount,
       refreshAllMetrics,
     ],
   );

diff --git a/frontend/src/api/prometheus/usePrometheusQueryRange.ts b/frontend/src/api/prometheus/usePrometheusQueryRange.ts
@@ -3,6 +3,7 @@ import axios from 'axios';
 import { PrometheusQueryRangeResponse, PrometheusQueryRangeResultValue } from '~/types';
 
 const usePrometheusQueryRange = (
+  active: boolean,
   apiPath: string,
   queryLang: string,
   span: number,
@@ -22,9 +23,15 @@ const usePrometheusQueryRange = (
     const endInS = endInMs / 1000;
     const start = endInS - span;
 
+    if (!active) {
+      // Save us the call & data storage -- if it's not active, we don't need to fetch
+      // If we are already loaded & have data, it's okay -- it can be stale data to quickly show
+      // if the associated graph renders
+      return;
+    }
     axios
       .post<{ response: PrometheusQueryRangeResponse }>(apiPath, {
-        query: `${queryLang}&start=${start}&end=${endInS}&step=${step}`,
+        query: `query=${queryLang}&start=${start}&end=${endInS}&step=${step}`,
       })
       .then((response) => {
         const result = response.data?.response.data.result?.[0]?.values || [];
@@ -36,7 +43,7 @@ const usePrometheusQueryRange = (
         setError(e);
         setLoaded(true);
       });
-  }, [queryLang, apiPath, span, endInMs, step]);
+  }, [endInMs, span, active, apiPath, queryLang, step]);
 
   React.useEffect(() => {
     fetchData();

diff --git a/frontend/src/api/prometheus/useQueryRangeResourceData.ts b/frontend/src/api/prometheus/useQueryRangeResourceData.ts
@@ -1,19 +1,22 @@
-import { TimeframeStep, TimeframeTime } from '~/pages/modelServing/screens/const';
+import { TimeframeStep, TimeframeTimeRange } from '~/pages/modelServing/screens/const';
 import { TimeframeTitle } from '~/pages/modelServing/screens/types';
 import { ContextResourceData, PrometheusQueryRangeResultValue } from '~/types';
 import { useContextResourceData } from '~/utilities/useContextResourceData';
 import usePrometheusQueryRange from './usePrometheusQueryRange';
 
 const useQueryRangeResourceData = (
+  /** Is the query active -- should we be fetching? */
+  active: boolean,
   query: string,
   end: number,
   timeframe: TimeframeTitle,
 ): ContextResourceData<PrometheusQueryRangeResultValue> =>
   useContextResourceData<PrometheusQueryRangeResultValue>(
     usePrometheusQueryRange(
+      active,
       '/api/prometheus/serving',
       query,
-      TimeframeTime[timeframe],
+      TimeframeTimeRange[timeframe],
       end,
       TimeframeStep[timeframe],
     ),

diff --git a/frontend/src/pages/modelServing/ModelServingRoutes.tsx b/frontend/src/pages/modelServing/ModelServingRoutes.tsx
@@ -1,7 +1,7 @@
 import * as React from 'react';
 import { Navigate, Routes, Route } from 'react-router-dom';
 import ModelServingContextProvider from './ModelServingContext';
-import ModelServingMetricsWrapper from './screens/metrics/ModelServingMetricsWrapper';
+import GlobalInferenceMetricsWrapper from './screens/metrics/GlobalInferenceMetricsWrapper';
 import ModelServingGlobal from './screens/global/ModelServingGlobal';
 import useModelMetricsEnabled from './useModelMetricsEnabled';
 
@@ -15,9 +15,10 @@ const ModelServingRoutes: React.FC = () => {
         <Route
           path="/metrics/:project/:inferenceService"
           element={
-            modelMetricsEnabled ? <ModelServingMetricsWrapper /> : <Navigate replace to="/" />
+            modelMetricsEnabled ? <GlobalInferenceMetricsWrapper /> : <Navigate replace to="/" />
           }
         />
+        {/* TODO: Global Runtime metrics?? */}
         <Route path="*" element={<Navigate to="." />} />
       </Route>
     </Routes>

diff --git a/frontend/src/pages/modelServing/screens/const.ts b/frontend/src/pages/modelServing/screens/const.ts
@@ -122,18 +122,30 @@ export const DEFAULT_MODEL_SERVING_TEMPLATE: ServingRuntimeKind = {
   },
 };
 
-// unit: seconds
-export const TimeframeTime: TimeframeTimeType = {
-  [TimeframeTitle.FIVE_MINUTES]: 5 * 60,
+/**
+ * The desired range (x-axis) of the charts.
+ * Unit is in seconds
+ */
+export const TimeframeTimeRange: TimeframeTimeType = {
   [TimeframeTitle.ONE_HOUR]: 60 * 60,
   [TimeframeTitle.ONE_DAY]: 24 * 60 * 60,
   [TimeframeTitle.ONE_WEEK]: 7 * 24 * 60 * 60,
+  [TimeframeTitle.ONE_MONTH]: 30 * 7 * 24 * 60 * 60,
+  // [TimeframeTitle.UNLIMITED]: 0,
 };
 
-// make sure we always get ~300 data points
+/**
+ * How large a step is -- value is in how many seconds to combine to great an individual data response
+ * Each should be getting ~300 data points (assuming data fills the gap)
+ *
+ * eg. [TimeframeTitle.ONE_DAY]: 24 * 12,
+ *   24h * 60m * 60s => 86,400 seconds of space
+ *   86,400 / (24 * 12) => 300 points of prometheus data
+ */
 export const TimeframeStep: TimeframeStepType = {
-  [TimeframeTitle.FIVE_MINUTES]: 1,
   [TimeframeTitle.ONE_HOUR]: 12,
   [TimeframeTitle.ONE_DAY]: 24 * 12,
   [TimeframeTitle.ONE_WEEK]: 7 * 24 * 12,
+  [TimeframeTitle.ONE_MONTH]: 30 * 7 * 24 * 12,
+  // [TimeframeTitle.UNLIMITED]: 30 * 7 * 24 * 12, // TODO: determine if we "zoom out" more
 };
diff --git a/...ns/metrics/ModelServingMetricsWrapper.tsx → ...metrics/GlobalInferenceMetricsWrapper.tsx b/...ns/metrics/ModelServingMetricsWrapper.tsx → ...metrics/GlobalInferenceMetricsWrapper.tsx
@@ -4,11 +4,13 @@ import { Bullseye, Spinner } from '@patternfly/react-core';
 import NotFound from '~/pages/NotFound';
 import { ModelServingContext } from '~/pages/modelServing/ModelServingContext';
 import { getInferenceServiceDisplayName } from '~/pages/modelServing/screens/global/utils';
+import InferenceGraphs from '~/pages/modelServing/screens/metrics/InferenceGraphs';
+import { MetricType } from '~/pages/modelServing/screens/types';
 import { ModelServingMetricsProvider } from './ModelServingMetricsContext';
 import MetricsPage from './MetricsPage';
 import { getInferenceServiceMetricsQueries } from './utils';
 
-const ModelServingMetricsWrapper: React.FC = () => {
+const GlobalInferenceMetricsWrapper: React.FC = () => {
   const { project: projectName, inferenceService: modelName } = useParams<{
     project: string;
     inferenceService: string;
@@ -33,7 +35,7 @@ const ModelServingMetricsWrapper: React.FC = () => {
   const modelDisplayName = getInferenceServiceDisplayName(inferenceService);
 
   return (
-    <ModelServingMetricsProvider queries={queries}>
+    <ModelServingMetricsProvider queries={queries} type={MetricType.INFERENCE}>
       <MetricsPage
         title={`${modelDisplayName} metrics`}
         breadcrumbItems={[
@@ -43,9 +45,11 @@ const ModelServingMetricsWrapper: React.FC = () => {
             isActive: true,
           },
         ]}
-      />
+      >
+        <InferenceGraphs />
+      </MetricsPage>
     </ModelServingMetricsProvider>
   );
 };
 
-export default ModelServingMetricsWrapper;
+export default GlobalInferenceMetricsWrapper;
diff --git a/frontend/src/pages/modelServing/screens/metrics/InferenceGraphs.tsx b/frontend/src/pages/modelServing/screens/metrics/InferenceGraphs.tsx
@@ -0,0 +1,45 @@
+import * as React from 'react';
+import { Stack, StackItem } from '@patternfly/react-core';
+import MetricsChart from '~/pages/modelServing/screens/metrics/MetricsChart';
+import {
+  InferenceMetricType,
+  ModelServingMetricsContext,
+} from '~/pages/modelServing/screens/metrics/ModelServingMetricsContext';
+import { TimeframeTitle } from '~/pages/modelServing/screens/types';
+import { per100 } from './utils';
+
+const InferenceGraphs: React.FC = () => {
+  const { data, currentTimeframe } = React.useContext(ModelServingMetricsContext);
+
+  const inHours =
+    currentTimeframe === TimeframeTitle.ONE_HOUR || currentTimeframe === TimeframeTitle.ONE_DAY;
+
+  return (
+    <Stack hasGutter>
+      <StackItem>
+        <MetricsChart
+          metrics={[
+            {
+              name: 'Success http requests (x100)',
+              metric: data[InferenceMetricType.REQUEST_COUNT_SUCCESS],
+              translatePoint: per100,
+            },
+            {
+              name: 'Failed http requests (x100)',
+              metric: data[InferenceMetricType.REQUEST_COUNT_FAILED],
+              translatePoint: (point) => {
+                // TODO: remove when real values are used
+                const newPoint = per100(point);
+                const y = Math.floor(newPoint.y / (Math.floor(Math.random() * 2) + 2));
+                return { ...newPoint, y };
+              },
+            },
+          ]}
+          title={`Http requests per ${inHours ? 'hour' : 'day'} (x100)`}
+        />
+      </StackItem>
+    </Stack>
+  );
+};
+
+export default InferenceGraphs;