diff --git a/Makefile b/Makefile index b45b8ea3d53..204ff978360 100644 --- a/Makefile +++ b/Makefile @@ -88,7 +88,7 @@ SHELL = /usr/bin/env bash -o pipefail .SHELLFLAGS = -ec # E2E tests additional flags -E2E_TEST_FLAGS = "--skip-deletion=false" -timeout 20m # See README.md, default go test timeout 10m +E2E_TEST_FLAGS = "--skip-deletion=false" -timeout 25m # See README.md, default go test timeout 10m # Default image-build is to not use local odh-manifests folder # set to "true" to use local instead diff --git a/components/codeflare/codeflare.go b/components/codeflare/codeflare.go index cc95b44ce17..534e58c319e 100644 --- a/components/codeflare/codeflare.go +++ b/components/codeflare/codeflare.go @@ -106,16 +106,15 @@ func (c *CodeFlare) ReconcileComponent(ctx context.Context, return err } l.Info("apply manifests done") - // CloudServiceMonitoring handling - if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus won't fire alerts when it is just startup - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") + + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) } + } + // CloudServiceMonitoring handling + if platform == cluster.ManagedRhods { // inject prometheus codeflare*.rules in to /opt/manifests/monitoring/prometheus/prometheus-configs.yaml if err := c.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err diff --git a/components/dashboard/dashboard.go b/components/dashboard/dashboard.go index 9f44c79fdfe..30937c89673 100644 --- a/components/dashboard/dashboard.go +++ b/components/dashboard/dashboard.go @@ -141,16 +141,14 @@ func (d *Dashboard) ReconcileComponent(ctx context.Context, } l.Info("apply manifests done") - // CloudService Monitoring handling - if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus won't fire alerts when it is just startup - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentNameDownstream, dscispec.ApplicationsNamespace, 20, 3); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentNameDownstream, err) - } - l.Info("deployment is done, updating monitoring rules") + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentNameDownstream, dscispec.ApplicationsNamespace, 20, 3); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentNameDownstream, err) } + } + // CloudService Monitoring handling + if platform == cluster.ManagedRhods { if err := d.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentNameDownstream); err != nil { return err } @@ -170,6 +168,12 @@ func (d *Dashboard) ReconcileComponent(ctx context.Context, return err } l.Info("apply manifests done") + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentNameUpstream, dscispec.ApplicationsNamespace, 20, 3); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentNameUpstream, err) + } + } + return nil } } diff --git a/components/datasciencepipelines/datasciencepipelines.go b/components/datasciencepipelines/datasciencepipelines.go index 40456a14cac..388b54bd707 100644 --- a/components/datasciencepipelines/datasciencepipelines.go +++ b/components/datasciencepipelines/datasciencepipelines.go @@ -124,17 +124,15 @@ func (d *DataSciencePipelines) ReconcileComponent(ctx context.Context, } l.Info("apply manifests done") - // CloudService Monitoring handling - if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus won't fire alerts when it is just startup - // only 1 replica should be very quick - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") + // Wait for deployment available + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) } + } + // CloudService Monitoring handling + if platform == cluster.ManagedRhods { if err := d.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/kserve/kserve.go b/components/kserve/kserve.go index 1c3672d08af..fe2eb2a994d 100644 --- a/components/kserve/kserve.go +++ b/components/kserve/kserve.go @@ -157,15 +157,16 @@ func (k *Kserve) ReconcileComponent(ctx context.Context, cli client.Client, } } l.WithValues("Path", Path).Info("apply manifests done for odh-model-controller") + + // Wait for deployment available + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 3); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus won't fire alerts when it is just startup - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoing rules") - } // kesrve rules if err := k.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err diff --git a/components/kueue/kueue.go b/components/kueue/kueue.go index b6a357f74df..81bdbfe5638 100644 --- a/components/kueue/kueue.go +++ b/components/kueue/kueue.go @@ -80,15 +80,15 @@ func (k *Kueue) ReconcileComponent(ctx context.Context, cli client.Client, logge return fmt.Errorf("failed to apply manifetss %s: %w", Path, err) } l.Info("apply manifests done") + + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus won't fire alerts when it is just startup - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") - } if err := k.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/modelmeshserving/modelmeshserving.go b/components/modelmeshserving/modelmeshserving.go index f1c87adc853..b6b067221b9 100644 --- a/components/modelmeshserving/modelmeshserving.go +++ b/components/modelmeshserving/modelmeshserving.go @@ -145,15 +145,15 @@ func (m *ModelMeshServing) ReconcileComponent(ctx context.Context, } l.WithValues("Path", DependentPath).Info("apply manifests done for odh-model-controller") + + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - // first check if service is up, so prometheus won't fire alerts when it is just startup - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") - } // first model-mesh rules if err := m.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err diff --git a/components/modelregistry/modelregistry.go b/components/modelregistry/modelregistry.go index c0e20d94021..5d0610019e5 100644 --- a/components/modelregistry/modelregistry.go +++ b/components/modelregistry/modelregistry.go @@ -139,14 +139,14 @@ func (m *ModelRegistry) ReconcileComponent(ctx context.Context, cli client.Clien } l.Info("apply extra manifests done") + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, m.GetComponentName(), dscispec.ApplicationsNamespace, 10, 1); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") - } if err := m.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/ray/ray.go b/components/ray/ray.go index ea689e5e927..c1720cfcd6c 100644 --- a/components/ray/ray.go +++ b/components/ray/ray.go @@ -84,15 +84,15 @@ func (r *Ray) ReconcileComponent(ctx context.Context, cli client.Client, logger return fmt.Errorf("failed to apply manifets from %s : %w", RayPath, err) } l.Info("apply manifests done") + + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus won't fire alerts when it is just startup - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") - } if err := r.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/trainingoperator/trainingoperator.go b/components/trainingoperator/trainingoperator.go index 28f8a98c60d..6ebe12f538b 100644 --- a/components/trainingoperator/trainingoperator.go +++ b/components/trainingoperator/trainingoperator.go @@ -84,16 +84,15 @@ func (r *TrainingOperator) ReconcileComponent(ctx context.Context, cli client.Cl return err } l.Info("apply manifests done") + + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus wont fire alerts when it is just startup - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - logger.Info("deployment for " + ComponentName + " is done, updating monitoring rules") - } - l.Info("deployment is done, updating monitoring rules") if err := r.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/trustyai/trustyai.go b/components/trustyai/trustyai.go index 8cb3ccc35e7..d7d67d237bd 100644 --- a/components/trustyai/trustyai.go +++ b/components/trustyai/trustyai.go @@ -84,14 +84,15 @@ func (t *TrustyAI) ReconcileComponent(ctx context.Context, cli client.Client, lo } l.Info("apply manifests done") + // Wait for deployment available + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 2); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") - } if err := t.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/components/workbenches/workbenches.go b/components/workbenches/workbenches.go index eb9f3c81c9d..5875b1e848f 100644 --- a/components/workbenches/workbenches.go +++ b/components/workbenches/workbenches.go @@ -172,16 +172,16 @@ func (w *Workbenches) ReconcileComponent(ctx context.Context, cli client.Client, return err } l.WithValues("Path", manifestsPath).Info("apply manifests done notebook image") + + // Wait for deployment available + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 2); err != nil { + return fmt.Errorf("deployments for %s are not ready to server: %w", ComponentName, err) + } + } + // CloudService Monitoring handling if platform == cluster.ManagedRhods { - if enabled { - // first check if the service is up, so prometheus wont fire alerts when it is just startup - // only 1 replica set timeout to 1min - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 10, 1); err != nil { - return fmt.Errorf("deployments for %s are not ready to server: %w", ComponentName, err) - } - l.Info("deployment is done, updating monitoring rules") - } if err := w.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { return err } diff --git a/pkg/cluster/resources.go b/pkg/cluster/resources.go index d77201639e0..5e41eb25bc2 100644 --- a/pkg/cluster/resources.go +++ b/pkg/cluster/resources.go @@ -192,7 +192,7 @@ func WaitForDeploymentAvailable(ctx context.Context, c client.Client, componentN return false, fmt.Errorf("error fetching list of deployments: %w", err) } - ctrl.Log.Info("waiting for " + strconv.Itoa(len(componentDeploymentList.Items)) + " deployment to be ready for %s" + componentName) + ctrl.Log.Info("waiting for " + strconv.Itoa(len(componentDeploymentList.Items)) + " deployment to be ready for " + componentName) for _, deployment := range componentDeploymentList.Items { if deployment.Status.ReadyReplicas != deployment.Status.Replicas { return false, nil diff --git a/tests/e2e/controller_setup_test.go b/tests/e2e/controller_setup_test.go index d38154b7af3..1eea1ddf745 100644 --- a/tests/e2e/controller_setup_test.go +++ b/tests/e2e/controller_setup_test.go @@ -91,7 +91,7 @@ func NewTestContext() (*testContext, error) { customClient: custClient, operatorNamespace: opNamespace, applicationsNamespace: testDSCI.Spec.ApplicationsNamespace, - resourceCreationTimeout: time.Minute * 2, + resourceCreationTimeout: time.Minute * 5, // since we introduce check for all deployment, we need prolong time here too and match what we set in the component resourceRetryInterval: time.Second * 10, ctx: context.TODO(), testDsc: testDSC, diff --git a/tests/e2e/dsc_creation_test.go b/tests/e2e/dsc_creation_test.go index aff519f0690..1db9c3a6295 100644 --- a/tests/e2e/dsc_creation_test.go +++ b/tests/e2e/dsc_creation_test.go @@ -58,18 +58,10 @@ func creationTestSuite(t *testing.T) { t.Run("Creation of more than one of DataScienceCluster instance", func(t *testing.T) { testCtx.testDSCDuplication(t) }) - t.Run("Validate all deployed components", func(t *testing.T) { - err = testCtx.testAllApplicationCreation(t) - require.NoError(t, err, "error testing deployments for DataScienceCluster: "+testCtx.testDsc.Name) - }) t.Run("Validate DSCInitialization instance", func(t *testing.T) { err = testCtx.validateDSCI() require.NoError(t, err, "error validating DSCInitialization instance") }) - t.Run("Validate DataScienceCluster instance", func(t *testing.T) { - err = testCtx.validateDSC() - require.NoError(t, err, "error validating DataScienceCluster instance") - }) t.Run("Validate Ownerrefrences exist", func(t *testing.T) { err = testCtx.testOwnerrefrences() require.NoError(t, err, "error getting all DataScienceCluster's Ownerrefrences") @@ -78,6 +70,15 @@ func creationTestSuite(t *testing.T) { err = testCtx.testDefaultCertsAvailable() require.NoError(t, err, "error getting default cert secrets for Kserve") }) + t.Run("Validate Knative resoruce", func(t *testing.T) { + err = testCtx.validateDSC() + require.NoError(t, err, "error getting Knatvie resrouce as part of DataScienceCluster validation") + }) + t.Run("Validate all deployed components", func(t *testing.T) { + // this will take about 5-6mins to complete + err = testCtx.testAllApplicationCreation(t) + require.NoError(t, err, "error testing deployments for DataScienceCluster: "+testCtx.testDsc.Name) + }) t.Run("Validate default model registry cert available", func(t *testing.T) { err = testCtx.testDefaultModelRegistryCertAvailable() require.NoError(t, err, "error getting default cert secret for ModelRegistry") @@ -261,11 +262,6 @@ func (tc *testContext) testAllApplicationCreation(t *testing.T) error { //nolint } tc.testDsc = createdDSC - // Verify DSC instance is in Ready phase - if tc.testDsc.Status.Phase != "Ready" { - return fmt.Errorf("DSC instance is not in Ready phase. Current phase: %v", tc.testDsc.Status.Phase) - } - components, err := tc.testDsc.GetComponents() if err != nil { return err @@ -287,6 +283,10 @@ func (tc *testContext) testAllApplicationCreation(t *testing.T) error { //nolint } }) } + // Verify DSC instance is in Ready phase in the end when all components are up and running + if tc.testDsc.Status.Phase != "Ready" { + return fmt.Errorf("DSC instance is not in Ready phase. Current phase: %v", tc.testDsc.Status.Phase) + } return nil } @@ -354,6 +354,7 @@ func (tc *testContext) validateDSCI() error { return nil } +// test if knative resource has been created. func (tc *testContext) validateDSC() error { expServingSpec := infrav1.ServingSpec{ ManagementState: operatorv1.Managed, @@ -586,8 +587,8 @@ func (tc *testContext) testUpdateDSCComponentEnabled() error { return fmt.Errorf("error after retry %w", err) } - // Sleep for 40 seconds to allow the operator to reconcile - time.Sleep(4 * tc.resourceRetryInterval) + // Sleep for 80 seconds to allow the operator to reconcile + time.Sleep(8 * tc.resourceRetryInterval) _, err = tc.kubeClient.AppsV1().Deployments(tc.applicationsNamespace).Get(tc.ctx, dashboardDeploymentName, metav1.GetOptions{}) if err != nil { if k8serr.IsNotFound(err) {