From d3c17d64d65b223a123c6f9c905e9dde8bfaefa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radek=20Ma=C5=88=C3=A1k?= Date: Tue, 24 Feb 2026 16:16:42 +0100 Subject: [PATCH] Suppress degraded reporting while MAO is actively upgrading. Gate Degraded=True behind version-drift timeout checks and add unit coverage for upgrade detection, timeout handling, and daemonset rollout status behavior. --- pkg/operator/status.go | 53 +++++++++ pkg/operator/status_test.go | 229 ++++++++++++++++++++++++++++++++++++ pkg/operator/sync.go | 51 +++++--- pkg/operator/sync_test.go | 90 ++++++++++++++ 4 files changed, 407 insertions(+), 16 deletions(-) diff --git a/pkg/operator/status.go b/pkg/operator/status.go index b683af616..6891d5280 100644 --- a/pkg/operator/status.go +++ b/pkg/operator/status.go @@ -5,6 +5,7 @@ import ( "fmt" "reflect" "strings" + "time" osconfigv1 "github.com/openshift/api/config/v1" "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" @@ -30,6 +31,12 @@ const ( const ( clusterOperatorName = "machine-api" + + // upgradeTimeout is the maximum duration an upgrade may take before + // the operator should report Degraded. This aligns with the CVO + // expectation for how long non-MCO operators may take to complete + // version changes. + upgradeTimeout = 20 * time.Minute ) var ( @@ -357,3 +364,49 @@ func (optr *Operator) isInitializing() (bool, error) { return availableCondition != nil && availableCondition.Reason == string(ReasonInitializing), nil } + +// isUpgrading returns true when the operator's desired operand versions +// differ from the versions currently reported on the ClusterOperator status. +func (optr *Operator) isUpgrading() (bool, error) { + currentVersions, err := optr.getCurrentVersions() + if err != nil { + return false, fmt.Errorf("could not get current versions: %w", err) + } + return !reflect.DeepEqual(optr.operandVersions, currentVersions), nil +} + +// upgradeHasTimedOut returns true if the Progressing condition has been True +// for longer than upgradeTimeout, indicating a stuck upgrade. +func (optr *Operator) upgradeHasTimedOut() (bool, error) { + co, err := optr.getClusterOperator() + if err != nil { + return false, fmt.Errorf("could not get cluster operator: %w", err) + } + + progressingCondition := v1helpers.FindStatusCondition(co.Status.Conditions, osconfigv1.OperatorProgressing) + if progressingCondition == nil || progressingCondition.Status != osconfigv1.ConditionTrue { + return false, nil + } + + return time.Since(progressingCondition.LastTransitionTime.Time) > upgradeTimeout, nil +} + +// shouldReportDegraded returns true if the operator should set Degraded=True. +// During an upgrade (versions differ), Degraded is suppressed for up to +// upgradeTimeout to avoid falsely reporting degradation for transient errors. +func (optr *Operator) shouldReportDegraded() (bool, error) { + upgrading, err := optr.isUpgrading() + if err != nil { + return false, err + } + if !upgrading { + return true, nil + } + + timedOut, err := optr.upgradeHasTimedOut() + if err != nil { + return false, err + } + + return timedOut, nil +} diff --git a/pkg/operator/status_test.go b/pkg/operator/status_test.go index d20ac4a18..73d50e63f 100644 --- a/pkg/operator/status_test.go +++ b/pkg/operator/status_test.go @@ -444,3 +444,232 @@ func TestIsInitializing(t *testing.T) { }) } } + +func TestIsUpgrading(t *testing.T) { + testCases := []struct { + name string + desiredVersions []osconfigv1.OperandVersion + currentVersions []osconfigv1.OperandVersion + expectedResult bool + }{ + { + name: "versions match - not upgrading", + desiredVersions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "1.0"}, + }, + currentVersions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "1.0"}, + }, + expectedResult: false, + }, + { + name: "versions differ - upgrading", + desiredVersions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "2.0"}, + }, + currentVersions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "1.0"}, + }, + expectedResult: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + co := &osconfigv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: osconfigv1.ClusterOperatorStatus{ + Versions: tc.currentVersions, + }, + } + optr := Operator{ + osClient: fakeconfigclientset.NewSimpleClientset(co), + operandVersions: tc.desiredVersions, + namespace: "test", + } + + upgrading, err := optr.isUpgrading() + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(upgrading).To(Equal(tc.expectedResult)) + }) + } +} + +func TestUpgradeHasTimedOut(t *testing.T) { + testCases := []struct { + name string + existingCO *osconfigv1.ClusterOperator + expectedResult bool + }{ + { + name: "Progressing=True for less than upgradeTimeout - not timed out", + existingCO: &osconfigv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: osconfigv1.ClusterOperatorStatus{ + Conditions: []osconfigv1.ClusterOperatorStatusCondition{ + { + Type: osconfigv1.OperatorProgressing, + Status: osconfigv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now().Add(-5 * time.Minute)), + }, + }, + }, + }, + expectedResult: false, + }, + { + name: "Progressing=True for more than upgradeTimeout - timed out", + existingCO: &osconfigv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: osconfigv1.ClusterOperatorStatus{ + Conditions: []osconfigv1.ClusterOperatorStatusCondition{ + { + Type: osconfigv1.OperatorProgressing, + Status: osconfigv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now().Add(-25 * time.Minute)), + }, + }, + }, + }, + expectedResult: true, + }, + { + name: "Progressing=False - not timed out", + existingCO: &osconfigv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: osconfigv1.ClusterOperatorStatus{ + Conditions: []osconfigv1.ClusterOperatorStatusCondition{ + { + Type: osconfigv1.OperatorProgressing, + Status: osconfigv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now().Add(-25 * time.Minute)), + }, + }, + }, + }, + expectedResult: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + optr := Operator{ + osClient: fakeconfigclientset.NewSimpleClientset(tc.existingCO), + } + + timedOut, err := optr.upgradeHasTimedOut() + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(timedOut).To(Equal(tc.expectedResult)) + }) + } +} + +func TestShouldReportDegraded(t *testing.T) { + testCases := []struct { + name string + desiredVersions []osconfigv1.OperandVersion + existingCO *osconfigv1.ClusterOperator + expectedResult bool + }{ + { + name: "not upgrading - should report degraded", + desiredVersions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "1.0"}, + }, + existingCO: &osconfigv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: osconfigv1.ClusterOperatorStatus{ + Versions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "1.0"}, + }, + Conditions: []osconfigv1.ClusterOperatorStatusCondition{ + { + Type: osconfigv1.OperatorProgressing, + Status: osconfigv1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now()), + }, + }, + }, + }, + expectedResult: true, + }, + { + name: "upgrading within timeout - should suppress degraded", + desiredVersions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "2.0"}, + }, + existingCO: &osconfigv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: osconfigv1.ClusterOperatorStatus{ + Versions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "1.0"}, + }, + Conditions: []osconfigv1.ClusterOperatorStatusCondition{ + { + Type: osconfigv1.OperatorProgressing, + Status: osconfigv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now().Add(-5 * time.Minute)), + }, + }, + }, + }, + expectedResult: false, + }, + { + name: "upgrading past timeout - should report degraded", + desiredVersions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "2.0"}, + }, + existingCO: &osconfigv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: osconfigv1.ClusterOperatorStatus{ + Versions: []osconfigv1.OperandVersion{ + {Name: "operator", Version: "1.0"}, + }, + Conditions: []osconfigv1.ClusterOperatorStatusCondition{ + { + Type: osconfigv1.OperatorProgressing, + Status: osconfigv1.ConditionTrue, + LastTransitionTime: metav1.NewTime(time.Now().Add(-25 * time.Minute)), + }, + }, + }, + }, + expectedResult: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + optr := Operator{ + osClient: fakeconfigclientset.NewSimpleClientset(tc.existingCO), + operandVersions: tc.desiredVersions, + namespace: "test", + } + + result, err := optr.shouldReportDegraded() + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result).To(Equal(tc.expectedResult)) + }) + } +} diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 4fbf9ac42..de0442611 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -102,22 +102,32 @@ func (optr *Operator) syncAll(config *OperatorConfig) (reconcile.Result, error) if len(errors) > 0 { err := utilerrors.NewAggregate(errors) - if err := optr.statusDegraded(err.Error()); err != nil { - // Just log the error here. We still want to - // return the outer error. - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil { + klog.Errorf("Error checking upgrade status: %v", checkErr) + } else if shouldDegraded { + if err := optr.statusDegraded(err.Error()); err != nil { + klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + } + } else { + klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error()) } + klog.Errorf("Error syncing machine controller components: %v", err) return reconcile.Result{}, err } result, err := optr.checkRolloutStatus(config) if err != nil { - if err := optr.statusDegraded(err.Error()); err != nil { - // Just log the error here. We still want to - // return the outer error. - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil { + klog.Errorf("Error checking upgrade status: %v", checkErr) + } else if shouldDegraded { + if err := optr.statusDegraded(err.Error()); err != nil { + klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + } + } else { + klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error()) } + klog.Errorf("Error waiting for resource to sync: %v", err) return reconcile.Result{}, err } @@ -130,21 +140,30 @@ func (optr *Operator) syncAll(config *OperatorConfig) (reconcile.Result, error) initializing, err := optr.isInitializing() if err != nil { - if err := optr.statusDegraded(err.Error()); err != nil { - // Just log the error here. We still want to - // return the outer error. - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil { + klog.Errorf("Error checking upgrade status: %v", checkErr) + } else if shouldDegraded { + if err := optr.statusDegraded(err.Error()); err != nil { + klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + } + } else { + klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error()) } + klog.Errorf("Error determining state of operator: %v", err) return reconcile.Result{}, err } if initializing { if err := optr.checkMinimumWorkerMachines(); err != nil { - if err := optr.statusDegraded(err.Error()); err != nil { - // Just log the error here. We still want to - // return the outer error. - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil { + klog.Errorf("Error checking upgrade status: %v", checkErr) + } else if shouldDegraded { + if err := optr.statusDegraded(err.Error()); err != nil { + klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) + } + } else { + klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error()) } klog.Errorf("Cluster is initializing and minimum worker Machine requirements are not met: %v", err) diff --git a/pkg/operator/sync_test.go b/pkg/operator/sync_test.go index 7c28c04cb..80c5c19e1 100644 --- a/pkg/operator/sync_test.go +++ b/pkg/operator/sync_test.go @@ -504,3 +504,93 @@ func TestSyncWebhookConfiguration(t *testing.T) { }) } } + +func TestCheckDaemonSetRolloutStatus(t *testing.T) { + testCases := []struct { + name string + daemonset *appsv1.DaemonSet + expectedError error + expectedRequeueAfter time.Duration + }{ + { + name: "DaemonSet fully rolled out", + daemonset: &appsv1.DaemonSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ds", + Namespace: targetNamespace, + Generation: 1, + }, + Status: appsv1.DaemonSetStatus{ + ObservedGeneration: 1, + DesiredNumberScheduled: 3, + UpdatedNumberScheduled: 3, + NumberAvailable: 3, + NumberUnavailable: 0, + }, + }, + expectedError: nil, + expectedRequeueAfter: 0, + }, + { + name: "DaemonSet with Generation > ObservedGeneration should requeue", + daemonset: &appsv1.DaemonSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "DaemonSet", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ds", + Namespace: targetNamespace, + Generation: 2, + }, + Status: appsv1.DaemonSetStatus{ + ObservedGeneration: 1, + DesiredNumberScheduled: 3, + UpdatedNumberScheduled: 3, + NumberAvailable: 3, + NumberUnavailable: 0, + }, + }, + expectedError: nil, + expectedRequeueAfter: 5 * time.Second, + }, + } + + imagesJSONFile, err := createImagesJSONFromManifest() + if err != nil { + t.Fatal(err) + } + defer func() { + if err := os.Remove(imagesJSONFile); err != nil { + t.Fatal(err) + } + }() + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + optr, err := newFakeOperator([]runtime.Object{tc.daemonset}, nil, nil, imagesJSONFile, nil, stopCh) + if err != nil { + t.Fatal(err) + } + + result, gotErr := optr.checkDaemonSetRolloutStatus(tc.daemonset) + if tc.expectedError != nil && gotErr != nil { + if tc.expectedError.Error() != gotErr.Error() { + t.Errorf("Got error: %v, expected: %v", gotErr, tc.expectedError) + } + } else if tc.expectedError != gotErr { + t.Errorf("Got error: %v, expected: %v", gotErr, tc.expectedError) + } + + if tc.expectedRequeueAfter != result.RequeueAfter { + t.Errorf("Got requeueAfter: %v, expected: %v", result.RequeueAfter, tc.expectedRequeueAfter) + } + }) + } +}