Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions pkg/operator/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"reflect"
"strings"
"time"

osconfigv1 "github.com/openshift/api/config/v1"
"github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers"
Expand All @@ -30,6 +31,12 @@ const (

const (
clusterOperatorName = "machine-api"

// upgradeTimeout is the maximum duration an upgrade may take before
// the operator should report Degraded. This aligns with the CVO
// expectation for how long non-MCO operators may take to complete
// version changes.
upgradeTimeout = 20 * time.Minute
)

var (
Expand Down Expand Up @@ -357,3 +364,49 @@ func (optr *Operator) isInitializing() (bool, error) {

return availableCondition != nil && availableCondition.Reason == string(ReasonInitializing), nil
}

// isUpgrading returns true when the operator's desired operand versions
// differ from the versions currently reported on the ClusterOperator status.
func (optr *Operator) isUpgrading() (bool, error) {
currentVersions, err := optr.getCurrentVersions()
if err != nil {
return false, fmt.Errorf("could not get current versions: %w", err)
}
return !reflect.DeepEqual(optr.operandVersions, currentVersions), nil
}

// upgradeHasTimedOut returns true if the Progressing condition has been True
// for longer than upgradeTimeout, indicating a stuck upgrade.
func (optr *Operator) upgradeHasTimedOut() (bool, error) {
co, err := optr.getClusterOperator()
if err != nil {
return false, fmt.Errorf("could not get cluster operator: %w", err)
}

progressingCondition := v1helpers.FindStatusCondition(co.Status.Conditions, osconfigv1.OperatorProgressing)
if progressingCondition == nil || progressingCondition.Status != osconfigv1.ConditionTrue {
return false, nil
}

return time.Since(progressingCondition.LastTransitionTime.Time) > upgradeTimeout, nil
}

// shouldReportDegraded returns true if the operator should set Degraded=True.
// During an upgrade (versions differ), Degraded is suppressed for up to
// upgradeTimeout to avoid falsely reporting degradation for transient errors.
func (optr *Operator) shouldReportDegraded() (bool, error) {
upgrading, err := optr.isUpgrading()
if err != nil {
return false, err
}
if !upgrading {
return true, nil
}

timedOut, err := optr.upgradeHasTimedOut()
if err != nil {
return false, err
}

return timedOut, nil
}
229 changes: 229 additions & 0 deletions pkg/operator/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,3 +444,232 @@ func TestIsInitializing(t *testing.T) {
})
}
}

func TestIsUpgrading(t *testing.T) {
testCases := []struct {
name string
desiredVersions []osconfigv1.OperandVersion
currentVersions []osconfigv1.OperandVersion
expectedResult bool
}{
{
name: "versions match - not upgrading",
desiredVersions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "1.0"},
},
currentVersions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "1.0"},
},
expectedResult: false,
},
{
name: "versions differ - upgrading",
desiredVersions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "2.0"},
},
currentVersions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "1.0"},
},
expectedResult: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)

co := &osconfigv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: clusterOperatorName,
},
Status: osconfigv1.ClusterOperatorStatus{
Versions: tc.currentVersions,
},
}
optr := Operator{
osClient: fakeconfigclientset.NewSimpleClientset(co),
operandVersions: tc.desiredVersions,
namespace: "test",
}

upgrading, err := optr.isUpgrading()
g.Expect(err).ToNot(HaveOccurred())
g.Expect(upgrading).To(Equal(tc.expectedResult))
})
}
}

func TestUpgradeHasTimedOut(t *testing.T) {
testCases := []struct {
name string
existingCO *osconfigv1.ClusterOperator
expectedResult bool
}{
{
name: "Progressing=True for less than upgradeTimeout - not timed out",
existingCO: &osconfigv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: clusterOperatorName,
},
Status: osconfigv1.ClusterOperatorStatus{
Conditions: []osconfigv1.ClusterOperatorStatusCondition{
{
Type: osconfigv1.OperatorProgressing,
Status: osconfigv1.ConditionTrue,
LastTransitionTime: metav1.NewTime(time.Now().Add(-5 * time.Minute)),
},
},
},
},
expectedResult: false,
},
{
name: "Progressing=True for more than upgradeTimeout - timed out",
existingCO: &osconfigv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: clusterOperatorName,
},
Status: osconfigv1.ClusterOperatorStatus{
Conditions: []osconfigv1.ClusterOperatorStatusCondition{
{
Type: osconfigv1.OperatorProgressing,
Status: osconfigv1.ConditionTrue,
LastTransitionTime: metav1.NewTime(time.Now().Add(-25 * time.Minute)),
},
},
},
},
expectedResult: true,
},
{
name: "Progressing=False - not timed out",
existingCO: &osconfigv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: clusterOperatorName,
},
Status: osconfigv1.ClusterOperatorStatus{
Conditions: []osconfigv1.ClusterOperatorStatusCondition{
{
Type: osconfigv1.OperatorProgressing,
Status: osconfigv1.ConditionFalse,
LastTransitionTime: metav1.NewTime(time.Now().Add(-25 * time.Minute)),
},
},
},
},
expectedResult: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)

optr := Operator{
osClient: fakeconfigclientset.NewSimpleClientset(tc.existingCO),
}

timedOut, err := optr.upgradeHasTimedOut()
g.Expect(err).ToNot(HaveOccurred())
g.Expect(timedOut).To(Equal(tc.expectedResult))
})
}
}

func TestShouldReportDegraded(t *testing.T) {
testCases := []struct {
name string
desiredVersions []osconfigv1.OperandVersion
existingCO *osconfigv1.ClusterOperator
expectedResult bool
}{
{
name: "not upgrading - should report degraded",
desiredVersions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "1.0"},
},
existingCO: &osconfigv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: clusterOperatorName,
},
Status: osconfigv1.ClusterOperatorStatus{
Versions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "1.0"},
},
Conditions: []osconfigv1.ClusterOperatorStatusCondition{
{
Type: osconfigv1.OperatorProgressing,
Status: osconfigv1.ConditionFalse,
LastTransitionTime: metav1.NewTime(time.Now()),
},
},
},
},
expectedResult: true,
},
{
name: "upgrading within timeout - should suppress degraded",
desiredVersions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "2.0"},
},
existingCO: &osconfigv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: clusterOperatorName,
},
Status: osconfigv1.ClusterOperatorStatus{
Versions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "1.0"},
},
Conditions: []osconfigv1.ClusterOperatorStatusCondition{
{
Type: osconfigv1.OperatorProgressing,
Status: osconfigv1.ConditionTrue,
LastTransitionTime: metav1.NewTime(time.Now().Add(-5 * time.Minute)),
},
},
},
},
expectedResult: false,
},
{
name: "upgrading past timeout - should report degraded",
desiredVersions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "2.0"},
},
existingCO: &osconfigv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: clusterOperatorName,
},
Status: osconfigv1.ClusterOperatorStatus{
Versions: []osconfigv1.OperandVersion{
{Name: "operator", Version: "1.0"},
},
Conditions: []osconfigv1.ClusterOperatorStatusCondition{
{
Type: osconfigv1.OperatorProgressing,
Status: osconfigv1.ConditionTrue,
LastTransitionTime: metav1.NewTime(time.Now().Add(-25 * time.Minute)),
},
},
},
},
expectedResult: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)

optr := Operator{
osClient: fakeconfigclientset.NewSimpleClientset(tc.existingCO),
operandVersions: tc.desiredVersions,
namespace: "test",
}

result, err := optr.shouldReportDegraded()
g.Expect(err).ToNot(HaveOccurred())
g.Expect(result).To(Equal(tc.expectedResult))
})
}
}
51 changes: 35 additions & 16 deletions pkg/operator/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,22 +102,32 @@ func (optr *Operator) syncAll(config *OperatorConfig) (reconcile.Result, error)

if len(errors) > 0 {
err := utilerrors.NewAggregate(errors)
if err := optr.statusDegraded(err.Error()); err != nil {
// Just log the error here. We still want to
// return the outer error.
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil {
klog.Errorf("Error checking upgrade status: %v", checkErr)
} else if shouldDegraded {
if err := optr.statusDegraded(err.Error()); err != nil {
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
}
} else {
klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error())
}

klog.Errorf("Error syncing machine controller components: %v", err)
return reconcile.Result{}, err
}

result, err := optr.checkRolloutStatus(config)
if err != nil {
if err := optr.statusDegraded(err.Error()); err != nil {
// Just log the error here. We still want to
// return the outer error.
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil {
klog.Errorf("Error checking upgrade status: %v", checkErr)
} else if shouldDegraded {
if err := optr.statusDegraded(err.Error()); err != nil {
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
}
} else {
klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error())
}

klog.Errorf("Error waiting for resource to sync: %v", err)
return reconcile.Result{}, err
}
Expand All @@ -130,21 +140,30 @@ func (optr *Operator) syncAll(config *OperatorConfig) (reconcile.Result, error)

initializing, err := optr.isInitializing()
if err != nil {
if err := optr.statusDegraded(err.Error()); err != nil {
// Just log the error here. We still want to
// return the outer error.
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil {
klog.Errorf("Error checking upgrade status: %v", checkErr)
} else if shouldDegraded {
if err := optr.statusDegraded(err.Error()); err != nil {
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
}
} else {
klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error())
}

klog.Errorf("Error determining state of operator: %v", err)
return reconcile.Result{}, err
}

if initializing {
if err := optr.checkMinimumWorkerMachines(); err != nil {
if err := optr.statusDegraded(err.Error()); err != nil {
// Just log the error here. We still want to
// return the outer error.
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
if shouldDegraded, checkErr := optr.shouldReportDegraded(); checkErr != nil {
klog.Errorf("Error checking upgrade status: %v", checkErr)
} else if shouldDegraded {
if err := optr.statusDegraded(err.Error()); err != nil {
klog.Errorf("Error syncing ClusterOperatorStatus: %v", err)
}
} else {
klog.Infof("Suppressing Degraded status during upgrade: %s", err.Error())
}

klog.Errorf("Cluster is initializing and minimum worker Machine requirements are not met: %v", err)
Expand Down
Loading