-
Notifications
You must be signed in to change notification settings - Fork 88
OCPBUGS-62633: Fix Progressing/Available status conditions on node reboot #349
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ import ( | |
|
|
||
| appsv1 "k8s.io/api/apps/v1" | ||
| "k8s.io/apimachinery/pkg/util/sets" | ||
| "k8s.io/utils/ptr" | ||
|
|
||
| operatorv1 "github.com/openshift/api/operator/v1" | ||
| "github.com/openshift/library-go/pkg/operator/v1helpers" | ||
|
|
@@ -86,38 +87,39 @@ func isDeploymentStatusAvailableAndUpdated(deploy appsv1.Deployment) bool { | |
| } | ||
|
|
||
| func isDeploymentStatusComplete(deploy appsv1.Deployment) bool { | ||
| replicas := int32(1) | ||
| if deploy.Spec.Replicas != nil { | ||
| replicas = *(deploy.Spec.Replicas) | ||
| } | ||
| return deploy.Status.UpdatedReplicas == replicas && | ||
| deploy.Status.Replicas == replicas && | ||
| deploy.Status.AvailableReplicas == replicas && | ||
| deploy.Status.ObservedGeneration >= deploy.Generation | ||
| desiredReplicas := ptr.Deref(deploy.Spec.Replicas, 1) | ||
| return isDeploymentUpToDate(deploy) && deploy.Status.AvailableReplicas == desiredReplicas | ||
| } | ||
|
|
||
| func isDeploymentUpToDate(deploy appsv1.Deployment) bool { | ||
| desiredReplicas := ptr.Deref(deploy.Spec.Replicas, 1) | ||
| return deploy.Status.ObservedGeneration >= deploy.Generation && | ||
| deploy.Status.UpdatedReplicas == desiredReplicas && | ||
| deploy.Status.Replicas == desiredReplicas | ||
| } | ||
|
|
||
| func (c *serviceCAOperator) syncStatus(operatorConfigCopy *operatorv1.ServiceCA, existingDeployments *appsv1.DeploymentList, targetDeploymentNames sets.String) { | ||
| func (c *serviceCAOperator) syncStatus(operatorConfigCopy *operatorv1.ServiceCA, existingDeployments *appsv1.DeploymentList, targetDeploymentNames sets.Set[string]) { | ||
| versionUpdatable := true | ||
| versionUpdatableAndDeploymentsComplete := true | ||
| deploymentUnavailableButUpToDate := false | ||
| statusMsg := "" | ||
| existingDeploymentNames := sets.String{} | ||
| existingDeploymentNames := sets.New[string]() | ||
| for _, dep := range existingDeployments.Items { | ||
| existingDeploymentNames.Insert(dep.Name) | ||
| // If there isn't at least one replica from each deployment, Available=False | ||
| reason := "ManagedDeploymentsNotReady" | ||
| if dep.DeletionTimestamp != nil { | ||
| statusMsg += fmt.Sprintf("\n%s deleting", dep.Name) | ||
| setProgressingTrue(operatorConfigCopy, reason, statusMsg) | ||
| setAvailableFalse(operatorConfigCopy, reason, statusMsg) | ||
| versionUpdatable = false | ||
| versionUpdatableAndDeploymentsComplete = false | ||
| continue | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The status check also seems to expect a very specific deployment with 1 replica. |
||
| if !isDeploymentStatusAvailable(dep) { | ||
| statusMsg += fmt.Sprintf("\n%s does not have available replicas", dep.Name) | ||
| setProgressingTrue(operatorConfigCopy, reason, statusMsg) | ||
| setAvailableFalse(operatorConfigCopy, reason, statusMsg) | ||
| versionUpdatable = false | ||
| if isDeploymentUpToDate(dep) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be a false positive. During a workload rollout, we should report progressing until all the replicas become available. Why can't we depend on the Deployment |
||
| statusMsg += fmt.Sprintf("\n%s: all replicas are up-to-date but not yet available", dep.Name) | ||
| deploymentUnavailableButUpToDate = true | ||
| } else { | ||
| statusMsg += fmt.Sprintf("\n%s does not have available replicas", dep.Name) | ||
| versionUpdatable = false | ||
| } | ||
|
tchap marked this conversation as resolved.
|
||
| versionUpdatableAndDeploymentsComplete = false | ||
| continue | ||
| } | ||
|
|
@@ -149,6 +151,17 @@ func (c *serviceCAOperator) syncStatus(operatorConfigCopy *operatorv1.ServiceCA, | |
| c.setVersion() | ||
| return | ||
| } | ||
| // Deployment is up-to-date but temporarily unavailable (e.g. node reboot, | ||
| // pod eviction). The versionUpdatable guard ensures this only fires when no | ||
| // other deployment is mid-rollout or deleting. Keep Available=True to avoid | ||
| // tripping CVO invariant monitors during transient pod rescheduling. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is incorrect. The deployment is unavailable and since it is a single replica nothing can be done during rescheduling. We either need a monitor exception or increase the number of replicas. |
||
| if deploymentUnavailableButUpToDate && versionUpdatable { | ||
| reason := "ManagedDeploymentsUpToDateButUnavailable" | ||
| setAvailableTrue(operatorConfigCopy, reason) | ||
| setProgressingFalse(operatorConfigCopy, reason, statusMsg) | ||
| c.setVersion() | ||
| return | ||
| } | ||
| // No instances of previous deployments, | ||
| // some replicas are missing, but each has at least 1 available; set Progressing=true | ||
| if versionUpdatable { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we track multiple deployments? I can only see one. This is unnecessarily confusing.