feat(logstorage): validate replicas < node count to prevent ILM stall (tigera#4529)

tianfeng92 · claude · web-flow · commit 019cea5f4675 · 2026-03-16T12:45:27.000-07:00
* fix(logstorage): validate replicas &lt; node count to prevent ILM stall

On single-node ES clusters with replicas: 1, replica shards can never
be allocated. This causes the ILM warm phase migrate action to wait
indefinitely for shard copies to become active, blocking progression
to the delete phase and causing indices to accumulate beyond retention.

Add validation in the LogStorage initializer that rejects configurations
where indices.replicas &gt;= nodes.count, with a clear error message
guiding users to set replicas to 0 for single-node deployments.

* fix(logstorage): warn when node count only exceeds replicas by 1

* fmt

* fix(logstorage): return error as last argument per Go convention

Fixes ST1008 staticcheck violation by swapping return order from
(error, string) to (string, error) in validateLogStorage and
validateReplicasForNodeCount.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/pkg/controller/logstorage/initializer/initializing_controller.go b/pkg/controller/logstorage/initializer/initializing_controller.go
@@ -168,6 +168,35 @@ func FillDefaults(opr *operatorv1.LogStorage) {
 	}
 }
 
+func validateLogStorage(spec *operatorv1.LogStorageSpec) (string, error) {
+	warning, err := validateReplicasForNodeCount(spec)
+	if err != nil {
+		return "", err
+	}
+	if err := validateComponentResources(spec); err != nil {
+		return "", err
+	}
+	return warning, nil
+}
+
+func validateReplicasForNodeCount(spec *operatorv1.LogStorageSpec) (string, error) {
+	if spec.Nodes == nil || spec.Indices == nil || spec.Indices.Replicas == nil {
+		return "", nil
+	}
+
+	replicas := int(*spec.Indices.Replicas)
+	nodeCount := int(spec.Nodes.Count)
+	if replicas > 0 && nodeCount <= replicas {
+		return "", fmt.Errorf("LogStorage spec.indices.replicas (%d) must be less than spec.nodes.count (%d); replica shards cannot be allocated when there are not enough nodes. For a single-node Elasticsearch cluster, set spec.indices.replicas to 0", replicas, nodeCount)
+	}
+
+	if replicas > 0 && nodeCount == replicas+1 {
+		return fmt.Sprintf("LogStorage spec.nodes.count (%d) is only 1 more than spec.indices.replicas (%d); this may prevent voluntary pod evictions (e.g., node repaving) due to PodDisruptionBudget constraints. If this is expected for your environment, no action is needed. Otherwise, consider setting spec.nodes.count to at least %d", nodeCount, replicas, replicas+2), nil
+	}
+
+	return "", nil
+}
+
 func validateComponentResources(spec *operatorv1.LogStorageSpec) error {
 	if spec.ComponentResources == nil {
 		return fmt.Errorf("LogStorage spec.ComponentResources is nil %+v", spec)
@@ -232,13 +261,16 @@ func (r *LogStorageInitializer) Reconcile(ctx context.Context, request reconcile
 
 	// Default and validate the object.
 	FillDefaults(ls)
-	err = validateComponentResources(&ls.Spec)
+	warning, err := validateLogStorage(&ls.Spec)
 	if err != nil {
 		// Invalid - mark it as such and return.
 		r.setConditionDegraded(ctx, ls, reqLogger)
 		r.status.SetDegraded(operatorv1.ResourceValidationError, "An error occurred while validating LogStorage", err, reqLogger)
 		return reconcile.Result{}, err
 	}
+	if warning != "" {
+		reqLogger.Info(warning)
+	}
 
 	pullSecrets, err := utils.GetNetworkingPullSecrets(install, r.client)
 	if err != nil {
diff --git a/pkg/controller/logstorage/initializer/initializing_controller_test.go b/pkg/controller/logstorage/initializer/initializing_controller_test.go
@@ -184,6 +184,46 @@ var _ = Describe("LogStorage Initializing controller", func() {
 			Expect(ls.Status.State).Should(Equal(operatorv1.TigeraStatusReady))
 		})
 
+		It("sets a degraded status when replicas >= node count", func() {
+			var replicas int32 = 1
+			ls := &operatorv1.LogStorage{}
+			ls.Name = "tigera-secure"
+			FillDefaults(ls)
+			ls.Spec.Indices.Replicas = &replicas
+			ls.Spec.Nodes.Count = 1
+			Expect(cli.Create(ctx, ls)).ShouldNot(HaveOccurred())
+
+			r, err := NewTestInitializer(cli, scheme, mockStatus, operatorv1.ProviderNone, dns.DefaultClusterDomain)
+			Expect(err).ShouldNot(HaveOccurred())
+			_, err = r.Reconcile(ctx, reconcile.Request{})
+			Expect(err).Should(HaveOccurred())
+			Expect(mockStatus.AssertNumberOfCalls(GinkgoT(), "SetDegraded", 1)).Should(BeTrue())
+
+			ls = &operatorv1.LogStorage{}
+			Expect(cli.Get(ctx, client.ObjectKey{Name: "tigera-secure"}, ls)).ShouldNot(HaveOccurred())
+			Expect(ls.Status.State).Should(Equal(operatorv1.TigeraStatusDegraded))
+		})
+
+		It("logs a warning but does not degrade when node count only exceeds replicas by 1", func() {
+			var replicas int32 = 1
+			ls := &operatorv1.LogStorage{}
+			ls.Name = "tigera-secure"
+			FillDefaults(ls)
+			ls.Spec.Indices.Replicas = &replicas
+			ls.Spec.Nodes.Count = 2
+			Expect(cli.Create(ctx, ls)).ShouldNot(HaveOccurred())
+
+			r, err := NewTestInitializer(cli, scheme, mockStatus, operatorv1.ProviderNone, dns.DefaultClusterDomain)
+			Expect(err).ShouldNot(HaveOccurred())
+			_, err = r.Reconcile(ctx, reconcile.Request{})
+			Expect(err).ShouldNot(HaveOccurred())
+			Expect(mockStatus.AssertNumberOfCalls(GinkgoT(), "SetDegraded", 0)).Should(BeTrue())
+
+			ls = &operatorv1.LogStorage{}
+			Expect(cli.Get(ctx, client.ObjectKey{Name: "tigera-secure"}, ls)).ShouldNot(HaveOccurred())
+			Expect(ls.Status.State).Should(Equal(operatorv1.TigeraStatusReady))
+		})
+
 		It("handles LogStorage deletion", func() {
 			// Create a LogStorage instance.
 			ls := &operatorv1.LogStorage{}
@@ -352,6 +392,93 @@ var _ = Describe("LogStorage Initializing controller", func() {
 		})
 	})
 
+	Context("validateReplicasForNodeCount", func() {
+		It("should return an error when replicas is 1 and node count is 1", func() {
+			var replicas int32 = 1
+			spec := &operatorv1.LogStorageSpec{
+				Nodes:   &operatorv1.Nodes{Count: 1},
+				Indices: &operatorv1.Indices{Replicas: &replicas},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).NotTo(BeNil())
+			Expect(warning).To(BeEmpty())
+		})
+
+		It("should return an error when replicas equals node count", func() {
+			var replicas int32 = 2
+			spec := &operatorv1.LogStorageSpec{
+				Nodes:   &operatorv1.Nodes{Count: 2},
+				Indices: &operatorv1.Indices{Replicas: &replicas},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).NotTo(BeNil())
+			Expect(warning).To(BeEmpty())
+		})
+
+		It("should return a warning when node count is only 1 more than replicas", func() {
+			var replicas int32 = 1
+			spec := &operatorv1.LogStorageSpec{
+				Nodes:   &operatorv1.Nodes{Count: 2},
+				Indices: &operatorv1.Indices{Replicas: &replicas},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).To(BeNil())
+			Expect(warning).To(ContainSubstring("only 1 more than"))
+		})
+
+		It("should return a warning when node count is 3 and replicas is 2", func() {
+			var replicas int32 = 2
+			spec := &operatorv1.LogStorageSpec{
+				Nodes:   &operatorv1.Nodes{Count: 3},
+				Indices: &operatorv1.Indices{Replicas: &replicas},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).To(BeNil())
+			Expect(warning).To(ContainSubstring("only 1 more than"))
+		})
+
+		It("should return no error or warning when node count exceeds replicas by 2 or more", func() {
+			var replicas int32 = 1
+			spec := &operatorv1.LogStorageSpec{
+				Nodes:   &operatorv1.Nodes{Count: 3},
+				Indices: &operatorv1.Indices{Replicas: &replicas},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).To(BeNil())
+			Expect(warning).To(BeEmpty())
+		})
+
+		It("should return no error or warning when replicas is 0 and node count is 1", func() {
+			var replicas int32 = 0
+			spec := &operatorv1.LogStorageSpec{
+				Nodes:   &operatorv1.Nodes{Count: 1},
+				Indices: &operatorv1.Indices{Replicas: &replicas},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).To(BeNil())
+			Expect(warning).To(BeEmpty())
+		})
+
+		It("should return no error or warning when indices is nil", func() {
+			spec := &operatorv1.LogStorageSpec{
+				Nodes: &operatorv1.Nodes{Count: 1},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).To(BeNil())
+			Expect(warning).To(BeEmpty())
+		})
+
+		It("should return no error or warning when nodes is nil", func() {
+			var replicas int32 = 1
+			spec := &operatorv1.LogStorageSpec{
+				Indices: &operatorv1.Indices{Replicas: &replicas},
+			}
+			warning, err := validateReplicasForNodeCount(spec)
+			Expect(err).To(BeNil())
+			Expect(warning).To(BeEmpty())
+		})
+	})
+
 	Context("validateComponentResources", func() {
 		ls := operatorv1.LogStorage{Spec: operatorv1.LogStorageSpec{}}