Skip to content

Commit 019cea5

Browse files
tianfeng92claude
andauthored
feat(logstorage): validate replicas < node count to prevent ILM stall (tigera#4529)
* fix(logstorage): validate replicas < node count to prevent ILM stall On single-node ES clusters with replicas: 1, replica shards can never be allocated. This causes the ILM warm phase migrate action to wait indefinitely for shard copies to become active, blocking progression to the delete phase and causing indices to accumulate beyond retention. Add validation in the LogStorage initializer that rejects configurations where indices.replicas >= nodes.count, with a clear error message guiding users to set replicas to 0 for single-node deployments. * fix(logstorage): warn when node count only exceeds replicas by 1 * fmt * fix(logstorage): return error as last argument per Go convention Fixes ST1008 staticcheck violation by swapping return order from (error, string) to (string, error) in validateLogStorage and validateReplicasForNodeCount. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c9cc828 commit 019cea5

2 files changed

Lines changed: 160 additions & 1 deletion

File tree

pkg/controller/logstorage/initializer/initializing_controller.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,35 @@ func FillDefaults(opr *operatorv1.LogStorage) {
168168
}
169169
}
170170

171+
func validateLogStorage(spec *operatorv1.LogStorageSpec) (string, error) {
172+
warning, err := validateReplicasForNodeCount(spec)
173+
if err != nil {
174+
return "", err
175+
}
176+
if err := validateComponentResources(spec); err != nil {
177+
return "", err
178+
}
179+
return warning, nil
180+
}
181+
182+
func validateReplicasForNodeCount(spec *operatorv1.LogStorageSpec) (string, error) {
183+
if spec.Nodes == nil || spec.Indices == nil || spec.Indices.Replicas == nil {
184+
return "", nil
185+
}
186+
187+
replicas := int(*spec.Indices.Replicas)
188+
nodeCount := int(spec.Nodes.Count)
189+
if replicas > 0 && nodeCount <= replicas {
190+
return "", fmt.Errorf("LogStorage spec.indices.replicas (%d) must be less than spec.nodes.count (%d); replica shards cannot be allocated when there are not enough nodes. For a single-node Elasticsearch cluster, set spec.indices.replicas to 0", replicas, nodeCount)
191+
}
192+
193+
if replicas > 0 && nodeCount == replicas+1 {
194+
return fmt.Sprintf("LogStorage spec.nodes.count (%d) is only 1 more than spec.indices.replicas (%d); this may prevent voluntary pod evictions (e.g., node repaving) due to PodDisruptionBudget constraints. If this is expected for your environment, no action is needed. Otherwise, consider setting spec.nodes.count to at least %d", nodeCount, replicas, replicas+2), nil
195+
}
196+
197+
return "", nil
198+
}
199+
171200
func validateComponentResources(spec *operatorv1.LogStorageSpec) error {
172201
if spec.ComponentResources == nil {
173202
return fmt.Errorf("LogStorage spec.ComponentResources is nil %+v", spec)
@@ -232,13 +261,16 @@ func (r *LogStorageInitializer) Reconcile(ctx context.Context, request reconcile
232261

233262
// Default and validate the object.
234263
FillDefaults(ls)
235-
err = validateComponentResources(&ls.Spec)
264+
warning, err := validateLogStorage(&ls.Spec)
236265
if err != nil {
237266
// Invalid - mark it as such and return.
238267
r.setConditionDegraded(ctx, ls, reqLogger)
239268
r.status.SetDegraded(operatorv1.ResourceValidationError, "An error occurred while validating LogStorage", err, reqLogger)
240269
return reconcile.Result{}, err
241270
}
271+
if warning != "" {
272+
reqLogger.Info(warning)
273+
}
242274

243275
pullSecrets, err := utils.GetNetworkingPullSecrets(install, r.client)
244276
if err != nil {

pkg/controller/logstorage/initializer/initializing_controller_test.go

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,46 @@ var _ = Describe("LogStorage Initializing controller", func() {
184184
Expect(ls.Status.State).Should(Equal(operatorv1.TigeraStatusReady))
185185
})
186186

187+
It("sets a degraded status when replicas >= node count", func() {
188+
var replicas int32 = 1
189+
ls := &operatorv1.LogStorage{}
190+
ls.Name = "tigera-secure"
191+
FillDefaults(ls)
192+
ls.Spec.Indices.Replicas = &replicas
193+
ls.Spec.Nodes.Count = 1
194+
Expect(cli.Create(ctx, ls)).ShouldNot(HaveOccurred())
195+
196+
r, err := NewTestInitializer(cli, scheme, mockStatus, operatorv1.ProviderNone, dns.DefaultClusterDomain)
197+
Expect(err).ShouldNot(HaveOccurred())
198+
_, err = r.Reconcile(ctx, reconcile.Request{})
199+
Expect(err).Should(HaveOccurred())
200+
Expect(mockStatus.AssertNumberOfCalls(GinkgoT(), "SetDegraded", 1)).Should(BeTrue())
201+
202+
ls = &operatorv1.LogStorage{}
203+
Expect(cli.Get(ctx, client.ObjectKey{Name: "tigera-secure"}, ls)).ShouldNot(HaveOccurred())
204+
Expect(ls.Status.State).Should(Equal(operatorv1.TigeraStatusDegraded))
205+
})
206+
207+
It("logs a warning but does not degrade when node count only exceeds replicas by 1", func() {
208+
var replicas int32 = 1
209+
ls := &operatorv1.LogStorage{}
210+
ls.Name = "tigera-secure"
211+
FillDefaults(ls)
212+
ls.Spec.Indices.Replicas = &replicas
213+
ls.Spec.Nodes.Count = 2
214+
Expect(cli.Create(ctx, ls)).ShouldNot(HaveOccurred())
215+
216+
r, err := NewTestInitializer(cli, scheme, mockStatus, operatorv1.ProviderNone, dns.DefaultClusterDomain)
217+
Expect(err).ShouldNot(HaveOccurred())
218+
_, err = r.Reconcile(ctx, reconcile.Request{})
219+
Expect(err).ShouldNot(HaveOccurred())
220+
Expect(mockStatus.AssertNumberOfCalls(GinkgoT(), "SetDegraded", 0)).Should(BeTrue())
221+
222+
ls = &operatorv1.LogStorage{}
223+
Expect(cli.Get(ctx, client.ObjectKey{Name: "tigera-secure"}, ls)).ShouldNot(HaveOccurred())
224+
Expect(ls.Status.State).Should(Equal(operatorv1.TigeraStatusReady))
225+
})
226+
187227
It("handles LogStorage deletion", func() {
188228
// Create a LogStorage instance.
189229
ls := &operatorv1.LogStorage{}
@@ -352,6 +392,93 @@ var _ = Describe("LogStorage Initializing controller", func() {
352392
})
353393
})
354394

395+
Context("validateReplicasForNodeCount", func() {
396+
It("should return an error when replicas is 1 and node count is 1", func() {
397+
var replicas int32 = 1
398+
spec := &operatorv1.LogStorageSpec{
399+
Nodes: &operatorv1.Nodes{Count: 1},
400+
Indices: &operatorv1.Indices{Replicas: &replicas},
401+
}
402+
warning, err := validateReplicasForNodeCount(spec)
403+
Expect(err).NotTo(BeNil())
404+
Expect(warning).To(BeEmpty())
405+
})
406+
407+
It("should return an error when replicas equals node count", func() {
408+
var replicas int32 = 2
409+
spec := &operatorv1.LogStorageSpec{
410+
Nodes: &operatorv1.Nodes{Count: 2},
411+
Indices: &operatorv1.Indices{Replicas: &replicas},
412+
}
413+
warning, err := validateReplicasForNodeCount(spec)
414+
Expect(err).NotTo(BeNil())
415+
Expect(warning).To(BeEmpty())
416+
})
417+
418+
It("should return a warning when node count is only 1 more than replicas", func() {
419+
var replicas int32 = 1
420+
spec := &operatorv1.LogStorageSpec{
421+
Nodes: &operatorv1.Nodes{Count: 2},
422+
Indices: &operatorv1.Indices{Replicas: &replicas},
423+
}
424+
warning, err := validateReplicasForNodeCount(spec)
425+
Expect(err).To(BeNil())
426+
Expect(warning).To(ContainSubstring("only 1 more than"))
427+
})
428+
429+
It("should return a warning when node count is 3 and replicas is 2", func() {
430+
var replicas int32 = 2
431+
spec := &operatorv1.LogStorageSpec{
432+
Nodes: &operatorv1.Nodes{Count: 3},
433+
Indices: &operatorv1.Indices{Replicas: &replicas},
434+
}
435+
warning, err := validateReplicasForNodeCount(spec)
436+
Expect(err).To(BeNil())
437+
Expect(warning).To(ContainSubstring("only 1 more than"))
438+
})
439+
440+
It("should return no error or warning when node count exceeds replicas by 2 or more", func() {
441+
var replicas int32 = 1
442+
spec := &operatorv1.LogStorageSpec{
443+
Nodes: &operatorv1.Nodes{Count: 3},
444+
Indices: &operatorv1.Indices{Replicas: &replicas},
445+
}
446+
warning, err := validateReplicasForNodeCount(spec)
447+
Expect(err).To(BeNil())
448+
Expect(warning).To(BeEmpty())
449+
})
450+
451+
It("should return no error or warning when replicas is 0 and node count is 1", func() {
452+
var replicas int32 = 0
453+
spec := &operatorv1.LogStorageSpec{
454+
Nodes: &operatorv1.Nodes{Count: 1},
455+
Indices: &operatorv1.Indices{Replicas: &replicas},
456+
}
457+
warning, err := validateReplicasForNodeCount(spec)
458+
Expect(err).To(BeNil())
459+
Expect(warning).To(BeEmpty())
460+
})
461+
462+
It("should return no error or warning when indices is nil", func() {
463+
spec := &operatorv1.LogStorageSpec{
464+
Nodes: &operatorv1.Nodes{Count: 1},
465+
}
466+
warning, err := validateReplicasForNodeCount(spec)
467+
Expect(err).To(BeNil())
468+
Expect(warning).To(BeEmpty())
469+
})
470+
471+
It("should return no error or warning when nodes is nil", func() {
472+
var replicas int32 = 1
473+
spec := &operatorv1.LogStorageSpec{
474+
Indices: &operatorv1.Indices{Replicas: &replicas},
475+
}
476+
warning, err := validateReplicasForNodeCount(spec)
477+
Expect(err).To(BeNil())
478+
Expect(warning).To(BeEmpty())
479+
})
480+
})
481+
355482
Context("validateComponentResources", func() {
356483
ls := operatorv1.LogStorage{Spec: operatorv1.LogStorageSpec{}}
357484

0 commit comments

Comments
 (0)