fix(raft): wait for block-store sync before abdicating on leader election

auricom · claude · auricom · commit 5897ef3be3df · 2026-04-17T16:55:25.000+02:00
When all nodes restart simultaneously their block stores can lag behind
the raft FSM height (block data arrives via p2p, not raft). With the
previous code every elected node saw diff &lt; -1 and immediately called
leadershipTransfer(), creating an infinite hot-potato: no node ever
stabilised as leader and block production stalled.

Instead of abdicating immediately, the new waitForBlockStoreSync helper
polls IsSynced for up to ShutdownTimeout (default ~1s). The fastest-
syncing peer proceeds as leader; nodes that cannot catch up in time still
abdicate and yield to a better candidate. Leadership also checks mid-wait
so a lost-leadership event aborts the wait early.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/pkg/raft/election.go b/pkg/raft/election.go
@@ -136,19 +136,36 @@ func (d *DynamicLeaderElection) Run(ctx context.Context) error {
 						// Store is more than 1 block behind raft state.
 						// RecoverFromRaft can only apply the single latest block
 						// from the raft snapshot; it cannot replay a larger gap.
-						// Starting leader operations in this state would stall block
-						// production until catch-up completes (potentially minutes or
-						// hours). Abdicate immediately so a better-synced peer can
-						// take leadership.
+						//
+						// Before abdicating, wait for p2p block-store sync to close
+						// the gap. If all nodes restart simultaneously with lagging
+						// block stores, immediate abdication causes a leadership
+						// hot-potato: every elected node abdicates at once and the
+						// cluster never stabilises. Waiting gives the fastest-syncing
+						// peer a chance to stay as leader.
 						d.logger.Warn().
 							Int("store_lag_blocks", -diff).
 							Uint64("raft_height", raftState.Height).
-							Msg("became leader but store is significantly behind raft state; abdicating to prevent stalled block production")
-						if tErr := d.node.leadershipTransfer(); tErr != nil {
-							d.logger.Error().Err(tErr).Msg("leadership transfer failed after store-lag abdication")
-							return fmt.Errorf("leadership transfer failed after store-lag abdication: %w", tErr)
+							Msg("became leader but store is significantly behind raft state; waiting for block-store sync")
+						if !d.waitForBlockStoreSync(ctx, runnable) {
+							d.logger.Warn().
+								Int("store_lag_blocks", -diff).
+								Uint64("raft_height", raftState.Height).
+								Msg("store still significantly behind raft state after wait; abdicating to prevent stalled block production")
+							if tErr := d.node.leadershipTransfer(); tErr != nil {
+								d.logger.Error().Err(tErr).Msg("leadership transfer failed after store-lag abdication")
+								return fmt.Errorf("leadership transfer failed after store-lag abdication: %w", tErr)
+							}
+							continue
+						}
+						// Block store caught up — refresh state so the recovery
+						// check below works with the latest values.
+						d.logger.Info().Msg("block store caught up after wait; proceeding as leader")
+						raftState = d.node.GetState()
+						diff, err = runnable.IsSynced(raftState)
+						if err != nil {
+							return err
 						}
-						continue
 					}
 					if diff != 0 {
 						d.logger.Info().Msg("became leader but not synced, attempting recovery")
@@ -271,3 +288,37 @@ func (d *DynamicLeaderElection) verifyState(ctx context.Context, runnable Runnab
 func (d *DynamicLeaderElection) IsRunning() bool {
 	return d.running.Load()
 }
+
+// waitForBlockStoreSync polls IsSynced until the block store is within 1 block
+// of the current raft FSM height, leadership is lost, or the context expires.
+// Returns true if sync was achieved in time.
+func (d *DynamicLeaderElection) waitForBlockStoreSync(ctx context.Context, r Runnable) bool {
+	cfg := d.node.Config()
+	timeout := cfg.ShutdownTimeout
+	if timeout <= 0 {
+		timeout = 5 * cfg.SendTimeout
+	}
+	deadline := time.NewTimer(timeout)
+	defer deadline.Stop()
+	pollInterval := min(100*time.Millisecond, timeout/10)
+	ticker := time.NewTicker(pollInterval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return false
+		case <-deadline.C:
+			// Final check before giving up.
+			diff, err := r.IsSynced(d.node.GetState())
+			return err == nil && diff >= -1
+		case <-ticker.C:
+			if d.node.leaderID() != d.node.NodeID() {
+				return false // lost leadership during wait
+			}
+			diff, err := r.IsSynced(d.node.GetState())
+			if err == nil && diff >= -1 {
+				return true
+			}
+		}
+	}
+}
diff --git a/pkg/raft/election_test.go b/pkg/raft/election_test.go
@@ -221,19 +221,21 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
 				assert.ErrorIs(t, err, context.Canceled)
 			},
 		},
-		"abdicate when store significantly behind raft": {
+		"abdicate when store significantly behind raft and never catches up": {
 			setup: func(t *testing.T) (*DynamicLeaderElection, context.Context, context.CancelFunc) {
 				m := newMocksourceNode(t)
 				leaderCh := make(chan bool, 2)
 				m.EXPECT().leaderCh().Return((<-chan bool)(leaderCh))
-				// GetState called in verifyState (follower start) and in leader sync check
-				m.EXPECT().GetState().Return(&RaftBlockState{Height: 10})
-				m.EXPECT().GetState().Return(&RaftBlockState{Height: 10})
-				m.EXPECT().Config().Return(testCfg()).Times(2)
+				// GetState is called in verifyState, leader sync check, and the wait loop.
+				m.EXPECT().GetState().Return(&RaftBlockState{Height: 10}).Maybe()
+				// Config: once for follower waitForMsgsLanded, once for leader waitForMsgsLanded,
+				// once inside waitForBlockStoreSync.
+				m.EXPECT().Config().Return(testCfg()).Times(3)
 				m.EXPECT().waitForMsgsLanded(2 * time.Millisecond).Return(nil)
-				m.EXPECT().NodeID().Return("self")
-				m.EXPECT().leaderID().Return("self")
-				// Abdication must transfer leadership
+				// NodeID + leaderID: called in leader-sync check and polled inside the wait loop.
+				m.EXPECT().NodeID().Return("self").Maybe()
+				m.EXPECT().leaderID().Return("self").Maybe()
+				// Abdication must transfer leadership after wait times out.
 				m.EXPECT().leadershipTransfer().Return(nil)
 
 				fStarted := make(chan struct{})
@@ -262,12 +264,12 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
 					leaderCh <- false
 					<-fStarted
 					leaderCh <- true
-					// Wait for abdication to complete (transfer + continue) then verify
-					// the leader was never started before cancelling.
+					// Wait long enough for the sync wait to time out and abdication to
+					// complete, then verify the leader was never started.
 					select {
 					case <-leaderStarted:
 						t.Error("leader should not start when store is significantly behind raft")
-					case <-time.After(50 * time.Millisecond):
+					case <-time.After(200 * time.Millisecond):
 						// leadership transferred without starting leader — expected
 					}
 					cancel()
@@ -279,6 +281,67 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
 				assert.ErrorIs(t, err, context.Canceled)
 			},
 		},
+		"proceed as leader when store catches up during wait": {
+			// Simulates the hot-potato scenario: all nodes behind on election,
+			// but the winner's block store syncs up before the wait times out.
+			setup: func(t *testing.T) (*DynamicLeaderElection, context.Context, context.CancelFunc) {
+				m := newMocksourceNode(t)
+				leaderCh := make(chan bool, 2)
+				m.EXPECT().leaderCh().Return((<-chan bool)(leaderCh))
+				m.EXPECT().GetState().Return(&RaftBlockState{Height: 10}).Maybe()
+				m.EXPECT().Config().Return(testCfg()).Maybe()
+				m.EXPECT().waitForMsgsLanded(2 * time.Millisecond).Return(nil)
+				m.EXPECT().NodeID().Return("self").Maybe()
+				m.EXPECT().leaderID().Return("self").Maybe()
+				// No leadershipTransfer: the node should stay as leader.
+
+				fStarted := make(chan struct{})
+				var syncedCalls int
+				follower := &testRunnable{
+					startedCh: fStarted,
+					isSyncedFn: func(*RaftBlockState) (int, error) {
+						syncedCalls++
+						if syncedCalls < 3 {
+							return -5, nil // still catching up
+						}
+						return 0, nil // caught up
+					},
+				}
+				leaderStarted := make(chan struct{})
+				leader := &testRunnable{
+					startedCh: leaderStarted,
+					runFn: func(ctx context.Context) error {
+						<-ctx.Done()
+						return ctx.Err()
+					},
+				}
+
+				logger := zerolog.Nop()
+				d := &DynamicLeaderElection{logger: logger, node: m,
+					leaderFactory:   func() (Runnable, error) { return leader, nil },
+					followerFactory: func() (Runnable, error) { return follower, nil },
+				}
+				ctx, cancel := context.WithCancel(t.Context())
+				go func() {
+					leaderCh <- false
+					<-fStarted
+					leaderCh <- true
+					// The leader must start once the store catches up.
+					select {
+					case <-leaderStarted:
+						// expected: leader started after store synced
+					case <-time.After(200 * time.Millisecond):
+						t.Error("leader should have started once store caught up")
+					}
+					cancel()
+				}()
+				return d, ctx, cancel
+			},
+			assertF: func(t *testing.T, err error) {
+				require.Error(t, err)
+				assert.ErrorIs(t, err, context.Canceled)
+			},
+		},
 		"lost leadership during sync wait": {
 			setup: func(t *testing.T) (*DynamicLeaderElection, context.Context, context.CancelFunc) {
 				m := newMocksourceNode(t)