@@ -221,19 +221,21 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
221221 assert .ErrorIs (t , err , context .Canceled )
222222 },
223223 },
224- "abdicate when store significantly behind raft" : {
224+ "abdicate when store significantly behind raft and never catches up " : {
225225 setup : func (t * testing.T ) (* DynamicLeaderElection , context.Context , context.CancelFunc ) {
226226 m := newMocksourceNode (t )
227227 leaderCh := make (chan bool , 2 )
228228 m .EXPECT ().leaderCh ().Return ((<- chan bool )(leaderCh ))
229- // GetState called in verifyState (follower start) and in leader sync check
230- m .EXPECT ().GetState ().Return (& RaftBlockState {Height : 10 })
231- m .EXPECT ().GetState ().Return (& RaftBlockState {Height : 10 })
232- m .EXPECT ().Config ().Return (testCfg ()).Times (2 )
229+ // GetState is called in verifyState, leader sync check, and the wait loop.
230+ m .EXPECT ().GetState ().Return (& RaftBlockState {Height : 10 }).Maybe ()
231+ // Config: once for follower waitForMsgsLanded, once for leader waitForMsgsLanded,
232+ // once inside waitForBlockStoreSync.
233+ m .EXPECT ().Config ().Return (testCfg ()).Times (3 )
233234 m .EXPECT ().waitForMsgsLanded (2 * time .Millisecond ).Return (nil )
234- m .EXPECT ().NodeID ().Return ("self" )
235- m .EXPECT ().leaderID ().Return ("self" )
236- // Abdication must transfer leadership
235+ // NodeID + leaderID: called in leader-sync check and polled inside the wait loop.
236+ m .EXPECT ().NodeID ().Return ("self" ).Maybe ()
237+ m .EXPECT ().leaderID ().Return ("self" ).Maybe ()
238+ // Abdication must transfer leadership after wait times out.
237239 m .EXPECT ().leadershipTransfer ().Return (nil )
238240
239241 fStarted := make (chan struct {})
@@ -262,12 +264,12 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
262264 leaderCh <- false
263265 <- fStarted
264266 leaderCh <- true
265- // Wait for abdication to complete (transfer + continue) then verify
266- // the leader was never started before cancelling .
267+ // Wait long enough for the sync wait to time out and abdication to
268+ // complete, then verify the leader was never started.
267269 select {
268270 case <- leaderStarted :
269271 t .Error ("leader should not start when store is significantly behind raft" )
270- case <- time .After (50 * time .Millisecond ):
272+ case <- time .After (200 * time .Millisecond ):
271273 // leadership transferred without starting leader — expected
272274 }
273275 cancel ()
@@ -279,6 +281,67 @@ func TestDynamicLeaderElectionRun(t *testing.T) {
279281 assert .ErrorIs (t , err , context .Canceled )
280282 },
281283 },
284+ "proceed as leader when store catches up during wait" : {
285+ // Simulates the hot-potato scenario: all nodes behind on election,
286+ // but the winner's block store syncs up before the wait times out.
287+ setup : func (t * testing.T ) (* DynamicLeaderElection , context.Context , context.CancelFunc ) {
288+ m := newMocksourceNode (t )
289+ leaderCh := make (chan bool , 2 )
290+ m .EXPECT ().leaderCh ().Return ((<- chan bool )(leaderCh ))
291+ m .EXPECT ().GetState ().Return (& RaftBlockState {Height : 10 }).Maybe ()
292+ m .EXPECT ().Config ().Return (testCfg ()).Maybe ()
293+ m .EXPECT ().waitForMsgsLanded (2 * time .Millisecond ).Return (nil )
294+ m .EXPECT ().NodeID ().Return ("self" ).Maybe ()
295+ m .EXPECT ().leaderID ().Return ("self" ).Maybe ()
296+ // No leadershipTransfer: the node should stay as leader.
297+
298+ fStarted := make (chan struct {})
299+ var syncedCalls int
300+ follower := & testRunnable {
301+ startedCh : fStarted ,
302+ isSyncedFn : func (* RaftBlockState ) (int , error ) {
303+ syncedCalls ++
304+ if syncedCalls < 3 {
305+ return - 5 , nil // still catching up
306+ }
307+ return 0 , nil // caught up
308+ },
309+ }
310+ leaderStarted := make (chan struct {})
311+ leader := & testRunnable {
312+ startedCh : leaderStarted ,
313+ runFn : func (ctx context.Context ) error {
314+ <- ctx .Done ()
315+ return ctx .Err ()
316+ },
317+ }
318+
319+ logger := zerolog .Nop ()
320+ d := & DynamicLeaderElection {logger : logger , node : m ,
321+ leaderFactory : func () (Runnable , error ) { return leader , nil },
322+ followerFactory : func () (Runnable , error ) { return follower , nil },
323+ }
324+ ctx , cancel := context .WithCancel (t .Context ())
325+ go func () {
326+ leaderCh <- false
327+ <- fStarted
328+ leaderCh <- true
329+ // The leader must start once the store catches up.
330+ select {
331+ case <- leaderStarted :
332+ // expected: leader started after store synced
333+ case <- time .After (200 * time .Millisecond ):
334+ t .Error ("leader should have started once store caught up" )
335+ }
336+ cancel ()
337+ }()
338+ return d , ctx , cancel
339+ },
340+ assertF : func (t * testing.T , err error ) {
341+ require .Error (t , err )
342+ assert .ErrorIs (t , err , context .Canceled )
343+ },
344+ },
282345 "lost leadership during sync wait" : {
283346 setup : func (t * testing.T ) (* DynamicLeaderElection , context.Context , context.CancelFunc ) {
284347 m := newMocksourceNode (t )
0 commit comments