@@ -16,6 +16,9 @@ def add_options(p):
1616 p .add_argument ('--disable-evicts' , action = 'store_true' , help = 'Disable VDisk evicts' )
1717 p .add_argument ('--disable-restarts' , action = 'store_true' , help = 'Disable node restarts' )
1818 p .add_argument ('--enable-pdisk-encryption-keys-changes' , action = 'store_true' , help = 'Enable changes of PDisk encryption keys' )
19+ p .add_argument ('--enable-kill-tablets' , action = 'store_true' , help = 'Enable tablet killer' )
20+ p .add_argument ('--enable-kill-blob-depot' , action = 'store_true' , help = 'Enable BlobDepot killer' )
21+ p .add_argument ('--kill-signal' , type = str , default = 'KILL' , help = 'Kill signal to send to restart node' )
1922
2023
2124def fetch_start_time_map (base_config ):
@@ -84,6 +87,8 @@ def do(args):
8487 config_retries -= 1
8588 continue
8689
90+ tablets = common .fetch_json_info ('tabletinfo' ) if args .enable_kill_tablets or args .enable_kill_blob_depot else {}
91+
8792 config_retries = None
8893
8994 for vslot in base_config .VSlot :
@@ -135,7 +140,7 @@ def do_restart(node_id):
135140 host = node_fqdn_map [node_id ]
136141 if args .enable_pdisk_encryption_keys_changes :
137142 update_pdisk_key_config (node_fqdn_map , pdisk_keys , node_id )
138- subprocess .call (['ssh' , host , 'sudo' , 'killall' , '-9' , 'kikimr' ])
143+ subprocess .call (['ssh' , host , 'sudo' , 'killall' , '-%s' % args . kill_signal , 'kikimr' ])
139144 if args .enable_pdisk_encryption_keys_changes :
140145 remove_old_pdisk_keys (pdisk_keys , pdisk_key_versions , node_id )
141146
@@ -185,6 +190,29 @@ def do_add_pdisk_key(node_id):
185190 "version" : v ,
186191 "file" : "keynumber" + str (v )})
187192
193+ def do_kill_tablet ():
194+ tablet_list = [
195+ value
196+ for key , value in tablets .items ()
197+ if value ['State' ] == 'Active' and value ['Leader' ]
198+ ]
199+ item = random .choice (tablet_list )
200+ tablet_id = int (item ['TabletId' ])
201+ print ('Killing tablet %d of type %s' % (tablet_id , item ['Type' ]))
202+ common .fetch ('tablets' , dict (RestartTabletID = tablet_id ), fmt = 'raw' , cache = False )
203+
204+ def do_kill_blob_depot ():
205+ tablet_list = [
206+ value
207+ for key , value in tablets .items ()
208+ if value ['State' ] == 'Active' and value ['Leader' ] and value ['Type' ] == 'BlobDepot'
209+ ]
210+ if tablet_list :
211+ item = random .choice (tablet_list )
212+ tablet_id = int (item ['TabletId' ])
213+ print ('Killing tablet %d of type %s' % (tablet_id , item ['Type' ]))
214+ common .fetch ('tablets' , dict (RestartTabletID = tablet_id ), fmt = 'raw' , cache = False )
215+
188216 ################################################################################################################
189217
190218 now = datetime .utcnow ()
@@ -193,19 +221,45 @@ def do_add_pdisk_key(node_id):
193221
194222 possible_actions = []
195223
224+ if args .enable_kill_tablets :
225+ possible_actions .append (('kill tablet' , (do_kill_tablet ,)))
226+ if args .enable_kill_blob_depot :
227+ possible_actions .append (('kill blob depot' , (do_kill_blob_depot ,)))
228+
229+ evicts = []
230+ wipes = []
231+ readonlies = []
232+ unreadonlies = []
233+
196234 for vslot in base_config .VSlot :
197235 if common .is_dynamic_group (vslot .GroupId ):
198236 vslot_id = common .get_vslot_id (vslot .VSlotId )
199237 vdisk_id = '[%08x:%d:%d:%d]' % (vslot .GroupId , vslot .FailRealmIdx , vslot .FailDomainIdx , vslot .VDiskIdx )
200238 if vslot_id in vslot_readonly and not args .disable_readonly :
201- possible_actions .append (('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , False )))
239+ unreadonlies .append (('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , False )))
202240 if can_act_on_vslot (* vslot_id ) and (recent_restarts or args .disable_restarts ):
203241 if not args .disable_evicts :
204- possible_actions .append (('evict vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_evict , vslot_id )))
242+ evicts .append (('evict vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_evict , vslot_id )))
205243 if not args .disable_wipes :
206- possible_actions .append (('wipe vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_wipe , vslot )))
244+ wipes .append (('wipe vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_wipe , vslot )))
207245 if not args .disable_readonly :
208- possible_actions .append (('readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , True )))
246+ readolies .append (('readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , True )))
247+
248+ def pick (v ):
249+ action_name , action = random .choice (v )
250+ print (action_name )
251+ action [0 ](* action [1 :])
252+
253+ if evicts :
254+ possible_actions .append (('evict' , (pick , evicts )))
255+ if wipes :
256+ possible_actions .append (('wipe' , (pick , wipes )))
257+ if readonlies :
258+ possible_actions .append (('readonly' , (pick , readonlies )))
259+ if unreadonlies :
260+ possible_actions .append (('un-readonly' , (pick , unreadonlies )))
261+
262+ restarts = []
209263
210264 if start_time_map and len (recent_restarts ) < 3 :
211265 # sort so that the latest restarts come first
@@ -216,7 +270,10 @@ def do_add_pdisk_key(node_id):
216270 if args .enable_pdisk_encryption_keys_changes :
217271 possible_actions .append (('add new pdisk key to node with id: %d' % node_id , (do_add_pdisk_key , node_id )))
218272 if not args .disable_restarts :
219- possible_actions .append (('restart node with id: %d' % node_id , (do_restart , node_id )))
273+ restarts .append (('restart node with id: %d' % node_id , (do_restart , node_id )))
274+
275+ if restarts :
276+ possible_actions .append (('restart' , (pick , restarts )))
220277
221278 if not possible_actions :
222279 common .print_if_not_quiet (args , 'Waiting for the next round...' , file = sys .stdout )
@@ -226,7 +283,7 @@ def do_add_pdisk_key(node_id):
226283 ################################################################################################################
227284
228285 action_name , action = random .choice (possible_actions )
229- common . print_if_not_quiet ( args , '%s' % action_name , file = sys . stdout )
286+ print ( '%s %s ' % ( action_name , datetime . utcnow (). strftime ( '%Y-%m-%dT%H:%M:%S' )) )
230287
231288 try :
232289 action [0 ](* action [1 :])
0 commit comments