Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import javax.management.JMException;
import javax.management.ObjectName;

Expand Down Expand Up @@ -105,9 +102,6 @@ public String metricName() {
// A map of dynamic capacity Gauges. The map's keys could change.
private final Map<String, SimpleDynamicMetric<Long>> _dynamicCapacityMetricsMap;

// Background executor for resetting gauges
private final ScheduledExecutorService _resetExecutor;

/**
* Initialize the bean
* @param clusterName the cluster to monitor
Expand All @@ -123,11 +117,6 @@ public InstanceMonitor(String clusterName, String participantName, ObjectName ob
// Initialize to 0 so that if we haven't received operation info yet, duration will be large
// and will be corrected when we receive the actual operation start time from InstanceConfig
_currentOperationStartTime = 0L;
_resetExecutor = Executors.newSingleThreadScheduledExecutor(r -> {
Thread thread = new Thread(r, "InstanceMonitor-ResetGauges-" + participantName);
thread.setDaemon(true);
return thread;
});

createMetrics();
}
Expand Down Expand Up @@ -381,34 +370,17 @@ public synchronized void updateInstanceOperation(InstanceConstants.InstanceOpera

// Check if operation changed
if (_currentOperation != newOperation) {
// Only update final duration if we had a valid start time
// On first call after controller restart, _currentOperationStartTime may be 0
if (_currentOperationStartTime > 0L) {
long finalDuration = currentTime - _currentOperationStartTime;
updateOperationDurationGauge(_currentOperation, finalDuration);
}

// Now switch to the new operation
// Switch to the new operation
_currentOperation = newOperation;
_currentOperationStartTime = operationStartTime;

// Reset all gauges except the current (new) operation
resetAllExceptOperations(Collections.singleton(_currentOperation));
}

// Update the duration gauge for the current operation
// Update the duration gauge for the current operation (called on every update)
long currentDuration = currentTime - _currentOperationStartTime;
updateOperationDurationGauge(_currentOperation, currentDuration);

// Capture the current operation at scheduling time to avoid race conditions
// If we transition from ENABLE -> EVACUATE -> ENABLE within 2 minutes,
// we want to reset based on what the operation was when we scheduled the task,
// not what it becomes later.
final InstanceConstants.InstanceOperation operationToExclude = _currentOperation;

// Schedule a background task to reset all gauges except the captured operation after 2 minutes
_resetExecutor.schedule(() -> {
synchronized (InstanceMonitor.this) {
resetAllExceptOperations(Collections.singleton(operationToExclude));
}
}, 2, TimeUnit.MINUTES);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,9 @@ public void testInstanceOperationDurationMetrics() throws JMException, Interrupt
Assert.assertTrue(evacuateDuration >= 100L,
"EVACUATE duration should be >= 100ms, but was " + evacuateDuration);

// The previous operation (ENABLE) should still show its final duration
// until the background thread resets it after 2 minutes
// So we just verify it's >= 0 (it will have some duration from before the switch)
Assert.assertTrue(monitor.getInstanceOperationDurationEnable() >= 0L,
"ENABLE duration should retain its final value until background reset");
// The previous operation (ENABLE) should be reset to 0 immediately
Assert.assertEquals(monitor.getInstanceOperationDurationEnable(), 0L,
"ENABLE duration should be reset to 0 when switching to EVACUATE");

// All other operations should be 0
Assert.assertEquals(monitor.getInstanceOperationDurationDisable(), 0L);
Expand All @@ -137,23 +135,23 @@ public void testInstanceOperationDurationMetrics() throws JMException, Interrupt
long disableStartTime = System.currentTimeMillis();
monitor.updateInstanceOperation(InstanceConstants.InstanceOperation.DISABLE, disableStartTime);

// EVACUATE should retain its final duration (until background reset after 2 mins)
// DISABLE should start counting from 0
long finalEvacuateDuration = monitor.getInstanceOperationDurationEvacuate();
Assert.assertTrue(finalEvacuateDuration > 0L,
"EVACUATE duration should retain its final value after switching to DISABLE");
// All gauges except DISABLE should be reset to 0
Assert.assertEquals(monitor.getInstanceOperationDurationEvacuate(), 0L,
"EVACUATE duration should be reset to 0 when switching to DISABLE");
Assert.assertEquals(monitor.getInstanceOperationDurationDisable(), 0L,
"DISABLE duration should start at 0");
Assert.assertEquals(monitor.getInstanceOperationDurationEnable(), 0L,
"ENABLE duration should be reset to 0");

// Wait and verify DISABLE duration increases
Thread.sleep(100);
monitor.updateInstanceOperation(InstanceConstants.InstanceOperation.DISABLE, disableStartTime);
long disableDuration = monitor.getInstanceOperationDurationDisable();
Assert.assertTrue(disableDuration >= 100L,
"DISABLE duration should be >= 100ms, but was " + disableDuration);
// EVACUATE retains its final duration
Assert.assertTrue(monitor.getInstanceOperationDurationEvacuate() > 0L,
"EVACUATE should still show its final duration");
// EVACUATE should remain reset at 0
Assert.assertEquals(monitor.getInstanceOperationDurationEvacuate(), 0L,
"EVACUATE should remain at 0");

// Test SWAP_IN operation
long swapInStartTime = System.currentTimeMillis();
Expand All @@ -164,11 +162,13 @@ public void testInstanceOperationDurationMetrics() throws JMException, Interrupt
long swapInDuration = monitor.getInstanceOperationDurationSwapIn();
Assert.assertTrue(swapInDuration >= 50L,
"SWAP_IN duration should be >= 50ms, but was " + swapInDuration);
// Previous operations retain their final durations
Assert.assertTrue(monitor.getInstanceOperationDurationDisable() > 0L,
"DISABLE should still show its final duration");
Assert.assertTrue(monitor.getInstanceOperationDurationEvacuate() > 0L,
"EVACUATE should still show its final duration");
// All others (DISABLE, EVACUATE, ENABLE) should be reset to 0
Assert.assertEquals(monitor.getInstanceOperationDurationDisable(), 0L,
"DISABLE should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationEvacuate(), 0L,
"EVACUATE should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationEnable(), 0L,
"ENABLE should be reset to 0");

// Test UNKNOWN operation
long unknownStartTime = System.currentTimeMillis();
Expand All @@ -179,25 +179,31 @@ public void testInstanceOperationDurationMetrics() throws JMException, Interrupt
long unknownDuration = monitor.getInstanceOperationDurationUnknown();
Assert.assertTrue(unknownDuration >= 50L,
"UNKNOWN duration should be >= 50ms, but was " + unknownDuration);
// SWAP_IN retains its final duration
Assert.assertTrue(monitor.getInstanceOperationDurationSwapIn() > 0L,
"SWAP_IN should still show its final duration");
// All others (SWAP_IN, DISABLE, EVACUATE, ENABLE) should be reset to 0
Assert.assertEquals(monitor.getInstanceOperationDurationSwapIn(), 0L,
"SWAP_IN should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationDisable(), 0L,
"DISABLE should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationEvacuate(), 0L,
"EVACUATE should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationEnable(), 0L,
"ENABLE should be reset to 0");

// Test going back to ENABLE - previous operations retain their final values
// Test going back to ENABLE - all others reset to 0
long enableStartTime = System.currentTimeMillis();
monitor.updateInstanceOperation(InstanceConstants.InstanceOperation.ENABLE, enableStartTime);
Thread.sleep(50);
monitor.updateInstanceOperation(InstanceConstants.InstanceOperation.ENABLE, enableStartTime);

// Previous operations retain their final durations until background reset
Assert.assertTrue(monitor.getInstanceOperationDurationDisable() > 0L,
"DISABLE should retain its final duration");
Assert.assertTrue(monitor.getInstanceOperationDurationEvacuate() > 0L,
"EVACUATE should retain its final duration");
Assert.assertTrue(monitor.getInstanceOperationDurationSwapIn() > 0L,
"SWAP_IN should retain its final duration");
Assert.assertTrue(monitor.getInstanceOperationDurationUnknown() > 0L,
"UNKNOWN should retain its final duration");
// All gauges except ENABLE should be reset to 0
Assert.assertEquals(monitor.getInstanceOperationDurationUnknown(), 0L,
"UNKNOWN should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationDisable(), 0L,
"DISABLE should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationEvacuate(), 0L,
"EVACUATE should be reset to 0");
Assert.assertEquals(monitor.getInstanceOperationDurationSwapIn(), 0L,
"SWAP_IN should be reset to 0");

// ENABLE duration should be > 0
long enableDuration = monitor.getInstanceOperationDurationEnable();
Expand Down