Browse Source

hold incident resolution timer while services are still failing

The resolution countdown now only starts once every failing service has
recovered. While any service is still firing the timer is cancelled, so
the incident cannot auto-resolve and spawn a new incident for the same
ongoing failure between alerts.

High latency is a warning signal (no explicit recovery event) and
therefore does not block resolution — only FAILING services do.
pull/15456/head
Oleksii Kuripko 1 month ago
parent
commit
bec05fab53
  1. 16
      monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java
  2. 21
      monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java

16
monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java

@ -94,7 +94,14 @@ public class IncidentManager {
} finally {
if (activeIncidentThreadId != null) {
lastAlertTime = Instant.now();
resetResolutionTimer();
// High latency is a warning only — it has no explicit recovery signal
// (HighLatencyNotification fires only when something is above threshold),
// so resolution hinges on failing services alone.
if (failingServices.isEmpty()) {
resetResolutionTimer();
} else {
cancelResolutionTimer();
}
}
}
}
@ -172,10 +179,15 @@ public class IncidentManager {
}
private void resetResolutionTimer() {
cancelResolutionTimer();
resolutionTask = scheduler.schedule(this::resolveIncident, resolutionTimeoutSeconds, TimeUnit.SECONDS);
}
private void cancelResolutionTimer() {
if (resolutionTask != null) {
resolutionTask.cancel(false);
resolutionTask = null;
}
resolutionTask = scheduler.schedule(this::resolveIncident, resolutionTimeoutSeconds, TimeUnit.SECONDS);
}
private void startDurationUpdater() {

21
monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java

@ -135,6 +135,27 @@ class IncidentManagerTest {
assertThat(transport.updates).isEmpty();
}
@Test
void doesNotAutoResolveWhileServicesAreStillFailing() throws Exception {
manager.shutdown();
transport = new RecordingTransport();
manager = new IncidentManager(transport, 1L, "tbqa", false);
manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 1)));
Thread.sleep(1500);
assertThat(transport.updates)
.extracting(RecordingTransport.Message::text)
.noneMatch(t -> t.contains(":white_check_mark:"));
manager.sendAlert("CoAP is OK", List.of(AffectedService.recovered("CoAP")));
Thread.sleep(1500);
assertThat(transport.updates)
.extracting(RecordingTransport.Message::text)
.anyMatch(t -> t.contains(":white_check_mark:"));
}
private static class RecordingTransport implements IncidentTransport {
private final AtomicInteger threadCounter = new AtomicInteger();
final java.util.List<String> incidents = new java.util.ArrayList<>();

Loading…
Cancel
Save