From bec05fab535229e712ee92d13a2c7c6ddd8fe650 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Wed, 22 Apr 2026 09:51:35 +0200 Subject: [PATCH] hold incident resolution timer while services are still failing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resolution countdown now only starts once every failing service has recovered. While any service is still firing the timer is cancelled, so the incident cannot auto-resolve and spawn a new incident for the same ongoing failure between alerts. High latency is a warning signal (no explicit recovery event) and therefore does not block resolution — only FAILING services do. --- .../incident/IncidentManager.java | 16 ++++++++++++-- .../incident/IncidentManagerTest.java | 21 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java index 1e52c37634..c25abfc081 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java @@ -94,7 +94,14 @@ public class IncidentManager { } finally { if (activeIncidentThreadId != null) { lastAlertTime = Instant.now(); - resetResolutionTimer(); + // High latency is a warning only — it has no explicit recovery signal + // (HighLatencyNotification fires only when something is above threshold), + // so resolution hinges on failing services alone. + if (failingServices.isEmpty()) { + resetResolutionTimer(); + } else { + cancelResolutionTimer(); + } } } } @@ -172,10 +179,15 @@ public class IncidentManager { } private void resetResolutionTimer() { + cancelResolutionTimer(); + resolutionTask = scheduler.schedule(this::resolveIncident, resolutionTimeoutSeconds, TimeUnit.SECONDS); + } + + private void cancelResolutionTimer() { if (resolutionTask != null) { resolutionTask.cancel(false); + resolutionTask = null; } - resolutionTask = scheduler.schedule(this::resolveIncident, resolutionTimeoutSeconds, TimeUnit.SECONDS); } private void startDurationUpdater() { diff --git a/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java index afcd20adf3..b7051db534 100644 --- a/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java +++ b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java @@ -135,6 +135,27 @@ class IncidentManagerTest { assertThat(transport.updates).isEmpty(); } + @Test + void doesNotAutoResolveWhileServicesAreStillFailing() throws Exception { + manager.shutdown(); + transport = new RecordingTransport(); + manager = new IncidentManager(transport, 1L, "tbqa", false); + + manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 1))); + Thread.sleep(1500); + + assertThat(transport.updates) + .extracting(RecordingTransport.Message::text) + .noneMatch(t -> t.contains(":white_check_mark:")); + + manager.sendAlert("CoAP is OK", List.of(AffectedService.recovered("CoAP"))); + Thread.sleep(1500); + + assertThat(transport.updates) + .extracting(RecordingTransport.Message::text) + .anyMatch(t -> t.contains(":white_check_mark:")); + } + private static class RecordingTransport implements IncidentTransport { private final AtomicInteger threadCounter = new AtomicInteger(); final java.util.List incidents = new java.util.ArrayList<>();