From df4dc250829e13b0442bc7b2ffd58a23e3573f47 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Thu, 23 Apr 2026 10:20:20 +0200 Subject: [PATCH] emit recovery signal for MonitoringServiceKey.GENERAL and route EDQS I/O errors correctly Two related fixes for a stuck-open 'Monitoring' entry in the incident header: 1. serviceIsOk(GENERAL) is now called when a monitoring cycle completes successfully. Previously GENERAL could only accumulate failures (via the outer Throwable catch), with no complementary recovery, so once the catch-all fired the service stayed red forever. 2. checkEdqs() is now wrapped in its own try/catch that reports any non-ServiceFailureException failures under EDQS rather than GENERAL. Connection/read timeouts hitting /api/entitiesQuery/find previously propagated unwrapped and were bucketed as GENERAL, which hid the fact that EDQS was the failing component. --- .../service/BaseMonitoringService.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java index 3f1c624200..9c8c8fc533 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java @@ -155,13 +155,22 @@ public abstract class BaseMonitoringService, T ext } if (checkEdqs) { - stopWatch.start(); - checkEdqs(); - reporter.reportLatency(Latencies.EDQS_QUERY, stopWatch.getTime()); - reporter.serviceIsOk(MonitoredServiceKey.EDQS); + try { + stopWatch.start(); + checkEdqs(); + reporter.reportLatency(Latencies.EDQS_QUERY, stopWatch.getTime()); + reporter.serviceIsOk(MonitoredServiceKey.EDQS); + } catch (ServiceFailureException e) { + reporter.serviceFailure(e.getServiceKey(), e); + return; + } catch (Exception e) { + reporter.serviceFailure(MonitoredServiceKey.EDQS, e); + return; + } } reporter.reportLatencies(); + reporter.serviceIsOk(MonitoredServiceKey.GENERAL); log.debug("Finished {}", getName()); } catch (ServiceFailureException e) { reporter.serviceFailure(e.getServiceKey(), e);