From c66da7e18d5fdb2710d93fa15ec148e8de88c73a Mon Sep 17 00:00:00 2001 From: dashevchenko Date: Mon, 20 Apr 2026 10:24:43 +0300 Subject: [PATCH 01/13] edqs: fixed entity filtering by string data points --- .../server/edqs/data/dp/StringDataPoint.java | 5 ++ .../edqs/repo/DeviceTypeFilterTest.java | 57 ++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/common/edqs/src/main/java/org/thingsboard/server/edqs/data/dp/StringDataPoint.java b/common/edqs/src/main/java/org/thingsboard/server/edqs/data/dp/StringDataPoint.java index 8ae3499d63..2d9e77633d 100644 --- a/common/edqs/src/main/java/org/thingsboard/server/edqs/data/dp/StringDataPoint.java +++ b/common/edqs/src/main/java/org/thingsboard/server/edqs/data/dp/StringDataPoint.java @@ -33,6 +33,11 @@ public class StringDataPoint extends AbstractDataPoint { this.value = deduplicate ? TbStringPool.intern(value) : value; } + @Override + public boolean getBool() { + return Boolean.parseBoolean(value); + } + @Override public double getDouble() { return Double.parseDouble(value); diff --git a/edqs/src/test/java/org/thingsboard/server/edqs/repo/DeviceTypeFilterTest.java b/edqs/src/test/java/org/thingsboard/server/edqs/repo/DeviceTypeFilterTest.java index 5d4905f0ef..11423d7484 100644 --- a/edqs/src/test/java/org/thingsboard/server/edqs/repo/DeviceTypeFilterTest.java +++ b/edqs/src/test/java/org/thingsboard/server/edqs/repo/DeviceTypeFilterTest.java @@ -19,15 +19,20 @@ import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.thingsboard.server.common.data.AttributeScope; import org.thingsboard.server.common.data.Device; import org.thingsboard.server.common.data.DeviceProfile; import org.thingsboard.server.common.data.DeviceProfileType; import org.thingsboard.server.common.data.EntityType; +import org.thingsboard.server.common.data.edqs.AttributeKv; import org.thingsboard.server.common.data.edqs.LatestTsKv; import org.thingsboard.server.common.data.id.DeviceId; import org.thingsboard.server.common.data.id.DeviceProfileId; +import org.thingsboard.server.common.data.kv.BaseAttributeKvEntry; import org.thingsboard.server.common.data.kv.BasicTsKvEntry; +import org.thingsboard.server.common.data.kv.BooleanDataEntry; import org.thingsboard.server.common.data.kv.StringDataEntry; +import org.thingsboard.server.common.data.query.BooleanFilterPredicate; import org.thingsboard.server.common.data.query.DeviceTypeFilter; import org.thingsboard.server.common.data.query.EntityDataPageLink; import org.thingsboard.server.common.data.query.EntityDataQuery; @@ -39,8 +44,10 @@ import org.thingsboard.server.common.data.query.FilterPredicateValue; import org.thingsboard.server.common.data.query.KeyFilter; import org.thingsboard.server.common.data.query.StringFilterPredicate; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.List; import java.util.UUID; public class DeviceTypeFilterTest extends AbstractEDQTest { @@ -119,7 +126,50 @@ public class DeviceTypeFilterTest extends AbstractEDQTest { Assert.assertEquals("42", first.getLatest().get(EntityKeyType.ENTITY_FIELD).get("createdTime").getValue()); } + @Test + public void testFindDeviceByBooleanAttributeWithMixedTypes() { + DeviceId device1Id = createLoraDevice("LoRa-1"); + DeviceId device2Id = createLoraDevice("LoRa-2"); + DeviceId device3Id = createLoraDevice("LoRa-3"); + + long ts = System.currentTimeMillis(); + addOrUpdate(new AttributeKv(device1Id, AttributeScope.SERVER_SCOPE, + new BaseAttributeKvEntry(new BooleanDataEntry("active", true), ts), 1L)); + addOrUpdate(new AttributeKv(device2Id, AttributeScope.SERVER_SCOPE, + new BaseAttributeKvEntry(new BooleanDataEntry("active", false), ts), 1L)); + addOrUpdate(new AttributeKv(device3Id, AttributeScope.SERVER_SCOPE, + new BaseAttributeKvEntry(new StringDataEntry("active", "true"), ts), 1L)); + + KeyFilter activeFilter = new KeyFilter(); + activeFilter.setKey(new EntityKey(EntityKeyType.SERVER_ATTRIBUTE, "active")); + activeFilter.setValueType(EntityKeyValueType.BOOLEAN); + BooleanFilterPredicate predicate = new BooleanFilterPredicate(); + predicate.setOperation(BooleanFilterPredicate.BooleanOperation.EQUAL); + predicate.setValue(FilterPredicateValue.fromBoolean(true)); + activeFilter.setPredicate(predicate); + + var result = repository.countEntitiesByQuery(tenantId, null, + getDeviceTypeQuery("LoRa", List.of(activeFilter)), false); + Assert.assertEquals(2, result); + } + + private DeviceId createLoraDevice(String name) { + DeviceId deviceId = new DeviceId(UUID.randomUUID()); + Device device = new Device(); + device.setId(deviceId); + device.setTenantId(tenantId); + device.setDeviceProfileId(loraProfileId); + device.setName(name); + device.setCreatedTime(42L); + addOrUpdate(EntityType.DEVICE, device); + return deviceId; + } + private static EntityDataQuery getDeviceTypeQuery(String deviceType) { + return getDeviceTypeQuery(deviceType, null); + } + + private static EntityDataQuery getDeviceTypeQuery(String deviceType, List extraFilters) { DeviceTypeFilter filter = new DeviceTypeFilter(); filter.setDeviceTypes(Collections.singletonList(deviceType)); var pageLink = new EntityDataPageLink(20, 0, null, new EntityDataSortOrder(new EntityKey(EntityKeyType.TIME_SERIES, "state"), EntityDataSortOrder.Direction.DESC), false); @@ -135,7 +185,12 @@ public class DeviceTypeFilterTest extends AbstractEDQTest { nameFilter.setPredicate(predicate); nameFilter.setValueType(EntityKeyValueType.STRING); - return new EntityDataQuery(filter, pageLink, entityFields, latestValues, Arrays.asList(nameFilter)); + List keyFilters = new ArrayList<>(); + keyFilters.add(nameFilter); + if (extraFilters != null) { + keyFilters.addAll(extraFilters); + } + return new EntityDataQuery(filter, pageLink, entityFields, latestValues, keyFilters); } } From 28de53425b7d83b7b26d5397837524836c173987 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Tue, 21 Apr 2026 09:28:19 +0200 Subject: [PATCH 02/13] Incident feature for tb-monitoring Adds Slack-API-based incident grouping for the monitoring microservice: alerts that fire within resolution_timeout_s are threaded under a single "Incident" message whose header tracks affected services in real time, and the incident auto-resolves after a quiet period with a final summary. Highlights - Dual Slack modes: when bot_token + channel_id are set, alerts go through chat.postMessage / chat.update with threaded replies; otherwise the existing webhook path is used unchanged. - Incident header shows :red_circle: failing / :large_yellow_circle: high latency / :large_green_circle: recovered services with live failure counts and elapsed duration (updated every minute). - Notifications carry structured affected-service data (AffectedService { name, status, failureCount }) so the incident layer no longer parses formatted alert text with regex. - IncidentManager is decoupled from Slack via a small IncidentTransport interface; SlackIncidentTransport adapts SlackApiClient. - PE/other service-key types can plug in a friendly name via the ShortNameProvider interface; TransportInfo implements it. - Config lives under monitoring.notifications.incident.* (enabled, resolution_timeout_s, tag_channel). Slack bot config stays under monitoring.notifications.slack.{bot_token,channel_id}. YAML defaults are authoritative; Spring @Value no longer carries a conflicting fallback. - Concurrency: state and transport I/O run under the manager's monitor. Slack client has explicit 5s call timeouts (set on SlackConfig) so the hold time is bounded. Slack client is closed on PreDestroy. - HTTP failure text is sanitised: HTML response bodies are stripped so Nginx-style error pages don't flood alerts. - BaseMonitoringService splits login / WS connect / WS subscribe into distinct MonitoredServiceKey entries, uses catch(Exception) instead of catch(Throwable), and wraps WsClient in try-with-resources. - Unit tests cover incident lifecycle, status transitions, duration formatting, HTML body stripping, and the ShortNameProvider dispatch. --- monitoring/pom.xml | 9 + .../config/transport/TransportInfo.java | 10 +- .../monitoring/data/MonitoredServiceKey.java | 3 + .../data/notification/AffectedService.java | 34 +++ .../notification/HighLatencyNotification.java | 9 + .../data/notification/InfoNotification.java | 5 + .../data/notification/Notification.java | 10 + .../ServiceFailureNotification.java | 38 +++ .../ServiceRecoveryNotification.java | 7 + .../data/notification/ShortNameProvider.java | 22 ++ .../notification/NotificationService.java | 2 +- .../channels/NotificationChannel.java | 4 +- .../channels/impl/SlackApiClient.java | 125 ++++++++ .../channels/impl/SlackIncidentTransport.java | 45 +++ .../impl/SlackNotificationChannel.java | 69 ++++- .../incident/IncidentManager.java | 279 ++++++++++++++++++ .../incident/IncidentTransport.java | 26 ++ .../monitoring/service/BaseHealthChecker.java | 3 +- .../service/BaseMonitoringService.java | 37 ++- .../service/MonitoringReporter.java | 5 +- .../src/main/resources/tb-monitoring.yml | 13 + .../ServiceFailureNotificationTest.java | 70 +++++ .../incident/IncidentManagerTest.java | 162 ++++++++++ 23 files changed, 965 insertions(+), 22 deletions(-) create mode 100644 monitoring/src/main/java/org/thingsboard/monitoring/data/notification/AffectedService.java create mode 100644 monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ShortNameProvider.java create mode 100644 monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackApiClient.java create mode 100644 monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackIncidentTransport.java create mode 100644 monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java create mode 100644 monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentTransport.java create mode 100644 monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java create mode 100644 monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java diff --git a/monitoring/pom.xml b/monitoring/pom.xml index 5bc1fb5baa..571586b8c3 100644 --- a/monitoring/pom.xml +++ b/monitoring/pom.xml @@ -79,6 +79,10 @@ org.apache.httpcomponents httpclient + + com.slack.api + slack-api-client + org.eclipse.leshan leshan-client-cf @@ -118,6 +122,11 @@ ch.qos.logback logback-classic + + org.springframework.boot + spring-boot-starter-test + test + diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/config/transport/TransportInfo.java b/monitoring/src/main/java/org/thingsboard/monitoring/config/transport/TransportInfo.java index 251b5e1d85..cc9801dcad 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/config/transport/TransportInfo.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/config/transport/TransportInfo.java @@ -16,13 +16,21 @@ package org.thingsboard.monitoring.config.transport; import lombok.Data; +import org.thingsboard.monitoring.data.notification.ShortNameProvider; @Data -public class TransportInfo { +public class TransportInfo implements ShortNameProvider { private final TransportType type; private final TransportMonitoringTarget target; + public String getShortName() { + if (target.getQueue().equals("Main")) { + return type.getName(); + } + return type.getName() + " " + target.getQueue(); + } + @Override public String toString() { if (target.getQueue().equals("Main")) { diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/MonitoredServiceKey.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/MonitoredServiceKey.java index 28fa6a9e18..c566e8f896 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/MonitoredServiceKey.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/MonitoredServiceKey.java @@ -18,6 +18,9 @@ package org.thingsboard.monitoring.data; public class MonitoredServiceKey { public static final String GENERAL = "Monitoring"; + public static final String LOGIN = "Login"; + public static final String WS_CONNECT = "WS Connect"; + public static final String WS_SUBSCRIBE = "WS Subscribe"; public static final String EDQS = "*EDQS*"; } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/AffectedService.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/AffectedService.java new file mode 100644 index 0000000000..b30a398223 --- /dev/null +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/AffectedService.java @@ -0,0 +1,34 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.data.notification; + +public record AffectedService(String name, Status status, int failureCount) { + + public enum Status { FAILING, RECOVERED, HIGH_LATENCY } + + public static AffectedService failing(String name, int failureCount) { + return new AffectedService(name, Status.FAILING, failureCount); + } + + public static AffectedService recovered(String name) { + return new AffectedService(name, Status.RECOVERED, 0); + } + + public static AffectedService highLatency(String name) { + return new AffectedService(name, Status.HIGH_LATENCY, 0); + } + +} diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/HighLatencyNotification.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/HighLatencyNotification.java index 9a319eb6c9..3e739ec4dc 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/HighLatencyNotification.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/HighLatencyNotification.java @@ -18,6 +18,8 @@ package org.thingsboard.monitoring.data.notification; import org.thingsboard.monitoring.data.Latency; import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; public class HighLatencyNotification implements Notification { @@ -39,4 +41,11 @@ public class HighLatencyNotification implements Notification { return text.toString(); } + @Override + public List getAffectedServices() { + return highLatencies.stream() + .map(latency -> AffectedService.highLatency(latency.getKey())) + .collect(Collectors.toList()); + } + } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/InfoNotification.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/InfoNotification.java index b2b73414de..6906e8c732 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/InfoNotification.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/InfoNotification.java @@ -24,4 +24,9 @@ public class InfoNotification implements Notification { public String getText() { return message; } + + @Override + public boolean isIncident() { + return false; + } } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/Notification.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/Notification.java index 33ed7f8328..7734fc62ee 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/Notification.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/Notification.java @@ -15,8 +15,18 @@ */ package org.thingsboard.monitoring.data.notification; +import java.util.List; + public interface Notification { String getText(); + default boolean isIncident() { + return true; + } + + default List getAffectedServices() { + return List.of(); + } + } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java index dd93f232a6..47daf043ab 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java @@ -18,6 +18,8 @@ package org.thingsboard.monitoring.data.notification; import lombok.Getter; import org.apache.commons.lang3.exception.ExceptionUtils; +import java.util.List; + @Getter public class ServiceFailureNotification implements Notification { @@ -43,7 +45,43 @@ public class ServiceFailureNotification implements Notification { if (errorMsg == null) { errorMsg = error.getClass().getSimpleName(); } + errorMsg = stripResponseBody(errorMsg); return String.format("%s - Failure: %s (number of subsequent failures: %s)", serviceKey, errorMsg, failuresCount); } + static String stripResponseBody(String msg) { + if (msg == null) { + return null; + } + int htmlIdx = -1; + for (String marker : new String[]{"= 0 && (htmlIdx < 0 || idx < htmlIdx)) { + htmlIdx = idx; + } + } + if (htmlIdx > 0) { + msg = msg.substring(0, htmlIdx).stripTrailing(); + if (msg.endsWith("\"")) { + msg = msg.substring(0, msg.length() - 1).stripTrailing(); + } + if (msg.endsWith(":")) { + msg = msg.substring(0, msg.length() - 1).stripTrailing(); + } + } + return msg; + } + + @Override + public List getAffectedServices() { + return List.of(AffectedService.failing(shortName(serviceKey), failuresCount)); + } + + static String shortName(Object serviceKey) { + if (serviceKey instanceof ShortNameProvider provider) { + return provider.getShortName(); + } + return serviceKey.toString(); + } + } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceRecoveryNotification.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceRecoveryNotification.java index b6e0c7c695..31b32422a1 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceRecoveryNotification.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceRecoveryNotification.java @@ -15,6 +15,8 @@ */ package org.thingsboard.monitoring.data.notification; +import java.util.List; + public class ServiceRecoveryNotification implements Notification { private final Object serviceKey; @@ -28,4 +30,9 @@ public class ServiceRecoveryNotification implements Notification { return String.format("%s is OK", serviceKey); } + @Override + public List getAffectedServices() { + return List.of(AffectedService.recovered(ServiceFailureNotification.shortName(serviceKey))); + } + } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ShortNameProvider.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ShortNameProvider.java new file mode 100644 index 0000000000..96c63023af --- /dev/null +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ShortNameProvider.java @@ -0,0 +1,22 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.data.notification; + +public interface ShortNameProvider { + + String getShortName(); + +} diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/NotificationService.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/NotificationService.java index e813b3205d..091a331438 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/notification/NotificationService.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/NotificationService.java @@ -51,7 +51,7 @@ public class NotificationService { return notificationChannels.stream().map(notificationChannel -> notificationExecutor.submit(() -> { try { - notificationChannel.sendNotification(message); + notificationChannel.sendNotification(message, notification); } catch (Exception e) { log.error("Failed to send notification to {}", notificationChannel.getClass().getSimpleName(), e); } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/NotificationChannel.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/NotificationChannel.java index 126942cb3f..614fddda31 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/NotificationChannel.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/NotificationChannel.java @@ -15,8 +15,10 @@ */ package org.thingsboard.monitoring.notification.channels; +import org.thingsboard.monitoring.data.notification.Notification; + public interface NotificationChannel { - void sendNotification(String message); + void sendNotification(String message, Notification notification); } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackApiClient.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackApiClient.java new file mode 100644 index 0000000000..4ad74d7811 --- /dev/null +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackApiClient.java @@ -0,0 +1,125 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.notification.channels.impl; + +import com.slack.api.Slack; +import com.slack.api.SlackConfig; +import com.slack.api.methods.MethodsClient; +import com.slack.api.methods.SlackApiTextResponse; +import com.slack.api.methods.request.chat.ChatPostMessageRequest; +import com.slack.api.methods.request.chat.ChatUpdateRequest; +import com.slack.api.methods.response.chat.ChatPostMessageResponse; +import com.slack.api.methods.response.chat.ChatUpdateResponse; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class SlackApiClient { + + private static final int DEFAULT_CALL_TIMEOUT_MS = 5000; + + private final Slack slack; + private final String botToken; + + public SlackApiClient(String botToken) { + this(botToken, DEFAULT_CALL_TIMEOUT_MS); + } + + public SlackApiClient(String botToken, int callTimeoutMs) { + this.botToken = botToken; + SlackConfig config = new SlackConfig(); + config.setHttpClientCallTimeoutMillis(callTimeoutMs); + config.setHttpClientReadTimeoutMillis(callTimeoutMs); + config.setHttpClientWriteTimeoutMillis(callTimeoutMs); + this.slack = Slack.getInstance(config); + } + + public String postMessage(String channelId, String text) { + ChatPostMessageRequest request = ChatPostMessageRequest.builder() + .channel(channelId) + .text(text) + .build(); + ChatPostMessageResponse response = sendRequest(request); + return response.getTs(); + } + + public String postThreadReply(String channelId, String threadTs, String text) { + ChatPostMessageRequest request = ChatPostMessageRequest.builder() + .channel(channelId) + .text(text) + .threadTs(threadTs) + .build(); + ChatPostMessageResponse response = sendRequest(request); + return response.getTs(); + } + + public void close() { + try { + slack.close(); + } catch (Exception e) { + log.warn("Failed to close Slack client", e); + } + } + + public void updateMessage(String channelId, String ts, String text) { + ChatUpdateRequest request = ChatUpdateRequest.builder() + .channel(channelId) + .ts(ts) + .text(text) + .build(); + MethodsClient client = slack.methods(botToken); + ChatUpdateResponse response; + try { + response = client.chatUpdate(request); + } catch (Exception e) { + throw new RuntimeException("Failed to update Slack message: " + e.getMessage(), e); + } + checkResponse(response); + } + + private ChatPostMessageResponse sendRequest(ChatPostMessageRequest request) { + MethodsClient client = slack.methods(botToken); + ChatPostMessageResponse response; + try { + response = client.chatPostMessage(request); + } catch (Exception e) { + throw new RuntimeException("Failed to send Slack message: " + e.getMessage(), e); + } + checkResponse(response); + return response; + } + + private void checkResponse(SlackApiTextResponse response) { + if (response.isOk()) { + return; + } + String error = response.getError(); + if (error != null) { + switch (error) { + case "missing_scope" -> { + String neededScope = response.getNeeded(); + error = "bot token scope '" + neededScope + "' is needed"; + } + case "not_in_channel" -> error = "app needs to be added to the channel"; + } + } else if (response.getWarning() != null) { + error = "warning: " + response.getWarning(); + } else { + error = "unknown error"; + } + throw new RuntimeException("Slack API error: " + error); + } + +} diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackIncidentTransport.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackIncidentTransport.java new file mode 100644 index 0000000000..8812714d62 --- /dev/null +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackIncidentTransport.java @@ -0,0 +1,45 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.notification.channels.impl; + +import org.thingsboard.monitoring.notification.incident.IncidentTransport; + +public class SlackIncidentTransport implements IncidentTransport { + + private final SlackApiClient slackApiClient; + private final String channelId; + + public SlackIncidentTransport(SlackApiClient slackApiClient, String channelId) { + this.slackApiClient = slackApiClient; + this.channelId = channelId; + } + + @Override + public String postIncident(String text) { + return slackApiClient.postMessage(channelId, text); + } + + @Override + public void postThreadReply(String threadId, String text) { + slackApiClient.postThreadReply(channelId, threadId, text); + } + + @Override + public void updateIncident(String threadId, String text) { + slackApiClient.updateMessage(channelId, threadId, text); + } + +} diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackNotificationChannel.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackNotificationChannel.java index c1d25c43c0..06990c5a51 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackNotificationChannel.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/channels/impl/SlackNotificationChannel.java @@ -16,13 +16,16 @@ package org.thingsboard.monitoring.notification.channels.impl; import jakarta.annotation.PostConstruct; +import jakarta.annotation.PreDestroy; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.boot.web.client.RestTemplateBuilder; import org.springframework.stereotype.Component; import org.springframework.web.client.RestTemplate; +import org.thingsboard.monitoring.data.notification.Notification; import org.thingsboard.monitoring.notification.channels.NotificationChannel; +import org.thingsboard.monitoring.notification.incident.IncidentManager; import java.time.Duration; import java.util.Map; @@ -35,19 +38,73 @@ public class SlackNotificationChannel implements NotificationChannel { @Value("${monitoring.notifications.slack.webhook_url}") private String webhookUrl; + @Value("${monitoring.notifications.slack.bot_token:}") + private String botToken; + + @Value("${monitoring.notifications.slack.channel_id:}") + private String channelId; + + @Value("${monitoring.notifications.incident.enabled:}") + private boolean incidentEnabled; + + @Value("${monitoring.notifications.incident.resolution_timeout_s:}") + private long resolutionTimeoutSeconds; + + @Value("${monitoring.notifications.incident.tag_channel:}") + private boolean tagChannel; + + @Value("${monitoring.notifications.message_prefix:}") + private String messagePrefix; + private RestTemplate restTemplate; + private SlackApiClient slackApiClient; + private IncidentManager incidentManager; @PostConstruct private void init() { - restTemplate = new RestTemplateBuilder() - .setConnectTimeout(Duration.ofSeconds(5)) - .setReadTimeout(Duration.ofSeconds(2)) - .build(); + boolean hasBotConfig = botToken != null && !botToken.isEmpty() && channelId != null && !channelId.isEmpty(); + if (hasBotConfig) { + slackApiClient = new SlackApiClient(botToken); + log.info("Slack API mode enabled (channel: {})", channelId); + if (incidentEnabled) { + incidentManager = new IncidentManager(new SlackIncidentTransport(slackApiClient, channelId), + resolutionTimeoutSeconds, messagePrefix, tagChannel); + log.info("Incident grouping enabled via Slack (resolution timeout: {}s)", resolutionTimeoutSeconds); + } + } else { + if (incidentEnabled) { + log.warn("Incident grouping is enabled but Slack bot_token/channel_id are not set; " + + "falling back to plain webhook mode without incident support"); + } + restTemplate = new RestTemplateBuilder() + .setConnectTimeout(Duration.ofSeconds(5)) + .setReadTimeout(Duration.ofSeconds(2)) + .build(); + log.info("Slack webhook mode enabled"); + } } @Override - public void sendNotification(String message) { - restTemplate.postForObject(webhookUrl, Map.of("text", message), String.class); + public void sendNotification(String message, Notification notification) { + if (incidentManager != null && notification.isIncident()) { + // Pass the raw notification text: IncidentManager already puts the prefix into the + // incident header, so pre-prefixing the thread reply would double it up. + incidentManager.sendAlert(notification.getText(), notification.getAffectedServices()); + } else if (slackApiClient != null) { + slackApiClient.postMessage(channelId, message); + } else { + restTemplate.postForObject(webhookUrl, Map.of("text", message), String.class); + } + } + + @PreDestroy + private void destroy() { + if (incidentManager != null) { + incidentManager.shutdown(); + } + if (slackApiClient != null) { + slackApiClient.close(); + } } } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java new file mode 100644 index 0000000000..133d665f4c --- /dev/null +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java @@ -0,0 +1,279 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.notification.incident; + +import lombok.extern.slf4j.Slf4j; +import org.thingsboard.monitoring.data.notification.AffectedService; + +import java.time.Duration; +import java.time.Instant; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; + +/** + * Thread-safety: all public entry points and scheduled callbacks are {@code synchronized} on the + * manager instance. Transport I/O is performed while the monitor is held. This is safe under the + * assumptions that (a) the transport enforces a short per-call timeout, and (b) notification + * producers are single-threaded; see the Slack client (default 5s) for the Slack-based transport. + */ +@Slf4j +public class IncidentManager { + + private final IncidentTransport transport; + private final long resolutionTimeoutSeconds; + private final String messagePrefix; + private final boolean tagChannel; + private final ScheduledExecutorService scheduler; + + private String activeIncidentThreadId; + private ScheduledFuture resolutionTask; + private ScheduledFuture durationUpdateTask; + private Instant incidentStartTime; + private Instant lastAlertTime; + private final Map failingServices = new LinkedHashMap<>(); + private final Set recoveredServices = new LinkedHashSet<>(); + private final Set highLatencyServices = new LinkedHashSet<>(); + + public IncidentManager(IncidentTransport transport, long resolutionTimeoutSeconds, + String messagePrefix, boolean tagChannel) { + this.transport = transport; + this.resolutionTimeoutSeconds = resolutionTimeoutSeconds; + this.messagePrefix = messagePrefix; + this.tagChannel = tagChannel; + this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> { + Thread t = new Thread(r, "incident-manager"); + t.setDaemon(true); + return t; + }); + } + + public synchronized void sendAlert(String message, List affectedServices) { + try { + if (activeIncidentThreadId == null) { + if (affectedServices.stream().allMatch(s -> s.status() == AffectedService.Status.RECOVERED)) { + return; + } + incidentStartTime = Instant.now(); + failingServices.clear(); + recoveredServices.clear(); + highLatencyServices.clear(); + applyAffectedServices(affectedServices); + activeIncidentThreadId = transport.postIncident(buildOngoingMessageText()); + startDurationUpdater(); + log.info("New incident created, thread id: {}", activeIncidentThreadId); + } else if (applyAffectedServices(affectedServices)) { + safeUpdateHeader(); + } + + try { + transport.postThreadReply(activeIncidentThreadId, message); + log.debug("Alert added to incident thread {}", activeIncidentThreadId); + } catch (Exception e) { + log.error("Failed to post alert to incident thread {}", activeIncidentThreadId, e); + } + } finally { + if (activeIncidentThreadId != null) { + lastAlertTime = Instant.now(); + resetResolutionTimer(); + } + } + } + + private boolean applyAffectedServices(List affectedServices) { + boolean changed = false; + Set latencySnapshot = null; + for (AffectedService service : affectedServices) { + String name = service.name(); + switch (service.status()) { + case FAILING -> { + Integer prev = failingServices.put(name, service.failureCount()); + if (prev == null || prev.intValue() != service.failureCount()) { + changed = true; + } + if (recoveredServices.remove(name)) { + changed = true; + } + } + case RECOVERED -> { + if (failingServices.remove(name) != null) { + recoveredServices.add(name); + changed = true; + } + } + case HIGH_LATENCY -> { + if (latencySnapshot == null) { + latencySnapshot = new LinkedHashSet<>(); + } + latencySnapshot.add(name); + } + } + } + // HighLatencyNotification carries the full current set of high latencies, so treat it as a + // snapshot: replace highLatencyServices entirely. Without this, a brief spike would stay + // yellow in the header until the incident resolves. + if (latencySnapshot != null && !latencySnapshot.equals(highLatencyServices)) { + highLatencyServices.clear(); + highLatencyServices.addAll(latencySnapshot); + changed = true; + } + return changed; + } + + private String buildOngoingMessageText() { + StringBuilder sb = new StringBuilder(); + if (tagChannel) { + sb.append(" "); + } + if (messagePrefix != null && !messagePrefix.isEmpty()) { + sb.append("*").append(messagePrefix).append("*"); + } + sb.append(" :rotating_light:"); + Duration elapsed = Duration.between(incidentStartTime, Instant.now()); + if (elapsed.toMinutes() >= 1) { + sb.append(" (").append(formatDuration(elapsed)).append(")"); + } + if (hasAffected()) { + sb.append(" | ").append(formatAffectedServices()); + } + return sb.toString(); + } + + private boolean hasAffected() { + return !failingServices.isEmpty() || !recoveredServices.isEmpty() || !highLatencyServices.isEmpty(); + } + + private void safeUpdateHeader() { + try { + transport.updateIncident(activeIncidentThreadId, buildOngoingMessageText()); + } catch (Exception e) { + log.error("Failed to update incident message", e); + } + } + + private void resetResolutionTimer() { + if (resolutionTask != null) { + resolutionTask.cancel(false); + } + resolutionTask = scheduler.schedule(this::resolveIncident, resolutionTimeoutSeconds, TimeUnit.SECONDS); + } + + private void startDurationUpdater() { + if (durationUpdateTask != null) { + durationUpdateTask.cancel(false); + } + durationUpdateTask = scheduler.scheduleAtFixedRate(this::updateDuration, 60, 60, TimeUnit.SECONDS); + } + + private synchronized void updateDuration() { + if (activeIncidentThreadId == null) { + return; + } + safeUpdateHeader(); + } + + private void stopDurationUpdater() { + if (durationUpdateTask != null) { + durationUpdateTask.cancel(false); + durationUpdateTask = null; + } + } + + static String formatDuration(Duration duration) { + long totalMinutes = duration.toMinutes(); + if (totalMinutes < 60) { + return totalMinutes + "m"; + } + long hours = totalMinutes / 60; + long minutes = totalMinutes % 60; + return minutes > 0 ? hours + "h" + minutes + "m" : hours + "h"; + } + + synchronized void resolveIncident() { + if (activeIncidentThreadId == null) { + return; + } + String threadId = activeIncidentThreadId; + stopDurationUpdater(); + String resolutionMessage = buildResolutionMessage(); + activeIncidentThreadId = null; + resolutionTask = null; + failingServices.clear(); + recoveredServices.clear(); + highLatencyServices.clear(); + try { + transport.updateIncident(threadId, resolutionMessage); + log.info("Incident resolved (thread was {})", threadId); + } catch (Exception e) { + log.error("Failed to send incident resolution message", e); + } + } + + private String buildResolutionMessage() { + Duration totalDuration = lastAlertTime != null + ? Duration.between(incidentStartTime, lastAlertTime) + : Duration.between(incidentStartTime, Instant.now()); + StringBuilder sb = new StringBuilder(); + if (messagePrefix != null && !messagePrefix.isEmpty()) { + sb.append("*").append(messagePrefix).append("*"); + } + sb.append(" :white_check_mark:"); + sb.append(" (").append(formatDuration(totalDuration)).append(")"); + if (hasAffected()) { + sb.append(" | ").append(formatAffectedServices()).append("\n"); + } + return sb.toString(); + } + + private String formatAffectedServices() { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry entry : failingServices.entrySet()) { + if (!first) sb.append(", "); + sb.append(":red_circle: ").append(entry.getKey()).append(" (").append(entry.getValue()).append(")"); + first = false; + } + for (String name : highLatencyServices) { + if (!first) sb.append(", "); + sb.append(":large_yellow_circle: ").append(name); + first = false; + } + for (String name : recoveredServices) { + if (!first) sb.append(", "); + sb.append(":large_green_circle: ").append(name); + first = false; + } + return sb.toString(); + } + + public void shutdown() { + scheduler.shutdownNow(); + try { + if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) { + log.warn("Incident scheduler did not terminate in time"); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + +} diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentTransport.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentTransport.java new file mode 100644 index 0000000000..0d6b4cac2a --- /dev/null +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentTransport.java @@ -0,0 +1,26 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.notification.incident; + +public interface IncidentTransport { + + String postIncident(String text); + + void postThreadReply(String threadId, String text); + + void updateIncident(String threadId, String text); + +} diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseHealthChecker.java b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseHealthChecker.java index b73194469f..c0e85da1ac 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseHealthChecker.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseHealthChecker.java @@ -89,11 +89,10 @@ public abstract class BaseHealthChecker { diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java index ea154e49c0..3f1c624200 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java @@ -117,17 +117,40 @@ public abstract class BaseMonitoringService, T ext } try { log.info("Starting {}", getName()); - stopWatch.start(); - String accessToken = tbClient.logIn(); - reporter.reportLatency(Latencies.LOG_IN, stopWatch.getTime()); - try (WsClient wsClient = wsClientFactory.createClient(accessToken)) { + String accessToken; + try { stopWatch.start(); - wsClient.subscribeForTelemetry(devices, getTestTelemetryKeys()).waitForReply(); - reporter.reportLatency(Latencies.WS_SUBSCRIBE, stopWatch.getTime()); + accessToken = tbClient.logIn(); + reporter.reportLatency(Latencies.LOG_IN, stopWatch.getTime()); + reporter.serviceIsOk(MonitoredServiceKey.LOGIN); + } catch (Exception e) { + reporter.serviceFailure(MonitoredServiceKey.LOGIN, e); + return; + } + + WsClient wsClient; + try { + wsClient = wsClientFactory.createClient(accessToken); + reporter.serviceIsOk(MonitoredServiceKey.WS_CONNECT); + } catch (Exception e) { + reporter.serviceFailure(MonitoredServiceKey.WS_CONNECT, e); + return; + } + + try (WsClient ws = wsClient) { + try { + stopWatch.start(); + ws.subscribeForTelemetry(devices, getTestTelemetryKeys()).waitForReply(); + reporter.reportLatency(Latencies.WS_SUBSCRIBE, stopWatch.getTime()); + reporter.serviceIsOk(MonitoredServiceKey.WS_SUBSCRIBE); + } catch (Exception e) { + reporter.serviceFailure(MonitoredServiceKey.WS_SUBSCRIBE, e); + return; + } for (BaseHealthChecker healthChecker : healthCheckers) { - check(healthChecker, wsClient); + check(healthChecker, ws); } } diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/service/MonitoringReporter.java b/monitoring/src/main/java/org/thingsboard/monitoring/service/MonitoringReporter.java index b13394dbdc..75a8a819c0 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/service/MonitoringReporter.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/service/MonitoringReporter.java @@ -25,7 +25,6 @@ import org.springframework.stereotype.Component; import org.thingsboard.common.util.JacksonUtil; import org.thingsboard.monitoring.client.TbClient; import org.thingsboard.monitoring.data.Latency; -import org.thingsboard.monitoring.data.MonitoredServiceKey; import org.thingsboard.monitoring.data.notification.HighLatencyNotification; import org.thingsboard.monitoring.data.notification.ServiceFailureNotification; import org.thingsboard.monitoring.data.notification.ServiceRecoveryNotification; @@ -119,9 +118,7 @@ public class MonitoringReporter { public void serviceIsOk(Object serviceKey) { ServiceRecoveryNotification notification = new ServiceRecoveryNotification(serviceKey); - if (!serviceKey.equals(MonitoredServiceKey.GENERAL)) { - log.info(notification.getText()); - } + log.info(notification.getText()); AtomicInteger failuresCounter = failuresCounters.get(serviceKey); if (failuresCounter != null) { if (failuresCounter.get() >= failuresThreshold) { diff --git a/monitoring/src/main/resources/tb-monitoring.yml b/monitoring/src/main/resources/tb-monitoring.yml index 053e42bb9d..768fd2e4c7 100644 --- a/monitoring/src/main/resources/tb-monitoring.yml +++ b/monitoring/src/main/resources/tb-monitoring.yml @@ -121,11 +121,24 @@ monitoring: notifications: message_prefix: '${NOTIFICATION_MESSAGE_PREFIX:}' + # Incident grouping (threads alerts into incidents, auto-resolves after timeout). + # Requires a channel that supports it — currently only Slack API (bot_token + channel_id). + incident: + # Enable incident grouping + enabled: '${INCIDENT_ENABLED:false}' + # Incident resolution timeout in seconds + resolution_timeout_s: '${INCIDENT_RESOLUTION_TIMEOUT_S:90}' + # Tag @channel in incident messages + tag_channel: '${INCIDENT_TAG_CHANNEL:false}' slack: # Enable notifying via Slack enabled: '${SLACK_NOTIFICATION_CHANNEL_ENABLED:false}' # Slack webhook url webhook_url: '${SLACK_WEBHOOK_URL:}' + # Slack Bot OAuth token (xoxb-...) for API-based messaging with incident support, requires chat:write:bot scope + bot_token: '${SLACK_BOT_TOKEN:}' + # Slack channel ID (e.g. C01234ABCDE) - required when incident feature is enabled + channel_id: '${SLACK_CHANNEL_ID:}' latency: # Enable latencies reporting diff --git a/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java b/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java new file mode 100644 index 0000000000..92c6996d1d --- /dev/null +++ b/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java @@ -0,0 +1,70 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.data.notification; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class ServiceFailureNotificationTest { + + @Test + void stripResponseBodyRemovesNginxErrorHtml() { + String msg = "503 Service Temporarily Unavailable on POST request for \"https://domain/api/auth/login\": \"" + + "503 Service Temporarily Unavailable" + + "

503 Service Temporarily Unavailable


nginx
\""; + + String sanitized = ServiceFailureNotification.stripResponseBody(msg); + + assertThat(sanitized) + .isEqualTo("503 Service Temporarily Unavailable on POST request for \"https://domain/api/auth/login\""); + } + + @Test + void stripResponseBodyRemovesDoctypeHtml() { + String msg = "500 Internal Server Error: \"...\""; + + String sanitized = ServiceFailureNotification.stripResponseBody(msg); + + assertThat(sanitized).isEqualTo("500 Internal Server Error"); + } + + @Test + void stripResponseBodyLeavesPlainMessagesUntouched() { + String msg = "Connection refused"; + assertThat(ServiceFailureNotification.stripResponseBody(msg)).isEqualTo(msg); + } + + @Test + void stripResponseBodyHandlesNull() { + assertThat(ServiceFailureNotification.stripResponseBody(null)).isNull(); + } + + @Test + void shortNameUsesShortNameProviderWhenAvailable() { + ShortNameProvider provider = () -> "MQTT"; + assertThat(ServiceFailureNotification.shortName(provider)).isEqualTo("MQTT"); + } + + @Test + void shortNameFallsBackToToStringForOtherKeys() { + Object key = new Object() { + @Override public String toString() { return "LOGIN"; } + }; + assertThat(ServiceFailureNotification.shortName(key)).isEqualTo("LOGIN"); + } + +} diff --git a/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java new file mode 100644 index 0000000000..81202efe25 --- /dev/null +++ b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java @@ -0,0 +1,162 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.monitoring.notification.incident; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.thingsboard.monitoring.data.notification.AffectedService; + +import java.time.Duration; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.assertj.core.api.Assertions.assertThat; + +class IncidentManagerTest { + + private RecordingTransport transport; + private IncidentManager manager; + + @BeforeEach + void setUp() { + transport = new RecordingTransport(); + manager = new IncidentManager(transport, 3600L, "tbqa", false); + } + + @AfterEach + void tearDown() { + manager.shutdown(); + } + + @Test + void formatDurationRendersMinutesAndHours() { + assertThat(IncidentManager.formatDuration(Duration.ofSeconds(30))).isEqualTo("0m"); + assertThat(IncidentManager.formatDuration(Duration.ofMinutes(5))).isEqualTo("5m"); + assertThat(IncidentManager.formatDuration(Duration.ofMinutes(59))).isEqualTo("59m"); + assertThat(IncidentManager.formatDuration(Duration.ofMinutes(60))).isEqualTo("1h"); + assertThat(IncidentManager.formatDuration(Duration.ofMinutes(75))).isEqualTo("1h15m"); + assertThat(IncidentManager.formatDuration(Duration.ofMinutes(120))).isEqualTo("2h"); + } + + @Test + void firstFailureOpensIncidentAndPostsHeaderAndReply() { + manager.sendAlert("CoAP failure message", + List.of(AffectedService.failing("CoAP", 1))); + + assertThat(transport.incidents).hasSize(1); + assertThat(transport.incidents.get(0)).contains(":rotating_light:").contains(":red_circle: CoAP (1)"); + assertThat(transport.replies).hasSize(1); + assertThat(transport.replies.get(0).text()).isEqualTo("CoAP failure message"); + } + + @Test + void isolatedRecoveryWithoutActiveIncidentIsIgnored() { + manager.sendAlert("Login is OK", + List.of(AffectedService.recovered("Login"))); + + assertThat(transport.incidents).isEmpty(); + assertThat(transport.replies).isEmpty(); + assertThat(transport.updates).isEmpty(); + } + + @Test + void subsequentFailureUpdatesHeaderAndPostsReply() { + manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 1))); + manager.sendAlert("CoAP repeat", List.of(AffectedService.failing("CoAP", 3))); + + assertThat(transport.incidents).hasSize(1); + assertThat(transport.replies).hasSize(2); + assertThat(transport.updates).hasSize(1); + assertThat(transport.updates.get(0).text()).contains(":red_circle: CoAP (3)"); + } + + @Test + void recoveryAfterFailureMovesServiceToGreenAndUpdatesHeader() { + manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 1))); + manager.sendAlert("CoAP is OK", List.of(AffectedService.recovered("CoAP"))); + + assertThat(transport.updates).hasSize(1); + String updated = transport.updates.get(0).text(); + assertThat(updated).contains(":large_green_circle: CoAP").doesNotContain(":red_circle:"); + } + + @Test + void highLatencyIsTrackedAsYellow() { + manager.sendAlert("high latency", + List.of(AffectedService.highLatency("logInLatency"))); + + assertThat(transport.incidents.get(0)).contains(":large_yellow_circle: logInLatency"); + } + + @Test + void repeatingSameFailureCountDoesNotTriggerRedundantUpdate() { + manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 3))); + manager.sendAlert("CoAP still failing", List.of(AffectedService.failing("CoAP", 3))); + + assertThat(transport.updates).isEmpty(); + assertThat(transport.replies).hasSize(2); + } + + @Test + void fullLifecycleStartFailRecoverResolve() { + manager.sendAlert("Login failure", List.of(AffectedService.failing("Login", 1))); + manager.sendAlert("WS failure", List.of(AffectedService.failing("WS Connect", 1))); + manager.sendAlert("Login is OK", List.of(AffectedService.recovered("Login"))); + + assertThat(transport.incidents).hasSize(1); + + manager.resolveIncident(); + + assertThat(transport.updates).last() + .extracting(RecordingTransport.Message::text) + .asString() + .contains(":white_check_mark:") + .contains(":red_circle: WS Connect") + .contains(":large_green_circle: Login"); + } + + @Test + void resolveWithoutActiveIncidentIsNoOp() { + manager.resolveIncident(); + assertThat(transport.updates).isEmpty(); + } + + private static class RecordingTransport implements IncidentTransport { + private final AtomicInteger threadCounter = new AtomicInteger(); + final java.util.List incidents = new java.util.ArrayList<>(); + final java.util.List replies = new java.util.ArrayList<>(); + final java.util.List updates = new java.util.ArrayList<>(); + + @Override + public String postIncident(String text) { + incidents.add(text); + return "thread-" + threadCounter.incrementAndGet(); + } + + @Override + public void postThreadReply(String threadId, String text) { + replies.add(new Message(threadId, text)); + } + + @Override + public void updateIncident(String threadId, String text) { + updates.add(new Message(threadId, text)); + } + + record Message(String threadId, String text) {} + } +} From 7033f60881deafb3d8cbe7ff1a5391a7e0f9dc41 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Tue, 21 Apr 2026 10:43:14 +0200 Subject: [PATCH 03/13] bump slack-api-client from 1.39.0 to 1.48.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changelog review (v1.40..v1.48): no breaking changes to the slack-api-client APIs used in this codebase. The only breaking changes in v1.45.0 were in slack-app-backend (servlet classes) and slack-api-bolt-aws-lambda-s3-storage (AWS SDK v1→v2), neither of which we depend on. Transitive deps (okhttp, gson, okio) are unchanged. Verified by compiling monitoring + application against 1.48.0 and running the monitoring unit tests plus the application Notification/Slack test suites (15 + 54 tests, all pass). --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a4aa43688d..107e707bcf 100755 --- a/pom.xml +++ b/pom.xml @@ -164,7 +164,7 @@ 0.4.8 1.0.0 - 1.39.0 + 1.48.0 6.6.0 1.35.0 1.6.1 From b0030087e3f0eaaaa7f6eee5d91fb83f3ed13f2c Mon Sep 17 00:00:00 2001 From: Sergey Matvienko Date: Tue, 21 Apr 2026 08:39:51 +0200 Subject: [PATCH 04/13] Bump PostgreSQL test images to 18 - postgres: 16.6 -> 18 (dao sql-test.properties / nosql-test.properties) - timescaledb: latest-pg12 -> latest-pg18 (dao timescale-test.properties) TimescaleDB pg15+ images crash on cgroup v2 CI hosts because /docker-entrypoint-initdb.d/001_timescaledb_tune.sh evaluates [ ${TS_TUNE_MEMORY} -gt ${FREE_BYTES} ] with an empty left operand after the kernel reports the 64-bit max for /sys/fs/cgroup/memory.max. Work around the upstream bug by setting NO_TS_TUNE=true. The Testcontainers JDBC URL (jdbc:tc:timescaledb:...) does not support docker env vars, so register a custom JdbcDatabaseContainerProvider (TbTimescaleDBContainerProvider, activated via jdbc:tc:tbtimescaledb:...) that starts a PostgreSQLContainer backed by timescale/timescaledb with NO_TS_TUNE=true. Production docker-compose files and tb-postgres image are untouched. --- .../dao/TbTimescaleDBContainerProvider.java | 47 +++++++++++++++++++ ...s.containers.JdbcDatabaseContainerProvider | 1 + dao/src/test/resources/nosql-test.properties | 2 +- dao/src/test/resources/sql-test.properties | 2 +- .../test/resources/timescale-test.properties | 2 +- 5 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 dao/src/test/java/org/thingsboard/server/dao/TbTimescaleDBContainerProvider.java create mode 100644 dao/src/test/resources/META-INF/services/org.testcontainers.containers.JdbcDatabaseContainerProvider diff --git a/dao/src/test/java/org/thingsboard/server/dao/TbTimescaleDBContainerProvider.java b/dao/src/test/java/org/thingsboard/server/dao/TbTimescaleDBContainerProvider.java new file mode 100644 index 0000000000..8afcaa1c22 --- /dev/null +++ b/dao/src/test/java/org/thingsboard/server/dao/TbTimescaleDBContainerProvider.java @@ -0,0 +1,47 @@ +/** + * Copyright © 2016-2026 The Thingsboard Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.thingsboard.server.dao; + +import org.testcontainers.containers.JdbcDatabaseContainer; +import org.testcontainers.containers.TimescaleDBContainerProvider; + +/** + * Extends the upstream {@link TimescaleDBContainerProvider} to disable the + * timescaledb-tune entrypoint script via NO_TS_TUNE=true. + * + * Works around a shell bug in /docker-entrypoint-initdb.d/001_timescaledb_tune.sh + * that crashes the container entrypoint on cgroup v2 hosts (including CI agents) + * when the kernel reports the 64-bit max for memory.max. + * + * Activated by the jdbc:tc:tbtimescaledb:<tag>:///... URL prefix + * registered via META-INF/services. + */ +public class TbTimescaleDBContainerProvider extends TimescaleDBContainerProvider { + + private static final String NAME = "tbtimescaledb"; + + @Override + public boolean supports(String databaseType) { + return NAME.equals(databaseType); + } + + @Override + public JdbcDatabaseContainer newInstance(String tag) { + JdbcDatabaseContainer container = super.newInstance(tag); + container.withEnv("NO_TS_TUNE", "true"); + return container; + } +} diff --git a/dao/src/test/resources/META-INF/services/org.testcontainers.containers.JdbcDatabaseContainerProvider b/dao/src/test/resources/META-INF/services/org.testcontainers.containers.JdbcDatabaseContainerProvider new file mode 100644 index 0000000000..ab36744aa9 --- /dev/null +++ b/dao/src/test/resources/META-INF/services/org.testcontainers.containers.JdbcDatabaseContainerProvider @@ -0,0 +1 @@ +org.thingsboard.server.dao.TbTimescaleDBContainerProvider diff --git a/dao/src/test/resources/nosql-test.properties b/dao/src/test/resources/nosql-test.properties index b688c3c40f..921aebd5fe 100644 --- a/dao/src/test/resources/nosql-test.properties +++ b/dao/src/test/resources/nosql-test.properties @@ -13,6 +13,6 @@ spring.jpa.show-sql=false spring.jpa.hibernate.ddl-auto=none spring.datasource.username=postgres spring.datasource.password=postgres -spring.datasource.url=jdbc:tc:postgresql:16.6:///thingsboard?TC_DAEMON=true&TC_TMPFS=/testtmpfs:rw&?TC_INITFUNCTION=org.thingsboard.server.dao.PostgreSqlInitializer::initDb +spring.datasource.url=jdbc:tc:postgresql:18:///thingsboard?TC_DAEMON=true&TC_TMPFS=/testtmpfs:rw&?TC_INITFUNCTION=org.thingsboard.server.dao.PostgreSqlInitializer::initDb spring.datasource.driverClassName=org.testcontainers.jdbc.ContainerDatabaseDriver spring.datasource.hikari.maximumPoolSize=16 diff --git a/dao/src/test/resources/sql-test.properties b/dao/src/test/resources/sql-test.properties index e3f4861aa9..0639c461a3 100644 --- a/dao/src/test/resources/sql-test.properties +++ b/dao/src/test/resources/sql-test.properties @@ -14,7 +14,7 @@ spring.jpa.show-sql=false spring.jpa.hibernate.ddl-auto=none spring.datasource.username=postgres spring.datasource.password=postgres -spring.datasource.url=jdbc:tc:postgresql:16.6:///thingsboard?TC_DAEMON=true&TC_TMPFS=/testtmpfs:rw&?TC_INITFUNCTION=org.thingsboard.server.dao.PostgreSqlInitializer::initDb +spring.datasource.url=jdbc:tc:postgresql:18:///thingsboard?TC_DAEMON=true&TC_TMPFS=/testtmpfs:rw&?TC_INITFUNCTION=org.thingsboard.server.dao.PostgreSqlInitializer::initDb spring.datasource.driverClassName=org.testcontainers.jdbc.ContainerDatabaseDriver spring.datasource.hikari.maximumPoolSize=16 diff --git a/dao/src/test/resources/timescale-test.properties b/dao/src/test/resources/timescale-test.properties index 2c5552cb75..e0c0bef25e 100644 --- a/dao/src/test/resources/timescale-test.properties +++ b/dao/src/test/resources/timescale-test.properties @@ -13,6 +13,6 @@ spring.jpa.show-sql=false spring.jpa.hibernate.ddl-auto=none spring.datasource.username=postgres spring.datasource.password=postgres -spring.datasource.url=jdbc:tc:timescaledb:latest-pg12:///thingsboard?TC_DAEMON=true&TC_TMPFS=/testtmpfs:rw&?TC_INITFUNCTION=org.thingsboard.server.dao.TimescaleSqlInitializer::initDb +spring.datasource.url=jdbc:tc:tbtimescaledb:latest-pg18:///thingsboard?TC_DAEMON=true&TC_TMPFS=/testtmpfs:rw&?TC_INITFUNCTION=org.thingsboard.server.dao.TimescaleSqlInitializer::initDb spring.datasource.driverClassName=org.testcontainers.jdbc.ContainerDatabaseDriver spring.datasource.hikari.maximumPoolSize = 50 From c87b20e0c269d6f0c94b396ea371b045e17fa378 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Tue, 21 Apr 2026 18:41:46 +0200 Subject: [PATCH 05/13] linkify the URL in HTTP failure messages as a Slack hyperlink Transforms 'request for ""' into Slack mrkdwn , so the failure line renders the word 'request' as a clickable link instead of showing the raw URL in quotes. --- .../ServiceFailureNotification.java | 17 +++++++++++++++++ .../ServiceFailureNotificationTest.java | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java index 47daf043ab..5fcc953cce 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java @@ -19,6 +19,8 @@ import lombok.Getter; import org.apache.commons.lang3.exception.ExceptionUtils; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; @Getter public class ServiceFailureNotification implements Notification { @@ -46,9 +48,24 @@ public class ServiceFailureNotification implements Notification { errorMsg = error.getClass().getSimpleName(); } errorMsg = stripResponseBody(errorMsg); + errorMsg = linkifyRequestUrl(errorMsg); return String.format("%s - Failure: %s (number of subsequent failures: %s)", serviceKey, errorMsg, failuresCount); } + private static final Pattern REQUEST_URL_PATTERN = Pattern.compile("request for \"(https?://[^\"\\s]+)\""); + + static String linkifyRequestUrl(String msg) { + if (msg == null) { + return null; + } + Matcher m = REQUEST_URL_PATTERN.matcher(msg); + if (!m.find()) { + return msg; + } + // Slack mrkdwn link: + return m.replaceAll("<$1|request>"); + } + static String stripResponseBody(String msg) { if (msg == null) { return null; diff --git a/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java b/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java index 92c6996d1d..7f66a456fe 100644 --- a/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java +++ b/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java @@ -53,6 +53,25 @@ class ServiceFailureNotificationTest { assertThat(ServiceFailureNotification.stripResponseBody(null)).isNull(); } + @Test + void linkifyReplacesRequestForUrlWithSlackMrkdwnLink() { + String msg = "503 Service Temporarily Unavailable on POST request for \"https://qa-tb-pe-lts43.iot-private.cloud/api/auth/login\""; + + assertThat(ServiceFailureNotification.linkifyRequestUrl(msg)) + .isEqualTo("503 Service Temporarily Unavailable on POST "); + } + + @Test + void linkifyLeavesMessagesWithoutRequestUrlUntouched() { + String msg = "Connection refused"; + assertThat(ServiceFailureNotification.linkifyRequestUrl(msg)).isEqualTo(msg); + } + + @Test + void linkifyHandlesNull() { + assertThat(ServiceFailureNotification.linkifyRequestUrl(null)).isNull(); + } + @Test void shortNameUsesShortNameProviderWhenAvailable() { ShortNameProvider provider = () -> "MQTT"; From 43e87d458887b85c670ddb490900c823fd5015fc Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Tue, 21 Apr 2026 18:45:44 +0200 Subject: [PATCH 06/13] preserve failure count on service recovery in incident header Recovered services are now shown as ':large_green_circle: ()' with the last failure count that was observed before the service recovered, matching the red-circle format. --- .../notification/incident/IncidentManager.java | 13 +++++++------ .../notification/incident/IncidentManagerTest.java | 8 ++++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java index 133d665f4c..1e52c37634 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java @@ -51,7 +51,7 @@ public class IncidentManager { private Instant incidentStartTime; private Instant lastAlertTime; private final Map failingServices = new LinkedHashMap<>(); - private final Set recoveredServices = new LinkedHashSet<>(); + private final Map recoveredServices = new LinkedHashMap<>(); private final Set highLatencyServices = new LinkedHashSet<>(); public IncidentManager(IncidentTransport transport, long resolutionTimeoutSeconds, @@ -110,13 +110,14 @@ public class IncidentManager { if (prev == null || prev.intValue() != service.failureCount()) { changed = true; } - if (recoveredServices.remove(name)) { + if (recoveredServices.remove(name) != null) { changed = true; } } case RECOVERED -> { - if (failingServices.remove(name) != null) { - recoveredServices.add(name); + Integer lastFailureCount = failingServices.remove(name); + if (lastFailureCount != null) { + recoveredServices.put(name, lastFailureCount); changed = true; } } @@ -257,9 +258,9 @@ public class IncidentManager { sb.append(":large_yellow_circle: ").append(name); first = false; } - for (String name : recoveredServices) { + for (Map.Entry entry : recoveredServices.entrySet()) { if (!first) sb.append(", "); - sb.append(":large_green_circle: ").append(name); + sb.append(":large_green_circle: ").append(entry.getKey()).append(" (").append(entry.getValue()).append(")"); first = false; } return sb.toString(); diff --git a/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java index 81202efe25..afcd20adf3 100644 --- a/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java +++ b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java @@ -85,13 +85,13 @@ class IncidentManagerTest { } @Test - void recoveryAfterFailureMovesServiceToGreenAndUpdatesHeader() { - manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 1))); + void recoveryAfterFailureMovesServiceToGreenAndKeepsFailureCount() { + manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 4))); manager.sendAlert("CoAP is OK", List.of(AffectedService.recovered("CoAP"))); assertThat(transport.updates).hasSize(1); String updated = transport.updates.get(0).text(); - assertThat(updated).contains(":large_green_circle: CoAP").doesNotContain(":red_circle:"); + assertThat(updated).contains(":large_green_circle: CoAP (4)").doesNotContain(":red_circle:"); } @Test @@ -126,7 +126,7 @@ class IncidentManagerTest { .asString() .contains(":white_check_mark:") .contains(":red_circle: WS Connect") - .contains(":large_green_circle: Login"); + .contains(":large_green_circle: Login (1)"); } @Test From 27c2844fecb382a2d741572a100baf7d08cba043 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Tue, 21 Apr 2026 18:53:06 +0200 Subject: [PATCH 07/13] linkify URL for 'request: Connect to failed' error variant Spring wraps Apache HttpClient connect failures as 'I/O error on POST request: Connect to failed: '. Same treatment as the RestClient variant: the URL is embedded in the 'request' word and the 'Connect to ... failed' restatement is dropped from the visible text. --- .../ServiceFailureNotification.java | 19 +++++++++++++------ .../ServiceFailureNotificationTest.java | 12 ++++++++++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java index 5fcc953cce..69e336f334 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotification.java @@ -52,18 +52,25 @@ public class ServiceFailureNotification implements Notification { return String.format("%s - Failure: %s (number of subsequent failures: %s)", serviceKey, errorMsg, failuresCount); } - private static final Pattern REQUEST_URL_PATTERN = Pattern.compile("request for \"(https?://[^\"\\s]+)\""); + // Spring RestClient: '... request for ""' + private static final Pattern REQUEST_FOR_URL_PATTERN = Pattern.compile("request for \"(https?://[^\"\\s]+)\""); + // Apache HttpClient wrapped by Spring: 'I/O error on POST request: Connect to failed: ' + private static final Pattern REQUEST_CONNECT_PATTERN = Pattern.compile("request: Connect to (https?://\\S+?) failed:"); static String linkifyRequestUrl(String msg) { if (msg == null) { return null; } - Matcher m = REQUEST_URL_PATTERN.matcher(msg); - if (!m.find()) { - return msg; - } // Slack mrkdwn link: - return m.replaceAll("<$1|request>"); + Matcher m = REQUEST_FOR_URL_PATTERN.matcher(msg); + if (m.find()) { + return m.replaceAll("<$1|request>"); + } + Matcher m2 = REQUEST_CONNECT_PATTERN.matcher(msg); + if (m2.find()) { + return m2.replaceAll("<$1|request>:"); + } + return msg; } static String stripResponseBody(String msg) { diff --git a/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java b/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java index 7f66a456fe..500a713f7b 100644 --- a/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java +++ b/monitoring/src/test/java/org/thingsboard/monitoring/data/notification/ServiceFailureNotificationTest.java @@ -55,10 +55,18 @@ class ServiceFailureNotificationTest { @Test void linkifyReplacesRequestForUrlWithSlackMrkdwnLink() { - String msg = "503 Service Temporarily Unavailable on POST request for \"https://qa-tb-pe-lts43.iot-private.cloud/api/auth/login\""; + String msg = "503 Service Temporarily Unavailable on POST request for \"https://example.com/api/auth/login\""; assertThat(ServiceFailureNotification.linkifyRequestUrl(msg)) - .isEqualTo("503 Service Temporarily Unavailable on POST "); + .isEqualTo("503 Service Temporarily Unavailable on POST "); + } + + @Test + void linkifyReplacesRequestConnectToUrlFailed() { + String msg = "I/O error on POST request: Connect to https://example.com:443 failed: Connect timed out"; + + assertThat(ServiceFailureNotification.linkifyRequestUrl(msg)) + .isEqualTo("I/O error on POST : Connect timed out"); } @Test From 600c8bf280b32f1a4bbafd0bfb537a2c601cad50 Mon Sep 17 00:00:00 2001 From: dashevchenko Date: Fri, 20 Feb 2026 16:27:19 +0200 Subject: [PATCH 08/13] Fix flaky DefaultResourceDataCacheTest.testGetCachedResourceData Await cached resource data to become available after save eviction before asserting, and await null after deletion. Prevents Mockito verifyNoMoreInteractions(resourceService) failure caused by racing background cache-load invocations. Backport of 99334ba7fe0 from master. --- .../service/resource/DefaultResourceDataCacheTest.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/application/src/test/java/org/thingsboard/server/service/resource/DefaultResourceDataCacheTest.java b/application/src/test/java/org/thingsboard/server/service/resource/DefaultResourceDataCacheTest.java index c227ec76aa..d33f1755d5 100644 --- a/application/src/test/java/org/thingsboard/server/service/resource/DefaultResourceDataCacheTest.java +++ b/application/src/test/java/org/thingsboard/server/service/resource/DefaultResourceDataCacheTest.java @@ -15,6 +15,7 @@ */ package org.thingsboard.server.service.resource; +import org.awaitility.Awaitility; import org.junit.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.test.context.bean.override.mockito.MockitoSpyBean; @@ -29,6 +30,8 @@ import org.thingsboard.server.dao.resource.ResourceService; import org.thingsboard.server.dao.resource.TbResourceDataCache; import org.thingsboard.server.dao.service.DaoSqlTest; +import java.util.concurrent.TimeUnit; + import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.clearInvocations; import static org.mockito.Mockito.timeout; @@ -61,6 +64,8 @@ public class DefaultResourceDataCacheTest extends AbstractControllerTest { TbResourceInfo savedResource = tbResourceService.save(resource); verify(resourceDataCache, timeout(2000).times(1)).evictResourceData(tenantId, savedResource.getId()); + Awaitility.await().atMost(2, TimeUnit.SECONDS).untilAsserted(() -> + assertThat(resourceDataCache.getResourceDataInfoAsync(tenantId, savedResource.getId()).get()).isNotNull()); TbResourceDataInfo cachedData = resourceDataCache.getResourceDataInfoAsync(tenantId, savedResource.getId()).get(); assertThat(cachedData.getData()).isEqualTo(data); assertThat(JacksonUtil.treeToValue(cachedData.getDescriptor(), GeneralFileDescriptor.class)).isEqualTo(descriptor); @@ -76,8 +81,8 @@ public class DefaultResourceDataCacheTest extends AbstractControllerTest { TbResource resourceById = resourceService.findResourceById(tenantId, savedResource.getId()); tbResourceService.delete(resourceById, true, null); verify(resourceDataCache, timeout(2000).times(2)).evictResourceData(tenantId, savedResource.getId()); - TbResourceDataInfo cachedDataAfterDeletion = resourceDataCache.getResourceDataInfoAsync(tenantId, savedResource.getId()).get(); - assertThat(cachedDataAfterDeletion).isEqualTo(null); + Awaitility.await().atMost(2, TimeUnit.SECONDS).untilAsserted(() -> + assertThat(resourceDataCache.getResourceDataInfoAsync(tenantId, savedResource.getId()).get()).isNull()); } } From 1b749ebadfe51de8837fbcaab3cc2d21b7a4e6e3 Mon Sep 17 00:00:00 2001 From: Sergey Matvienko Date: Tue, 21 Apr 2026 20:01:31 +0200 Subject: [PATCH 09/13] Fix flaky UserEdgeTest.testCreateUpdateDeleteTenantUser The test asserts exactly 2 UserCredentialsUpdateMsg after creating a new tenant-admin user, but the user activation flow can emit either 2 or 3 depending on timing: - activateUserCredentials publishes CREDENTIALS_UPDATED (msg #1) - setUserCredentialsEnabled publishes CREDENTIALS_UPDATED (msg #2) - the initial USER ADDED edge event is processed asynchronously in UserEdgeProcessor and bundles an extra UserCredentialsUpdateMsg when it finds userCredentials.isEnabled() == true (i.e. activation already raced past the ADDED event) When the race goes the second way we end up with 1 UserUpdateMsg plus 3 UserCredentialsUpdateMsg, which currently fails the hard-coded assertEquals(2, ...) assertion. Accept both 2 and 3 UserCredentialsUpdateMsg instead of asserting an exact count, matching the reality of the asynchronous edge event pipeline. --- .../java/org/thingsboard/server/edge/UserEdgeTest.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/application/src/test/java/org/thingsboard/server/edge/UserEdgeTest.java b/application/src/test/java/org/thingsboard/server/edge/UserEdgeTest.java index 6b3bffc883..2ec67a7886 100644 --- a/application/src/test/java/org/thingsboard/server/edge/UserEdgeTest.java +++ b/application/src/test/java/org/thingsboard/server/edge/UserEdgeTest.java @@ -57,7 +57,12 @@ public class UserEdgeTest extends AbstractEdgeTest { User savedTenantAdmin = createUser(newTenantAdmin, "tenant"); Assert.assertTrue(edgeImitator.waitForMessages()); // wait 3 messages - x1 user update msg and x2 user credentials update msgs (create + authenticate user) Assert.assertEquals(1, edgeImitator.findAllMessagesByType(UserUpdateMsg.class).size()); - Assert.assertEquals(2, edgeImitator.findAllMessagesByType(UserCredentialsUpdateMsg.class).size()); + // The initial USER ADDED edge event may bundle a UserCredentialsUpdateMsg when + // user activation completes before the event is processed, in addition to the 2 + // messages from the CREDENTIALS_UPDATED events fired during activation. Accept 2 or 3. + int credMsgCount = edgeImitator.findAllMessagesByType(UserCredentialsUpdateMsg.class).size(); + Assert.assertTrue("Expected 2 or 3 UserCredentialsUpdateMsg (ADDED/activation race), got " + credMsgCount, + credMsgCount == 2 || credMsgCount == 3); Optional userUpdateMsgOpt = edgeImitator.findMessageByType(UserUpdateMsg.class); Assert.assertTrue(userUpdateMsgOpt.isPresent()); UserUpdateMsg userUpdateMsg = userUpdateMsgOpt.get(); From 310e53fb23ab539d8c244149629e5ab4ed621a33 Mon Sep 17 00:00:00 2001 From: Sergey Matvienko Date: Tue, 21 Apr 2026 20:13:49 +0200 Subject: [PATCH 10/13] Fix flaky DeviceProfileControllerTest invalid-RPC-schema tests Several testSaveProtoDeviceProfileWithInvalidRpcRequestSchema* tests intermittently fail with: org.thingsboard.server.dao.exception.TenantNotFoundException: Tenant with id not found when the tenant created in @Before has not yet been populated in the tenant profile cache by the time the request hits the partition-lookup path (DefaultTenantRoutingInfoService -> TbTenantProfileCache -> TenantService#findTenantById). The underlying request is idempotent (the schema is invalid so it is rejected with 400 regardless of retries), so wrap the doPost + status assertion in Awaitility with Mockito.reset inside the retry block: only the last attempt's invocations are visible to the subsequent verify* assertions. Applies to all testSaveDeviceProfileWithInvalidRpcRequestProtoSchema callers, including the currently-muted testSaveProtoDeviceProfileWithInvalidRpcRequestSchemaRequestIdDateType. --- .../DeviceProfileControllerTest.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/application/src/test/java/org/thingsboard/server/controller/DeviceProfileControllerTest.java b/application/src/test/java/org/thingsboard/server/controller/DeviceProfileControllerTest.java index 225757e451..7d88edd73f 100644 --- a/application/src/test/java/org/thingsboard/server/controller/DeviceProfileControllerTest.java +++ b/application/src/test/java/org/thingsboard/server/controller/DeviceProfileControllerTest.java @@ -54,11 +54,13 @@ import org.thingsboard.server.common.data.security.Authority; import org.thingsboard.server.dao.device.DeviceProfileDao; import org.thingsboard.server.dao.exception.DataValidationException; import org.thingsboard.server.dao.service.DaoSqlTest; +import org.awaitility.Awaitility; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import static org.assertj.core.api.Assertions.assertThat; @@ -1028,11 +1030,17 @@ public class DeviceProfileControllerTest extends AbstractControllerTest { MqttDeviceProfileTransportConfiguration mqttDeviceProfileTransportConfiguration = this.createMqttDeviceProfileTransportConfiguration(protoTransportPayloadConfiguration, false); DeviceProfile deviceProfile = this.createDeviceProfile("Device Profile", mqttDeviceProfileTransportConfiguration); - Mockito.reset(tbClusterService, auditLogService); - - doPost("/api/deviceProfile", deviceProfile) - .andExpect(status().isBadRequest()) - .andExpect(statusReason(containsString(errorMsg))); + // The request may hit a transient TenantNotFoundException right after the @Before tenant creation + // if the tenant profile cache is not yet warmed up for the newly created tenant. Retry until the + // request returns the expected 400 Bad Request for the invalid schema. Mockito.reset is inside the + // retry loop so the subsequent verify* assertions see only the invocations from the last attempt. + Awaitility.await().atMost(10, TimeUnit.SECONDS).pollInterval(500, TimeUnit.MILLISECONDS) + .ignoreExceptions().untilAsserted(() -> { + Mockito.reset(tbClusterService, auditLogService); + doPost("/api/deviceProfile", deviceProfile) + .andExpect(status().isBadRequest()) + .andExpect(statusReason(containsString(errorMsg))); + }); testNotifyEntityEqualsOneTimeServiceNeverError(deviceProfile, savedTenant.getId(), tenantAdmin.getId(), tenantAdmin.getEmail(), ActionType.ADDED, new DataValidationException(errorMsg)); From 83c4a9325545b800a69f343f5db490596bf0d57b Mon Sep 17 00:00:00 2001 From: Sergey Matvienko Date: Wed, 22 Apr 2026 00:11:47 +0200 Subject: [PATCH 11/13] Fix maven-assembly-plugin skipAssembly leaking into every execution Root pom.xml wired ${pkg.skip.zip} at the plugin-level of maven-assembly-plugin inside the always-active `packaging` profile's . Maven merges plugin-level into every execution of that plugin, so -Dpkg.skip.zip=true (and the -Dpkg.skip=true alias that activates it) suppressed any maven-assembly-plugin execution across the reactor - not only the intended Windows ZIP execution. In CE lts-4.2/4.3 this is latent (no CE module declares a non-ZIP assembly execution), but it breaks downstream forks that do. PE's rule-node-twilio-sms, for instance, declares a custom make-assembly execution producing the classified -rule-node.jar consumed by application's copy-pe-rule-nodes step; under -Dpkg.skip.zip=true that assembly silently became a no-op and the downstream build failed to resolve the classified artifact. tools/pom.xml already sidesteps this via `combine.self="override"` on its own - earlier evidence that the placement was fragile. Move into the `assembly` execution's own so it scopes only to the Windows ZIP execution. Verified via mvn help:effective-pom on application/: with the fix, true no longer appears at plugin-level , only inside the `assembly` . --- pom.xml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 409e16cdc9..d4f5d0f9df 100755 --- a/pom.xml +++ b/pom.xml @@ -618,7 +618,6 @@ org.apache.maven.plugins maven-assembly-plugin - ${pkg.skip.zip} ${pkg.name} ${main.dir}/packaging/${pkg.type}/assembly/windows.xml @@ -631,6 +630,9 @@ single + + ${pkg.skip.zip} + From bec05fab535229e712ee92d13a2c7c6ddd8fe650 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Wed, 22 Apr 2026 09:51:35 +0200 Subject: [PATCH 12/13] hold incident resolution timer while services are still failing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resolution countdown now only starts once every failing service has recovered. While any service is still firing the timer is cancelled, so the incident cannot auto-resolve and spawn a new incident for the same ongoing failure between alerts. High latency is a warning signal (no explicit recovery event) and therefore does not block resolution — only FAILING services do. --- .../incident/IncidentManager.java | 16 ++++++++++++-- .../incident/IncidentManagerTest.java | 21 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java index 1e52c37634..c25abfc081 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/notification/incident/IncidentManager.java @@ -94,7 +94,14 @@ public class IncidentManager { } finally { if (activeIncidentThreadId != null) { lastAlertTime = Instant.now(); - resetResolutionTimer(); + // High latency is a warning only — it has no explicit recovery signal + // (HighLatencyNotification fires only when something is above threshold), + // so resolution hinges on failing services alone. + if (failingServices.isEmpty()) { + resetResolutionTimer(); + } else { + cancelResolutionTimer(); + } } } } @@ -172,10 +179,15 @@ public class IncidentManager { } private void resetResolutionTimer() { + cancelResolutionTimer(); + resolutionTask = scheduler.schedule(this::resolveIncident, resolutionTimeoutSeconds, TimeUnit.SECONDS); + } + + private void cancelResolutionTimer() { if (resolutionTask != null) { resolutionTask.cancel(false); + resolutionTask = null; } - resolutionTask = scheduler.schedule(this::resolveIncident, resolutionTimeoutSeconds, TimeUnit.SECONDS); } private void startDurationUpdater() { diff --git a/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java index afcd20adf3..b7051db534 100644 --- a/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java +++ b/monitoring/src/test/java/org/thingsboard/monitoring/notification/incident/IncidentManagerTest.java @@ -135,6 +135,27 @@ class IncidentManagerTest { assertThat(transport.updates).isEmpty(); } + @Test + void doesNotAutoResolveWhileServicesAreStillFailing() throws Exception { + manager.shutdown(); + transport = new RecordingTransport(); + manager = new IncidentManager(transport, 1L, "tbqa", false); + + manager.sendAlert("CoAP failure", List.of(AffectedService.failing("CoAP", 1))); + Thread.sleep(1500); + + assertThat(transport.updates) + .extracting(RecordingTransport.Message::text) + .noneMatch(t -> t.contains(":white_check_mark:")); + + manager.sendAlert("CoAP is OK", List.of(AffectedService.recovered("CoAP"))); + Thread.sleep(1500); + + assertThat(transport.updates) + .extracting(RecordingTransport.Message::text) + .anyMatch(t -> t.contains(":white_check_mark:")); + } + private static class RecordingTransport implements IncidentTransport { private final AtomicInteger threadCounter = new AtomicInteger(); final java.util.List incidents = new java.util.ArrayList<>(); From df4dc250829e13b0442bc7b2ffd58a23e3573f47 Mon Sep 17 00:00:00 2001 From: Oleksii Kuripko Date: Thu, 23 Apr 2026 10:20:20 +0200 Subject: [PATCH 13/13] emit recovery signal for MonitoringServiceKey.GENERAL and route EDQS I/O errors correctly Two related fixes for a stuck-open 'Monitoring' entry in the incident header: 1. serviceIsOk(GENERAL) is now called when a monitoring cycle completes successfully. Previously GENERAL could only accumulate failures (via the outer Throwable catch), with no complementary recovery, so once the catch-all fired the service stayed red forever. 2. checkEdqs() is now wrapped in its own try/catch that reports any non-ServiceFailureException failures under EDQS rather than GENERAL. Connection/read timeouts hitting /api/entitiesQuery/find previously propagated unwrapped and were bucketed as GENERAL, which hid the fact that EDQS was the failing component. --- .../service/BaseMonitoringService.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java index 3f1c624200..9c8c8fc533 100644 --- a/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java +++ b/monitoring/src/main/java/org/thingsboard/monitoring/service/BaseMonitoringService.java @@ -155,13 +155,22 @@ public abstract class BaseMonitoringService, T ext } if (checkEdqs) { - stopWatch.start(); - checkEdqs(); - reporter.reportLatency(Latencies.EDQS_QUERY, stopWatch.getTime()); - reporter.serviceIsOk(MonitoredServiceKey.EDQS); + try { + stopWatch.start(); + checkEdqs(); + reporter.reportLatency(Latencies.EDQS_QUERY, stopWatch.getTime()); + reporter.serviceIsOk(MonitoredServiceKey.EDQS); + } catch (ServiceFailureException e) { + reporter.serviceFailure(e.getServiceKey(), e); + return; + } catch (Exception e) { + reporter.serviceFailure(MonitoredServiceKey.EDQS, e); + return; + } } reporter.reportLatencies(); + reporter.serviceIsOk(MonitoredServiceKey.GENERAL); log.debug("Finished {}", getName()); } catch (ServiceFailureException e) { reporter.serviceFailure(e.getServiceKey(), e);