Add configurable timeout and retry logic for Qlik Sense API calls

- Add timeout, maxRetries, and retryDelayMilliseconds config options for both health check and proxy session APIs - Implement exponential backoff retry logic with configurable parameters - Change healthmetrics.js to use async/await pattern for better error handling - Process health checks concurrently with Promise.allSettled for better performance - Default timeout increased from 5s to 30s to handle slower/wifi connections - Default retry count set to 3 attempts with 1s initial delay - Add comprehensive logging for retry attempts and final failures Co-authored-by: mountaindude <1029262+mountaindude@users.noreply.github.com>
2025-12-19 09:47:53 -05:00 · 2025-12-18 06:13:05 +00:00
parent f7679cb8c2
commit c444156871
3 changed files with 158 additions and 56 deletions
--- a/src/config/production_template.yaml
+++ b/src/config/production_template.yaml
@@ -549,6 +549,9 @@ Butler-SOS:
        enableSessionExtract: true # Query unique user IDs of what users have sessions open (true/false)?
        # Items below are mandatory if enableSessionExtract=true
        pollingInterval: 30000 # How often (milliseconds) should session data be polled?
+        timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for proxy session API calls. Default: 30000
+        maxRetries: 3 # Number of times to retry failed API calls. Default: 3
+        retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000
        excludeUser: # Optional blacklist of users that should be disregarded when it comes to session monitoring.
            # Blacklist is only applied to data in InfluxDB. All session data will be sent to MQTT.
            # - directory: LAB
@@ -558,6 +561,9 @@ Butler-SOS:

    serversToMonitor:
        pollingInterval: 30000 # How often (milliseconds) should the healthcheck API be polled?
+        timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for health check API calls. Default: 30000
+        maxRetries: 3 # Number of times to retry failed API calls. Default: 3
+        retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000

        # If false, Butler SOS will accept TLS certificates on the server without verifying them with the CA.
        # If true, data will only be retrieved from the Sense server if that server's TLS cert verifies
--- a/src/lib/healthmetrics.js
+++ b/src/lib/healthmetrics.js
@@ -20,14 +20,16 @@ import { getCertificates, createCertificateOptions } from './cert-utils.js';
 *
 * This function makes an HTTPS request to the Sense engine healthcheck API and
 * distributes the data to configured destinations (MQTT, InfluxDB, New Relic, Prometheus).
+ * Implements retry logic with exponential backoff for transient network failures.
 *
 * @param {string} serverName - The name of the server as defined in the config.
 * @param {string} host - The hostname or IP address of the Sense server.
 * @param {object} tags - Tags/metadata to associate with the server metrics.
 * @param {object|null} headers - Additional headers to include in the request.
- * @returns {void}
+ * @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
+ * @returns {Promise<void>}
 */
-export function getHealthStatsFromSense(serverName, host, tags, headers) {
+export async function getHealthStatsFromSense(serverName, host, tags, headers, retryCount = 0) {
    globals.logger.debug(`HEALTH: URL=https://${host}/engine/healthcheck`);

    // Get certificate configuration options
@@ -49,6 +51,17 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
        rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
    });

+    // Get timeout and retry settings from config with fallback defaults
+    const timeout = globals.config.has('Butler-SOS.serversToMonitor.timeoutMilliseconds')
+        ? globals.config.get('Butler-SOS.serversToMonitor.timeoutMilliseconds')
+        : 30000;
+    const maxRetries = globals.config.has('Butler-SOS.serversToMonitor.maxRetries')
+        ? globals.config.get('Butler-SOS.serversToMonitor.maxRetries')
+        : 3;
+    const retryDelay = globals.config.has('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
+        ? globals.config.get('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
+        : 1000;
+
    const requestSettings = {
        url: `https://${host}/engine/healthcheck`,
        method: 'get',
@@ -57,7 +70,7 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
            'Content-Type': 'application/json',
        },
        httpsAgent,
-        timeout: 5000,
+        timeout,
        maxRedirects: 5,
    };

@@ -71,43 +84,71 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
        });
    }

-    axios
-        .request(requestSettings)
-        .then((response) => {
-            if (response.status === 200) {
-                globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
-                globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);
+    try {
+        const response = await axios.request(requestSettings);

-                // Post to MQTT
-                if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
-                    globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
-                    postHealthToMQTT(host, tags.host, response.data);
-                }
+        if (response.status === 200) {
+            globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
+            globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);

-                // Post to Influxdb
-                if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
-                    globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
-                    postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
-                }
-
-                // Post to New Relic
-                if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
-                    globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
-                    postHealthMetricsToNewRelic(host, response.data, tags);
-                }
-
-                // Save latest available data for Prometheus
-                if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
-                    globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
-                    saveHealthMetricsToPrometheus(host, response.data, tags);
-                }
+            // Post to MQTT
+            if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
+                globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
+                postHealthToMQTT(host, tags.host, response.data);
            }
-        })
-        .catch((err) => {
-            globals.logger.error(
-                `HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
+
+            // Post to Influxdb
+            if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
+                globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
+                postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
+            }
+
+            // Post to New Relic
+            if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
+                globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
+                postHealthMetricsToNewRelic(host, response.data, tags);
+            }
+
+            // Save latest available data for Prometheus
+            if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
+                globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
+                saveHealthMetricsToPrometheus(host, response.data, tags);
+            }
+        }
+    } catch (err) {
+        // Check if we should retry based on error type and retry count
+        const shouldRetry =
+            retryCount < maxRetries &&
+            (err.code === 'ECONNABORTED' || // Timeout
+                err.code === 'ECONNRESET' || // Connection reset
+                err.code === 'ETIMEDOUT' || // Network timeout
+                err.code === 'ENOTFOUND' || // DNS lookup failed
+                err.code === 'ENETUNREACH'); // Network unreachable
+
+        if (shouldRetry) {
+            // Calculate exponential backoff delay
+            const delay = retryDelay * Math.pow(2, retryCount);
+            globals.logger.warn(
+                `HEALTH: Error calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
            );
-        });
+
+            // Wait before retrying
+            await new Promise((resolve) => setTimeout(resolve, delay));
+
+            // Recursive retry
+            return getHealthStatsFromSense(serverName, host, tags, headers, retryCount + 1);
+        }
+
+        // Final error after all retries exhausted or non-retryable error
+        globals.logger.error(
+            `HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
+        );
+        if (retryCount > 0) {
+            globals.logger.error(
+                `HEALTH: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
+            );
+        }
+    }
 }

 /**
@@ -120,29 +161,30 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
 */
 export function setupHealthMetricsTimer() {
    // Configure timer for getting healthcheck data
-    setInterval(() => {
+    setInterval(async () => {
        globals.logger.verbose('HEALTH: Event started: Statistics collection');

-        globals.serverList.forEach((server) => {
-            globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
-            globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);
+        // Process all servers concurrently with error handling
+        const healthCheckPromises = globals.serverList.map(async (server) => {
+            try {
+                globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
+                globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);

-            // Get per-server tags
-            const tags = getServerTags(globals.logger, server);
+                // Get per-server tags
+                const tags = getServerTags(globals.logger, server);

-            // Save tags to global variable.
-            // Add a new object to the array, with properties host andd tags.
-            // The tags property is an array with all the tags for the server.
-            // Each tag object has a name and a value.
-            // globals.serverTags.push({
-            //     host: server.host,
-            //     tags,
-            // });
+                // Get per-server headers
+                const headers = getServerHeaders(server);

-            // Get per-server headers
-            const headers = getServerHeaders(server);
-
-            getHealthStatsFromSense(server.serverName, server.host, tags, headers);
+                await getHealthStatsFromSense(server.serverName, server.host, tags, headers);
+            } catch (err) {
+                globals.logger.error(
+                    `HEALTH: Unexpected error processing health stats for server '${server.serverName}': ${globals.getErrorMessage(err)}`
+                );
+            }
        });
+
+        // Wait for all health checks to complete
+        await Promise.allSettled(healthCheckPromises);
    }, globals.config.get('Butler-SOS.serversToMonitor.pollingInterval'));
 }
--- a/src/lib/proxysessionmetrics.js
+++ b/src/lib/proxysessionmetrics.js
@@ -206,14 +206,22 @@ function prepUserSessionMetrics(serverName, host, virtualProxy, body, tags) {
 * This function makes an API call to the Qlik Sense Proxy API to get information about
 * active user sessions. It then processes this data and sends it to configured destinations
 * (MQTT, InfluxDB, New Relic, Prometheus).
+ * Implements retry logic with exponential backoff for transient network failures.
 *
 * @param {string} serverName - Name of the Qlik Sense server
 * @param {string} host - Host name or IP of the Qlik Sense server
 * @param {string} virtualProxy - Virtual proxy prefix
 * @param {object} influxTags - Tags to associate with metrics in InfluxDB
+ * @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
 * @returns {Promise<void>} Promise that resolves when the operation is complete
 */
-export async function getProxySessionStatsFromSense(serverName, host, virtualProxy, influxTags) {
+export async function getProxySessionStatsFromSense(
+    serverName,
+    host,
+    virtualProxy,
+    influxTags,
+    retryCount = 0
+) {
    // Current user sessions are retrived using this API:
    // https://help.qlik.com/en-US/sense-developer/February2021/Subsystems/ProxyServiceAPI/Content/Sense_ProxyServiceAPI/ProxyServiceAPI-Proxy-API.htm

@@ -236,6 +244,17 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
        rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
    });

+    // Get timeout and retry settings from config with fallback defaults
+    const timeout = globals.config.has('Butler-SOS.userSessions.timeoutMilliseconds')
+        ? globals.config.get('Butler-SOS.userSessions.timeoutMilliseconds')
+        : 30000;
+    const maxRetries = globals.config.has('Butler-SOS.userSessions.maxRetries')
+        ? globals.config.get('Butler-SOS.userSessions.maxRetries')
+        : 3;
+    const retryDelay = globals.config.has('Butler-SOS.userSessions.retryDelayMilliseconds')
+        ? globals.config.get('Butler-SOS.userSessions.retryDelayMilliseconds')
+        : 1000;
+
    const vP = virtualProxy === '/' ? '' : `${virtualProxy}`;
    const requestSettings = {
        url: `https://${host}/qps${vP}/session?Xrfkey=abcdefghij987654`,
@@ -247,7 +266,7 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
            XVirtualProxy: virtualProxy,
        },
        httpsAgent,
-        timeout: 5000,
+        timeout,
        maxRedirects: 5,
    };

@@ -316,9 +335,44 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
            }
        }
    } catch (err) {
+        // Check if we should retry based on error type and retry count
+        const shouldRetry =
+            retryCount < maxRetries &&
+            (err.code === 'ECONNABORTED' || // Timeout
+                err.code === 'ECONNRESET' || // Connection reset
+                err.code === 'ETIMEDOUT' || // Network timeout
+                err.code === 'ENOTFOUND' || // DNS lookup failed
+                err.code === 'ENETUNREACH'); // Network unreachable
+
+        if (shouldRetry) {
+            // Calculate exponential backoff delay
+            const delay = retryDelay * Math.pow(2, retryCount);
+            globals.logger.warn(
+                `PROXY SESSIONS: Error calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
+            );
+
+            // Wait before retrying
+            await new Promise((resolve) => setTimeout(resolve, delay));
+
+            // Recursive retry
+            return getProxySessionStatsFromSense(
+                serverName,
+                host,
+                virtualProxy,
+                influxTags,
+                retryCount + 1
+            );
+        }
+
+        // Final error after all retries exhausted or non-retryable error
        globals.logger.error(
            `PROXY SESSIONS: Error when calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}`
        );
+        if (retryCount > 0) {
+            globals.logger.error(
+                `PROXY SESSIONS: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
+            );
+        }
    }
 }