Add configurable timeout and retry logic for Qlik Sense API calls

- Add timeout, maxRetries, and retryDelayMilliseconds config options for both health check and proxy session APIs
- Implement exponential backoff retry logic with configurable parameters
- Change healthmetrics.js to use async/await pattern for better error handling
- Process health checks concurrently with Promise.allSettled for better performance
- Default timeout increased from 5s to 30s to handle slower/wifi connections
- Default retry count set to 3 attempts with 1s initial delay
- Add comprehensive logging for retry attempts and final failures

Co-authored-by: mountaindude <1029262+mountaindude@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2025-12-18 06:13:05 +00:00
parent f7679cb8c2
commit c444156871
3 changed files with 158 additions and 56 deletions

View File

@@ -549,6 +549,9 @@ Butler-SOS:
enableSessionExtract: true # Query unique user IDs of what users have sessions open (true/false)?
# Items below are mandatory if enableSessionExtract=true
pollingInterval: 30000 # How often (milliseconds) should session data be polled?
timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for proxy session API calls. Default: 30000
maxRetries: 3 # Number of times to retry failed API calls. Default: 3
retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000
excludeUser: # Optional blacklist of users that should be disregarded when it comes to session monitoring.
# Blacklist is only applied to data in InfluxDB. All session data will be sent to MQTT.
# - directory: LAB
@@ -558,6 +561,9 @@ Butler-SOS:
serversToMonitor:
pollingInterval: 30000 # How often (milliseconds) should the healthcheck API be polled?
timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for health check API calls. Default: 30000
maxRetries: 3 # Number of times to retry failed API calls. Default: 3
retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000
# If false, Butler SOS will accept TLS certificates on the server without verifying them with the CA.
# If true, data will only be retrieved from the Sense server if that server's TLS cert verifies

View File

@@ -20,14 +20,16 @@ import { getCertificates, createCertificateOptions } from './cert-utils.js';
*
* This function makes an HTTPS request to the Sense engine healthcheck API and
* distributes the data to configured destinations (MQTT, InfluxDB, New Relic, Prometheus).
* Implements retry logic with exponential backoff for transient network failures.
*
* @param {string} serverName - The name of the server as defined in the config.
* @param {string} host - The hostname or IP address of the Sense server.
* @param {object} tags - Tags/metadata to associate with the server metrics.
* @param {object|null} headers - Additional headers to include in the request.
* @returns {void}
* @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
* @returns {Promise<void>}
*/
export function getHealthStatsFromSense(serverName, host, tags, headers) {
export async function getHealthStatsFromSense(serverName, host, tags, headers, retryCount = 0) {
globals.logger.debug(`HEALTH: URL=https://${host}/engine/healthcheck`);
// Get certificate configuration options
@@ -49,6 +51,17 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
});
// Get timeout and retry settings from config with fallback defaults
const timeout = globals.config.has('Butler-SOS.serversToMonitor.timeoutMilliseconds')
? globals.config.get('Butler-SOS.serversToMonitor.timeoutMilliseconds')
: 30000;
const maxRetries = globals.config.has('Butler-SOS.serversToMonitor.maxRetries')
? globals.config.get('Butler-SOS.serversToMonitor.maxRetries')
: 3;
const retryDelay = globals.config.has('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
? globals.config.get('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
: 1000;
const requestSettings = {
url: `https://${host}/engine/healthcheck`,
method: 'get',
@@ -57,7 +70,7 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
'Content-Type': 'application/json',
},
httpsAgent,
timeout: 5000,
timeout,
maxRedirects: 5,
};
@@ -71,43 +84,71 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
});
}
axios
.request(requestSettings)
.then((response) => {
if (response.status === 200) {
globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);
try {
const response = await axios.request(requestSettings);
// Post to MQTT
if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
postHealthToMQTT(host, tags.host, response.data);
}
if (response.status === 200) {
globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);
// Post to Influxdb
if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
}
// Post to New Relic
if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
postHealthMetricsToNewRelic(host, response.data, tags);
}
// Save latest available data for Prometheus
if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
saveHealthMetricsToPrometheus(host, response.data, tags);
}
// Post to MQTT
if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
postHealthToMQTT(host, tags.host, response.data);
}
})
.catch((err) => {
globals.logger.error(
`HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
// Post to Influxdb
if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
}
// Post to New Relic
if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
postHealthMetricsToNewRelic(host, response.data, tags);
}
// Save latest available data for Prometheus
if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
saveHealthMetricsToPrometheus(host, response.data, tags);
}
}
} catch (err) {
// Check if we should retry based on error type and retry count
const shouldRetry =
retryCount < maxRetries &&
(err.code === 'ECONNABORTED' || // Timeout
err.code === 'ECONNRESET' || // Connection reset
err.code === 'ETIMEDOUT' || // Network timeout
err.code === 'ENOTFOUND' || // DNS lookup failed
err.code === 'ENETUNREACH'); // Network unreachable
if (shouldRetry) {
// Calculate exponential backoff delay
const delay = retryDelay * Math.pow(2, retryCount);
globals.logger.warn(
`HEALTH: Error calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
);
});
// Wait before retrying
await new Promise((resolve) => setTimeout(resolve, delay));
// Recursive retry
return getHealthStatsFromSense(serverName, host, tags, headers, retryCount + 1);
}
// Final error after all retries exhausted or non-retryable error
globals.logger.error(
`HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
);
if (retryCount > 0) {
globals.logger.error(
`HEALTH: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
);
}
}
}
/**
@@ -120,29 +161,30 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
*/
export function setupHealthMetricsTimer() {
// Configure timer for getting healthcheck data
setInterval(() => {
setInterval(async () => {
globals.logger.verbose('HEALTH: Event started: Statistics collection');
globals.serverList.forEach((server) => {
globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);
// Process all servers concurrently with error handling
const healthCheckPromises = globals.serverList.map(async (server) => {
try {
globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);
// Get per-server tags
const tags = getServerTags(globals.logger, server);
// Get per-server tags
const tags = getServerTags(globals.logger, server);
// Save tags to global variable.
// Add a new object to the array, with properties host andd tags.
// The tags property is an array with all the tags for the server.
// Each tag object has a name and a value.
// globals.serverTags.push({
// host: server.host,
// tags,
// });
// Get per-server headers
const headers = getServerHeaders(server);
// Get per-server headers
const headers = getServerHeaders(server);
getHealthStatsFromSense(server.serverName, server.host, tags, headers);
await getHealthStatsFromSense(server.serverName, server.host, tags, headers);
} catch (err) {
globals.logger.error(
`HEALTH: Unexpected error processing health stats for server '${server.serverName}': ${globals.getErrorMessage(err)}`
);
}
});
// Wait for all health checks to complete
await Promise.allSettled(healthCheckPromises);
}, globals.config.get('Butler-SOS.serversToMonitor.pollingInterval'));
}

View File

@@ -206,14 +206,22 @@ function prepUserSessionMetrics(serverName, host, virtualProxy, body, tags) {
* This function makes an API call to the Qlik Sense Proxy API to get information about
* active user sessions. It then processes this data and sends it to configured destinations
* (MQTT, InfluxDB, New Relic, Prometheus).
* Implements retry logic with exponential backoff for transient network failures.
*
* @param {string} serverName - Name of the Qlik Sense server
* @param {string} host - Host name or IP of the Qlik Sense server
* @param {string} virtualProxy - Virtual proxy prefix
* @param {object} influxTags - Tags to associate with metrics in InfluxDB
* @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
* @returns {Promise<void>} Promise that resolves when the operation is complete
*/
export async function getProxySessionStatsFromSense(serverName, host, virtualProxy, influxTags) {
export async function getProxySessionStatsFromSense(
serverName,
host,
virtualProxy,
influxTags,
retryCount = 0
) {
// Current user sessions are retrived using this API:
// https://help.qlik.com/en-US/sense-developer/February2021/Subsystems/ProxyServiceAPI/Content/Sense_ProxyServiceAPI/ProxyServiceAPI-Proxy-API.htm
@@ -236,6 +244,17 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
});
// Get timeout and retry settings from config with fallback defaults
const timeout = globals.config.has('Butler-SOS.userSessions.timeoutMilliseconds')
? globals.config.get('Butler-SOS.userSessions.timeoutMilliseconds')
: 30000;
const maxRetries = globals.config.has('Butler-SOS.userSessions.maxRetries')
? globals.config.get('Butler-SOS.userSessions.maxRetries')
: 3;
const retryDelay = globals.config.has('Butler-SOS.userSessions.retryDelayMilliseconds')
? globals.config.get('Butler-SOS.userSessions.retryDelayMilliseconds')
: 1000;
const vP = virtualProxy === '/' ? '' : `${virtualProxy}`;
const requestSettings = {
url: `https://${host}/qps${vP}/session?Xrfkey=abcdefghij987654`,
@@ -247,7 +266,7 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
XVirtualProxy: virtualProxy,
},
httpsAgent,
timeout: 5000,
timeout,
maxRedirects: 5,
};
@@ -316,9 +335,44 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
}
}
} catch (err) {
// Check if we should retry based on error type and retry count
const shouldRetry =
retryCount < maxRetries &&
(err.code === 'ECONNABORTED' || // Timeout
err.code === 'ECONNRESET' || // Connection reset
err.code === 'ETIMEDOUT' || // Network timeout
err.code === 'ENOTFOUND' || // DNS lookup failed
err.code === 'ENETUNREACH'); // Network unreachable
if (shouldRetry) {
// Calculate exponential backoff delay
const delay = retryDelay * Math.pow(2, retryCount);
globals.logger.warn(
`PROXY SESSIONS: Error calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
);
// Wait before retrying
await new Promise((resolve) => setTimeout(resolve, delay));
// Recursive retry
return getProxySessionStatsFromSense(
serverName,
host,
virtualProxy,
influxTags,
retryCount + 1
);
}
// Final error after all retries exhausted or non-retryable error
globals.logger.error(
`PROXY SESSIONS: Error when calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}`
);
if (retryCount > 0) {
globals.logger.error(
`PROXY SESSIONS: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
);
}
}
}