mirror of
https://github.com/ptarmiganlabs/butler-sos.git
synced 2025-12-19 09:47:53 -05:00
Add configurable timeout and retry logic for Qlik Sense API calls
- Add timeout, maxRetries, and retryDelayMilliseconds config options for both health check and proxy session APIs - Implement exponential backoff retry logic with configurable parameters - Change healthmetrics.js to use async/await pattern for better error handling - Process health checks concurrently with Promise.allSettled for better performance - Default timeout increased from 5s to 30s to handle slower/wifi connections - Default retry count set to 3 attempts with 1s initial delay - Add comprehensive logging for retry attempts and final failures Co-authored-by: mountaindude <1029262+mountaindude@users.noreply.github.com>
This commit is contained in:
@@ -549,6 +549,9 @@ Butler-SOS:
|
||||
enableSessionExtract: true # Query unique user IDs of what users have sessions open (true/false)?
|
||||
# Items below are mandatory if enableSessionExtract=true
|
||||
pollingInterval: 30000 # How often (milliseconds) should session data be polled?
|
||||
timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for proxy session API calls. Default: 30000
|
||||
maxRetries: 3 # Number of times to retry failed API calls. Default: 3
|
||||
retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000
|
||||
excludeUser: # Optional blacklist of users that should be disregarded when it comes to session monitoring.
|
||||
# Blacklist is only applied to data in InfluxDB. All session data will be sent to MQTT.
|
||||
# - directory: LAB
|
||||
@@ -558,6 +561,9 @@ Butler-SOS:
|
||||
|
||||
serversToMonitor:
|
||||
pollingInterval: 30000 # How often (milliseconds) should the healthcheck API be polled?
|
||||
timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for health check API calls. Default: 30000
|
||||
maxRetries: 3 # Number of times to retry failed API calls. Default: 3
|
||||
retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000
|
||||
|
||||
# If false, Butler SOS will accept TLS certificates on the server without verifying them with the CA.
|
||||
# If true, data will only be retrieved from the Sense server if that server's TLS cert verifies
|
||||
|
||||
@@ -20,14 +20,16 @@ import { getCertificates, createCertificateOptions } from './cert-utils.js';
|
||||
*
|
||||
* This function makes an HTTPS request to the Sense engine healthcheck API and
|
||||
* distributes the data to configured destinations (MQTT, InfluxDB, New Relic, Prometheus).
|
||||
* Implements retry logic with exponential backoff for transient network failures.
|
||||
*
|
||||
* @param {string} serverName - The name of the server as defined in the config.
|
||||
* @param {string} host - The hostname or IP address of the Sense server.
|
||||
* @param {object} tags - Tags/metadata to associate with the server metrics.
|
||||
* @param {object|null} headers - Additional headers to include in the request.
|
||||
* @returns {void}
|
||||
* @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
||||
export async function getHealthStatsFromSense(serverName, host, tags, headers, retryCount = 0) {
|
||||
globals.logger.debug(`HEALTH: URL=https://${host}/engine/healthcheck`);
|
||||
|
||||
// Get certificate configuration options
|
||||
@@ -49,6 +51,17 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
||||
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
|
||||
});
|
||||
|
||||
// Get timeout and retry settings from config with fallback defaults
|
||||
const timeout = globals.config.has('Butler-SOS.serversToMonitor.timeoutMilliseconds')
|
||||
? globals.config.get('Butler-SOS.serversToMonitor.timeoutMilliseconds')
|
||||
: 30000;
|
||||
const maxRetries = globals.config.has('Butler-SOS.serversToMonitor.maxRetries')
|
||||
? globals.config.get('Butler-SOS.serversToMonitor.maxRetries')
|
||||
: 3;
|
||||
const retryDelay = globals.config.has('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
|
||||
? globals.config.get('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
|
||||
: 1000;
|
||||
|
||||
const requestSettings = {
|
||||
url: `https://${host}/engine/healthcheck`,
|
||||
method: 'get',
|
||||
@@ -57,7 +70,7 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
httpsAgent,
|
||||
timeout: 5000,
|
||||
timeout,
|
||||
maxRedirects: 5,
|
||||
};
|
||||
|
||||
@@ -71,43 +84,71 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
||||
});
|
||||
}
|
||||
|
||||
axios
|
||||
.request(requestSettings)
|
||||
.then((response) => {
|
||||
if (response.status === 200) {
|
||||
globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
|
||||
globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);
|
||||
try {
|
||||
const response = await axios.request(requestSettings);
|
||||
|
||||
// Post to MQTT
|
||||
if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
|
||||
postHealthToMQTT(host, tags.host, response.data);
|
||||
}
|
||||
if (response.status === 200) {
|
||||
globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
|
||||
globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);
|
||||
|
||||
// Post to Influxdb
|
||||
if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
|
||||
postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
|
||||
}
|
||||
|
||||
// Post to New Relic
|
||||
if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
|
||||
postHealthMetricsToNewRelic(host, response.data, tags);
|
||||
}
|
||||
|
||||
// Save latest available data for Prometheus
|
||||
if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
|
||||
saveHealthMetricsToPrometheus(host, response.data, tags);
|
||||
}
|
||||
// Post to MQTT
|
||||
if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
|
||||
postHealthToMQTT(host, tags.host, response.data);
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
globals.logger.error(
|
||||
`HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
|
||||
|
||||
// Post to Influxdb
|
||||
if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
|
||||
postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
|
||||
}
|
||||
|
||||
// Post to New Relic
|
||||
if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
|
||||
postHealthMetricsToNewRelic(host, response.data, tags);
|
||||
}
|
||||
|
||||
// Save latest available data for Prometheus
|
||||
if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
|
||||
globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
|
||||
saveHealthMetricsToPrometheus(host, response.data, tags);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
// Check if we should retry based on error type and retry count
|
||||
const shouldRetry =
|
||||
retryCount < maxRetries &&
|
||||
(err.code === 'ECONNABORTED' || // Timeout
|
||||
err.code === 'ECONNRESET' || // Connection reset
|
||||
err.code === 'ETIMEDOUT' || // Network timeout
|
||||
err.code === 'ENOTFOUND' || // DNS lookup failed
|
||||
err.code === 'ENETUNREACH'); // Network unreachable
|
||||
|
||||
if (shouldRetry) {
|
||||
// Calculate exponential backoff delay
|
||||
const delay = retryDelay * Math.pow(2, retryCount);
|
||||
globals.logger.warn(
|
||||
`HEALTH: Error calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
|
||||
);
|
||||
});
|
||||
|
||||
// Wait before retrying
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
|
||||
// Recursive retry
|
||||
return getHealthStatsFromSense(serverName, host, tags, headers, retryCount + 1);
|
||||
}
|
||||
|
||||
// Final error after all retries exhausted or non-retryable error
|
||||
globals.logger.error(
|
||||
`HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
|
||||
);
|
||||
if (retryCount > 0) {
|
||||
globals.logger.error(
|
||||
`HEALTH: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -120,29 +161,30 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
||||
*/
|
||||
export function setupHealthMetricsTimer() {
|
||||
// Configure timer for getting healthcheck data
|
||||
setInterval(() => {
|
||||
setInterval(async () => {
|
||||
globals.logger.verbose('HEALTH: Event started: Statistics collection');
|
||||
|
||||
globals.serverList.forEach((server) => {
|
||||
globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
|
||||
globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);
|
||||
// Process all servers concurrently with error handling
|
||||
const healthCheckPromises = globals.serverList.map(async (server) => {
|
||||
try {
|
||||
globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
|
||||
globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);
|
||||
|
||||
// Get per-server tags
|
||||
const tags = getServerTags(globals.logger, server);
|
||||
// Get per-server tags
|
||||
const tags = getServerTags(globals.logger, server);
|
||||
|
||||
// Save tags to global variable.
|
||||
// Add a new object to the array, with properties host andd tags.
|
||||
// The tags property is an array with all the tags for the server.
|
||||
// Each tag object has a name and a value.
|
||||
// globals.serverTags.push({
|
||||
// host: server.host,
|
||||
// tags,
|
||||
// });
|
||||
// Get per-server headers
|
||||
const headers = getServerHeaders(server);
|
||||
|
||||
// Get per-server headers
|
||||
const headers = getServerHeaders(server);
|
||||
|
||||
getHealthStatsFromSense(server.serverName, server.host, tags, headers);
|
||||
await getHealthStatsFromSense(server.serverName, server.host, tags, headers);
|
||||
} catch (err) {
|
||||
globals.logger.error(
|
||||
`HEALTH: Unexpected error processing health stats for server '${server.serverName}': ${globals.getErrorMessage(err)}`
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for all health checks to complete
|
||||
await Promise.allSettled(healthCheckPromises);
|
||||
}, globals.config.get('Butler-SOS.serversToMonitor.pollingInterval'));
|
||||
}
|
||||
|
||||
@@ -206,14 +206,22 @@ function prepUserSessionMetrics(serverName, host, virtualProxy, body, tags) {
|
||||
* This function makes an API call to the Qlik Sense Proxy API to get information about
|
||||
* active user sessions. It then processes this data and sends it to configured destinations
|
||||
* (MQTT, InfluxDB, New Relic, Prometheus).
|
||||
* Implements retry logic with exponential backoff for transient network failures.
|
||||
*
|
||||
* @param {string} serverName - Name of the Qlik Sense server
|
||||
* @param {string} host - Host name or IP of the Qlik Sense server
|
||||
* @param {string} virtualProxy - Virtual proxy prefix
|
||||
* @param {object} influxTags - Tags to associate with metrics in InfluxDB
|
||||
* @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
|
||||
* @returns {Promise<void>} Promise that resolves when the operation is complete
|
||||
*/
|
||||
export async function getProxySessionStatsFromSense(serverName, host, virtualProxy, influxTags) {
|
||||
export async function getProxySessionStatsFromSense(
|
||||
serverName,
|
||||
host,
|
||||
virtualProxy,
|
||||
influxTags,
|
||||
retryCount = 0
|
||||
) {
|
||||
// Current user sessions are retrived using this API:
|
||||
// https://help.qlik.com/en-US/sense-developer/February2021/Subsystems/ProxyServiceAPI/Content/Sense_ProxyServiceAPI/ProxyServiceAPI-Proxy-API.htm
|
||||
|
||||
@@ -236,6 +244,17 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
|
||||
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
|
||||
});
|
||||
|
||||
// Get timeout and retry settings from config with fallback defaults
|
||||
const timeout = globals.config.has('Butler-SOS.userSessions.timeoutMilliseconds')
|
||||
? globals.config.get('Butler-SOS.userSessions.timeoutMilliseconds')
|
||||
: 30000;
|
||||
const maxRetries = globals.config.has('Butler-SOS.userSessions.maxRetries')
|
||||
? globals.config.get('Butler-SOS.userSessions.maxRetries')
|
||||
: 3;
|
||||
const retryDelay = globals.config.has('Butler-SOS.userSessions.retryDelayMilliseconds')
|
||||
? globals.config.get('Butler-SOS.userSessions.retryDelayMilliseconds')
|
||||
: 1000;
|
||||
|
||||
const vP = virtualProxy === '/' ? '' : `${virtualProxy}`;
|
||||
const requestSettings = {
|
||||
url: `https://${host}/qps${vP}/session?Xrfkey=abcdefghij987654`,
|
||||
@@ -247,7 +266,7 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
|
||||
XVirtualProxy: virtualProxy,
|
||||
},
|
||||
httpsAgent,
|
||||
timeout: 5000,
|
||||
timeout,
|
||||
maxRedirects: 5,
|
||||
};
|
||||
|
||||
@@ -316,9 +335,44 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
// Check if we should retry based on error type and retry count
|
||||
const shouldRetry =
|
||||
retryCount < maxRetries &&
|
||||
(err.code === 'ECONNABORTED' || // Timeout
|
||||
err.code === 'ECONNRESET' || // Connection reset
|
||||
err.code === 'ETIMEDOUT' || // Network timeout
|
||||
err.code === 'ENOTFOUND' || // DNS lookup failed
|
||||
err.code === 'ENETUNREACH'); // Network unreachable
|
||||
|
||||
if (shouldRetry) {
|
||||
// Calculate exponential backoff delay
|
||||
const delay = retryDelay * Math.pow(2, retryCount);
|
||||
globals.logger.warn(
|
||||
`PROXY SESSIONS: Error calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
|
||||
);
|
||||
|
||||
// Wait before retrying
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
|
||||
// Recursive retry
|
||||
return getProxySessionStatsFromSense(
|
||||
serverName,
|
||||
host,
|
||||
virtualProxy,
|
||||
influxTags,
|
||||
retryCount + 1
|
||||
);
|
||||
}
|
||||
|
||||
// Final error after all retries exhausted or non-retryable error
|
||||
globals.logger.error(
|
||||
`PROXY SESSIONS: Error when calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}`
|
||||
);
|
||||
if (retryCount > 0) {
|
||||
globals.logger.error(
|
||||
`PROXY SESSIONS: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user