mirror of
https://github.com/ptarmiganlabs/butler-sos.git
synced 2025-12-19 17:58:18 -05:00
Add configurable timeout and retry logic for Qlik Sense API calls
- Add timeout, maxRetries, and retryDelayMilliseconds config options for both health check and proxy session APIs - Implement exponential backoff retry logic with configurable parameters - Change healthmetrics.js to use async/await pattern for better error handling - Process health checks concurrently with Promise.allSettled for better performance - Default timeout increased from 5s to 30s to handle slower/wifi connections - Default retry count set to 3 attempts with 1s initial delay - Add comprehensive logging for retry attempts and final failures Co-authored-by: mountaindude <1029262+mountaindude@users.noreply.github.com>
This commit is contained in:
@@ -549,6 +549,9 @@ Butler-SOS:
|
|||||||
enableSessionExtract: true # Query unique user IDs of what users have sessions open (true/false)?
|
enableSessionExtract: true # Query unique user IDs of what users have sessions open (true/false)?
|
||||||
# Items below are mandatory if enableSessionExtract=true
|
# Items below are mandatory if enableSessionExtract=true
|
||||||
pollingInterval: 30000 # How often (milliseconds) should session data be polled?
|
pollingInterval: 30000 # How often (milliseconds) should session data be polled?
|
||||||
|
timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for proxy session API calls. Default: 30000
|
||||||
|
maxRetries: 3 # Number of times to retry failed API calls. Default: 3
|
||||||
|
retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000
|
||||||
excludeUser: # Optional blacklist of users that should be disregarded when it comes to session monitoring.
|
excludeUser: # Optional blacklist of users that should be disregarded when it comes to session monitoring.
|
||||||
# Blacklist is only applied to data in InfluxDB. All session data will be sent to MQTT.
|
# Blacklist is only applied to data in InfluxDB. All session data will be sent to MQTT.
|
||||||
# - directory: LAB
|
# - directory: LAB
|
||||||
@@ -558,6 +561,9 @@ Butler-SOS:
|
|||||||
|
|
||||||
serversToMonitor:
|
serversToMonitor:
|
||||||
pollingInterval: 30000 # How often (milliseconds) should the healthcheck API be polled?
|
pollingInterval: 30000 # How often (milliseconds) should the healthcheck API be polled?
|
||||||
|
timeoutMilliseconds: 30000 # HTTP request timeout (milliseconds) for health check API calls. Default: 30000
|
||||||
|
maxRetries: 3 # Number of times to retry failed API calls. Default: 3
|
||||||
|
retryDelayMilliseconds: 1000 # Initial delay between retries (milliseconds). Doubles for each retry. Default: 1000
|
||||||
|
|
||||||
# If false, Butler SOS will accept TLS certificates on the server without verifying them with the CA.
|
# If false, Butler SOS will accept TLS certificates on the server without verifying them with the CA.
|
||||||
# If true, data will only be retrieved from the Sense server if that server's TLS cert verifies
|
# If true, data will only be retrieved from the Sense server if that server's TLS cert verifies
|
||||||
|
|||||||
@@ -20,14 +20,16 @@ import { getCertificates, createCertificateOptions } from './cert-utils.js';
|
|||||||
*
|
*
|
||||||
* This function makes an HTTPS request to the Sense engine healthcheck API and
|
* This function makes an HTTPS request to the Sense engine healthcheck API and
|
||||||
* distributes the data to configured destinations (MQTT, InfluxDB, New Relic, Prometheus).
|
* distributes the data to configured destinations (MQTT, InfluxDB, New Relic, Prometheus).
|
||||||
|
* Implements retry logic with exponential backoff for transient network failures.
|
||||||
*
|
*
|
||||||
* @param {string} serverName - The name of the server as defined in the config.
|
* @param {string} serverName - The name of the server as defined in the config.
|
||||||
* @param {string} host - The hostname or IP address of the Sense server.
|
* @param {string} host - The hostname or IP address of the Sense server.
|
||||||
* @param {object} tags - Tags/metadata to associate with the server metrics.
|
* @param {object} tags - Tags/metadata to associate with the server metrics.
|
||||||
* @param {object|null} headers - Additional headers to include in the request.
|
* @param {object|null} headers - Additional headers to include in the request.
|
||||||
* @returns {void}
|
* @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
|
||||||
|
* @returns {Promise<void>}
|
||||||
*/
|
*/
|
||||||
export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
export async function getHealthStatsFromSense(serverName, host, tags, headers, retryCount = 0) {
|
||||||
globals.logger.debug(`HEALTH: URL=https://${host}/engine/healthcheck`);
|
globals.logger.debug(`HEALTH: URL=https://${host}/engine/healthcheck`);
|
||||||
|
|
||||||
// Get certificate configuration options
|
// Get certificate configuration options
|
||||||
@@ -49,6 +51,17 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
|||||||
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
|
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Get timeout and retry settings from config with fallback defaults
|
||||||
|
const timeout = globals.config.has('Butler-SOS.serversToMonitor.timeoutMilliseconds')
|
||||||
|
? globals.config.get('Butler-SOS.serversToMonitor.timeoutMilliseconds')
|
||||||
|
: 30000;
|
||||||
|
const maxRetries = globals.config.has('Butler-SOS.serversToMonitor.maxRetries')
|
||||||
|
? globals.config.get('Butler-SOS.serversToMonitor.maxRetries')
|
||||||
|
: 3;
|
||||||
|
const retryDelay = globals.config.has('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
|
||||||
|
? globals.config.get('Butler-SOS.serversToMonitor.retryDelayMilliseconds')
|
||||||
|
: 1000;
|
||||||
|
|
||||||
const requestSettings = {
|
const requestSettings = {
|
||||||
url: `https://${host}/engine/healthcheck`,
|
url: `https://${host}/engine/healthcheck`,
|
||||||
method: 'get',
|
method: 'get',
|
||||||
@@ -57,7 +70,7 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
|||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
},
|
},
|
||||||
httpsAgent,
|
httpsAgent,
|
||||||
timeout: 5000,
|
timeout,
|
||||||
maxRedirects: 5,
|
maxRedirects: 5,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -71,43 +84,71 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
axios
|
try {
|
||||||
.request(requestSettings)
|
const response = await axios.request(requestSettings);
|
||||||
.then((response) => {
|
|
||||||
if (response.status === 200) {
|
|
||||||
globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
|
|
||||||
globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);
|
|
||||||
|
|
||||||
// Post to MQTT
|
if (response.status === 200) {
|
||||||
if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
|
globals.logger.verbose(`HEALTH: Received ok response from ${tags.host}`);
|
||||||
globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
|
globals.logger.debug(`HEALTH: ${JSON.stringify(response.data)}`);
|
||||||
postHealthToMQTT(host, tags.host, response.data);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Post to Influxdb
|
// Post to MQTT
|
||||||
if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
|
if (globals.config.get('Butler-SOS.mqttConfig.enable') === true) {
|
||||||
globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
|
globals.logger.debug('HEALTH: Calling HEALTH metrics MQTT posting method');
|
||||||
postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
|
postHealthToMQTT(host, tags.host, response.data);
|
||||||
}
|
|
||||||
|
|
||||||
// Post to New Relic
|
|
||||||
if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
|
|
||||||
globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
|
|
||||||
postHealthMetricsToNewRelic(host, response.data, tags);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save latest available data for Prometheus
|
|
||||||
if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
|
|
||||||
globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
|
|
||||||
saveHealthMetricsToPrometheus(host, response.data, tags);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
|
||||||
.catch((err) => {
|
// Post to Influxdb
|
||||||
globals.logger.error(
|
if (globals.config.get('Butler-SOS.influxdbConfig.enable') === true) {
|
||||||
`HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
|
globals.logger.debug('HEALTH: Calling HEALTH metrics Influxdb posting method');
|
||||||
|
postHealthMetricsToInfluxdb(serverName, host, response.data, tags);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post to New Relic
|
||||||
|
if (globals.config.get('Butler-SOS.newRelic.enable') === true) {
|
||||||
|
globals.logger.debug('HEALTH: Calling HEALTH metrics New Relic posting method');
|
||||||
|
postHealthMetricsToNewRelic(host, response.data, tags);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save latest available data for Prometheus
|
||||||
|
if (globals.config.get('Butler-SOS.prometheus.enable') === true) {
|
||||||
|
globals.logger.debug('HEALTH: Calling HEALTH metrics Prometheus method');
|
||||||
|
saveHealthMetricsToPrometheus(host, response.data, tags);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
// Check if we should retry based on error type and retry count
|
||||||
|
const shouldRetry =
|
||||||
|
retryCount < maxRetries &&
|
||||||
|
(err.code === 'ECONNABORTED' || // Timeout
|
||||||
|
err.code === 'ECONNRESET' || // Connection reset
|
||||||
|
err.code === 'ETIMEDOUT' || // Network timeout
|
||||||
|
err.code === 'ENOTFOUND' || // DNS lookup failed
|
||||||
|
err.code === 'ENETUNREACH'); // Network unreachable
|
||||||
|
|
||||||
|
if (shouldRetry) {
|
||||||
|
// Calculate exponential backoff delay
|
||||||
|
const delay = retryDelay * Math.pow(2, retryCount);
|
||||||
|
globals.logger.warn(
|
||||||
|
`HEALTH: Error calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
|
||||||
);
|
);
|
||||||
});
|
|
||||||
|
// Wait before retrying
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
|
|
||||||
|
// Recursive retry
|
||||||
|
return getHealthStatsFromSense(serverName, host, tags, headers, retryCount + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final error after all retries exhausted or non-retryable error
|
||||||
|
globals.logger.error(
|
||||||
|
`HEALTH: Error when calling health check API for server '${serverName}' (${host}): ${globals.getErrorMessage(err)}`
|
||||||
|
);
|
||||||
|
if (retryCount > 0) {
|
||||||
|
globals.logger.error(
|
||||||
|
`HEALTH: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -120,29 +161,30 @@ export function getHealthStatsFromSense(serverName, host, tags, headers) {
|
|||||||
*/
|
*/
|
||||||
export function setupHealthMetricsTimer() {
|
export function setupHealthMetricsTimer() {
|
||||||
// Configure timer for getting healthcheck data
|
// Configure timer for getting healthcheck data
|
||||||
setInterval(() => {
|
setInterval(async () => {
|
||||||
globals.logger.verbose('HEALTH: Event started: Statistics collection');
|
globals.logger.verbose('HEALTH: Event started: Statistics collection');
|
||||||
|
|
||||||
globals.serverList.forEach((server) => {
|
// Process all servers concurrently with error handling
|
||||||
globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
|
const healthCheckPromises = globals.serverList.map(async (server) => {
|
||||||
globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);
|
try {
|
||||||
|
globals.logger.verbose(`HEALTH: Getting stats for server: ${server.serverName}`);
|
||||||
|
globals.logger.debug(`HEALTH: Server details: ${JSON.stringify(server)}`);
|
||||||
|
|
||||||
// Get per-server tags
|
// Get per-server tags
|
||||||
const tags = getServerTags(globals.logger, server);
|
const tags = getServerTags(globals.logger, server);
|
||||||
|
|
||||||
// Save tags to global variable.
|
// Get per-server headers
|
||||||
// Add a new object to the array, with properties host andd tags.
|
const headers = getServerHeaders(server);
|
||||||
// The tags property is an array with all the tags for the server.
|
|
||||||
// Each tag object has a name and a value.
|
|
||||||
// globals.serverTags.push({
|
|
||||||
// host: server.host,
|
|
||||||
// tags,
|
|
||||||
// });
|
|
||||||
|
|
||||||
// Get per-server headers
|
await getHealthStatsFromSense(server.serverName, server.host, tags, headers);
|
||||||
const headers = getServerHeaders(server);
|
} catch (err) {
|
||||||
|
globals.logger.error(
|
||||||
getHealthStatsFromSense(server.serverName, server.host, tags, headers);
|
`HEALTH: Unexpected error processing health stats for server '${server.serverName}': ${globals.getErrorMessage(err)}`
|
||||||
|
);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Wait for all health checks to complete
|
||||||
|
await Promise.allSettled(healthCheckPromises);
|
||||||
}, globals.config.get('Butler-SOS.serversToMonitor.pollingInterval'));
|
}, globals.config.get('Butler-SOS.serversToMonitor.pollingInterval'));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -206,14 +206,22 @@ function prepUserSessionMetrics(serverName, host, virtualProxy, body, tags) {
|
|||||||
* This function makes an API call to the Qlik Sense Proxy API to get information about
|
* This function makes an API call to the Qlik Sense Proxy API to get information about
|
||||||
* active user sessions. It then processes this data and sends it to configured destinations
|
* active user sessions. It then processes this data and sends it to configured destinations
|
||||||
* (MQTT, InfluxDB, New Relic, Prometheus).
|
* (MQTT, InfluxDB, New Relic, Prometheus).
|
||||||
|
* Implements retry logic with exponential backoff for transient network failures.
|
||||||
*
|
*
|
||||||
* @param {string} serverName - Name of the Qlik Sense server
|
* @param {string} serverName - Name of the Qlik Sense server
|
||||||
* @param {string} host - Host name or IP of the Qlik Sense server
|
* @param {string} host - Host name or IP of the Qlik Sense server
|
||||||
* @param {string} virtualProxy - Virtual proxy prefix
|
* @param {string} virtualProxy - Virtual proxy prefix
|
||||||
* @param {object} influxTags - Tags to associate with metrics in InfluxDB
|
* @param {object} influxTags - Tags to associate with metrics in InfluxDB
|
||||||
|
* @param {number} retryCount - Current retry attempt number (used internally for recursion). Defaults to 0.
|
||||||
* @returns {Promise<void>} Promise that resolves when the operation is complete
|
* @returns {Promise<void>} Promise that resolves when the operation is complete
|
||||||
*/
|
*/
|
||||||
export async function getProxySessionStatsFromSense(serverName, host, virtualProxy, influxTags) {
|
export async function getProxySessionStatsFromSense(
|
||||||
|
serverName,
|
||||||
|
host,
|
||||||
|
virtualProxy,
|
||||||
|
influxTags,
|
||||||
|
retryCount = 0
|
||||||
|
) {
|
||||||
// Current user sessions are retrived using this API:
|
// Current user sessions are retrived using this API:
|
||||||
// https://help.qlik.com/en-US/sense-developer/February2021/Subsystems/ProxyServiceAPI/Content/Sense_ProxyServiceAPI/ProxyServiceAPI-Proxy-API.htm
|
// https://help.qlik.com/en-US/sense-developer/February2021/Subsystems/ProxyServiceAPI/Content/Sense_ProxyServiceAPI/ProxyServiceAPI-Proxy-API.htm
|
||||||
|
|
||||||
@@ -236,6 +244,17 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
|
|||||||
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
|
rejectUnauthorized: globals.config.get('Butler-SOS.serversToMonitor.rejectUnauthorized'),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Get timeout and retry settings from config with fallback defaults
|
||||||
|
const timeout = globals.config.has('Butler-SOS.userSessions.timeoutMilliseconds')
|
||||||
|
? globals.config.get('Butler-SOS.userSessions.timeoutMilliseconds')
|
||||||
|
: 30000;
|
||||||
|
const maxRetries = globals.config.has('Butler-SOS.userSessions.maxRetries')
|
||||||
|
? globals.config.get('Butler-SOS.userSessions.maxRetries')
|
||||||
|
: 3;
|
||||||
|
const retryDelay = globals.config.has('Butler-SOS.userSessions.retryDelayMilliseconds')
|
||||||
|
? globals.config.get('Butler-SOS.userSessions.retryDelayMilliseconds')
|
||||||
|
: 1000;
|
||||||
|
|
||||||
const vP = virtualProxy === '/' ? '' : `${virtualProxy}`;
|
const vP = virtualProxy === '/' ? '' : `${virtualProxy}`;
|
||||||
const requestSettings = {
|
const requestSettings = {
|
||||||
url: `https://${host}/qps${vP}/session?Xrfkey=abcdefghij987654`,
|
url: `https://${host}/qps${vP}/session?Xrfkey=abcdefghij987654`,
|
||||||
@@ -247,7 +266,7 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
|
|||||||
XVirtualProxy: virtualProxy,
|
XVirtualProxy: virtualProxy,
|
||||||
},
|
},
|
||||||
httpsAgent,
|
httpsAgent,
|
||||||
timeout: 5000,
|
timeout,
|
||||||
maxRedirects: 5,
|
maxRedirects: 5,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -316,9 +335,44 @@ export async function getProxySessionStatsFromSense(serverName, host, virtualPro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
// Check if we should retry based on error type and retry count
|
||||||
|
const shouldRetry =
|
||||||
|
retryCount < maxRetries &&
|
||||||
|
(err.code === 'ECONNABORTED' || // Timeout
|
||||||
|
err.code === 'ECONNRESET' || // Connection reset
|
||||||
|
err.code === 'ETIMEDOUT' || // Network timeout
|
||||||
|
err.code === 'ENOTFOUND' || // DNS lookup failed
|
||||||
|
err.code === 'ENETUNREACH'); // Network unreachable
|
||||||
|
|
||||||
|
if (shouldRetry) {
|
||||||
|
// Calculate exponential backoff delay
|
||||||
|
const delay = retryDelay * Math.pow(2, retryCount);
|
||||||
|
globals.logger.warn(
|
||||||
|
`PROXY SESSIONS: Error calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}. Retrying in ${delay}ms (attempt ${retryCount + 1}/${maxRetries})...`
|
||||||
|
);
|
||||||
|
|
||||||
|
// Wait before retrying
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
|
|
||||||
|
// Recursive retry
|
||||||
|
return getProxySessionStatsFromSense(
|
||||||
|
serverName,
|
||||||
|
host,
|
||||||
|
virtualProxy,
|
||||||
|
influxTags,
|
||||||
|
retryCount + 1
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final error after all retries exhausted or non-retryable error
|
||||||
globals.logger.error(
|
globals.logger.error(
|
||||||
`PROXY SESSIONS: Error when calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}`
|
`PROXY SESSIONS: Error when calling proxy session API for server '${serverName}' (${host}), virtual proxy '${virtualProxy}': ${globals.getErrorMessage(err)}`
|
||||||
);
|
);
|
||||||
|
if (retryCount > 0) {
|
||||||
|
globals.logger.error(
|
||||||
|
`PROXY SESSIONS: Failed after ${retryCount} ${retryCount === 1 ? 'retry' : 'retries'}`
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user