mirror of
https://github.com/apache/impala.git
synced 2025-12-30 21:02:41 -05:00
This adds initial changes for the Impala failure testing library. It also refactors run workload into its own module to it can be used in other tests. The failure testing has two main components - the first is an object model on top on top of Impala services in a cluster. This allows for enumerating the serivces in the cluster and executing commands on remote machines. This initial cut is built on top of the CM service to help with starting/stopping services. The long term goal is to let this run on both a CM cluster and non-CM cluster as well as locally. The other part of the failure injection change is failure_inctor module that uses the Impala service abstraction to select and inject failures into random impala services. This failure testing framework hasn't been completely validated because the product code is not yet ready, but it is important to get this checked in so all new changes to run-workload are based off this refactor. Change-Id: I73bf44f0ac881ec17bea7cb05d850b45e2ea5be5
92 lines
3.7 KiB
Python
Executable File
92 lines
3.7 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Failure injection module for the Impala service. There are two main ways this module
|
|
# can be used - the first is to initialize the failure injector and then call start()
|
|
# which will kick off a timer that chooses a random impalad/state store process
|
|
# to fail each time timer fires.
|
|
# The second way this module can be used to to initialize it and call the actions
|
|
# directly (ex. kill_random_impalad()). This provides a bit more control over exactly
|
|
# when a failure will happen and is useful for targeted test scenarios.
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from tests.common.impala_cluster import *
|
|
from random import choice
|
|
from threading import Timer
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s')
|
|
LOG = logging.getLogger('failure-injector')
|
|
|
|
# This class is used for injecting failures for the Impala service.
|
|
class FailureInjector(object):
|
|
def __init__(self, impala_cluster, failure_frequency, impalad_exclude_list=None):
|
|
"""
|
|
Initializes the FailureInjector object.
|
|
|
|
impala_cluster - An ImpalaCluster object (see the impala_cluster module)
|
|
failure_frequency - Interval to fire timer (in seconds)
|
|
impalad_exclude_list - A list of impalad host:port name to not inject failures
|
|
on. Useful to filter out the coordinator.
|
|
"""
|
|
self.cluster = impala_cluster
|
|
self.cluster.get_impala_service().set_process_auto_restart_config(value=True)
|
|
# TODO: Do we need to restart the impala service to apply this?
|
|
# self.cluster.get_impala_service().restart()
|
|
self.failure_frequency = failure_frequency
|
|
num_impalad_procs = len(self.cluster.get_impala_service().get_all_impalad_processes())
|
|
|
|
self.impalad_exclude_list = impalad_exclude_list
|
|
|
|
# Build a weighted list of possible actions. This is done using a trivial approach
|
|
# where we just add the item multiple times (weight value) into the action list.
|
|
# TODO: Provide a way to dynamically configure the weights
|
|
actions_with_weights = {self.kill_random_impalad: num_impalad_procs * 2,
|
|
self.kill_state_store: 1}
|
|
|
|
self.possible_actions = list()
|
|
for key, value in actions_with_weights.items():
|
|
self.possible_actions.extend([key] * value)
|
|
|
|
def start(self):
|
|
""" Starts the timer, triggering failures for the specified interval """
|
|
self.__start_timer()
|
|
|
|
def cancel(self):
|
|
""" Stops the timer, canceling any additional failures from occurring """
|
|
if self.__timer is not None:
|
|
self.__timer.cancel()
|
|
|
|
def kill_random_impalad(self):
|
|
""" Kills a randomly selected impalad instance not in the exlude list """
|
|
filtered_impalad = \
|
|
filter(lambda impalad: '%s:%d' % (impalad.hostname, impalad.be_port)\
|
|
not in self.impalad_exclude_list,
|
|
self.cluster.get_impala_service().get_all_impalad_processes())
|
|
self.kill_impalad(choice(filtered_impalad))
|
|
|
|
def kill_impalad(self, impalad):
|
|
""" Kills the specified impalad instance """
|
|
LOG.info('Chose impalad on "%s" to kill' % impalad.hostname)
|
|
impalad.kill()
|
|
|
|
def kill_state_store(self):
|
|
""" Kills the statestore process """
|
|
state_store = self.cluster.get_impala_service().get_state_store_process()
|
|
LOG.info('Chose statestore on "%s" to kill' % state_store.hostname)
|
|
state_store.kill()
|
|
|
|
def __start_timer(self):
|
|
""" Starts a new timer, cancelling the previous timer if it is running """
|
|
self.cancel()
|
|
self.__timer = Timer(self.failure_frequency, self.__choose_action)
|
|
self.__timer.start()
|
|
|
|
def __choose_action(self):
|
|
""" Chooses a failure action to perform """
|
|
action = choice(self.possible_actions)
|
|
LOG.info('Executing action: %s' % action)
|
|
action()
|
|
self.__start_timer()
|