Files
impala/tests/common/failure_injector.py
Casey Ching 074e5b4349 Remove hashbang from non-script python files
Many python files had a hashbang and the executable bit set though
they were not intended to be run a standalone script. That makes
determining which python files are actually scripts very difficult.
A future patch will update the hashbang in real python scripts so they
use $IMPALA_HOME/bin/impala-python.

Change-Id: I04eafdc73201feefe65b85817a00474e182ec2ba
Reviewed-on: http://gerrit.cloudera.org:8080/599
Reviewed-by: Casey Ching <casey@cloudera.com>
Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com>
Tested-by: Internal Jenkins
2015-08-04 05:26:07 +00:00

91 lines
3.7 KiB
Python

# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Failure injection module for the Impala service. There are two main ways this module
# can be used - the first is to initialize the failure injector and then call start()
# which will kick off a timer that chooses a random impalad/state store process
# to fail each time timer fires.
# The second way this module can be used to to initialize it and call the actions
# directly (ex. kill_random_impalad()). This provides a bit more control over exactly
# when a failure will happen and is useful for targeted test scenarios.
import logging
import os
import sys
import time
from tests.common.impala_cluster import *
from random import choice
from threading import Timer
logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s')
LOG = logging.getLogger('failure-injector')
# This class is used for injecting failures for the Impala service.
class FailureInjector(object):
def __init__(self, impala_cluster, failure_frequency, impalad_exclude_list=None):
"""
Initializes the FailureInjector object.
impala_cluster - An ImpalaCluster object (see the impala_cluster module)
failure_frequency - Interval to fire timer (in seconds)
impalad_exclude_list - A list of impalad host:port name to not inject failures
on. Useful to filter out the coordinator.
"""
self.cluster = impala_cluster
self.cluster.get_impala_service().set_process_auto_restart_config(value=True)
# TODO: Do we need to restart the impala service to apply this?
# self.cluster.get_impala_service().restart()
self.failure_frequency = failure_frequency
num_impalad_procs = len(self.cluster.get_impala_service().get_all_impalad_processes())
self.impalad_exclude_list = impalad_exclude_list
# Build a weighted list of possible actions. This is done using a trivial approach
# where we just add the item multiple times (weight value) into the action list.
# TODO: Provide a way to dynamically configure the weights
actions_with_weights = {self.kill_random_impalad: num_impalad_procs * 2,
self.kill_state_store: 1}
self.possible_actions = list()
for key, value in actions_with_weights.items():
self.possible_actions.extend([key] * value)
def start(self):
""" Starts the timer, triggering failures for the specified interval """
self.__start_timer()
def cancel(self):
""" Stops the timer, canceling any additional failures from occurring """
if self.__timer is not None:
self.__timer.cancel()
def kill_random_impalad(self):
""" Kills a randomly selected impalad instance not in the exlude list """
filtered_impalad = \
filter(lambda impalad: '%s:%d' % (impalad.hostname, impalad.be_port)\
not in self.impalad_exclude_list,
self.cluster.get_impala_service().get_all_impalad_processes())
self.kill_impalad(choice(filtered_impalad))
def kill_impalad(self, impalad):
""" Kills the specified impalad instance """
LOG.info('Chose impalad on "%s" to kill' % impalad.hostname)
impalad.kill()
def kill_state_store(self):
""" Kills the statestore process """
state_store = self.cluster.get_impala_service().get_state_store_process()
LOG.info('Chose statestore on "%s" to kill' % state_store.hostname)
state_store.kill()
def __start_timer(self):
""" Starts a new timer, cancelling the previous timer if it is running """
self.cancel()
self.__timer = Timer(self.failure_frequency, self.__choose_action)
self.__timer.start()
def __choose_action(self):
""" Chooses a failure action to perform """
action = choice(self.possible_actions)
LOG.info('Executing action: %s' % action)
action()
self.__start_timer()