impala/tests/comparison/discrepancy_searcher.py

#!/usr/bin/env impala-python

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

'''This module will run random queries against existing databases and compare the
   results.

'''
# TODO: IMPALA-4600: refactor this module

from copy import deepcopy
from decimal import Decimal
from itertools import izip
from logging import getLogger
from math import isinf, isnan
from os import getenv, symlink, unlink
from os.path import join as join_path
from random import choice, randint
from string import ascii_lowercase, digits
from subprocess import call
from tempfile import gettempdir
from threading import current_thread, Thread
from time import time

from tests.comparison.db_types import BigInt
from tests.comparison.db_connection import (
    DbCursor,
    IMPALA,
    HIVE,
    MYSQL,
    ORACLE,
    POSTGRESQL)
from tests.comparison.model_translator import SqlWriter
from tests.comparison.query import (
    FromClause,
    InsertClause,
    InsertStatement,
    Query,
    StatementExecutionMode,
    SelectClause,
    SelectItem)
from tests.comparison.query_flattener import QueryFlattener
from tests.comparison.statement_generator import get_generator
from tests.comparison import db_connection

LOG = getLogger(__name__)


class QueryResultComparator(object):
  '''Used for comparing the results of a Query across two databases'''

  # Used when comparing FLOAT values
  EPSILON = 0.1

  # The DECIMAL values will be rounded before comparison
  DECIMAL_PLACES = 2

  def __init__(self, query_profile, ref_conn,
      test_conn, query_timeout_seconds, flatten_dialect=None):
    '''test/ref_conn arguments should be an instance of DbConnection'''
    ref_cursor = ref_conn.cursor()
    test_cursor = test_conn.cursor()

    self.ref_conn = ref_conn
    self.ref_sql_writer = SqlWriter.create(
        dialect=ref_conn.db_type, nulls_order_asc=query_profile.nulls_order_asc())
    self.test_conn = test_conn
    self.test_sql_writer = SqlWriter.create(dialect=test_conn.db_type)

    self.query_executor = QueryExecutor(
        [ref_cursor, test_cursor],
        [self.ref_sql_writer, self.test_sql_writer],
        query_timeout_seconds=query_timeout_seconds,
        flatten_dialect=flatten_dialect)

  @property
  def test_db_type(self):
    return self.test_conn.db_type

  @property
  def ref_db_type(self):
    return self.ref_conn.db_type

  def compare_query_results(self, query):
    '''Execute the query, compare the data, and return a ComparisonResult, which
       summarizes the outcome.
    '''
    comparison_result = ComparisonResult(query, self.test_db_type, self.ref_db_type)
    (ref_sql, ref_exception, ref_data_set, ref_cursor_description), (test_sql,
        test_exception, test_data_set, test_cursor_description) = \
            self.query_executor.fetch_query_results(query)

    comparison_result.ref_sql = ref_sql
    comparison_result.test_sql = test_sql

    if ref_exception:
      comparison_result.exception = ref_exception
      error_message = str(ref_exception)
      if 'Year is out of valid range: 1400..9999' in error_message:
        # This comes from Postgresql. Overflow errors will be ignored.
        comparison_result.exception = TypeOverflow(error_message)
      LOG.debug('%s encountered an error running query: %s',
          self.ref_conn.db_type, ref_exception, exc_info=True)
      return comparison_result

    if test_exception:
      # "known errors" will be ignored
      error_message = str(test_exception)
      known_error = None

      if self.test_db_type is db_connection.IMPALA:
        if 'Expressions in the ORDER BY clause must not be constant' in error_message \
            or 'Expressions in the PARTITION BY clause must not be consta' in error_message:
          # It's too much work to avoid this bug. Just ignore it if it comes up.
          known_error = KnownError('IMPALA-1354')
        elif 'GROUP BY expression must not contain aggregate functions' in error_message \
            or 'select list expression not produced by aggregation output' in error_message:
          known_error = KnownError('IMPALA-1423')
        elif ('max(' in error_message or 'min(' in error_message) \
            and 'only supported with an UNBOUNDED PRECEDING start bound' in error_message:
          # This analytic isn't supported and ignoring this here is much easier than not
          # generating the query...
          known_error = KnownError('MAX UNBOUNDED PRECISION')
        elif 'IN and/or EXISTS subquery predicates are not supported in binary predicates' \
            in error_message:
          known_error = KnownError('IMPALA-1418')
        elif 'Unsupported predicate with subquery' in error_message:
          known_error = KnownError('IMPALA-1950')
        elif 'RIGHT OUTER JOIN type with no equi-join' in error_message:
          known_error = KnownError('IMPALA-3063')
        elif 'Operation is in ERROR_STATE' in error_message:
          known_error = KnownError('Mem limit exceeded')
      elif self.test_db_type is db_connection.HIVE:
        if 'ParseException line' in error_message and 'missing ) at' in \
              error_message and query.select_clause and \
              query.select_clause.analytic_items:
          known_error = KnownError("HIVE-14871")

      if known_error:
        comparison_result.exception = known_error
      else:
        comparison_result.exception = test_exception
        LOG.debug('%s encountered an error running query: %s',
            self.test_conn.db_type, test_exception, exc_info=True)
      return comparison_result

    comparison_result.ref_row_count = len(ref_data_set)
    comparison_result.test_row_count = len(test_data_set)
    comparison_result.query_resulted_in_data = (comparison_result.test_row_count > 0 or
                                                comparison_result.ref_row_count > 0)
    if comparison_result.ref_row_count != comparison_result.test_row_count:
      return comparison_result

    # Standardize data (round FLOATs) in each column, and sort the data set
    for data_set in (ref_data_set, test_data_set):
      for row_idx, row in enumerate(data_set):
        data_set[row_idx] = []
        for col_idx, col in enumerate(row):
          data_set[row_idx].append(self.standardize_data(col,
              ref_cursor_description[col_idx], test_cursor_description[col_idx]))
      # TODO: If the query has an ORDER BY clause, sorting should only be done within
      #       subsets of rows that have the same order by values.
      data_set.sort(cmp=self.row_sort_cmp)

    found_data = False  # Will be set to True if the result contains non-zero/NULL data
    for ref_row, test_row in izip(ref_data_set, test_data_set):
      for col_idx, (ref_val, test_val) in enumerate(izip(ref_row, test_row)):
        if ref_val or test_val:   # Ignores zeros, ex "SELECT COUNT(*) ... WHERE FALSE"
          found_data = True
        if self.vals_are_equal(ref_val, test_val):
          continue
        if isinstance(test_val, int) \
            and isinstance(ref_val, (int, float, Decimal)) \
            and abs(ref_val) > BigInt.MAX:
          # Impala will return incorrect results if the val is greater than max BigInt
          comparison_result.exception = KnownError('IMPALA-865')
        elif isinstance(test_val, float) \
            and (isinf(test_val) or isnan(test_val)):
          # In some cases, Impala gives NaNs and Infs instead of NULLs
          comparison_result.exception = KnownError('IMPALA-724')
        comparison_result.ref_row = ref_row
        comparison_result.test_row = test_row
        comparison_result.mismatch_at_row_number = row_idx + 1
        comparison_result.mismatch_at_col_number = col_idx + 1
        return comparison_result
    comparison_result.query_resulted_in_data = found_data

    # If we're here, it means ref and test data sets are equal for a DML statement.
    if isinstance(query, (InsertStatement,)):
      comparison_result.modified_rows_count = test_data_set[0][0]

    return comparison_result

  def standardize_data(self, data, ref_col_description, test_col_description):
    '''Return a val that is suitable for comparison.'''
    # For float data we need to round otherwise differences in precision will cause errors
    if isinstance(data, float):
      return round(data, self.DECIMAL_PLACES)
    if isinstance(data, Decimal):
      if ref_col_description[5] is not None and test_col_description[5] is not None:
        return round(data, min(ref_col_description[5], test_col_description[5]))
    return data

  def row_sort_cmp(self, ref_row, test_row):
    '''Comparison used for sorting. '''
    for ref_val, test_val in izip(ref_row, test_row):
      if ref_val is None and test_val is not None:
        return -1
      if ref_val is not None and test_val is None:
        return 1
      result = cmp(ref_val, test_val)
      if result:
        return result
    return 0

  def vals_are_equal(self, ref, test):
    '''Compares if two values are equal in two cells. Floats are considered equal if the
    difference between them is very small.'''
    if ref == test:
      return True
    # For some reason Postgresql will return Decimals when using some aggregate
    # functions such as AVG().
    if isinstance(ref, (float, Decimal)) and isinstance(test, float):
      return self.floats_are_equal(ref, test)
    LOG.debug("Values differ, reference: %s (%s), test: %s (%s)",
        ref, type(ref),
        test, type(test))
    return False

  def floats_are_equal(self, ref, test):
    '''Compare two floats.'''
    ref = round(ref, self.DECIMAL_PLACES)
    test = round(test, self.DECIMAL_PLACES)
    diff = abs(ref - test)
    if ref * test == 0:
      return diff < self.EPSILON
    result = diff / (abs(ref) + abs(test)) < self.EPSILON
    if not result:
      LOG.debug("Floats differ, diff: %s, |reference|: %s, |test|: %s",
          diff, abs(ref), abs(test))
    return result


class QueryExecutor(object):
  '''Concurrently executes queries'''

  # TODO: Set to false while IMPALA-3336 is a problem. Disabling random query options
  # seems to reduce IMPALA-3336 occurances.
  ENABLE_RANDOM_QUERY_OPTIONS = False

  # If the number of rows * cols is greater than this val, then the comparison will
  # be aborted. Raising this value also raises the risk of python being OOM killed. At
  # 10M python would get OOM killed occasionally even on a physical machine with 32GB
  # ram.
  TOO_MUCH_DATA = 1000 * 1000

  def __init__(self, cursors, sql_writers, query_timeout_seconds, flatten_dialect=None):
    '''cursors should be a list of db_connector.Cursors.

       sql_writers should be a list of model_translator.SqlWriters, with translators in
       the same order as cursors in "cursors".
    '''
    self.query_timeout_seconds = query_timeout_seconds
    self.cursors = cursors
    self.sql_writers = sql_writers
    self.query_logs = list()
    # SQL dialect for which the queries should be flattened
    self.flatten_dialect = flatten_dialect

    for cursor in cursors:
      # A list of all queries attempted
      query_log_path = gettempdir() + '/test_query_log_%s_%s.sql' \
          % (cursor.db_type.lower(), time())
      self.query_logs.append(open(query_log_path, 'w'))
      link = gettempdir() + '/test_query_log_%s.sql' % cursor.db_type.lower()
      try:
        unlink(link)
      except OSError as e:
        if 'No such file' not in str(e):
          raise e
      try:
        symlink(query_log_path, link)
      except OSError as e:
        # TODO: Figure out what the error message is where there is a race condition
        #       and ignore it.
        raise e

    # In case the query will be executed as a "CREATE TABLE <name> AS ..." or
    # "CREATE VIEW <name> AS ...", this will be the value of "<name>".
    self._table_or_view_name = None

  def set_impala_query_options(self, cursor):
    opts = """
        SET MEM_LIMIT={mem_limit};
        SET BATCH_SIZE={batch_size};
        SET DISABLE_CODEGEN={disable_codegen};
        SET DISABLE_OUTERMOST_TOPN={disable_outermost_topn};
        SET DISABLE_ROW_RUNTIME_FILTERING={disable_row_runtime_filtering};
        SET DISABLE_STREAMING_PREAGGREGATIONS={disable_streaming_preaggregations};
        SET DISABLE_UNSAFE_SPILLS={disable_unsafe_spills};
        SET EXEC_SINGLE_NODE_ROWS_THRESHOLD={exec_single_node_rows_threshold};
        SET BUFFER_POOL_LIMIT={buffer_pool_limit};
        SET MAX_IO_BUFFERS={max_io_buffers};
        SET MAX_SCAN_RANGE_LENGTH={max_scan_range_length};
        SET NUM_NODES={num_nodes};
        SET NUM_SCANNER_THREADS={num_scanner_threads};
        SET OPTIMIZE_PARTITION_KEY_SCANS={optimize_partition_key_scans};
        SET RUNTIME_BLOOM_FILTER_SIZE={runtime_bloom_filter_size};
        SET RUNTIME_FILTER_MODE={runtime_filter_mode};
        SET RUNTIME_FILTER_WAIT_TIME_MS={runtime_filter_wait_time_ms};
        SET SCAN_NODE_CODEGEN_THRESHOLD={scan_node_codegen_threshold}""".format(
            mem_limit=randint(1024 ** 3, 10 * 1024 ** 3),
            batch_size=randint(1, 4096),
            disable_codegen=choice((0, 1)),
            disable_outermost_topn=choice((0, 1)),
            disable_row_runtime_filtering=choice((0, 1)),
            disable_streaming_preaggregations=choice((0, 1)),
            disable_unsafe_spills=choice((0, 1)),
            exec_single_node_rows_threshold=randint(1, 100000000),
            buffer_pool_limit=randint(1, 100000000),
            max_io_buffers=randint(1, 100000000),
            max_scan_range_length=randint(1, 100000000),
            num_nodes=randint(3, 3),
            num_scanner_threads=randint(1, 100),
            optimize_partition_key_scans=choice((0, 1)),
            random_replica=choice((0, 1)),
            replica_preference=choice(("CACHE_LOCAL", "DISK_LOCAL", "REMOTE")),
            runtime_bloom_filter_size=randint(4096, 16777216),
            runtime_filter_mode=choice(("OFF", "LOCAL", "GLOBAL")),
            runtime_filter_wait_time_ms=randint(1, 100000000),
            scan_node_codegen_threshold=randint(1, 100000000))
    LOG.debug(opts)
    for opt in opts.strip().split(";"):
      cursor.execute(opt)

  def fetch_query_results(self, query):
    '''Concurrently execute the query using each cursor and return a list of tuples
       containing the result information for each cursor. The tuple format is
       (<exception or None>, <data set or None>).

       If query_timeout_seconds is reached and the connection is killable then the
       query will be cancelled and the connection reset. Otherwise the query will
       continue to run in the background.

       "query" should be an instance of query.Query.
    '''
    if query.execution in (StatementExecutionMode.CREATE_TABLE_AS,
                           StatementExecutionMode.CREATE_VIEW_AS):
      self._table_or_view_name = self._create_random_table_name()
    elif isinstance(query, (InsertStatement,)):
      self._table_or_view_name = query.dml_table.name

    query_threads = list()
    for sql_writer, cursor, log_file in izip(
        self.sql_writers, self.cursors, self.query_logs
    ):
      if self.ENABLE_RANDOM_QUERY_OPTIONS and cursor.db_type == IMPALA:
        self.set_impala_query_options(cursor)
      query_thread = Thread(
          target=self._fetch_sql_results,
          args=[query, cursor, sql_writer, log_file],
          name='{db_type}-exec-{id_}'.format(
              db_type=cursor.db_type, id_=id(query)))
      query_thread.daemon = True
      query_thread.sql = ''
      query_thread.data_set = None
      query_thread.cursor_description = None
      query_thread.exception = None
      query_thread.start()
      query_threads.append(query_thread)

    end_time = time() + self.query_timeout_seconds
    for query_thread, cursor in izip(query_threads, self.cursors):
      join_time = end_time - time()
      if join_time > 0:
        query_thread.join(join_time)
      if query_thread.is_alive():
        # Kill connection and reconnect to return cursor to initial state.
        if cursor.conn.supports_kill:
          LOG.debug('Attempting to kill connection')
          cursor.conn.kill()
          LOG.debug('Kill connection')
        try:
          # TODO: Sometimes this takes a very long time causing the program to appear to
          #       hang. Maybe this should be done in another thread so a timeout can be
          #       applied?
          cursor.close()
        except Exception as e:
          LOG.info('Error closing cursor: %s', e)
        cursor.reconnect()
        query_thread.exception = QueryTimeout(
            'Query timed out after %s seconds' % self.query_timeout_seconds)

      if (query.execution in (StatementExecutionMode.CREATE_TABLE_AS,
                              StatementExecutionMode.DML_TEST)):
        cursor.drop_table(self._table_or_view_name)
      elif query.execution == StatementExecutionMode.CREATE_VIEW_AS:
        cursor.drop_view(self._table_or_view_name)

    return [(query_thread.sql, query_thread.exception, query_thread.data_set,
             query_thread.cursor_description) for query_thread in query_threads]

  def _fetch_sql_results(self, query, cursor, sql_writer, log_file):
    '''Execute the query using the cursor and set the result or exception on the local
       thread.
    '''
    try:
      log_file.write('/***** Start Query *****/\n')
      if sql_writer.DIALECT == self.flatten_dialect:
        # Converts the query model for the flattened version of the data. This is for
        # testing of Impala nested types support.
        query = deepcopy(query)
        QueryFlattener().flatten(query)
      if query.execution == StatementExecutionMode.CREATE_TABLE_AS:
        setup_sql = sql_writer.write_create_table_as(query, self._table_or_view_name)
        query_sql = 'SELECT * FROM ' + self._table_or_view_name
      elif query.execution == StatementExecutionMode.CREATE_VIEW_AS:
        setup_sql = sql_writer.write_create_view(query, self._table_or_view_name)
        query_sql = 'SELECT * FROM ' + self._table_or_view_name
      elif isinstance(query, (InsertStatement,)):
        setup_sql = sql_writer.write_query(query)
        # TODO: improve validation (IMPALA-4599). This is good enough for looking for
        # crashes on DML statements
        query_sql = 'SELECT COUNT(*) FROM ' + self._table_or_view_name
      else:
        setup_sql = None
        query_sql = sql_writer.write_query(query)
      if setup_sql:
        LOG.debug("Executing on %s:\n%s", cursor.db_type, setup_sql)
        current_thread().sql = setup_sql + ';\n'
        log_file.write(setup_sql + ';\n')
        log_file.flush()
        cursor.execute(setup_sql)
      LOG.debug("Executing on %s:\n%s", cursor.db_type, query_sql)
      current_thread().sql += query_sql
      log_file.write(query_sql + ';\n')
      log_file.write('/***** End Query *****/\n')
      log_file.flush()
      cursor.execute(query_sql)
      col_count = len(cursor.description)
      batch_size = max(10000 / col_count, 1)
      row_limit = self.TOO_MUCH_DATA / col_count
      data_set = list()
      current_thread().data_set = data_set
      current_thread().cursor_description = cursor.description
      LOG.debug("Fetching results from %s", cursor.db_type)
      while True:
        batch = cursor.fetchmany(batch_size)
        data_set.extend(batch)
        if len(batch) < batch_size:
          if cursor.db_type == IMPALA:
            impala_log = cursor.get_log()
            if 'Expression overflowed, returning NULL' in impala_log:
              raise TypeOverflow('Numeric overflow; data may not match')
          break
        if len(data_set) > row_limit:
          raise DataLimitExceeded('Too much data')
      if isinstance(query, (InsertStatement,)):
        LOG.debug('Total row count for {0}: {1}'.format(
          cursor.db_type, str(data_set)))
    except Exception as e:
      current_thread().exception = e

  def _create_random_table_name(self):
    char_choices = ascii_lowercase
    chars = list()
    for idx in xrange(4):   # will result in ~1M combinations
      if idx == 1:
        char_choices += '_' + digits
      chars.append(choice(char_choices))
    return 'qgen_' + ''.join(chars)


class ComparisonResult(object):
  '''Represents a result.'''

  def __init__(self, query, test_db_type, ref_db_type):
    self.query = query
    self.test_db_type = test_db_type
    self.ref_db_type = ref_db_type
    self.ref_sql = None
    self.test_sql = None
    self.query_resulted_in_data = False
    self.ref_row_count = None
    self.test_row_count = None
    self.mismatch_at_row_number = None
    self.mismatch_at_col_number = None
    self.ref_row = None   # The test row where mismatch happened
    self.test_row = None   # The reference row where mismatch happened
    self.exception = None
    self.modified_rows_count = None
    self._error_message = None

  @property
  def error(self):
    if not self._error_message:
      if self.exception:
        self._error_message = str(self.exception)
      elif (self.ref_row_count or self.test_row_count) and \
          self.ref_row_count != self.test_row_count:
        self._error_message = 'Row counts do not match: %s %s rows vs %s %s rows' \
            % (self.test_row_count,
               self.test_db_type,
               self.ref_db_type,
               self.ref_row_count)
      elif self.mismatch_at_row_number is not None:
        # Write a row like "[a, b, <<c>>, d]" where c is a bad value
        test_row = '[' + ', '.join(
            '<<' + str(val) + '>>' if idx == self.mismatch_at_col_number - 1 else str(val)
            for idx, val in enumerate(self.test_row)
            )  + ']'
        ref_row = '[' + ', '.join(
            '<<' + str(val) + '>>' if idx == self.mismatch_at_col_number - 1 else str(val)
            for idx, val in enumerate(self.ref_row)
            )  + ']'
        self._error_message = \
            'Column %s in row %s does not match: %s %s row vs %s %s row' \
            % (self.mismatch_at_col_number,
               self.mismatch_at_row_number,
               test_row,
               self.test_db_type,
               ref_row,
               self.ref_db_type)
    return self._error_message

  @property
  def is_known_error(self):
    return isinstance(self.exception, KnownError)

  @property
  def query_timed_out(self):
    return isinstance(self.exception, QueryTimeout)


QueryTimeout = type('QueryTimeout', (Exception, ), {})
TypeOverflow = type('TypeOverflow', (Exception, ), {})
DataLimitExceeded = type('DataLimitExceeded', (Exception, ), {})


class KnownError(Exception):

  def __init__(self, jira_url):
    Exception.__init__(self, 'Known issue: ' + jira_url)
    self.jira_url = jira_url


class FrontendExceptionSearcher(object):

  def __init__(self, query_profile, ref_conn, test_conn):
    '''query_profile should be an instance of one of the profiles in query_profile.py'''
    self.query_profile = query_profile
    self.ref_conn = ref_conn
    self.test_conn = test_conn
    self.ref_sql_writer = SqlWriter.create(dialect=ref_conn.db_type)
    self.test_sql_writer = SqlWriter.create(dialect=test_conn.db_type)
    with ref_conn.cursor() as ref_cursor:
      with test_conn.cursor() as test_cursor:
        self.common_tables = DbCursor.describe_common_tables([ref_cursor, test_cursor])
        if not self.common_tables:
          raise Exception("Unable to find a common set of tables in both databases")

  def search(self, number_of_test_queries):

    def on_ref_db_error(e, sql):
      LOG.warn("Error generating explain plan for reference db:\n%s\n%s" % (e, sql))

    def on_test_db_error(e, sql):
      LOG.error("Error generating explain plan for test db:\n%s" % sql)
      raise e

    for idx in xrange(number_of_test_queries):
      LOG.info("Explaining query #%s" % (idx + 1))
      statement_type = self.query_profile.choose_statement()
      statement_generator = get_generator(statement_type)(self.query_profile)
      if issubclass(statement_type, (InsertStatement,)):
        dml_table = self.query_profile.choose_table(self.common_tables)
      else:
        dml_table = None
      query = statement_generator.generate_statement(
          self.common_tables, dml_table=dml_table)
      if not self._explain_query(self.ref_conn, self.ref_sql_writer, query,
          on_ref_db_error):
        continue
      self._explain_query(self.test_conn, self.test_sql_writer, query,
          on_test_db_error)

  def _explain_query(self, conn, writer, query, exception_handler):
    sql = writer.write_query(query)
    try:
      with conn.cursor() as cursor:
        cursor.execute("EXPLAIN %s" % sql)
        return True
    except Exception as e:
      exception_handler(e, sql)
      return False


class QueryResultDiffSearcher(object):
  '''This class uses the query generator (query_generator.py) along with the
     query profile (query_profile.py) to randomly generate queries then executes the
     queries on the reference and test databases, then compares the results.
  '''

  # Sometimes things get into a bad state and the same error loops forever
  ABORT_ON_REPEAT_ERROR_COUNT = 2

  COPY_TABLE_SUFFIX = '__qgen_copy'

  def __init__(self, query_profile, ref_conn, test_conn):
    '''query_profile should be an instance of one of the profiles in query_profile.py'''
    self.query_profile = query_profile
    self.ref_conn = ref_conn
    self.test_conn = test_conn
    with ref_conn.cursor() as ref_cursor:
      with test_conn.cursor() as test_cursor:
        self.common_tables = DbCursor.describe_common_tables([ref_cursor, test_cursor])
        if not self.common_tables:
          raise Exception("Unable to find a common set of tables in both databases")

  def _concurrently_copy_table(self, src_table):
    """
    Given a Table object, create another Table with the same schema and return the new
    Table object.  The schema will be created in both the test and reference databases.

    The data is then copied in both the ref and test databases using threads.
    """
    with test_conn.cursor() as test_cursor:
      test_cursor.execute('SHOW CREATE TABLE {0}'.format(src_table.name))
      (create_table_sql,) = test_cursor.fetchall()[0]
      new_table_name = src_table.name + self.COPY_TABLE_SUFFIX
      create_table_sql = create_table_sql.replace(src_table.name, new_table_name, 1)
      test_cursor.drop_table(new_table_name)
      test_cursor.execute(create_table_sql)
      new_table = test_cursor.describe_table(new_table_name)
    with ref_conn.cursor() as ref_cursor:
      ref_cursor.drop_table(new_table_name)
      ref_cursor.create_table(new_table)

    copy_select_query = Query()
    copy_select_query.select_clause = SelectClause(
        [SelectItem(col) for col in src_table.cols])
    copy_select_query.from_clause = FromClause(src_table)

    if new_table.primary_keys:
      conflict_action = InsertClause.CONFLICT_ACTION_IGNORE
    else:
      conflict_action = InsertClause.CONFLICT_ACTION_DEFAULT

    table_copy_statement = InsertStatement(
        insert_clause=InsertClause(new_table, conflict_action=conflict_action),
        select_query=copy_select_query, execution=StatementExecutionMode.DML_SETUP)

    result = self.query_result_comparator.compare_query_results(table_copy_statement)
    if result.error:
      raise Exception('setup SQL to copy table failed: {0}'.format(result.error))
    self._dml_table_size = result.modified_rows_count

    return new_table

  def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash,
             query_timeout_seconds):
    '''Returns an instance of SearchResults, which is a summary report. This method
       oversees the generation, execution, and comparison of queries.

      number_of_test_queries should an integer indicating the maximum number of queries
      to generate and execute.
    '''
    start_time = time()
    self.query_result_comparator = QueryResultComparator(
        self.query_profile, self.ref_conn, self.test_conn, query_timeout_seconds)
    query_count = 0
    queries_resulted_in_data_count = 0
    mismatch_count = 0
    query_timeout_count = 0
    known_error_count = 0
    test_crash_count = 0
    last_error = None
    repeat_error_count = 0
    count_effective_dml_statements = 0
    count_rows_affected_by_dml = 0

    while number_of_test_queries > query_count:
      statement_type = self.query_profile.choose_statement()
      statement_generator = get_generator(statement_type)(self.query_profile)
      dml_table = None
      if issubclass(statement_type, (InsertStatement,)):
        dml_choice_src_table = self.query_profile.choose_table(self.common_tables)
        # Copy the table we want to INSERT/UPSERT INTO. Do this for the following reasons:
        #
        # 1. If we don't copy, the tables will get larger and larger
        # 2. If we want to avoid tables getting larger and larger, we have to come up
        # with some threshold about when to cut and start over.
        # 3. If we keep INSERT/UPSERTing into tables and finally find a crash, we have to
        # replay all previous INSERT/UPSERTs again. Those INSERTs may not produce the
        # same rows as before. To maximize the chance of bug reproduction, run every
        # INSERT/UPSERT on a pristine table.
        dml_table = self._concurrently_copy_table(dml_choice_src_table)
      statement = statement_generator.generate_statement(
          self.common_tables, dml_table=dml_table)
      if isinstance(statement, Query):
        # we can re-write statement execution here to possibly be a CREATE TABLE AS SELECT
        # or CREATE VIEW AS SELECT
        statement.execution = self.query_profile.get_query_execution()
      query_count += 1
      LOG.info('Running query #%s', query_count)
      result = self.query_result_comparator.compare_query_results(statement)
      if result.query_resulted_in_data:
        queries_resulted_in_data_count += 1
      if result.modified_rows_count:
        count_effective_dml_statements += 1
        count_rows_affected_by_dml += abs(
            result.modified_rows_count - self._dml_table_size)
      if isinstance(result.exception, DataLimitExceeded) \
          or isinstance(result.exception, TypeOverflow):
        continue
      if result.error:
        # TODO: These first two come from psycopg2, the postgres driver. Maybe we should
        #       try a different driver? Or maybe the usage of the driver isn't correct.
        #       Anyhow ignore these failures.
        if 'division by zero' in result.error \
            or 'out of range' in result.error:
          LOG.debug('Ignoring error: %s', result.error)
          query_count -= 1
          continue

        if result.is_known_error:
          known_error_count += 1
        elif result.query_timed_out:
          query_timeout_count += 1
        else:
          mismatch_count += 1

        print('---Test Query---\n')
        print(result.test_sql + '\n')
        print('---Reference Query---\n')
        print(result.ref_sql + '\n')
        print('---Error---\n')
        print(result.error + '\n')
        print('------\n')

        if 'Could not connect' in result.error \
            or "Couldn't open transport for" in result.error:
          if stop_on_crash:
            break
          # Assume Impala crashed and try restarting
          test_crash_count += 1
          LOG.info('Restarting Impala')
          impalad_args = [
              '-convert_legacy_hive_parquet_utc_timestamps=true',
          ]
          impala_restart_cmd = [
              join_path(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'),
              '--log_dir={0}'.format(getenv('LOG_DIR', "/tmp/")),
              '--impalad_args="{0}"'.format(' '.join(impalad_args)),
          ]
          call(impala_restart_cmd)
          self.test_conn.reconnect()
          self.query_result_comparator.test_cursor = self.test_conn.cursor()
          result = self.query_result_comparator.compare_query_results(statement)
          if result.error:
            LOG.info('Restarting Impala')
            call(impala_restart_cmd)
            self.test_conn.reconnect()
            self.query_result_comparator.test_cursor = self.test_conn.cursor()
          else:
            break

        if stop_on_result_mismatch and \
            not (result.is_known_error or result.query_timed_out):
          break

        if last_error == result.error \
            and not (result.is_known_error or result.query_timed_out):
          repeat_error_count += 1
          if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT:
            break
        else:
          last_error = result.error
          repeat_error_count = 0
      else:
        if result.query_resulted_in_data:
          LOG.info('Results matched (%s rows)', result.test_row_count)
        else:
          LOG.info('Query did not produce meaningful data')
        last_error = None
        repeat_error_count = 0

    return SearchResults(
        query_count,
        queries_resulted_in_data_count,
        mismatch_count,
        query_timeout_count,
        known_error_count,
        test_crash_count,
        time() - start_time,
        count_effective_dml_statements,
        count_rows_affected_by_dml)


class SearchResults(object):
  '''This class holds information about the outcome of a search run.'''

  def __init__(self,
      query_count,
      queries_resulted_in_data_count,
      mismatch_count,
      query_timeout_count,
      known_error_count,
      test_crash_count,
      run_time_in_seconds,
      count_effective_dml_statements,
      count_rows_affected_by_dml
    ):
    # Approx number of queries run, some queries may have been ignored
    self.query_count = query_count
    self.queries_resulted_in_data_count = queries_resulted_in_data_count
    # Number of queries that had an error or result mismatch
    self.mismatch_count = mismatch_count
    self.query_timeout_count = query_timeout_count
    self.known_error_count = known_error_count
    self.test_crash_count = test_crash_count
    self.run_time_in_seconds = run_time_in_seconds
    # number of DML statements that actually modified tables
    self.count_effective_dml_statements = count_effective_dml_statements
    # total number of rows modified by DML statemnts
    self.count_rows_affected_by_dml = count_rows_affected_by_dml

  def __str__(self):
    '''Returns the string representation of the results.'''
    mins, secs = divmod(self.run_time_in_seconds, 60)
    hours, mins = divmod(mins, 60)
    hours = int(hours)
    mins = int(mins)
    if hours:
      run_time = '%s hour and %s minutes' % (hours, mins)
    else:
      secs = int(secs)
      run_time = '%s seconds' % secs
      if mins:
        run_time = '%s mins and ' % mins + run_time
    summary_params = self.__dict__
    summary_params['run_time'] = run_time
    return (
        '%(mismatch_count)s mismatches found after running %(query_count)s queries in '
        '%(run_time)s.\n'
        '%(queries_resulted_in_data_count)s of %(query_count)s queries produced results.'
        '\n'
        '%(count_effective_dml_statements)s of %(query_count)s statements modified a '
        'total of %(count_rows_affected_by_dml)s rows\n'
        '%(test_crash_count)s crashes occurred.\n'
        '%(known_error_count)s queries were excluded from the mismatch count because '
        'they are known errors.\n'
        '%(query_timeout_count)s queries timed out and were excluded from all counts.') \
            % summary_params


if __name__ == '__main__':
  import sys
  from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

  from tests.comparison import cli_options
  from tests.comparison.query_profile import PROFILES

  parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
  cli_options.add_logging_options(parser)
  cli_options.add_db_name_option(parser)
  cli_options.add_cluster_options(parser)
  cli_options.add_connection_option_groups(parser)
  cli_options.add_timeout_option(parser)

  parser.add_argument('--test-db-type', default=IMPALA,
      choices=(HIVE, IMPALA, MYSQL, ORACLE, POSTGRESQL),
      help='The type of the test database to use. Ex: IMPALA.')
  parser.add_argument('--ref-db-type', default=POSTGRESQL,
      choices=(MYSQL, ORACLE, POSTGRESQL),
      help='The type of the ref database to use. Ex: POSTGRESQL.')
  parser.add_argument('--stop-on-mismatch', default=False, action='store_true',
      help='Exit immediately upon find a discrepancy in a query result.')
  parser.add_argument('--stop-on-crash', default=False, action='store_true',
      help='Exit immediately if Impala crashes.')
  parser.add_argument('--query-count', default=1000000, type=int,
      help='Exit after running the given number of queries.')
  parser.add_argument('--exclude-types', default='',
      help='A comma separated list of data types to exclude while generating queries.')
  parser.add_argument('--explain-only', action='store_true',
      help="Don't run the queries only explain them to see if there was an error in "
      "planning.")
  profiles = dict()
  for profile in PROFILES:
    profile_name = profile.__name__
    if profile_name.endswith('Profile'):
      profile_name = profile_name[:-1 * len('Profile')]
    profiles[profile_name.lower()] = profile
  parser.add_argument('--profile', default='default',
      choices=(sorted(profiles.keys())),
      help='Determines the mix of SQL features to use during query generation.')
  # TODO: Seed the random query generator for repeatable queries?

  args = parser.parse_args()
  cli_options.configure_logging(
      args.log_level, debug_log_file=args.debug_log_file, log_thread_name=True)
  cluster = cli_options.create_cluster(args)

  ref_conn = cli_options.create_connection(args, args.ref_db_type, db_name=args.db_name)
  if args.test_db_type == IMPALA:
    test_conn = cluster.impala.connect(db_name=args.db_name)
  elif args.test_db_type == HIVE:
    test_conn = cluster.hive.connect(db_name=args.db_name)
  else:
    test_conn = cli_options.create_connection(
        args, args.test_db_type, db_name=args.db_name)
  # Create an instance of profile class (e.g. DefaultProfile)
  query_profile = profiles[args.profile]()
  if args.explain_only:
    searcher = FrontendExceptionSearcher(query_profile, ref_conn, test_conn)
    searcher.search(args.query_count)
  else:
    diff_searcher = QueryResultDiffSearcher(query_profile, ref_conn, test_conn)
    query_timeout_seconds = args.timeout
    search_results = diff_searcher.search(
        args.query_count, args.stop_on_mismatch, args.stop_on_crash,
        query_timeout_seconds)
    print(search_results)
    sys.exit(search_results.mismatch_count)