impala/tests/query_test/test_tablesample.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# Tests the TABLESAMPLE clause.

import pytest
import subprocess

from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_vector import ImpalaTestDimension

class TestTableSample(ImpalaTestSuite):
  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestTableSample, cls).add_test_dimensions()
    cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('repeatable', *[True, False]))
    cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('filtered', *[True, False]))
    # Tablesample is only supported on HDFS tables.
    cls.ImpalaTestMatrix.add_constraint(lambda v:
      v.get_value('table_format').file_format != 'kudu' and
      v.get_value('table_format').file_format != 'hbase')
    if cls.exploration_strategy() != 'exhaustive':
      # Cut down on core testing time by limiting the file formats.
      cls.ImpalaTestMatrix.add_constraint(lambda v:
        v.get_value('table_format').file_format == 'parquet' or
        v.get_value('table_format').file_format == 'text')

  def test_tablesample(self, vector):
    # Do not use a .test to avoid making this test flaky.
    # 1. Queries without the repeatable clause are non-deterministic.
    # 2. The results of queries without a repeatable clause could change due to
    # changes in data loading that affect the number or size of files.
    repeatable = vector.get_value('repeatable')
    filtered = vector.get_value('filtered')

    where_clause = ""
    if filtered:
      where_clause = "where month between 1 and 6"

    ImpalaTestSuite.change_database(self.client, vector.get_value('table_format'))
    result = self.client.execute("select count(*) from alltypes %s" % where_clause)
    baseline_count = int(result.data[0])
    prev_count = None
    for perc in [5, 20, 50, 100]:
      rep_sql = ""
      if repeatable: rep_sql = " repeatable(1)"
      sql_stmt = "select count(*) from alltypes tablesample system(%s)%s %s" \
                 % (perc, rep_sql, where_clause)
      handle = self.client.execute_async(sql_stmt)
      # IMPALA-6352: flaky test, possibly due to a hung thread. Wait for 500 sec before
      # failing and logging the backtraces of all impalads.
      is_finished = self.client.wait_for_finished_timeout(handle, 500)
      assert is_finished, 'Query Timed out. Dumping backtrace of all threads in ' \
                          'impalads:\nthreads in the impalad1: %s \nthreads in the ' \
                          'impalad2: %s \nthreads in the impalad3: %s' % \
                        (subprocess.check_output(
                          "gdb -ex \"set pagination 0\" -ex \"thread apply all bt\"  "
                          "--batch -p $(pgrep impalad | sed -n 1p)", shell=True),
                         subprocess.check_output(
                          "gdb -ex \"set pagination 0\" -ex \"thread apply all bt\"  "
                          "--batch -p $(pgrep impalad | sed -n 2p)", shell=True),
                         subprocess.check_output(
                          "gdb -ex \"set pagination 0\" -ex \"thread apply all bt\"  "
                          "--batch -p $(pgrep impalad | sed -n 3p)", shell=True))
      result = self.client.fetch(sql_stmt, handle)
      self.client.close_query(handle)
      count = int(result.data[0])
      if perc < 100:
        assert count < baseline_count
      else:
        assert count == baseline_count
      if prev_count and repeatable:
        # May not necessarily be true for non-repeatable samples
        assert count > prev_count
      prev_count = count