impala/tests/query_test/test_scanners.py

# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This test suite validates the scanners by running queries against ALL file formats and
# their permutations (e.g. compression codec/compression type). This works by exhaustively
# generating the table format test vectors for this specific test suite. This way, other
# tests can run with the normal exploration strategy and the overall test runtime doesn't
# explode.

import logging
import pytest
import random
from copy import deepcopy
from subprocess import call, check_call

from testdata.common import widetable
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
from tests.util.test_file_parser import *
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
from tests.util.get_parquet_metadata import get_parquet_metadata
from tests.common.test_dimensions import create_single_exec_option_dimension
from tests.common.skip import SkipIfS3, SkipIfIsilon, SkipIfOldAggsJoins, SkipIfLocal

from parquet.ttypes import ConvertedType

class TestScannersAllTableFormats(ImpalaTestSuite):
  BATCH_SIZES = [0, 1, 16]

  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestScannersAllTableFormats, cls).add_test_dimensions()
    if cls.exploration_strategy() == 'core':
      # The purpose of this test is to get some base coverage of all the file formats.
      # Even in 'core', we'll test each format by using the pairwise strategy.
      cls.TestMatrix.add_dimension(cls.create_table_info_dimension('pairwise'))
    cls.TestMatrix.add_dimension(
        TestDimension('batch_size', *TestScannersAllTableFormats.BATCH_SIZES))

  def test_scanners(self, vector):
    new_vector = deepcopy(vector)
    new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
    self.run_test_case('QueryTest/scanners', new_vector)

# Test all the scanners with a simple limit clause. The limit clause triggers
# cancellation in the scanner code paths.
class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):
  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestScannersAllTableFormatsWithLimit, cls).add_test_dimensions()

  def test_limit(self, vector):
    # Use a small batch size so changing the limit affects the timing of cancellation
    vector.get_value('exec_option')['batch_size'] = 100
    iterations = 50
    query_template = "select * from alltypes limit %s"
    for i in range(1, iterations):
      # Vary the limit to vary the timing of cancellation
      query = query_template % ((iterations * 100) % 1000 + 1)
      self.execute_query(query, vector.get_value('exec_option'),
          table_format=vector.get_value('table_format'))

# Test case to verify the scanners work properly when the table metadata (specifically the
# number of columns in the table) does not match the number of columns in the data file.
class TestUnmatchedSchema(ImpalaTestSuite):
  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestUnmatchedSchema, cls).add_test_dimensions()
    cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
    # Avro has a more advanced schema evolution process which is covered in more depth
    # in the test_avro_schema_evolution test suite.
    cls.TestMatrix.add_constraint(\
        lambda v: v.get_value('table_format').file_format != 'avro')

  def _get_table_location(self, table_name, vector):
    result = self.execute_query_using_client(self.client,
        "describe formatted %s" % table_name, vector)
    for row in result.data:
      if 'Location:' in row:
        return row.split('\t')[1]
    # This should never happen.
    assert 0, 'Unable to get location for table: ' + table_name

  def _create_test_table(self, vector):
    """
    Creates the test table

    Cannot be done in a setup method because we need access to the current test vector
    """
    self._drop_test_table(vector)
    self.execute_query_using_client(self.client,
        "create external table jointbl_test like jointbl", vector)

    # Update the location of the new table to point the same location as the old table
    location = self._get_table_location('jointbl', vector)
    self.execute_query_using_client(self.client,
        "alter table jointbl_test set location '%s'" % location, vector)

  def _drop_test_table(self, vector):
    self.execute_query_using_client(self.client,
        "drop table if exists jointbl_test", vector)

  def test_unmatched_schema(self, vector):
    table_format = vector.get_value('table_format')
    # jointbl has no columns with unique values. When loaded in hbase, the table looks
    # different, as hbase collapses duplicates.
    if table_format.file_format == 'hbase':
      pytest.skip()
    self._create_test_table(vector)
    self.run_test_case('QueryTest/test-unmatched-schema', vector)
    self._drop_test_table(vector)


# Tests that scanners can read a single-column, single-row, 10MB table
class TestWideRow(ImpalaTestSuite):
  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestWideRow, cls).add_test_dimensions()
    # I can't figure out how to load a huge row into hbase
    cls.TestMatrix.add_constraint(
      lambda v: v.get_value('table_format').file_format != 'hbase')

  def test_wide_row(self, vector):
    new_vector = deepcopy(vector)
    # Use a 5MB scan range, so we will have to perform 5MB of sync reads
    new_vector.get_value('exec_option')['max_scan_range_length'] = 5 * 1024 * 1024
    # We need > 10 MB of memory because we're creating extra buffers:
    # - 10 MB table / 5 MB scan range = 2 scan ranges, each of which may allocate ~20MB
    # - Sync reads will allocate ~5MB of space
    # The 100MB value used here was determined empirically by raising the limit until the
    # query succeeded for all file formats -- I don't know exactly why we need this much.
    # TODO: figure out exact breakdown of memory usage (IMPALA-681)
    new_vector.get_value('exec_option')['mem_limit'] = 100 * 1024 * 1024
    self.run_test_case('QueryTest/wide-row', new_vector)

class TestWideTable(ImpalaTestSuite):
  # TODO: expand this to more rows when we have the capability
  NUM_COLS = [250, 500, 1000]

  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestWideTable, cls).add_test_dimensions()
    cls.TestMatrix.add_dimension(TestDimension("num_cols", *cls.NUM_COLS))
    # To cut down on test execution time, only run in exhaustive.
    if cls.exploration_strategy() != 'exhaustive':
      cls.TestMatrix.add_constraint(lambda v: False)

  def test_wide_table(self, vector):
    NUM_COLS = vector.get_value('num_cols')
    # Due to the way HBase handles duplicate row keys, we have different number of
    # rows in HBase tables compared to HDFS tables.
    NUM_ROWS = 10 if vector.get_value('table_format').file_format != 'hbase' else 2
    DB_NAME = QueryTestSectionReader.get_db_name(vector.get_value('table_format'))
    TABLE_NAME = "%s.widetable_%s_cols" % (DB_NAME, NUM_COLS)

    result = self.client.execute("select count(*) from %s " % TABLE_NAME)
    assert result.data == [str(NUM_ROWS)]

    expected_result = widetable.get_data(NUM_COLS, NUM_ROWS, quote_strings=True)
    result = self.client.execute("select * from %s" % TABLE_NAME)

    if vector.get_value('table_format').file_format == 'hbase':
      assert len(result.data) == NUM_ROWS
      return

    types = parse_column_types(result.schema)
    labels = parse_column_labels(result.schema)
    expected = QueryTestResult(expected_result, types, labels, order_matters=False)
    actual = QueryTestResult(parse_result_rows(result), types, labels,
        order_matters=False)
    assert expected == actual


class TestParquet(ImpalaTestSuite):
  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestParquet, cls).add_test_dimensions()
    cls.TestMatrix.add_constraint(
      lambda v: v.get_value('table_format').file_format == 'parquet')

  def test_parquet(self, vector):
    self.run_test_case('QueryTest/parquet', vector)

  @SkipIfOldAggsJoins.nested_types
  def test_continue_on_error(self, vector):
    vector.get_value('exec_option')['abort_on_error'] = 0
    self.run_test_case('QueryTest/parquet-continue-on-error', vector)

  @SkipIfS3.hdfs_block_size
  @SkipIfIsilon.hdfs_block_size
  @SkipIfLocal.multiple_impalad
  @pytest.mark.execute_serially
  def test_multiple_blocks(self, vector):
    # For IMPALA-1881. The table functional_parquet.lineitem_multiblock has 3 blocks, so
    # each impalad should read 1 scan range.
    # It needs to execute serially because if there is at a time more, than one query
    # being scheduled, the simple scheduler round robins colocated impalads across
    # all running queries. See IMPALA-2479 for more details.
    table_name = 'functional_parquet.lineitem_multiblock'
    self._multiple_blocks_helper(table_name, 20000, ranges_per_node=1)
    table_name = 'functional_parquet.lineitem_sixblocks'
    # 2 scan ranges per node should be created to read 'lineitem_sixblocks' because
    # there are 6 blocks and 3 scan nodes.
    self._multiple_blocks_helper(table_name, 40000, ranges_per_node=2)

  @SkipIfS3.hdfs_block_size
  @SkipIfIsilon.hdfs_block_size
  @SkipIfLocal.multiple_impalad
  @pytest.mark.execute_serially
  def test_multiple_blocks_one_row_group(self, vector):
    # For IMPALA-1881. The table functional_parquet.lineitem_multiblock_one_row_group has
    # 3 blocks but only one row group across these blocks. We test to see that only one
    # scan range reads everything from this row group.
    table_name = 'functional_parquet.lineitem_multiblock_one_row_group'
    self._multiple_blocks_helper(
        table_name, 40000, one_row_group=True, ranges_per_node=1)

  def _multiple_blocks_helper(
      self, table_name, rows_in_table, one_row_group=False, ranges_per_node=1):
    """ This function executes a simple SELECT query on a multiblock parquet table and
    verifies the number of ranges issued per node and verifies that at least one row group
    was read. If 'one_row_group' is True, then one scan range is expected to read the data
    from the entire table regardless of the number of blocks. 'ranges_per_node' indicates
    how many scan ranges we expect to be issued per node. """

    query = 'select count(l_orderkey) from %s' % table_name
    result = self.client.execute(query)
    assert len(result.data) == 1
    assert result.data[0] == str(rows_in_table)

    runtime_profile = str(result.runtime_profile)
    num_row_groups_list = re.findall('NumRowGroups: ([0-9]*)', runtime_profile)
    scan_ranges_complete_list = re.findall(
        'ScanRangesComplete: ([0-9]*)', runtime_profile)
    num_rows_read_list = re.findall('RowsRead: [0-9.K]* \(([0-9]*)\)', runtime_profile)

    # This will fail if the number of impalads != 3
    # The fourth fragment is the "Averaged Fragment"
    assert len(num_row_groups_list) == 4
    assert len(scan_ranges_complete_list) == 4
    assert len(num_rows_read_list) == 4

    total_num_row_groups = 0
    # Skip the Averaged Fragment; it comes first in the runtime profile.
    for num_row_groups in num_row_groups_list[1:]:
      total_num_row_groups += int(num_row_groups)
      if not one_row_group: assert int(num_row_groups) > 0

    if one_row_group:
      # If it's the one row group test, only one scan range should read all the data from
      # that row group.
      assert total_num_row_groups == 1
      for rows_read in num_rows_read_list[1:]:
        if rows_read != '0': assert rows_read == str(rows_in_table)

    for scan_ranges_complete in scan_ranges_complete_list:
      assert int(scan_ranges_complete) == ranges_per_node

  @SkipIfS3.insert
  def test_annotate_utf8_option(self, vector, unique_database):
    if self.exploration_strategy() != 'exhaustive': pytest.skip("Only run in exhaustive")

    # Create table
    TABLE_NAME = "parquet_annotate_utf8_test"
    qualified_table_name = "%s.%s" % (unique_database, TABLE_NAME)
    query = 'create table %s (a string, b char(10), c varchar(10), d string) ' \
            'stored as parquet' % qualified_table_name
    self.client.execute(query)

    # Insert data that should have UTF8 annotation
    query = 'insert overwrite table %s '\
            'values("a", cast("b" as char(10)), cast("c" as varchar(10)), "d")' \
            % qualified_table_name
    self.execute_query(query, {'parquet_annotate_strings_utf8': True})

    def get_schema_elements():
      # Copy the created file to the local filesystem and parse metadata
      local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000)
      LOG.info("test_annotate_utf8_option local file name: " + local_file)
      hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq'
          % (unique_database, TABLE_NAME))
      check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file])
      metadata = get_parquet_metadata(local_file)

      # Extract SchemaElements corresponding to the table columns
      a_schema_element = metadata.schema[1]
      assert a_schema_element.name == 'a'
      b_schema_element = metadata.schema[2]
      assert b_schema_element.name == 'b'
      c_schema_element = metadata.schema[3]
      assert c_schema_element.name == 'c'
      d_schema_element = metadata.schema[4]
      assert d_schema_element.name == 'd'

      os.remove(local_file)
      return a_schema_element, b_schema_element, c_schema_element, d_schema_element

    # Check that the schema uses the UTF8 annotation
    a_schema_elt, b_schema_elt, c_schema_elt, d_schema_elt = get_schema_elements()
    assert a_schema_elt.converted_type == ConvertedType.UTF8
    assert b_schema_elt.converted_type == ConvertedType.UTF8
    assert c_schema_elt.converted_type == ConvertedType.UTF8
    assert d_schema_elt.converted_type == ConvertedType.UTF8

    # Create table and insert data that should not have UTF8 annotation for strings
    self.execute_query(query, {'parquet_annotate_strings_utf8': False})

    # Check that the schema does not use the UTF8 annotation except for CHAR and VARCHAR
    # columns
    a_schema_elt, b_schema_elt, c_schema_elt, d_schema_elt = get_schema_elements()
    assert a_schema_elt.converted_type == None
    assert b_schema_elt.converted_type == ConvertedType.UTF8
    assert c_schema_elt.converted_type == ConvertedType.UTF8
    assert d_schema_elt.converted_type == None

  @SkipIfS3.insert
  def test_resolution_by_name(self, unique_database, vector):
    self.run_test_case('QueryTest/parquet-resolution-by-name', vector,
                       use_db=unique_database)

# We use various scan range lengths to exercise corner cases in the HDFS scanner more
# thoroughly. In particular, it will exercise:
# 1. default scan range
# 2. scan range with no tuple
# 3. tuple that span across multiple scan ranges
# 4. scan range length = 16 for ParseSse() execution path
MAX_SCAN_RANGE_LENGTHS = [0, 1, 2, 5, 16, 17, 32]

class TestScanRangeLengths(ImpalaTestSuite):
  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestScanRangeLengths, cls).add_test_dimensions()
    cls.TestMatrix.add_dimension(
        TestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))

  def test_scan_ranges(self, vector):
    vector.get_value('exec_option')['max_scan_range_length'] =\
        vector.get_value('max_scan_range_length')
    self.run_test_case('QueryTest/hdfs-tiny-scan', vector)

# More tests for text scanner
# 1. Test file that ends w/o tuple delimiter
# 2. Test file with escape character
class TestTextScanRangeLengths(ImpalaTestSuite):
  ESCAPE_TABLE_LIST = ["testescape_16_lf", "testescape_16_crlf",
      "testescape_17_lf", "testescape_17_crlf",
      "testescape_32_lf", "testescape_32_crlf"]

  @classmethod
  def get_workload(cls):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestTextScanRangeLengths, cls).add_test_dimensions()
    cls.TestMatrix.add_dimension(
        TestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
    cls.TestMatrix.add_constraint(lambda v:\
        v.get_value('table_format').file_format == 'text' and\
        v.get_value('table_format').compression_codec == 'none')

  def test_text_scanner(self, vector):
    vector.get_value('exec_option')['max_scan_range_length'] =\
        vector.get_value('max_scan_range_length')
    self.execute_query_expect_success(self.client, "drop stats "
        "functional.table_no_newline_part")
    self.execute_query_expect_success(self.client, "compute stats "
        "functional.table_no_newline_part")
    self.run_test_case('QueryTest/hdfs-text-scan', vector)

    # Test various escape char cases. We have to check the count(*) result against
    # the count(col) result because if the scan range is split right after the escape
    # char, the escape char has no effect because we cannot scan backwards to the
    # previous scan range.
    for t in self.ESCAPE_TABLE_LIST:
      expected_result = self.client.execute("select count(col) from " + t)
      result = self.client.execute("select count(*) from " + t)
      assert result.data == expected_result.data

# Missing Coverage: No coverage for truncated files errors or scans.
@SkipIfS3.hive
@SkipIfIsilon.hive
@SkipIfLocal.hive
@pytest.mark.execute_serially
class TestScanTruncatedFiles(ImpalaTestSuite):
  TEST_DB = 'test_truncated_file'

  @classmethod
  def get_workload(self):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestScanTruncatedFiles, cls).add_test_dimensions()
    cls.TestMatrix.add_dimension(create_single_exec_option_dimension())

    # This test takes about a minute to complete due to the Hive commands that are
    # executed. To cut down on runtime, limit the test to exhaustive exploration
    # strategy.
    # TODO: Test other file formats
    if cls.exploration_strategy() == 'exhaustive':
      cls.TestMatrix.add_constraint(lambda v:\
          v.get_value('table_format').file_format == 'text' and\
          v.get_value('table_format').compression_codec == 'none')
    else:
      cls.TestMatrix.add_constraint(lambda v: False)

  def setup_method(self, method):
    self.cleanup_db(TestScanTruncatedFiles.TEST_DB)
    self.client.execute("create database %s location '%s/%s.db'" %
        (TestScanTruncatedFiles.TEST_DB, WAREHOUSE,
        TestScanTruncatedFiles.TEST_DB))

  def teardown_method(self, method):
    self.cleanup_db(TestScanTruncatedFiles.TEST_DB)

  def test_scan_truncated_file_empty(self, vector):
    self.scan_truncated_file(0)

  def test_scan_truncated_file(self, vector):
    self.scan_truncated_file(10)

  def scan_truncated_file(self, num_rows):
    db_name = TestScanTruncatedFiles.TEST_DB
    tbl_name = "tbl"
    self.execute_query("use %s" % db_name)
    self.execute_query("create table %s (s string)" % tbl_name)
    call(["hive", "-e", "INSERT OVERWRITE TABLE %s.%s SELECT string_col from "\
        "functional.alltypes" % (db_name, tbl_name)])

    # Update the Impala metadata
    self.execute_query("refresh %s" % tbl_name)

    # Insert overwrite with a truncated file
    call(["hive", "-e", "INSERT OVERWRITE TABLE %s.%s SELECT string_col from "\
        "functional.alltypes limit %s" % (db_name, tbl_name, num_rows)])

    result = self.execute_query("select count(*) from %s" % tbl_name)
    assert(len(result.data) == 1)
    assert(result.data[0] == str(num_rows))