impala/tests/query_test/test_compressed_formats.py

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.

import pytest
from os.path import join
from subprocess import call
from tests.common.test_vector import *
from tests.common.impala_test_suite import ImpalaTestSuite

# (file extension, table suffix) pairs
compression_formats = [
  ('.bz2',     'bzip'),
  ('.deflate', 'def'),
  ('.gz',      'gzip'),
  ('.snappy',  'snap'),
 ]

class TestCompressedFormats(ImpalaTestSuite):
  """
  Tests that we support compressed RC and sequence files (see IMPALA-14: Files
  with .gz extension reported as 'not supported') and that unsupported formats
  fail gracefully.
  """
  @classmethod
  def get_workload(self):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestCompressedFormats, cls).add_test_dimensions()
    cls.TestMatrix.clear()
    cls.TestMatrix.add_dimension(\
        TestDimension('file_format', *['rc', 'seq', 'text']))
    cls.TestMatrix.add_dimension(\
        TestDimension('compression_format', *compression_formats))
    if cls.exploration_strategy() == 'core':
      # Don't run on core.  This test is very slow and we are unlikely
      # to regress here.
      cls.TestMatrix.add_constraint(lambda v: False);

  @pytest.mark.execute_serially
  def test_compressed_formats(self, vector):
    file_format = vector.get_value('file_format')
    extension, suffix = vector.get_value('compression_format')
    if file_format in ['rc', 'seq']:
      # Test that compressed RC/sequence files are supported
      db_suffix = '_%s_%s' % (file_format, suffix)
      self.__copy_and_query_compressed_file(
       'tinytable', db_suffix, suffix, '000000_0', extension)

    elif file_format is 'text':
      # Test that that compressed text files (or at least text files with a
      # compressed extension) fail.
      db_suffix = ""
      self.__copy_and_query_compressed_file(
        'tinytable', db_suffix, suffix, 'data.csv', extension,
        'Compressed text files are not supported')

    else:
      assert False, "Unknown file_format: %s" % file_format


  # TODO: switch to using hive metastore API rather than hive shell.
  def __copy_and_query_compressed_file(self, table_name, db_suffix, compression_codec,
                                     file_name, extension, expected_error=None):
    # We want to create a test table with a compressed file that has a file
    # extension. We'll do this by making a copy of an existing table with hive.
    base_dir = '/test-warehouse'
    src_table = 'functional%s.%s' % (db_suffix, table_name)
    src_table_dir = "%s%s" % (table_name, db_suffix)
    src_table_dir = join(base_dir, src_table_dir)
    src_file = join(src_table_dir, file_name)

    # Make sure destination table uses suffix, even if use_suffix=False, so
    # unique tables are created for each compression format
    dest_table = '%s_%s_copy' % (table_name, compression_codec)
    dest_table_dir = join(base_dir, dest_table)
    dest_file = join(dest_table_dir, file_name + extension)

    drop_cmd = 'DROP TABLE IF EXISTS %s;' % (dest_table)
    hive_cmd = drop_cmd + 'CREATE TABLE %s LIKE %s;' % (dest_table, src_table)

    # Create the table
    call(["hive", "-e", hive_cmd]);
    call(["hadoop", "fs", "-cp", src_file, dest_file])
    # Try to read the compressed file with extension
    query = 'select count(*) from %s' % dest_table
    try:
      # Need to invalidate the metadata because the table was created external to Impala.
      self.client.execute("invalidate metadata %s" % dest_table)
      result = self.execute_scalar(query)
      # Fail iff we expected an error
      assert expected_error is None, 'Query is expected to fail'
    except Exception as e:
      error_msg = str(e)
      print error_msg
      if expected_error is None or expected_error not in error_msg:
        print "Unexpected error:\n%s", error_msg
        raise
    finally:
      call(["hive", "-e", drop_cmd]);