Files
impala/tests/query_test/test_views_compatibility.py
Lenni Kuff b4f5c1edcf Enable lazy loading of table metadata for the CatalogService/Impalad
This change adds support for lazy loading of table metadata to the
CatalogService/Impalad. The way this works is that the CatalogService initially
sends out an update with only the databases and table names (wrapped as
IncompleteTables). When an Impalad encounters one of these tables, it will contact
the catalog service to get the metadata, possibly triggering a metadata load if the
catalog server has not yet loaded this table.

With these changes the catalog server starts up in just seconds, even for large
metastores since it only needs to call into the metastore to get the list of tables
and databases. The performance of "invalidate metadata" also improves for the same reason.

I also picked up the catalog cleanup patch I had to make the APIs a bit more consistent and
remove the need for using a LoadingCache for databases.

This also fixes up the FE tests to run in a more realistic fashion. The FE tests now run
against catalog object recieved from the catalog server. This actually turned up some bugs
in our previous test configuration where we were not running with the correct column stats
(we were always running with avgSerializedSize = slotSize).  This changed some plans so the
planner tests needed to be updated.

Still TODO:
This does not include the changes to perform background metadata loading. I will send
that out as a separate patch on top of this.

Change-Id: Ied16f8a7f3a3393e89d6bfea78f0ba708d0ddd0e

Saving changes

Change-Id: I48c34408826b7396004177f5fc61a9523e664acc
Reviewed-on: http://gerrit.ent.cloudera.com:8080/1328
Tested-by: jenkins
Reviewed-by: Lenni Kuff <lskuff@cloudera.com>
Reviewed-on: http://gerrit.ent.cloudera.com:8080/1338
Tested-by: Lenni Kuff <lskuff@cloudera.com>
2014-01-21 21:43:29 -08:00

290 lines
12 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shlex
from subprocess import call
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
# The purpose of view compatibility testing is to check whether views created in Hive
# can be queried in Impala and vice versa. A test typically consists of
# the following actions specified in different test sections.
# 1. create a view with a certain definition using Hive and Impala
# 2. explain a "select *" query on the view created by Hive using Hive and Impala
# 3. explain a "select *" query on the view created by Impala using Hive and Impala
# For each of the steps above its corresponding test section specifies our expectations
# on whether Impala and Hive will succeed or fail.
#
# Impala and Hive's SQL dialects are not fully compatible. We intentionally rely
# on the view creation mechanism instead of just testing various SQL statements in
# Impala and Hive, because view creation transforms the original view definition into
# a so-called "extended view definition". As this process of transformation could
# potentially change in Impala and/or Hive simply testing various SQL statements
# in Impala and Hive would be insufficient.
class TestViewCompatibility(ImpalaTestSuite):
TEST_DB_NAME = "view_compat_test_db"
VALID_SECTION_NAMES = ["CREATE_VIEW", "CREATE_VIEW_RESULTS",\
"QUERY_HIVE_VIEW_RESULTS", "QUERY_IMPALA_VIEW_RESULTS"]
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestViewCompatibility, cls).add_test_dimensions()
# don't use any exec options, running exactly once is fine
cls.TestMatrix.clear_dimension('exec_option')
# There is no reason to run these tests using all dimensions.
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and\
v.get_value('table_format').compression_codec == 'none')
def setup_method(self, method):
# cleanup and create a fresh test database
self.cleanup_db(self.TEST_DB_NAME)
self.client.refresh()
self.execute_query("create database %s" % (self.TEST_DB_NAME))
def teardown_method(self, method):
self.cleanup_db(self.TEST_DB_NAME)
@pytest.mark.execute_serially
def test_view_compatibility(self, vector):
self.__run_view_compat_test_case('QueryTest/views-compatibility', vector)
def __run_view_compat_test_case(self, test_file_name, vector):
"""
Runs a view-compatibility test file, containing the following sections:
---- CREATE_VIEW
contains a view creation statement to be executed in Impala and Hive
---- CREATE_VIEW_RESULTS
whether we expect the view creation in Impala/Hive to succeed/fail
---- QUERY_HIVE_VIEW_RESULTS
whether we expect to be able to query the view created by Hive in Hive/Impala
---- QUERY_IMPALA_VIEW_RESULTS
whether we expect to be able to query the view created by Impala in Hive/Impala
"""
sections = self.load_query_test_file(self.get_workload(), test_file_name,\
self.VALID_SECTION_NAMES)
for test_section in sections:
self.client.refresh()
# validate the test
test_case = ViewCompatTestCase(test_section, test_file_name, self.TEST_DB_NAME)
# create views in Hive and Impala checking against the expected results
self.__exec_in_hive(test_case.get_create_view_sql('HIVE'),\
test_case.get_create_view_sql('HIVE'),\
test_case.get_create_exp_res())
self.__exec_in_impala(test_case.get_create_view_sql('IMPALA'),\
test_case.get_create_view_sql('IMPALA'),\
test_case.get_create_exp_res())
self.client.refresh()
# explain a simple query on the view created by Hive in Hive and Impala
if test_case.has_query_hive_section():
exp_res = test_case.get_query_exp_res('HIVE');
if 'HIVE' in exp_res:
self.__exec_in_hive(test_case.get_query_view_sql('HIVE'),\
test_case.get_create_view_sql('HIVE'), exp_res)
if 'IMPALA' in exp_res:
self.__exec_in_impala(test_case.get_query_view_sql('HIVE'),\
test_case.get_create_view_sql('HIVE'), exp_res)
# explain a simple query on the view created by Impala in Hive and Impala
if test_case.has_query_impala_section():
exp_res = test_case.get_query_exp_res('IMPALA');
if 'HIVE' in exp_res:
self.__exec_in_hive(test_case.get_query_view_sql('IMPALA'),\
test_case.get_create_view_sql('IMPALA'), exp_res)
if 'IMPALA' in exp_res:
self.__exec_in_impala(test_case.get_query_view_sql('IMPALA'),\
test_case.get_create_view_sql('IMPALA'), exp_res)
# drop the views without checking success or failure
self.__exec_in_hive(test_case.get_drop_view_sql('HIVE'),\
test_case.get_create_view_sql('HIVE'), None)
self.__exec_in_impala(test_case.get_drop_view_sql('IMPALA'),\
test_case.get_create_view_sql('IMPALA'), None)
self.client.refresh()
def __exec_in_hive(self, sql_str, create_view_sql, exp_res):
hive_ret = call(['hive', '-e', sql_str])
self.__cmp_expected(sql_str, create_view_sql, exp_res, "HIVE", hive_ret == 0)
def __exec_in_impala(self, sql_str, create_view_sql, exp_res):
success = True
try:
impala_ret = self.execute_query(sql_str)
success = impala_ret.success
except: # consider any exception a failure
success = False
self.__cmp_expected(sql_str, create_view_sql, exp_res, "IMPALA", success)
def __cmp_expected(self, sql_str, create_view_sql, exp_res, engine, success):
if exp_res is None:
return
if exp_res[engine] and not success:
assert 0, '%s failed to execute\n%s\nwhile testing a view created as\n%s'\
% (engine, sql_str, create_view_sql)
if not exp_res[engine] and success:
assert 0, '%s unexpectedly succeeded in executing\n%s\nwhile testing '\
'a view created as\n%s' % (engine, create_view_sql, sql_str)
# Represents one view-compatibility test case. Performs validation of the test sections
# and provides SQL to execute for each section.
class ViewCompatTestCase(object):
RESULT_KEYS = ["IMPALA", "HIVE"]
def __init__(self, test_section, test_file_name, test_db_name):
if 'CREATE_VIEW' not in test_section:
assert 0, 'Error in test file %s. Test cases require a '\
'CREATE_VIEW section.\n%s' %\
(test_file_name, pprint.pformat(test_section))
self.create_exp_res = None
# get map of expected results from test sections
if 'CREATE_VIEW_RESULTS' in test_section:
self.create_exp_res =\
self.__get_expected_results(test_section['CREATE_VIEW_RESULTS'])
else:
assert 0, 'Error in test file %s. Test cases require a '\
'CREATE_VIEW_RESULTS section.\n%s' %\
(test_file_name, pprint.pformat(test_section))
self.query_hive_exp_res = None
if 'QUERY_HIVE_VIEW_RESULTS' in test_section:
self.query_hive_exp_res =\
self.__get_expected_results(test_section['QUERY_HIVE_VIEW_RESULTS'])
self.query_impala_exp_res = None
if 'QUERY_IMPALA_VIEW_RESULTS' in test_section:
self.query_impala_exp_res =\
self.__get_expected_results(test_section['QUERY_IMPALA_VIEW_RESULTS'])
if self.query_hive_exp_res is None and self.query_impala_exp_res is None:
assert 0, 'Error in test file %s. Test cases require a QUERY_HIVE_VIEW_RESULTS '\
'or QUERY_IMPALA_VIEW_RESULTS section.\n%s' %\
(test_file_name, pprint.pformat(test_section))
# clean test section, remove comments etc.
self.create_view_sql = QueryTestSectionReader.build_query(test_section['CREATE_VIEW'])
view_name = self.__get_view_name(self.create_view_sql)
if view_name.find(".") != -1:
assert 0, 'Error in test file %s. Found unexpected view name %s that is '\
'qualified with a database' % (test_file_name, view_name)
# add db prefix and suffixes to indicate which engine created the view
self.hive_view_name = test_db_name + '.' + view_name + '_hive'
self.impala_view_name = test_db_name + '.' + view_name + '_impala'
self.hive_create_view_sql =\
self.create_view_sql.replace(view_name, self.hive_view_name, 1)
self.impala_create_view_sql =\
self.create_view_sql.replace(view_name, self.impala_view_name, 1)
# SQL to explain a simple query on the view created by Hive in Hive and Impala
if self.query_hive_exp_res is not None:
self.query_hive_view_sql = 'explain select * from %s' % (self.hive_view_name)
# SQL to explain a simple query on the view created by Impala in Hive and Impala
if self.query_impala_exp_res is not None:
self.query_impala_view_sql = 'explain select * from %s' % (self.impala_view_name)
self.drop_hive_view_sql = "drop view %s" % (self.hive_view_name)
self.drop_impala_view_sql = "drop view %s" % (self.impala_view_name)
def __get_view_name(self, create_view_sql):
lexer = shlex.shlex(create_view_sql)
tokens = list(lexer)
# sanity check the create view statement
if len(tokens) < 3:
assert 0, 'Error in test. Invalid CREATE VIEW statement: %s' % (create_view_sql)
if tokens[0].lower() != "create" or tokens[1].lower() != "view":
assert 0, 'Error in test. Invalid CREATE VIEW statement: %s' % (create_view_sql)
if tokens[2].lower() == "if":
# expect an "if not exists" clause
return tokens[5]
else:
# expect a create view view_name ...
return tokens[2]
def __get_expected_results(self, section_text):
lines = section_text.splitlines()
exp_res = dict()
for line in lines:
components = line.partition("=")
if (components[2].lower() == 'SUCCESS'.lower()):
exp_res[components[0]] = True
else:
exp_res[components[0]] = False
# check that the results section contains at least one entry
if not (lambda a, b: any(i in b for i in a)):
assert 0, 'No valid entry in expected-results section. '\
'Expected an IMPALA or HIVE entry.'
return exp_res
def get_create_view_sql(self, engine):
engine = engine.upper();
if engine == "HIVE":
return self.hive_create_view_sql
elif engine == "IMPALA":
return self.impala_create_view_sql
else:
assert 0, "Unknown execution engine %s" % (engine)
def get_create_exp_res(self):
return self.create_exp_res
def get_drop_view_sql(self, engine):
engine = engine.upper();
if engine == "HIVE":
return self.drop_hive_view_sql
elif engine == "IMPALA":
return self.drop_impala_view_sql
else:
assert 0, "Unknown execution engine %s" % (engine)
def get_query_exp_res(self, engine):
engine = engine.upper();
if engine == "HIVE":
return self.query_hive_exp_res
elif engine == "IMPALA":
return self.query_impala_exp_res
else:
assert 0, "Unknown execution engine %s" % (engine)
def get_query_view_sql(self, engine):
engine = engine.upper();
if engine == "HIVE":
return self.query_hive_view_sql
elif engine == "IMPALA":
return self.query_impala_view_sql
else:
assert 0, "Unknown execution engine %s" % (engine)
return self.query_hive_view_sql
def has_query_hive_section(self):
return hasattr(self, 'query_hive_view_sql')
def has_query_impala_section(self):
return hasattr(self, 'query_impala_view_sql')