mirror of
https://github.com/apache/impala.git
synced 2026-01-04 09:00:56 -05:00
The Impala CatalogService manages the caching and dissemination of cluster-wide metadata. The CatalogService combines the metadata from the Hive Metastore, the NameNode, and potentially additional sources in the future. The CatalogService uses the StateStore to broadcast metadata updates across the cluster. The CatalogService also directly handles executing metadata updates request from impalad servers (DDL requests). It exposes a Thrift interface to allow impalads to directly connect execute their DDL operations. The CatalogService has two main components - a C++ server that implements StateStore integration, Thrift service implementiation, and exporting of the debug webpage/metrics. The other main component is the Java Catalog that manages caching and updating of of all the metadata. For each StateStore heartbeat, a delta of all metadata updates is broadcast to the rest of the cluster. Some Notes On the Changes --- * The metadata is all sent as thrift structs. To do this all catalog objects (Tables/Views, Databases, UDFs) have thrift struct to represent them. These are sent with each statestore delta update. * The existing Catalog class has been seperated into two seperate sub-classes. An ImpladCatalog and a CatalogServiceCatalog. See the comments on those classes for more details. What is working: * New CatalogService created * Working with statestore delta updates and latest UDF changes * DDL performed on Node 1 is now visible on all other nodes without a "refresh". * Each DDL operation against the Catalog Service will return the catalog version that contains the change. An impalad will wait for the statestore heartbeat that contains this version before returning from the DDL comment. * All table types (Hbase, Hdfs, Views) getting their metadata propagated properly * Block location information included in CS updates and used by Impalads * Column and table stats included in CS updates and used by Impalads * Query tests are all passing Still TODO: * Directly return catalog object metadata from DDL requests * Poll the Hive Metastore to detect new/dropped/modified tables * Reorganize the FE code for the Catalog Service. I don't think we want everything in the same JAR. Change-Id: I8c61296dac28fb98bcfdc17361f4f141d3977eda Reviewed-on: http://gerrit.ent.cloudera.com:8080/601 Reviewed-by: Lenni Kuff <lskuff@cloudera.com> Tested-by: Lenni Kuff <lskuff@cloudera.com>
295 lines
12 KiB
Python
295 lines
12 KiB
Python
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import shlex
|
|
from subprocess import call
|
|
from tests.common.test_vector import *
|
|
from tests.common.impala_test_suite import *
|
|
|
|
# The purpose of view compatibility testing is to check whether views created in Hive
|
|
# can be queried in Impala and vice versa. A test typically consists of
|
|
# the following actions specified in different test sections.
|
|
# 1. create a view with a certain definition using Hive and Impala
|
|
# 2. explain a "select *" query on the view created by Hive using Hive and Impala
|
|
# 3. explain a "select *" query on the view created by Impala using Hive and Impala
|
|
# For each of the steps above its corresponding test section specifies our expectations
|
|
# on whether Impala and Hive will succeed or fail.
|
|
#
|
|
# Impala and Hive's SQL dialects are not fully compatible. We intentionally rely
|
|
# on the view creation mechanism instead of just testing various SQL statements in
|
|
# Impala and Hive, because view creation transforms the original view definition into
|
|
# a so-called "extended view definition". As this process of transformation could
|
|
# potentially change in Impala and/or Hive simply testing various SQL statements
|
|
# in Impala and Hive would be insufficient.
|
|
class TestViewCompatibility(ImpalaTestSuite):
|
|
TEST_DB_NAME = "view_compat_test_db"
|
|
VALID_SECTION_NAMES = ["CREATE_VIEW", "CREATE_VIEW_RESULTS",\
|
|
"QUERY_HIVE_VIEW_RESULTS", "QUERY_IMPALA_VIEW_RESULTS"]
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestViewCompatibility, cls).add_test_dimensions()
|
|
# don't use any exec options, running exactly once is fine
|
|
cls.TestMatrix.clear_dimension('exec_option')
|
|
# There is no reason to run these tests using all dimensions.
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'text' and\
|
|
v.get_value('table_format').compression_codec == 'none')
|
|
|
|
if cls.exploration_strategy() == 'core':
|
|
# Don't run on core. This test is very slow and we are unlikely
|
|
# to regress here.
|
|
cls.TestMatrix.add_constraint(lambda v: False);
|
|
|
|
def setup_method(self, method):
|
|
# cleanup and create a fresh test database
|
|
self.cleanup_db(self.TEST_DB_NAME)
|
|
self.client.refresh()
|
|
self.execute_query("create database %s" % (self.TEST_DB_NAME))
|
|
|
|
def teardown_method(self, method):
|
|
self.cleanup_db(self.TEST_DB_NAME)
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_view_compatibility(self, vector):
|
|
self.__run_view_compat_test_case('QueryTest/views-compatibility', vector)
|
|
|
|
def __run_view_compat_test_case(self, test_file_name, vector):
|
|
"""
|
|
Runs a view-compatibility test file, containing the following sections:
|
|
|
|
---- CREATE_VIEW
|
|
contains a view creation statement to be executed in Impala and Hive
|
|
---- CREATE_VIEW_RESULTS
|
|
whether we expect the view creation in Impala/Hive to succeed/fail
|
|
---- QUERY_HIVE_VIEW_RESULTS
|
|
whether we expect to be able to query the view created by Hive in Hive/Impala
|
|
---- QUERY_IMPALA_VIEW_RESULTS
|
|
whether we expect to be able to query the view created by Impala in Hive/Impala
|
|
"""
|
|
|
|
sections = self.load_query_test_file(self.get_workload(), test_file_name,\
|
|
self.VALID_SECTION_NAMES)
|
|
|
|
for test_section in sections:
|
|
self.client.refresh()
|
|
|
|
# validate the test
|
|
test_case = ViewCompatTestCase(test_section, test_file_name, self.TEST_DB_NAME)
|
|
|
|
# create views in Hive and Impala checking against the expected results
|
|
self.__exec_in_hive(test_case.get_create_view_sql('HIVE'),\
|
|
test_case.get_create_view_sql('HIVE'),\
|
|
test_case.get_create_exp_res())
|
|
self.__exec_in_impala(test_case.get_create_view_sql('IMPALA'),\
|
|
test_case.get_create_view_sql('IMPALA'),\
|
|
test_case.get_create_exp_res())
|
|
self.client.refresh()
|
|
|
|
# explain a simple query on the view created by Hive in Hive and Impala
|
|
if test_case.has_query_hive_section():
|
|
exp_res = test_case.get_query_exp_res('HIVE');
|
|
if 'HIVE' in exp_res:
|
|
self.__exec_in_hive(test_case.get_query_view_sql('HIVE'),\
|
|
test_case.get_create_view_sql('HIVE'), exp_res)
|
|
if 'IMPALA' in exp_res:
|
|
self.__exec_in_impala(test_case.get_query_view_sql('HIVE'),\
|
|
test_case.get_create_view_sql('HIVE'), exp_res)
|
|
|
|
# explain a simple query on the view created by Impala in Hive and Impala
|
|
if test_case.has_query_impala_section():
|
|
exp_res = test_case.get_query_exp_res('IMPALA');
|
|
if 'HIVE' in exp_res:
|
|
self.__exec_in_hive(test_case.get_query_view_sql('IMPALA'),\
|
|
test_case.get_create_view_sql('IMPALA'), exp_res)
|
|
if 'IMPALA' in exp_res:
|
|
self.__exec_in_impala(test_case.get_query_view_sql('IMPALA'),\
|
|
test_case.get_create_view_sql('IMPALA'), exp_res)
|
|
|
|
# drop the views without checking success or failure
|
|
self.__exec_in_hive(test_case.get_drop_view_sql('HIVE'),\
|
|
test_case.get_create_view_sql('HIVE'), None)
|
|
self.__exec_in_impala(test_case.get_drop_view_sql('IMPALA'),\
|
|
test_case.get_create_view_sql('IMPALA'), None)
|
|
self.client.refresh()
|
|
|
|
def __exec_in_hive(self, sql_str, create_view_sql, exp_res):
|
|
hive_ret = call(['hive', '-e', sql_str])
|
|
self.__cmp_expected(sql_str, create_view_sql, exp_res, "HIVE", hive_ret == 0)
|
|
|
|
def __exec_in_impala(self, sql_str, create_view_sql, exp_res):
|
|
success = True
|
|
try:
|
|
impala_ret = self.execute_query(sql_str)
|
|
success = impala_ret.success
|
|
except: # consider any exception a failure
|
|
success = False
|
|
self.__cmp_expected(sql_str, create_view_sql, exp_res, "IMPALA", success)
|
|
|
|
def __cmp_expected(self, sql_str, create_view_sql, exp_res, engine, success):
|
|
if exp_res is None:
|
|
return
|
|
if exp_res[engine] and not success:
|
|
assert 0, '%s failed to execute\n%s\nwhile testing a view created as\n%s'\
|
|
% (engine, sql_str, create_view_sql)
|
|
if not exp_res[engine] and success:
|
|
assert 0, '%s unexpectedly succeeded in executing\n%s\nwhile testing '\
|
|
'a view created as\n%s' % (engine, create_view_sql, sql_str)
|
|
|
|
# Represents one view-compatibility test case. Performs validation of the test sections
|
|
# and provides SQL to execute for each section.
|
|
class ViewCompatTestCase(object):
|
|
RESULT_KEYS = ["IMPALA", "HIVE"]
|
|
|
|
def __init__(self, test_section, test_file_name, test_db_name):
|
|
if 'CREATE_VIEW' not in test_section:
|
|
assert 0, 'Error in test file %s. Test cases require a '\
|
|
'CREATE_VIEW section.\n%s' %\
|
|
(test_file_name, pprint.pformat(test_section))
|
|
|
|
self.create_exp_res = None
|
|
# get map of expected results from test sections
|
|
if 'CREATE_VIEW_RESULTS' in test_section:
|
|
self.create_exp_res =\
|
|
self.__get_expected_results(test_section['CREATE_VIEW_RESULTS'])
|
|
else:
|
|
assert 0, 'Error in test file %s. Test cases require a '\
|
|
'CREATE_VIEW_RESULTS section.\n%s' %\
|
|
(test_file_name, pprint.pformat(test_section))
|
|
|
|
self.query_hive_exp_res = None
|
|
if 'QUERY_HIVE_VIEW_RESULTS' in test_section:
|
|
self.query_hive_exp_res =\
|
|
self.__get_expected_results(test_section['QUERY_HIVE_VIEW_RESULTS'])
|
|
|
|
self.query_impala_exp_res = None
|
|
if 'QUERY_IMPALA_VIEW_RESULTS' in test_section:
|
|
self.query_impala_exp_res =\
|
|
self.__get_expected_results(test_section['QUERY_IMPALA_VIEW_RESULTS'])
|
|
|
|
if self.query_hive_exp_res is None and self.query_impala_exp_res is None:
|
|
assert 0, 'Error in test file %s. Test cases require a QUERY_HIVE_VIEW_RESULTS '\
|
|
'or QUERY_IMPALA_VIEW_RESULTS section.\n%s' %\
|
|
(test_file_name, pprint.pformat(test_section))
|
|
|
|
# clean test section, remove comments etc.
|
|
self.create_view_sql = QueryTestSectionReader.build_query(test_section['CREATE_VIEW'])
|
|
|
|
view_name = self.__get_view_name(self.create_view_sql)
|
|
if view_name.find(".") != -1:
|
|
assert 0, 'Error in test file %s. Found unexpected view name %s that is '\
|
|
'qualified with a database' % (test_file_name, view_name)
|
|
|
|
# add db prefix and suffixes to indicate which engine created the view
|
|
self.hive_view_name = test_db_name + '.' + view_name + '_hive'
|
|
self.impala_view_name = test_db_name + '.' + view_name + '_impala'
|
|
|
|
self.hive_create_view_sql =\
|
|
self.create_view_sql.replace(view_name, self.hive_view_name, 1)
|
|
self.impala_create_view_sql =\
|
|
self.create_view_sql.replace(view_name, self.impala_view_name, 1)
|
|
|
|
# SQL to explain a simple query on the view created by Hive in Hive and Impala
|
|
if self.query_hive_exp_res is not None:
|
|
self.query_hive_view_sql = 'explain select * from %s' % (self.hive_view_name)
|
|
|
|
# SQL to explain a simple query on the view created by Impala in Hive and Impala
|
|
if self.query_impala_exp_res is not None:
|
|
self.query_impala_view_sql = 'explain select * from %s' % (self.impala_view_name)
|
|
|
|
self.drop_hive_view_sql = "drop view %s" % (self.hive_view_name)
|
|
self.drop_impala_view_sql = "drop view %s" % (self.impala_view_name)
|
|
|
|
def __get_view_name(self, create_view_sql):
|
|
lexer = shlex.shlex(create_view_sql)
|
|
tokens = list(lexer)
|
|
# sanity check the create view statement
|
|
if len(tokens) < 3:
|
|
assert 0, 'Error in test. Invalid CREATE VIEW statement: %s' % (create_view_sql)
|
|
if tokens[0].lower() != "create" or tokens[1].lower() != "view":
|
|
assert 0, 'Error in test. Invalid CREATE VIEW statement: %s' % (create_view_sql)
|
|
|
|
if tokens[2].lower() == "if":
|
|
# expect an "if not exists" clause
|
|
return tokens[5]
|
|
else:
|
|
# expect a create view view_name ...
|
|
return tokens[2]
|
|
|
|
def __get_expected_results(self, section_text):
|
|
lines = section_text.splitlines()
|
|
exp_res = dict()
|
|
for line in lines:
|
|
components = line.partition("=")
|
|
if (components[2].lower() == 'SUCCESS'.lower()):
|
|
exp_res[components[0]] = True
|
|
else:
|
|
exp_res[components[0]] = False
|
|
|
|
# check that the results section contains at least one entry
|
|
if not (lambda a, b: any(i in b for i in a)):
|
|
assert 0, 'No valid entry in expected-results section. '\
|
|
'Expected an IMPALA or HIVE entry.'
|
|
return exp_res
|
|
|
|
def get_create_view_sql(self, engine):
|
|
engine = engine.upper();
|
|
if engine == "HIVE":
|
|
return self.hive_create_view_sql
|
|
elif engine == "IMPALA":
|
|
return self.impala_create_view_sql
|
|
else:
|
|
assert 0, "Unknown execution engine %s" % (engine)
|
|
|
|
def get_create_exp_res(self):
|
|
return self.create_exp_res
|
|
|
|
def get_drop_view_sql(self, engine):
|
|
engine = engine.upper();
|
|
if engine == "HIVE":
|
|
return self.drop_hive_view_sql
|
|
elif engine == "IMPALA":
|
|
return self.drop_impala_view_sql
|
|
else:
|
|
assert 0, "Unknown execution engine %s" % (engine)
|
|
|
|
def get_query_exp_res(self, engine):
|
|
engine = engine.upper();
|
|
if engine == "HIVE":
|
|
return self.query_hive_exp_res
|
|
elif engine == "IMPALA":
|
|
return self.query_impala_exp_res
|
|
else:
|
|
assert 0, "Unknown execution engine %s" % (engine)
|
|
|
|
def get_query_view_sql(self, engine):
|
|
engine = engine.upper();
|
|
if engine == "HIVE":
|
|
return self.query_hive_view_sql
|
|
elif engine == "IMPALA":
|
|
return self.query_impala_view_sql
|
|
else:
|
|
assert 0, "Unknown execution engine %s" % (engine)
|
|
return self.query_hive_view_sql
|
|
|
|
def has_query_hive_section(self):
|
|
return hasattr(self, 'query_hive_view_sql')
|
|
|
|
def has_query_impala_section(self):
|
|
return hasattr(self, 'query_impala_view_sql')
|