mirror of
https://github.com/apache/impala.git
synced 2026-01-08 12:02:54 -05:00
This change adds support for faster DDL via the CatalogServer by directly returning the TCatalogObject from each catalog operation and using this result to update the local impalad's catalog cache directly, rather than waiting for a state store heartbeat that contains the change. Because the Impalad's catalog can now be updated in two ways, it means that we need to be careful when applying updates to ensure no work gets "undone". For example, consider the following sequence of events: t1: [Direct Update] - Add item A - (Catalog Version 9) t2: [Direct Update] - Drop item A - (Catalog Version 10) t3: [StateStore Update] - (From Catalog Version 9) In this case, we need to ensure that the state store update in t3 does not undo the drop in t2, even though that update will contain the change to "add item A". To support this, we now check the catalog versions before adding any item to ensure that an existing item does not overwrite an item with a newer catalog version. To handle the case of removals, a new CatalogUpdateLog is introduced. This log tracks the catalog version each item was removed from the catalog. When adding a new catalog object, it is checked to see if this object was removed in a catalog version > than the version of the current object. If so, the update is ignored. This covers most updates, but there is still one concurrency issue that is not covered with this change. If someone issues an "invalidate metadata" concurrently with a direct catalog operation, it may briefly set the catalog back in time. This seems like okay behavior to me (the command is invalidating the catalog metadata). If we want to address this the CatalogUpdateLog could be extended to track additions to the catalog and we could replay the log after invalidating the metadata (as one possible solution). Change-Id: Icc9bdecc3c32436708bf9e9e7974f91d40e514f2 Reviewed-on: http://gerrit.ent.cloudera.com:8080/864 Tested-by: jenkins Reviewed-by: Lenni Kuff <lskuff@cloudera.com>
91 lines
3.5 KiB
Python
91 lines
3.5 KiB
Python
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# Impala tests for column statistics
|
|
|
|
import logging
|
|
import pytest
|
|
import shlex
|
|
from tests.common.test_result_verifier import *
|
|
from subprocess import call
|
|
from tests.common.test_vector import *
|
|
from tests.common.impala_test_suite import *
|
|
|
|
TEST_DB = 'colstats_test_db'
|
|
|
|
# End-to-end validation of Impala column stats usage.
|
|
class TestColStats(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestColStats, cls).add_test_dimensions()
|
|
# There is no reason to run these tests using all dimensions.
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'text' and\
|
|
v.get_value('table_format').compression_codec == 'none')
|
|
|
|
def setup_method(self, method):
|
|
self.__cleanup()
|
|
|
|
def teardown_method(self, method):
|
|
self.__cleanup()
|
|
|
|
def __cleanup(self):
|
|
self.cleanup_db(TEST_DB)
|
|
|
|
def test_incompatible_col_stats(self, vector):
|
|
"""Tests Impala is able to use tables when the column stats data is not compatible
|
|
with the column type. Regression test for IMPALA-588."""
|
|
|
|
# Create a test database.
|
|
self.client.execute("create database " + TEST_DB);
|
|
self.client.execute("use " + TEST_DB)
|
|
|
|
# Create a table with a string column and populate it with some data.
|
|
self.client.execute("create table badstats(s string)")
|
|
self.client.execute("insert into table badstats select cast(int_col as string) "\
|
|
"from functional.alltypes limit 10")
|
|
|
|
# Compute stats for this table, they will be for the string column type.
|
|
self.__compute_table_stats(TEST_DB, 'badstats')
|
|
self.client.execute("refresh badstats")
|
|
|
|
# Change the column type to int which will cause a mismatch between the column
|
|
# stats data and the column type metadata.
|
|
self.client.execute("alter table badstats change s s int")
|
|
# Should still be able to query the table
|
|
result = self.client.execute("select s from badstats")
|
|
assert len(result.data) == 10
|
|
|
|
# Recompute stats with the new column type. Impala should now have stats for this
|
|
# column and should be able to access the table.
|
|
# TODO: Currently this just verifies Impala can query the table, it does not
|
|
# verify the stats are there or correct. Expand the verification once Impala has a
|
|
# mechanism to expose this metadata.
|
|
self.__compute_table_stats(TEST_DB, 'badstats')
|
|
self.client.execute("refresh badstats")
|
|
result = self.client.execute("select s from badstats")
|
|
assert len(result.data) == 10
|
|
|
|
def __compute_table_stats(self, db_name, table_name):
|
|
compute_stats_script =\
|
|
os.path.join(os.environ['IMPALA_HOME'],'tests/util/compute_table_stats.py')
|
|
rval = call([compute_stats_script,
|
|
'--db_names=' + db_name, '--table_names=' + table_name])
|
|
assert rval == 0, 'Compute table stats failed on: %s.%s' % (db_name, table_name)
|