Files
impala/tests/query_test/test_col_stats.py
Lenni Kuff 35817f6a17 Support faster DDL operations via the CatalogServer
This change adds support for faster DDL via the CatalogServer by directly
returning the TCatalogObject from each catalog operation and using this result
to update the local impalad's catalog cache directly, rather than waiting
for a state store heartbeat that contains the change.
Because the Impalad's catalog can now be updated in two ways, it means that
we need to be careful when applying updates to ensure no work gets "undone".

For example, consider the following sequence of events:
t1: [Direct Update] - Add item A - (Catalog Version 9)
t2: [Direct Update] - Drop item A - (Catalog Version 10)
t3: [StateStore Update] - (From Catalog Version 9)

In this case, we need to ensure that the state store update in t3 does not undo the
drop in t2, even though that update will contain the change to "add item A".

To support this, we now check the catalog versions before adding any item to ensure
that an existing item does not overwrite an item with a newer catalog version.
To handle the case of removals, a new CatalogUpdateLog is introduced. This log tracks
the catalog version each item was removed from the catalog. When adding a new
catalog object, it is checked to see if this object was removed in a catalog version >
than the version of the current object. If so, the update is ignored.

This covers most updates, but there is still one concurrency issue that is not covered
with this change. If someone issues an "invalidate metadata" concurrently with a
direct catalog operation, it may briefly set the catalog back in time. This seems like
okay behavior to me (the command is invalidating the catalog metadata). If we want
to address this the CatalogUpdateLog could be extended to track additions to the catalog
and we could replay the log after invalidating the metadata (as one possible solution).

Change-Id: Icc9bdecc3c32436708bf9e9e7974f91d40e514f2
Reviewed-on: http://gerrit.ent.cloudera.com:8080/864
Tested-by: jenkins
Reviewed-by: Lenni Kuff <lskuff@cloudera.com>
2014-01-08 10:53:58 -08:00

91 lines
3.5 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Impala tests for column statistics
import logging
import pytest
import shlex
from tests.common.test_result_verifier import *
from subprocess import call
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
TEST_DB = 'colstats_test_db'
# End-to-end validation of Impala column stats usage.
class TestColStats(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestColStats, cls).add_test_dimensions()
# There is no reason to run these tests using all dimensions.
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and\
v.get_value('table_format').compression_codec == 'none')
def setup_method(self, method):
self.__cleanup()
def teardown_method(self, method):
self.__cleanup()
def __cleanup(self):
self.cleanup_db(TEST_DB)
def test_incompatible_col_stats(self, vector):
"""Tests Impala is able to use tables when the column stats data is not compatible
with the column type. Regression test for IMPALA-588."""
# Create a test database.
self.client.execute("create database " + TEST_DB);
self.client.execute("use " + TEST_DB)
# Create a table with a string column and populate it with some data.
self.client.execute("create table badstats(s string)")
self.client.execute("insert into table badstats select cast(int_col as string) "\
"from functional.alltypes limit 10")
# Compute stats for this table, they will be for the string column type.
self.__compute_table_stats(TEST_DB, 'badstats')
self.client.execute("refresh badstats")
# Change the column type to int which will cause a mismatch between the column
# stats data and the column type metadata.
self.client.execute("alter table badstats change s s int")
# Should still be able to query the table
result = self.client.execute("select s from badstats")
assert len(result.data) == 10
# Recompute stats with the new column type. Impala should now have stats for this
# column and should be able to access the table.
# TODO: Currently this just verifies Impala can query the table, it does not
# verify the stats are there or correct. Expand the verification once Impala has a
# mechanism to expose this metadata.
self.__compute_table_stats(TEST_DB, 'badstats')
self.client.execute("refresh badstats")
result = self.client.execute("select s from badstats")
assert len(result.data) == 10
def __compute_table_stats(self, db_name, table_name):
compute_stats_script =\
os.path.join(os.environ['IMPALA_HOME'],'tests/util/compute_table_stats.py')
rval = call([compute_stats_script,
'--db_names=' + db_name, '--table_names=' + table_name])
assert rval == 0, 'Compute table stats failed on: %s.%s' % (db_name, table_name)