mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
IMPALA-8215, IMPALA-8458. Fix setting stats without setting NDVs in local-catalog mode
This removes an old workaround in which columns with numDVs=-1 were skipped when sending stats from catalogd to impalad. This workaround was trying to fix some weird behavior with boolean stats but broke other things, as noted in the JIRA. Additionally, it was used as a way to avoid sending stats on HDFS clustering columns (which don't have computed stats). This change changes the workaround to be a bit more careful: we explicitly ignore HDFS clustering columns, and for booleans, we use the numNulls instead of numDVs to determine whether we have enough information to infer numDVs. NOTE: HBase table keys are also considered "clustering columns", but those _do_ need computed stats, because HBase is range-partitioned, not value-partitioned. While I was in this section of the code, I stumbled upon IMPALA-8215, a bug in which the 'numTrues' of boolean columns was getting set to '1' for no apparent reason. I fixed that while I was in the area. Change-Id: Ic0b95de22954c7ad6715143fc42a1506289c095f Reviewed-on: http://gerrit.cloudera.org:8080/13382 Tested-by: Todd Lipcon <todd@apache.org> Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
This commit is contained in:
@@ -186,7 +186,14 @@ public class ColumnStats {
|
||||
if (isCompatible) {
|
||||
BooleanColumnStatsData boolStats = statsData.getBooleanStats();
|
||||
numNulls_ = boolStats.getNumNulls();
|
||||
numDistinctValues_ = (numNulls_ > 0) ? 3 : 2;
|
||||
// If we have numNulls, we can infer NDV from that.
|
||||
if (numNulls_ > 0) {
|
||||
numDistinctValues_ = 3;
|
||||
} else if (numNulls_ == 0) {
|
||||
numDistinctValues_ = 2;
|
||||
} else {
|
||||
numDistinctValues_ = -1;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TINYINT:
|
||||
@@ -274,7 +281,10 @@ public class ColumnStats {
|
||||
long numNulls = colStats.getNum_nulls();
|
||||
switch(colType.getPrimitiveType()) {
|
||||
case BOOLEAN:
|
||||
colStatsData.setBooleanStats(new BooleanColumnStatsData(1, -1, numNulls));
|
||||
// TODO(IMPALA-8205): actually compute the count of true/false
|
||||
// values.
|
||||
colStatsData.setBooleanStats(new BooleanColumnStatsData(
|
||||
/*numTrues=*/-1, /*numFalse=*/-1, numNulls));
|
||||
break;
|
||||
case TINYINT:
|
||||
ndv = Math.min(ndv, LongMath.pow(2, Byte.SIZE));
|
||||
|
||||
@@ -486,10 +486,14 @@ public abstract class Table extends CatalogObjectImpl implements FeTable {
|
||||
for (String colName: selector.want_stats_for_column_names) {
|
||||
Column col = getColumn(colName);
|
||||
if (col == null) continue;
|
||||
// Ugly hack: if the catalogd has never gotten any stats from HMS, numDVs will
|
||||
// be -1, and we'll have to send no stats to the impalad.
|
||||
// TODO(todd): this breaks test_ddl.test_alter_set_column_stats.
|
||||
if (!col.getStats().hasNumDistinctValues()) continue;
|
||||
|
||||
// Don't return stats for HDFS partitioning columns, since these are computed
|
||||
// by the coordinator based on the partition map itself. This makes the
|
||||
// behavior consistent with the HMS stats-fetching APIs.
|
||||
//
|
||||
// NOTE: we _do_ have to return stats for HBase clustering columns, because
|
||||
// those aren't simple value partitions.
|
||||
if (this instanceof FeFsTable && isClusteringColumn(col)) continue;
|
||||
|
||||
ColumnStatisticsData tstats = col.getStats().toHmsCompatibleThrift(col.getType());
|
||||
if (tstats == null) continue;
|
||||
|
||||
@@ -103,6 +103,9 @@ public interface MetaProvider {
|
||||
|
||||
/**
|
||||
* Load statistics for the given columns from the given table.
|
||||
*
|
||||
* NOTE: Stats should not be returned for the partition columns of FS-backed
|
||||
* tables, since it's assumed that these will be computed by the coordinator.
|
||||
*/
|
||||
List<ColumnStatisticsObj> loadTableColumnStatistics(TableMetaRef table,
|
||||
List<String> colNames) throws TException;
|
||||
|
||||
@@ -226,13 +226,6 @@ class SkipIfCatalogV2:
|
||||
IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
|
||||
reason="IMPALA-8486: LibCache isn't invalidated by function DDL.")
|
||||
|
||||
# TODO: IMPALA-8458: fix bug or update tests to reflect expected behaviour.
|
||||
@classmethod
|
||||
def alter_column_stats_broken(self):
|
||||
return pytest.mark.skipif(
|
||||
IMPALA_TEST_CLUSTER_PROPERTIES.is_catalog_v2_cluster(),
|
||||
reason="IMPALA-8458: setting column stats without setting NDV is no-op.")
|
||||
|
||||
# TODO: IMPALA-7131: add support or update tests to reflect expected behaviour.
|
||||
@classmethod
|
||||
def data_sources_unsupported(self):
|
||||
|
||||
@@ -430,7 +430,6 @@ class TestDdlStatements(TestDdlBase):
|
||||
self.run_test_case('QueryTest/alter-table-hdfs-caching', vector,
|
||||
use_db=unique_database, multiple_impalad=self._use_multiple_impalad(vector))
|
||||
|
||||
@SkipIfCatalogV2.alter_column_stats_broken()
|
||||
@UniqueDatabase.parametrize(sync_ddl=True)
|
||||
def test_alter_set_column_stats(self, vector, unique_database):
|
||||
self.run_test_case('QueryTest/alter-table-set-column-stats', vector,
|
||||
|
||||
Reference in New Issue
Block a user