IMPALA-14142: Fix TestIcebergV2Table.test_compute_stats_table_sampling

TestIcebergV2Table.test_compute_stats_table_sampling was failing in
ARM release builds. However, COMPUTE STATS with TABLESAMPLE is
inherently non-deterministic due to its use of SAMPLED_NDV().

This patch completely rewrites the tests and moves them to
test_stats_extrapolation.py to test Iceberg tables similarly to
legacy tables.

'diff_perc' argument of appx_equals() method was also updated in
the tests, as with the previous value (1.0) it only reported errors
for negative estimates.

Change-Id: I98b07b156aad300827c9e1b7970b8dfacfc6d251
Reviewed-on: http://gerrit.cloudera.org:8080/23044
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Zoltan Borok-Nagy
2025-06-18 15:30:32 +02:00
committed by Impala Public Jenkins
parent 0604325958
commit 1a381b6aea
3 changed files with 55 additions and 253 deletions

View File

@@ -1,234 +0,0 @@
====
---- QUERY
DROP STATS iceberg_non_partitioned;
COMPUTE STATS iceberg_non_partitioned tablesample system(10) repeatable(1234);
---- RESULTS
'Updated 1 partition(s) and 4 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_non_partitioned;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
20,20,'22.90KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_non_partitioned','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_non_partitioned;
---- RESULTS
'id','INT',3,0,4,4,-1,-1
'user','STRING',2,0,4,4,-1,-1
'action','STRING',2,0,5,4.333333492279053,-1,-1
'event_time','TIMESTAMP',2,0,16,16,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_non_partitioned;
COMPUTE STATS iceberg_non_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 4 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_non_partitioned;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
20,20,'22.90KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_non_partitioned','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_non_partitioned;
---- RESULTS
'id','INT',2,0,4,4,-1,-1
'user','STRING',2,0,4,4,-1,-1
'action','STRING',2,0,8,6.5,-1,-1
'event_time','TIMESTAMP',2,0,16,16,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_partitioned;
COMPUTE STATS iceberg_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 4 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_partitioned;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
20,20,'22.90KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_partitioned','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_partitioned;
---- RESULTS
'id','INT',3,0,4,4,-1,-1
'user','STRING',3,0,4,4,-1,-1
'action','STRING',3,0,8,5.666666507720947,-1,-1
'event_time','TIMESTAMP',3,0,16,16,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_v2_delete_equality_partitioned;
COMPUTE STATS iceberg_v2_delete_equality_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 3 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_v2_delete_equality_partitioned;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
3,6,'4.81KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_delete_equality_partitioned','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_v2_delete_equality_partitioned;
---- RESULTS
'i','INT',2,0,4,4,-1,-1
's','STRING',2,0,4,4,-1,-1
'd','DATE',1,0,4,4,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_v2_delete_equality_partitioned;
COMPUTE STATS iceberg_v2_delete_equality_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 3 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_v2_delete_equality_partitioned;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
3,6,'4.81KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_delete_equality_partitioned','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_v2_delete_equality_partitioned;
---- RESULTS
'i','INT',2,0,4,4,-1,-1
's','STRING',2,0,4,4,-1,-1
'd','DATE',1,0,4,4,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files tablesample system(30) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 2 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
1,6,'7.77KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- RESULTS
'i','INT',1,0,4,4,-1,-1
's','STRING',1,0,1,1,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc tablesample system(30) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 2 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
1,6,'3.97KB','NOT CACHED','NOT CACHED','ORC','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files_orc','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- RESULTS
'i','INT',1,0,4,4,-1,-1
's','STRING',1,0,1,1,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files tablesample system(50) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 2 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
4,6,'7.77KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- RESULTS
'i','INT',4,0,4,4,-1,-1
's','STRING',4,0,1,1,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====
---- QUERY
DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc tablesample system(50) repeatable(1111);
---- RESULTS
'Updated 1 partition(s) and 2 column(s).'
---- TYPES
STRING
====
---- QUERY
SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- LABELS
#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
4,6,'3.97KB','NOT CACHED','NOT CACHED','ORC','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files_orc','$ERASURECODE_POLICY'
---- TYPES
BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- RESULTS
'i','INT',4,0,4,4,-1,-1
's','STRING',4,0,1,1,-1,-1
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
====

View File

@@ -55,7 +55,7 @@ class TestStatsExtrapolation(ImpalaTestSuite):
# Since our test tables are small, set the minimum sample size to 0 to make sure
# we exercise the sampling code paths.
self.client.execute("set compute_stats_min_sample_size=0")
self.client.set_configuration_option('compute_stats_min_sample_size', '0')
# Test partitioned table.
part_test_tbl = unique_database + ".alltypes"
@@ -120,24 +120,57 @@ class TestStatsExtrapolation(ImpalaTestSuite):
self.client.execute(
"compute stats {0} tablesample system(10)".format(wide_test_tbl))
def test_compute_stats_tablesample_iceberg(self, unique_database):
self.client.set_configuration_option('compute_stats_min_sample_size', '0')
ice_tbls = ['iceberg_non_partitioned', 'iceberg_partitioned',
'iceberg_v2_delete_equality_partitioned',
'iceberg_v2_positional_not_all_data_files_have_delete_files',
'iceberg_v2_positional_not_all_data_files_have_delete_files_orc']
# Array of sampling parameters: [(percentage, seed), ...]
sampling_params = [(1, 3), (10, 7), (20, 13), (100, 99)]
for tbl in ice_tbls:
orig_tbl = 'functional_parquet.' + tbl
cloned_tbl_base = unique_database + '.' + tbl + "_base"
cloned_tbl_sample = unique_database + '.' + tbl + "_sample"
self.clone_iceberg_table(orig_tbl, cloned_tbl_base)
self.clone_iceberg_table(orig_tbl, cloned_tbl_sample)
self.__set_extrapolation_tblprop(cloned_tbl_sample)
self.client.execute("compute stats {0}".format(cloned_tbl_base))
for sampling_param in sampling_params:
percentage = sampling_param[0]
seed = sampling_param[1]
self.__run_sampling_test(
cloned_tbl_sample, "", cloned_tbl_base, percentage, seed,
is_precise_table_stats=True)
def clone_iceberg_table(self, src_tbl, dst_tbl):
# We are cloning external, non-HiveCatalog tables here.
self.client.execute("create table {0} like {1}".format(dst_tbl, src_tbl))
def __set_extrapolation_tblprop(self, tbl):
"""Alters the given table to enable stats extrapolation via tblproperty."""
self.client.execute("alter table {0} set "
"tblproperties('impala.enable.stats.extrapolation'='true')".format(tbl))
def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed):
def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed,
is_precise_table_stats=False):
"""Drops stats on 'tbl' and then runs COMPUTE STATS TABLESAMPLE on 'tbl' with the
given column restriction clause, sampling percent and random seed. Checks that
the resulting table and column stats are reasonably close to those of
'expected_tbl'."""
'expected_tbl'. For table formats like ICEBERG the table-level stats are always
precise, this can be indicated by the parameter 'is_extrapolated_table_stats'"""
self.client.execute("drop stats {0}".format(tbl))
self.client.execute(
"compute stats {0}{1} tablesample system ({2}) repeatable ({3})".format(
tbl, cols, perc, seed))
self.__check_table_stats(tbl, expected_tbl)
self.__check_column_stats(cols, tbl, expected_tbl)
self.__check_table_stats(tbl, expected_tbl, perc, is_precise_table_stats)
self.__check_column_stats(cols, tbl, perc, expected_tbl)
def __check_table_stats(self, tbl, expected_tbl):
def __check_table_stats(self, tbl, expected_tbl, perc, is_precise=False):
"""Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are within 2x
of those reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
@@ -147,25 +180,27 @@ class TestStatsExtrapolation(ImpalaTestSuite):
assert len(actual.column_labels) == len(expected.column_labels)
col_names = actual.column_labels
rows_col_idx = col_names.index("#ROWS")
extrap_rows_col_idx = col_names.index("EXTRAP #ROWS")
extrap_rows_label = "#ROWS" if is_precise else "EXTRAP #ROWS"
extrap_rows_col_idx = col_names.index(extrap_rows_label)
for i in range(0, len(actual.data)):
act_cols = actual.data[i].split("\t")
exp_cols = expected.data[i].split("\t")
assert int(exp_cols[rows_col_idx]) >= 0
# The expected_tbl is expected to have valid extrapolated #rows for every partition.
assert int(act_cols[extrap_rows_col_idx]) >= 0
diff_perc = 0.0 if is_precise else self.get_diff_perc(perc)
self.appx_equals(
int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), diff_perc)
# Only the table-level row count is stored. The partition row counts
# are extrapolated.
if act_cols[0] == "Total":
self.appx_equals(
int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), diff_perc)
elif len(actual.data) > 1:
# Partition row count is expected to not be set.
assert int(act_cols[rows_col_idx]) == -1
def __check_column_stats(self, cols, tbl, expected_tbl):
def __check_column_stats(self, cols, tbl, perc, expected_tbl):
"""Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of those
reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
@@ -184,4 +219,13 @@ class TestStatsExtrapolation(ImpalaTestSuite):
# caller drops the stats before calling COMPUTE STATS.
if cols == "" or act_cols[0] in cols:
assert int(act_cols[ndv_col_idx]) >= 0
self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 1.0)
self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]),
self.get_diff_perc(perc))
def get_diff_perc(self, percentage):
# Calculates the the parameter 'diff_perc' we give to 'appx_equals()'.
# 'diff_perc' should be proportional to the sample percentage. We use percentage * 0.9
# to give enough margin for misestimation.
assert 0 <= percentage <= 100
fraction = percentage / 100.0
return 1.0 - fraction * 0.9

View File

@@ -1545,14 +1545,6 @@ class TestIcebergV2Table(IcebergTestSuite):
self.run_test_case('QueryTest/iceberg-v2-read-position-deletes-stats', vector)
self.run_test_case('QueryTest/iceberg-v2-read-position-deletes-orc-stats', vector)
@SkipIfDockerizedCluster.internal_hostname
@SkipIf.hardcoded_uris
@pytest.mark.execute_serially
def test_compute_stats_table_sampling(self, vector):
"""Tests COMPUTE STATS with table sampling."""
vector.get_value('exec_option')['COMPUTE_STATS_MIN_SAMPLE_SIZE'] = 0
self.run_test_case('QueryTest/iceberg-v2-compute-stats-table-sampling', vector)
@SkipIfFS.hive
def test_read_mixed_format_position_deletes(self, vector, unique_database):
self.run_test_case('QueryTest/iceberg-mixed-format-position-deletes',