IMPALA-14142: Fix TestIcebergV2Table.test_compute_stats_table_sampling

TestIcebergV2Table.test_compute_stats_table_sampling was failing in ARM release builds. However, COMPUTE STATS with TABLESAMPLE is inherently non-deterministic due to its use of SAMPLED_NDV(). This patch completely rewrites the tests and moves them to test_stats_extrapolation.py to test Iceberg tables similarly to legacy tables. 'diff_perc' argument of appx_equals() method was also updated in the tests, as with the previous value (1.0) it only reported errors for negative estimates. Change-Id: I98b07b156aad300827c9e1b7970b8dfacfc6d251 Reviewed-on: http://gerrit.cloudera.org:8080/23044 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2025-06-18 15:30:32 +02:00
parent 0604325958
commit 1a381b6aea
3 changed files with 55 additions and 253 deletions
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-v2-compute-stats-table-sampling.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-v2-compute-stats-table-sampling.test
@@ -1,234 +0,0 @@
-====
---- QUERY
-DROP STATS iceberg_non_partitioned;
-COMPUTE STATS iceberg_non_partitioned tablesample system(10) repeatable(1234);
---- RESULTS
-'Updated 1 partition(s) and 4 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_non_partitioned;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-20,20,'22.90KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_non_partitioned','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_non_partitioned;
---- RESULTS
-'id','INT',3,0,4,4,-1,-1
-'user','STRING',2,0,4,4,-1,-1
-'action','STRING',2,0,5,4.333333492279053,-1,-1
-'event_time','TIMESTAMP',2,0,16,16,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_non_partitioned;
-COMPUTE STATS iceberg_non_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 4 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_non_partitioned;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-20,20,'22.90KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_non_partitioned','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_non_partitioned;
---- RESULTS
-'id','INT',2,0,4,4,-1,-1
-'user','STRING',2,0,4,4,-1,-1
-'action','STRING',2,0,8,6.5,-1,-1
-'event_time','TIMESTAMP',2,0,16,16,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_partitioned;
-COMPUTE STATS iceberg_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 4 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_partitioned;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-20,20,'22.90KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/iceberg_partitioned','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_partitioned;
---- RESULTS
-'id','INT',3,0,4,4,-1,-1
-'user','STRING',3,0,4,4,-1,-1
-'action','STRING',3,0,8,5.666666507720947,-1,-1
-'event_time','TIMESTAMP',3,0,16,16,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_v2_delete_equality_partitioned;
-COMPUTE STATS iceberg_v2_delete_equality_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 3 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_v2_delete_equality_partitioned;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-3,6,'4.81KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_delete_equality_partitioned','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_v2_delete_equality_partitioned;
---- RESULTS
-'i','INT',2,0,4,4,-1,-1
-'s','STRING',2,0,4,4,-1,-1
-'d','DATE',1,0,4,4,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_v2_delete_equality_partitioned;
-COMPUTE STATS iceberg_v2_delete_equality_partitioned tablesample system(10) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 3 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_v2_delete_equality_partitioned;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-3,6,'4.81KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_delete_equality_partitioned','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_v2_delete_equality_partitioned;
---- RESULTS
-'i','INT',2,0,4,4,-1,-1
-'s','STRING',2,0,4,4,-1,-1
-'d','DATE',1,0,4,4,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files tablesample system(30) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-1,6,'7.77KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- RESULTS
-'i','INT',1,0,4,4,-1,-1
-'s','STRING',1,0,1,1,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc tablesample system(30) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-1,6,'3.97KB','NOT CACHED','NOT CACHED','ORC','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files_orc','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- RESULTS
-'i','INT',1,0,4,4,-1,-1
-'s','STRING',1,0,1,1,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files tablesample system(50) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-4,6,'7.77KB','NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files;
---- RESULTS
-'i','INT',4,0,4,4,-1,-1
-'s','STRING',4,0,1,1,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
---- QUERY
-DROP STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
-COMPUTE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc tablesample system(50) repeatable(1111);
---- RESULTS
-'Updated 1 partition(s) and 2 column(s).'
---- TYPES
-STRING
-====
---- QUERY
-SHOW TABLE STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- LABELS
-#ROWS, #Files, Size, Bytes Cached, Cache Replication, Format, Incremental stats, Location, EC Policy
---- RESULTS: VERIFY_IS_EQUAL
-4,6,'3.97KB','NOT CACHED','NOT CACHED','ORC','false','$NAMENODE/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_v2_positional_not_all_data_files_have_delete_files_orc','$ERASURECODE_POLICY'
---- TYPES
-BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING,STRING
-====
---- QUERY
-SHOW COLUMN STATS iceberg_v2_positional_not_all_data_files_have_delete_files_orc;
---- RESULTS
-'i','INT',4,0,4,4,-1,-1
-'s','STRING',4,0,1,1,-1,-1
---- TYPES
-STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE, BIGINT, BIGINT
-====
--- a/tests/metadata/test_stats_extrapolation.py
+++ b/tests/metadata/test_stats_extrapolation.py
@@ -55,7 +55,7 @@ class TestStatsExtrapolation(ImpalaTestSuite):

    # Since our test tables are small, set the minimum sample size to 0 to make sure
    # we exercise the sampling code paths.
-    self.client.execute("set compute_stats_min_sample_size=0")
+    self.client.set_configuration_option('compute_stats_min_sample_size', '0')

    # Test partitioned table.
    part_test_tbl = unique_database + ".alltypes"
@@ -120,24 +120,57 @@ class TestStatsExtrapolation(ImpalaTestSuite):
      self.client.execute(
        "compute stats {0} tablesample system(10)".format(wide_test_tbl))

+  def test_compute_stats_tablesample_iceberg(self, unique_database):
+    self.client.set_configuration_option('compute_stats_min_sample_size', '0')
+
+    ice_tbls = ['iceberg_non_partitioned', 'iceberg_partitioned',
+                'iceberg_v2_delete_equality_partitioned',
+                'iceberg_v2_positional_not_all_data_files_have_delete_files',
+                'iceberg_v2_positional_not_all_data_files_have_delete_files_orc']
+
+    # Array of sampling parameters: [(percentage, seed), ...]
+    sampling_params = [(1, 3), (10, 7), (20, 13), (100, 99)]
+
+    for tbl in ice_tbls:
+      orig_tbl = 'functional_parquet.' + tbl
+      cloned_tbl_base = unique_database + '.' + tbl + "_base"
+      cloned_tbl_sample = unique_database + '.' + tbl + "_sample"
+      self.clone_iceberg_table(orig_tbl, cloned_tbl_base)
+      self.clone_iceberg_table(orig_tbl, cloned_tbl_sample)
+      self.__set_extrapolation_tblprop(cloned_tbl_sample)
+      self.client.execute("compute stats {0}".format(cloned_tbl_base))
+
+      for sampling_param in sampling_params:
+        percentage = sampling_param[0]
+        seed = sampling_param[1]
+        self.__run_sampling_test(
+            cloned_tbl_sample, "", cloned_tbl_base, percentage, seed,
+            is_precise_table_stats=True)
+
+  def clone_iceberg_table(self, src_tbl, dst_tbl):
+    # We are cloning external, non-HiveCatalog tables here.
+    self.client.execute("create table {0} like {1}".format(dst_tbl, src_tbl))
+
  def __set_extrapolation_tblprop(self, tbl):
    """Alters the given table to enable stats extrapolation via tblproperty."""
    self.client.execute("alter table {0} set "
      "tblproperties('impala.enable.stats.extrapolation'='true')".format(tbl))

-  def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed):
+  def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed,
+                          is_precise_table_stats=False):
    """Drops stats on 'tbl' and then runs COMPUTE STATS TABLESAMPLE on 'tbl' with the
    given column restriction clause, sampling percent and random seed. Checks that
    the resulting table and column stats are reasonably close to those of
-    'expected_tbl'."""
+    'expected_tbl'. For table formats like ICEBERG the table-level stats are always
+    precise, this can be indicated by the parameter 'is_extrapolated_table_stats'"""
    self.client.execute("drop stats {0}".format(tbl))
    self.client.execute(
        "compute stats {0}{1} tablesample system ({2}) repeatable ({3})".format(
            tbl, cols, perc, seed))
-    self.__check_table_stats(tbl, expected_tbl)
-    self.__check_column_stats(cols, tbl, expected_tbl)
+    self.__check_table_stats(tbl, expected_tbl, perc, is_precise_table_stats)
+    self.__check_column_stats(cols, tbl, perc, expected_tbl)

-  def __check_table_stats(self, tbl, expected_tbl):
+  def __check_table_stats(self, tbl, expected_tbl, perc, is_precise=False):
    """Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are within 2x
    of those reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
@@ -147,25 +180,27 @@ class TestStatsExtrapolation(ImpalaTestSuite):
    assert len(actual.column_labels) == len(expected.column_labels)
    col_names = actual.column_labels
    rows_col_idx = col_names.index("#ROWS")
-    extrap_rows_col_idx = col_names.index("EXTRAP #ROWS")
+    extrap_rows_label = "#ROWS" if is_precise else "EXTRAP #ROWS"
+    extrap_rows_col_idx = col_names.index(extrap_rows_label)
    for i in range(0, len(actual.data)):
      act_cols = actual.data[i].split("\t")
      exp_cols = expected.data[i].split("\t")
      assert int(exp_cols[rows_col_idx]) >= 0
      # The expected_tbl is expected to have valid extrapolated #rows for every partition.
      assert int(act_cols[extrap_rows_col_idx]) >= 0
+      diff_perc = 0.0 if is_precise else self.get_diff_perc(perc)
      self.appx_equals(
-        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
+        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), diff_perc)
      # Only the table-level row count is stored. The partition row counts
      # are extrapolated.
      if act_cols[0] == "Total":
        self.appx_equals(
-          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
+          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), diff_perc)
      elif len(actual.data) > 1:
        # Partition row count is expected to not be set.
        assert int(act_cols[rows_col_idx]) == -1

-  def __check_column_stats(self, cols, tbl, expected_tbl):
+  def __check_column_stats(self, cols, tbl, perc, expected_tbl):
    """Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of those
    reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
@@ -184,4 +219,13 @@ class TestStatsExtrapolation(ImpalaTestSuite):
      # caller drops the stats before calling COMPUTE STATS.
      if cols == "" or act_cols[0] in cols:
        assert int(act_cols[ndv_col_idx]) >= 0
-        self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 1.0)
+        self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]),
+                         self.get_diff_perc(perc))
+
+  def get_diff_perc(self, percentage):
+    # Calculates the the parameter 'diff_perc' we give to 'appx_equals()'.
+    # 'diff_perc' should be proportional to the sample percentage. We use percentage * 0.9
+    # to give enough margin for misestimation.
+    assert 0 <= percentage <= 100
+    fraction = percentage / 100.0
+    return 1.0 - fraction * 0.9
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -1545,14 +1545,6 @@ class TestIcebergV2Table(IcebergTestSuite):
    self.run_test_case('QueryTest/iceberg-v2-read-position-deletes-stats', vector)
    self.run_test_case('QueryTest/iceberg-v2-read-position-deletes-orc-stats', vector)

-  @SkipIfDockerizedCluster.internal_hostname
-  @SkipIf.hardcoded_uris
-  @pytest.mark.execute_serially
-  def test_compute_stats_table_sampling(self, vector):
-    """Tests COMPUTE STATS with table sampling."""
-    vector.get_value('exec_option')['COMPUTE_STATS_MIN_SAMPLE_SIZE'] = 0
-    self.run_test_case('QueryTest/iceberg-v2-compute-stats-table-sampling', vector)
-
  @SkipIfFS.hive
  def test_read_mixed_format_position_deletes(self, vector, unique_database):
    self.run_test_case('QueryTest/iceberg-mixed-format-position-deletes',