Addendum: IMPALA-12584: Enable strict data file access by default

This change sets the default value to 'true' for
'iceberg_restrict_data_file_location' and changes the flag name to
'iceberg_allow_datafiles_in_table_location_only'. Tests related to
multiple storage locations in Iceberg tables are moved out to custom
cluster tests. During test data loading, the flag is set to 'false'
to make the creation of 'iceberg_multiple_storage_locations' table
possible.

Change-Id: Ifec84c86132a8a44d7e161006dcf51be2e7c7e57
Reviewed-on: http://gerrit.cloudera.org:8080/20874
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Peter Rozsa
2024-01-09 16:37:22 +01:00
committed by Impala Public Jenkins
parent 3af3b2c8ab
commit 7e5bb385e1
9 changed files with 32 additions and 24 deletions

View File

@@ -421,8 +421,9 @@ DEFINE_int32(iceberg_reload_new_files_threshold, 100, "(Advanced) If during a ta
"reload all file metadata. If number of new files are less or equal to this, "
"catalogd will only load the metadata of the newly added files.");
DEFINE_bool(iceberg_restrict_data_file_location, false, "If true, Impala does not allow "
"Iceberg data file locations outside of the table directory during reads");
DEFINE_bool(iceberg_allow_datafiles_in_table_location_only, true, "If true, Impala "
"does not allow Iceberg data file locations outside of the table directory during "
"reads");
// Host and port of Statestore Service
DEFINE_string(state_store_host, "localhost",

View File

@@ -111,7 +111,7 @@ DECLARE_string(java_weigher);
DECLARE_int32(iceberg_reload_new_files_threshold);
DECLARE_bool(enable_skipping_older_events);
DECLARE_bool(enable_json_scanner);
DECLARE_bool(iceberg_restrict_data_file_location);
DECLARE_bool(iceberg_allow_datafiles_in_table_location_only);
DECLARE_int32(catalog_operation_log_size);
DECLARE_string(hostname);
DECLARE_bool(allow_catalog_cache_op_from_masked_users);
@@ -443,8 +443,8 @@ Status PopulateThriftBackendGflags(TBackendGflags& cfg) {
cfg.__set_iceberg_reload_new_files_threshold(FLAGS_iceberg_reload_new_files_threshold);
cfg.__set_enable_skipping_older_events(FLAGS_enable_skipping_older_events);
cfg.__set_enable_json_scanner(FLAGS_enable_json_scanner);
cfg.__set_iceberg_restrict_data_file_location(
FLAGS_iceberg_restrict_data_file_location);
cfg.__set_iceberg_allow_datafiles_in_table_location_only(
FLAGS_iceberg_allow_datafiles_in_table_location_only);
cfg.__set_max_filter_error_rate_from_full_scan(
FLAGS_max_filter_error_rate_from_full_scan);
cfg.__set_catalog_operation_log_size(FLAGS_catalog_operation_log_size);

View File

@@ -275,5 +275,5 @@ struct TBackendGflags {
122: required bool allow_catalog_cache_op_from_masked_users
123: required bool iceberg_restrict_data_file_location
123: required bool iceberg_allow_datafiles_in_table_location_only
}

View File

@@ -1027,7 +1027,7 @@ public interface FeIcebergTable extends FeFsTable {
Table icebergApiTable = icebergTable.getIcebergApiTable();
Preconditions.checkNotNull(icebergApiTable);
Map<String, String> properties = icebergApiTable.properties();
if (BackendConfig.INSTANCE.icebergRestrictDataFileLocation()) {
if (BackendConfig.INSTANCE.icebergAllowDatafileInTableLocationOnly()) {
return true;
}
return !(PropertyUtil.propertyAsBoolean(properties,

View File

@@ -429,8 +429,12 @@ public class BackendConfig {
return backendCfg_.iceberg_reload_new_files_threshold;
}
public boolean icebergRestrictDataFileLocation() {
return backendCfg_.iceberg_restrict_data_file_location;
public boolean icebergAllowDatafileInTableLocationOnly() {
return backendCfg_.iceberg_allow_datafiles_in_table_location_only;
}
public void setIcebergAllowDatafileInTableLocationOnly(boolean flag) {
backendCfg_.iceberg_allow_datafiles_in_table_location_only = flag;
}
public boolean isJsonScannerEnabled() {

View File

@@ -34,6 +34,7 @@ import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.impala.catalog.HdfsPartition.FileDescriptor;
import org.apache.impala.catalog.iceberg.GroupedContentFiles;
import org.apache.impala.compat.MetastoreShim;
import org.apache.impala.service.BackendConfig;
import org.apache.impala.testutil.CatalogServiceTestCatalog;
import org.apache.impala.thrift.TNetworkAddress;
import org.apache.impala.util.IcebergUtil;
@@ -291,6 +292,7 @@ public class FileMetadataLoaderTest {
@Test
public void testIcebergMultipleStorageLocations() throws IOException, CatalogException {
CatalogServiceCatalog catalog = CatalogServiceTestCatalog.create();
BackendConfig.INSTANCE.setIcebergAllowDatafileInTableLocationOnly(false);
IcebergFileMetadataLoader fml1 = getLoaderForIcebergTable(catalog,
"functional_parquet", "iceberg_multiple_storage_locations",
/* oldFds = */ Collections.emptyList(),

View File

@@ -165,6 +165,9 @@ function start-impala {
: ${START_CLUSTER_ARGS=""}
# Use a fast statestore update so that DDL operations run faster.
START_CLUSTER_ARGS_INT="--state_store_args=--statestore_update_frequency_ms=50"
# Disable strict datafile location checks for Iceberg tables
DATAFILE_LOCATION_CHECK="-iceberg_allow_datafiles_in_table_location_only=false"
START_CLUSTER_ARGS_INT+=("--catalogd_args=$DATAFILE_LOCATION_CHECK")
if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then
START_CLUSTER_ARGS_INT+=("--impalad_args=--abort_on_config_error=false -s 1")
else

View File

@@ -20,33 +20,35 @@ import pytest
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
"functional_parquet.iceberg_multiple_storage_locations"
EXCEPTION = "IcebergTableLoadingException: " \
"Error loading metadata for Iceberg table"
class TestIcebergStrictDataFileLocation(CustomClusterTestSuite):
"""Tests for checking the behaviour of startup flag
'iceberg_restrict_data_file_location'."""
'iceberg_allow_datafiles_in_table_location_only'."""
SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
"functional_parquet.iceberg_multiple_storage_locations"
EXCEPTION = "IcebergTableLoadingException: " \
"Error loading metadata for Iceberg table"
@classmethod
def get_workload(self):
return 'functional-query'
@CustomClusterTestSuite.with_args(
catalogd_args='--iceberg_restrict_data_file_location=true')
catalogd_args='--iceberg_allow_datafiles_in_table_location_only=true')
@pytest.mark.execute_serially
def test_restricted_location(self, vector):
"""If the flag is enabled, tables with multiple storage locations will fail
to load their datafiles."""
result = self.execute_query_expect_failure(self.client, SELECT_STATEMENT)
assert EXCEPTION in str(result)
result = self.execute_query_expect_failure(self.client, self.SELECT_STATEMENT)
assert self.EXCEPTION in str(result)
@CustomClusterTestSuite.with_args(
catalogd_args='--iceberg_restrict_data_file_location=false')
catalogd_args='--iceberg_allow_datafiles_in_table_location_only=false')
@pytest.mark.execute_serially
def test_disabled(self, vector):
"""If the flag is disabled, and tables with multiple storage locations
are configured properly, the tables load successfully."""
result = self.execute_query_expect_success(self.client, SELECT_STATEMENT)
result = self.execute_query_expect_success(self.client, self.SELECT_STATEMENT)
assert '9' in result.data
self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table', vector)

View File

@@ -1092,10 +1092,6 @@ class TestIcebergTable(IcebergTestSuite):
args = ['-q', 'DROP TABLE {0}.{1}'.format(db_name, tbl_name)]
results = run_impala_shell_cmd(vector, args)
def test_multiple_storage_locations(self, vector, unique_database):
self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table',
vector, unique_database)
def test_mixed_file_format(self, vector, unique_database):
self.run_test_case('QueryTest/iceberg-mixed-file-format', vector,
unique_database)