mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
Addendum: IMPALA-12584: Enable strict data file access by default
This change sets the default value to 'true' for 'iceberg_restrict_data_file_location' and changes the flag name to 'iceberg_allow_datafiles_in_table_location_only'. Tests related to multiple storage locations in Iceberg tables are moved out to custom cluster tests. During test data loading, the flag is set to 'false' to make the creation of 'iceberg_multiple_storage_locations' table possible. Change-Id: Ifec84c86132a8a44d7e161006dcf51be2e7c7e57 Reviewed-on: http://gerrit.cloudera.org:8080/20874 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
3af3b2c8ab
commit
7e5bb385e1
@@ -421,8 +421,9 @@ DEFINE_int32(iceberg_reload_new_files_threshold, 100, "(Advanced) If during a ta
|
||||
"reload all file metadata. If number of new files are less or equal to this, "
|
||||
"catalogd will only load the metadata of the newly added files.");
|
||||
|
||||
DEFINE_bool(iceberg_restrict_data_file_location, false, "If true, Impala does not allow "
|
||||
"Iceberg data file locations outside of the table directory during reads");
|
||||
DEFINE_bool(iceberg_allow_datafiles_in_table_location_only, true, "If true, Impala "
|
||||
"does not allow Iceberg data file locations outside of the table directory during "
|
||||
"reads");
|
||||
|
||||
// Host and port of Statestore Service
|
||||
DEFINE_string(state_store_host, "localhost",
|
||||
|
||||
@@ -111,7 +111,7 @@ DECLARE_string(java_weigher);
|
||||
DECLARE_int32(iceberg_reload_new_files_threshold);
|
||||
DECLARE_bool(enable_skipping_older_events);
|
||||
DECLARE_bool(enable_json_scanner);
|
||||
DECLARE_bool(iceberg_restrict_data_file_location);
|
||||
DECLARE_bool(iceberg_allow_datafiles_in_table_location_only);
|
||||
DECLARE_int32(catalog_operation_log_size);
|
||||
DECLARE_string(hostname);
|
||||
DECLARE_bool(allow_catalog_cache_op_from_masked_users);
|
||||
@@ -443,8 +443,8 @@ Status PopulateThriftBackendGflags(TBackendGflags& cfg) {
|
||||
cfg.__set_iceberg_reload_new_files_threshold(FLAGS_iceberg_reload_new_files_threshold);
|
||||
cfg.__set_enable_skipping_older_events(FLAGS_enable_skipping_older_events);
|
||||
cfg.__set_enable_json_scanner(FLAGS_enable_json_scanner);
|
||||
cfg.__set_iceberg_restrict_data_file_location(
|
||||
FLAGS_iceberg_restrict_data_file_location);
|
||||
cfg.__set_iceberg_allow_datafiles_in_table_location_only(
|
||||
FLAGS_iceberg_allow_datafiles_in_table_location_only);
|
||||
cfg.__set_max_filter_error_rate_from_full_scan(
|
||||
FLAGS_max_filter_error_rate_from_full_scan);
|
||||
cfg.__set_catalog_operation_log_size(FLAGS_catalog_operation_log_size);
|
||||
|
||||
@@ -275,5 +275,5 @@ struct TBackendGflags {
|
||||
|
||||
122: required bool allow_catalog_cache_op_from_masked_users
|
||||
|
||||
123: required bool iceberg_restrict_data_file_location
|
||||
123: required bool iceberg_allow_datafiles_in_table_location_only
|
||||
}
|
||||
|
||||
@@ -1027,7 +1027,7 @@ public interface FeIcebergTable extends FeFsTable {
|
||||
Table icebergApiTable = icebergTable.getIcebergApiTable();
|
||||
Preconditions.checkNotNull(icebergApiTable);
|
||||
Map<String, String> properties = icebergApiTable.properties();
|
||||
if (BackendConfig.INSTANCE.icebergRestrictDataFileLocation()) {
|
||||
if (BackendConfig.INSTANCE.icebergAllowDatafileInTableLocationOnly()) {
|
||||
return true;
|
||||
}
|
||||
return !(PropertyUtil.propertyAsBoolean(properties,
|
||||
|
||||
@@ -429,8 +429,12 @@ public class BackendConfig {
|
||||
return backendCfg_.iceberg_reload_new_files_threshold;
|
||||
}
|
||||
|
||||
public boolean icebergRestrictDataFileLocation() {
|
||||
return backendCfg_.iceberg_restrict_data_file_location;
|
||||
public boolean icebergAllowDatafileInTableLocationOnly() {
|
||||
return backendCfg_.iceberg_allow_datafiles_in_table_location_only;
|
||||
}
|
||||
|
||||
public void setIcebergAllowDatafileInTableLocationOnly(boolean flag) {
|
||||
backendCfg_.iceberg_allow_datafiles_in_table_location_only = flag;
|
||||
}
|
||||
|
||||
public boolean isJsonScannerEnabled() {
|
||||
|
||||
@@ -34,6 +34,7 @@ import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.impala.catalog.HdfsPartition.FileDescriptor;
|
||||
import org.apache.impala.catalog.iceberg.GroupedContentFiles;
|
||||
import org.apache.impala.compat.MetastoreShim;
|
||||
import org.apache.impala.service.BackendConfig;
|
||||
import org.apache.impala.testutil.CatalogServiceTestCatalog;
|
||||
import org.apache.impala.thrift.TNetworkAddress;
|
||||
import org.apache.impala.util.IcebergUtil;
|
||||
@@ -291,6 +292,7 @@ public class FileMetadataLoaderTest {
|
||||
@Test
|
||||
public void testIcebergMultipleStorageLocations() throws IOException, CatalogException {
|
||||
CatalogServiceCatalog catalog = CatalogServiceTestCatalog.create();
|
||||
BackendConfig.INSTANCE.setIcebergAllowDatafileInTableLocationOnly(false);
|
||||
IcebergFileMetadataLoader fml1 = getLoaderForIcebergTable(catalog,
|
||||
"functional_parquet", "iceberg_multiple_storage_locations",
|
||||
/* oldFds = */ Collections.emptyList(),
|
||||
|
||||
3
testdata/bin/create-load-data.sh
vendored
3
testdata/bin/create-load-data.sh
vendored
@@ -165,6 +165,9 @@ function start-impala {
|
||||
: ${START_CLUSTER_ARGS=""}
|
||||
# Use a fast statestore update so that DDL operations run faster.
|
||||
START_CLUSTER_ARGS_INT="--state_store_args=--statestore_update_frequency_ms=50"
|
||||
# Disable strict datafile location checks for Iceberg tables
|
||||
DATAFILE_LOCATION_CHECK="-iceberg_allow_datafiles_in_table_location_only=false"
|
||||
START_CLUSTER_ARGS_INT+=("--catalogd_args=$DATAFILE_LOCATION_CHECK")
|
||||
if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then
|
||||
START_CLUSTER_ARGS_INT+=("--impalad_args=--abort_on_config_error=false -s 1")
|
||||
else
|
||||
|
||||
@@ -20,33 +20,35 @@ import pytest
|
||||
|
||||
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
||||
|
||||
SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
|
||||
"functional_parquet.iceberg_multiple_storage_locations"
|
||||
EXCEPTION = "IcebergTableLoadingException: " \
|
||||
"Error loading metadata for Iceberg table"
|
||||
|
||||
|
||||
class TestIcebergStrictDataFileLocation(CustomClusterTestSuite):
|
||||
"""Tests for checking the behaviour of startup flag
|
||||
'iceberg_restrict_data_file_location'."""
|
||||
'iceberg_allow_datafiles_in_table_location_only'."""
|
||||
|
||||
SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
|
||||
"functional_parquet.iceberg_multiple_storage_locations"
|
||||
EXCEPTION = "IcebergTableLoadingException: " \
|
||||
"Error loading metadata for Iceberg table"
|
||||
|
||||
@classmethod
|
||||
def get_workload(self):
|
||||
return 'functional-query'
|
||||
|
||||
@CustomClusterTestSuite.with_args(
|
||||
catalogd_args='--iceberg_restrict_data_file_location=true')
|
||||
catalogd_args='--iceberg_allow_datafiles_in_table_location_only=true')
|
||||
@pytest.mark.execute_serially
|
||||
def test_restricted_location(self, vector):
|
||||
"""If the flag is enabled, tables with multiple storage locations will fail
|
||||
to load their datafiles."""
|
||||
result = self.execute_query_expect_failure(self.client, SELECT_STATEMENT)
|
||||
assert EXCEPTION in str(result)
|
||||
result = self.execute_query_expect_failure(self.client, self.SELECT_STATEMENT)
|
||||
assert self.EXCEPTION in str(result)
|
||||
|
||||
@CustomClusterTestSuite.with_args(
|
||||
catalogd_args='--iceberg_restrict_data_file_location=false')
|
||||
catalogd_args='--iceberg_allow_datafiles_in_table_location_only=false')
|
||||
@pytest.mark.execute_serially
|
||||
def test_disabled(self, vector):
|
||||
"""If the flag is disabled, and tables with multiple storage locations
|
||||
are configured properly, the tables load successfully."""
|
||||
result = self.execute_query_expect_success(self.client, SELECT_STATEMENT)
|
||||
result = self.execute_query_expect_success(self.client, self.SELECT_STATEMENT)
|
||||
assert '9' in result.data
|
||||
self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table', vector)
|
||||
|
||||
@@ -1092,10 +1092,6 @@ class TestIcebergTable(IcebergTestSuite):
|
||||
args = ['-q', 'DROP TABLE {0}.{1}'.format(db_name, tbl_name)]
|
||||
results = run_impala_shell_cmd(vector, args)
|
||||
|
||||
def test_multiple_storage_locations(self, vector, unique_database):
|
||||
self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table',
|
||||
vector, unique_database)
|
||||
|
||||
def test_mixed_file_format(self, vector, unique_database):
|
||||
self.run_test_case('QueryTest/iceberg-mixed-file-format', vector,
|
||||
unique_database)
|
||||
|
||||
Reference in New Issue
Block a user