mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
This change adds support for reading NDV statistics from Puffin files
when they are available for the current snapshot. Puffin files or blobs
that were written for other snapshots than the current one are ignored.
Because this behaviour is different from what we have for HMS stats and
may therefore be unintuitive for users, reading Puffin stats is disabled
by default; set the "--disable_reading_puffin_stats" startup flag to
false to enable it.
When Puffin stats reading is enabled, the NDV values read from Puffin
files take precedence over NDV values stored in the HMS. This is because
we only read Puffin stats for the current snapshot, so these values are
always up-to-date, while the values in the HMS may be stale.
Note that it is currently not possible to drop Puffin stats from Impala.
For this reason, this patch also introduces two ways of disabling the
reading of Puffin stats:
- globally, with the aforementioned "--disable_reading_puffin_stats"
startup flag: when it is set to true, Impala will never read Puffin
stats
- for specific tables, by setting the
"impala.iceberg_disable_reading_puffin_stats" table property to
true.
Note that this change is only about reading Puffin files, Impala does
not yet support writing them.
Testing:
- created the PuffinDataGenerator tool which can generate Puffin files
and metadata.json files for different scenarios (e.g. all stats are
in the same Puffin file; stats for different columns are in different
Puffin files; some Puffin files are corrupt etc.). The generated
files are under the "testdata/ice_puffin/generated" directory.
- The new custom cluster test class
'test_iceberg_with_puffin.py::TestIcebergTableWithPuffinStats' uses
the generated data to test various scenarios.
- Added custom cluster tests that test the
'disable_reading_puffin_stats' startup flag.
Change-Id: I50c1228988960a686d08a9b2942e01e366678866
Reviewed-on: http://gerrit.cloudera.org:8080/21605
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
78 lines
3.2 KiB
Python
78 lines
3.2 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
import re
|
|
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.test_dimensions import (create_uncompressed_text_dimension,
|
|
create_exec_option_dimension)
|
|
|
|
|
|
class TestExtDataSources(ImpalaTestSuite):
|
|
"""Impala query tests for external data sources."""
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestExtDataSources, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension(exec_single_node_option=[0, 100]))
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def _get_tbl_properties(self, table_name):
|
|
"""Extracts the table properties mapping from the output of DESCRIBE FORMATTED"""
|
|
return self._get_properties('Table Parameters:', table_name)
|
|
|
|
def test_verify_jdbc_table_properties(self, vector):
|
|
jdbc_tbl_name = "functional.alltypes_jdbc_datasource"
|
|
properties = self._get_tbl_properties(jdbc_tbl_name)
|
|
# Verify table properties specific for external JDBC table
|
|
assert properties['__IMPALA_DATA_SOURCE_NAME'] == 'impalajdbcdatasource'
|
|
assert properties['database.type'] == 'POSTGRES'
|
|
assert properties['jdbc.driver'] == 'org.postgresql.Driver'
|
|
assert properties['dbcp.username'] == 'hiveuser'
|
|
assert properties['table'] == 'alltypes'
|
|
# Verify dbcp.password is masked in the output of DESCRIBE FORMATTED command
|
|
assert properties['dbcp.password'] == '******'
|
|
|
|
# Verify dbcp.password is masked in the output of SHOW CREATE TABLE command
|
|
result = self.client.execute("SHOW CREATE TABLE {0}".format(jdbc_tbl_name))
|
|
match = False
|
|
for row in result.data:
|
|
if "'dbcp.password'='******'" in row:
|
|
match = True
|
|
break
|
|
assert match, result.data
|
|
|
|
def test_data_source_tables(self, vector, unique_database, unique_name):
|
|
self.run_test_case('QueryTest/data-source-tables', vector, use_db=unique_database,
|
|
test_file_vars={'$UNIQUE_DATASOURCE': unique_name})
|
|
|
|
def test_jdbc_data_source(self, vector, unique_database):
|
|
self.run_test_case('QueryTest/jdbc-data-source', vector, use_db=unique_database)
|
|
|
|
def test_jdbc_data_source_with_keystore(self, vector, unique_database):
|
|
# Impala query tests for external data sources with keystore.
|
|
self.run_test_case('QueryTest/jdbc-data-source-with-keystore', vector,
|
|
use_db=unique_database)
|