mirror of
https://github.com/apache/impala.git
synced 2026-01-07 00:02:28 -05:00
This change ensures that the planner computes parquet conjuncts
only when for scans containing parquet files. Additionally, it
also handles PARQUET_DICTIONARY_FILTERING and
PARQUET_READ_STATISTICS query options in the planner.
Testing was carried out independently on parquet and non-parquet
scans:
1. Parquet scans were tested via the existing parquet-filtering
planner test. Additionally, a new test
[parquet-filtering-disabled] was added to ensure that the
explain plan generated skips parquet predicates based on the
query options.
2. Non-parquet scans were tested manually to ensure that the
functions to compute parquet conjucts were not invoked.
Additional test cases were added to the parquet-filtering
planner test to scan non parquet tables and ensure that the
plans do not contain conjuncts based on parquet statistics.
3. A parquet partition was added to the alltypesmixedformat
table in the functional database. Planner tests were added
to ensure that Parquet conjuncts are constructed only when
the Parquet partition is included in the query.
Change-Id: I9d6c26d42db090c8a15c602f6419ad6399c329e7
Reviewed-on: http://gerrit.cloudera.org:8080/10704
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
129 lines
6.0 KiB
Python
129 lines
6.0 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
import re
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.test_vector import ImpalaTestDimension
|
|
|
|
class TestRowsAvailability(ImpalaTestSuite):
|
|
"""Tests that the 'Rows available' timeline event is marked only after rows are
|
|
truly available. We mark the 'Rows available' event once we advance the query
|
|
status to a 'ready' state; this signals to the client that rows can be fetched.
|
|
Long fetch times can trigger client timeouts at various levels (socket, app, etc.).
|
|
This is a regression test against IMPALA-924."""
|
|
|
|
# These queries are chosen to have different plan roots in the coordinator's fragment.
|
|
# The WHERE clause is carefully crafted to control when result rows become available at
|
|
# the coordinator. The selected partition of 'functional.alltypestiny' has exactly
|
|
# two rows stored in a single file. In the scan node we sleep one second for each
|
|
# result row. Therefore, result rows can become available no earlier that after 2s.
|
|
TABLE = 'functional.alltypestiny'
|
|
WHERE_CLAUSE = 'where month = 1 and bool_col = sleep(1000)'
|
|
QUERIES = ['select * from %s %s' % (TABLE, WHERE_CLAUSE),
|
|
'select * from %s %s order by id limit 1' % (TABLE, WHERE_CLAUSE),
|
|
'select * from %s %s order by id' % (TABLE, WHERE_CLAUSE),
|
|
'select count(*) from %s %s' % (TABLE, WHERE_CLAUSE),
|
|
'select 1 union all select count(*) from %s %s' % (TABLE, WHERE_CLAUSE),
|
|
'select count(*) over () from %s %s' % (TABLE, WHERE_CLAUSE)]
|
|
ROWS_AVAIL_LOWER_BOUND_MS = 2000
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestRowsAvailability, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('query', *cls.QUERIES))
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v: cls.__is_valid_test_vector(v))
|
|
|
|
@classmethod
|
|
def __is_valid_test_vector(cls, vector):
|
|
return vector.get_value('table_format').file_format == 'text' and\
|
|
vector.get_value('table_format').compression_codec == 'none' and\
|
|
vector.get_value('exec_option')['batch_size'] == 0 and\
|
|
vector.get_value('exec_option')['disable_codegen'] == False and\
|
|
vector.get_value('exec_option')['num_nodes'] == 0
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_rows_availability(self, vector):
|
|
# This test is run serially because it requires the query to come back within
|
|
# some amount of time. Running this with other tests makes it hard to bound
|
|
# that time.
|
|
query = vector.get_value('query')
|
|
# Execute async to get a handle. Wait until the query has completed.
|
|
handle = self.execute_query_async(query, vector.get_value('exec_option'))
|
|
self.impalad_test_service.wait_for_query_state(self.client, handle,
|
|
self.client.QUERY_STATES['FINISHED'], timeout=20)
|
|
|
|
profile = self.client.get_runtime_profile(handle)
|
|
start_time_ms = None
|
|
rows_avail_time_ms = None
|
|
for line in profile.split("\n"):
|
|
if "Ready to start on" in line:
|
|
start_time_ms = self.__parse_time_ms(self.__find_time(line))
|
|
elif "Rows available:" in line:
|
|
rows_avail_time_ms = self.__parse_time_ms(self.__find_time(line))
|
|
|
|
if start_time_ms is None:
|
|
assert False, "Failed to find the 'Ready to start' timeline event in the " \
|
|
"query profile:\n%s" % profile
|
|
if rows_avail_time_ms is None:
|
|
assert False, "Failed to find the 'Rows available' timeline event in the " \
|
|
"query profile:\n%s" % profile
|
|
time_diff = rows_avail_time_ms - start_time_ms
|
|
assert time_diff >= self.ROWS_AVAIL_LOWER_BOUND_MS,\
|
|
"The 'Rows available' timeline event was marked prematurely %sms after the "\
|
|
"'Ready to start' event.\nExpected the event to be marked no earlier than "\
|
|
"%sms after the 'Ready to start' event.\nQuery: %s"\
|
|
% (time_diff, self.ROWS_AVAIL_LOWER_BOUND_MS, query)
|
|
self.close_query(handle)
|
|
|
|
@staticmethod
|
|
def __find_time(line):
|
|
"""Find event time point in a line from the runtime profile timeline."""
|
|
# Given line "- Rows available: 3s311ms (2s300ms)", this function returns "3s311ms"
|
|
match = re.search(r': (.*) \(', line)
|
|
if match is None:
|
|
assert False, "Failed to find time in runtime profile"
|
|
return match.group(1)
|
|
|
|
@staticmethod
|
|
def __parse_time_ms(duration):
|
|
"""Parses a duration string of the form 1h2h3m4s5.6ms7.8ns into milliseconds."""
|
|
matches = re.findall(r'([0-9]+h)?([0-9]+m)?([0-9]+s)?'\
|
|
'([0-9]+(\.[0-9]+)?ms)?([0-9]+(\.[0-9]+)?ns)?',
|
|
duration)
|
|
# Expect exactly two matches because all groups are optional in the regex.
|
|
if matches is None or len(matches) != 2:
|
|
assert False, 'Failed to parse duration string %s' % duration
|
|
hours = 0
|
|
minutes = 0
|
|
seconds = 0
|
|
milliseconds = 0
|
|
if matches[0][0]:
|
|
hours = int(matches[0][0][:-1])
|
|
if matches[0][1]:
|
|
minutes = int(matches[0][1][:-1])
|
|
if matches[0][2]:
|
|
seconds = int(matches[0][2][:-1])
|
|
if matches[0][3]:
|
|
# Truncate fractional milliseconds.
|
|
milliseconds = int(float(matches[0][3][:-2]))
|
|
return hours * 60 * 60 * 1000 + minutes * 60 * 1000 + seconds * 1000 + milliseconds
|