mirror of
https://github.com/apache/impala.git
synced 2026-01-03 15:00:52 -05:00
For files that have a Cloudera copyright (and no other copyright notice), make changes to follow the ASF source file header policy here: http://www.apache.org/legal/src-headers.html#headers Specifically: 1) Remove the Cloudera copyright. 2) Modify NOTICE.txt according to http://www.apache.org/legal/src-headers.html#notice to follow that format and add a line for Cloudera. 3) Replace or add the existing ASF license text with the one given on the website. Much of this change was automatically generated via: git grep -li 'Copyright.*Cloudera' > modified_files.txt cat modified_files.txt | xargs perl -n -i -e 'print unless m#Copyright.*Cloudera#i;' cat modified_files_txt | xargs fix_apache_license.py [1] Some manual fixups were performed following those steps, especially when license text was completely missing from the file. [1] https://gist.github.com/anonymous/ff71292094362fc5c594 with minor modification to ORIG_LICENSE to match Impala's license text. Change-Id: I2e0bd8420945b953e1b806041bea4d72a3943d86 Reviewed-on: http://gerrit.cloudera.org:8080/3779 Reviewed-by: Dan Hecht <dhecht@cloudera.com> Tested-by: Internal Jenkins
157 lines
6.7 KiB
Python
157 lines
6.7 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
|
|
def transpose_results(result):
|
|
"""Given a query result (list of strings, each string represents a row), return a list
|
|
of columns, where each column is a list of strings."""
|
|
split_result = [row.split('\t') for row in result]
|
|
return [list(l) for l in zip(*split_result)]
|
|
|
|
class TestQueryFullSort(ImpalaTestSuite):
|
|
"""Test class to do functional validation of sorting when data is spilled to disk."""
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'tpch'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestQueryFullSort, cls).add_test_dimensions()
|
|
|
|
if cls.exploration_strategy() == 'core':
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'parquet')
|
|
|
|
def test_multiple_mem_limits(self, vector):
|
|
"""Exercise the dynamic memory scaling functionality."""
|
|
|
|
"""Using lineitem table forces the multi-phase sort with low mem_limit. This test
|
|
takes about a minute"""
|
|
query = """select l_comment, l_partkey, l_orderkey, l_suppkey, l_commitdate
|
|
from lineitem order by l_comment limit 100000"""
|
|
exec_option = vector.get_value('exec_option')
|
|
exec_option['disable_outermost_topn'] = 1
|
|
table_format = vector.get_value('table_format')
|
|
|
|
"""The first run should fit in memory, the 300m run is a 2-phase disk sort,
|
|
the 150m run is a multi-phase sort (i.e. with an intermediate merge)."""
|
|
for mem_limit in ['-1', '300m', '150m']:
|
|
exec_option['mem_limit'] = mem_limit
|
|
result = transpose_results(self.execute_query(
|
|
query, exec_option, table_format=table_format).data)
|
|
assert(result[0] == sorted(result[0]))
|
|
|
|
def test_multiple_mem_limits_full_output(self, vector):
|
|
""" Exercise a range of memory limits, returning the full sorted input. """
|
|
query = """select o_orderdate, o_custkey, o_comment
|
|
from orders
|
|
order by o_orderdate"""
|
|
exec_option = vector.get_value('exec_option')
|
|
table_format = vector.get_value('table_format')
|
|
|
|
max_block_mgr_memory_values = ['-1', '48M'] # Unlimited and minimum memory.
|
|
if self.exploration_strategy() == 'exhaustive' and \
|
|
table_format.file_format == 'parquet':
|
|
# Test some intermediate values for parquet on exhaustive.
|
|
max_block_mgr_memory_values += ['64M', '128M', '256M']
|
|
for max_block_mgr_memory in max_block_mgr_memory_values:
|
|
exec_option['max_block_mgr_memory'] = max_block_mgr_memory
|
|
result = transpose_results(self.execute_query(
|
|
query, exec_option, table_format=table_format).data)
|
|
assert(result[0] == sorted(result[0]))
|
|
|
|
def test_sort_join(self, vector):
|
|
"""With 200m memory limit this should be a 2-phase sort"""
|
|
|
|
query = """select o1.o_orderdate, o2.o_custkey, o1.o_comment from orders o1 join
|
|
orders o2 on (o1.o_orderkey = o2.o_orderkey) order by o1.o_orderdate limit 100000"""
|
|
|
|
exec_option = vector.get_value('exec_option')
|
|
exec_option['disable_outermost_topn'] = 1
|
|
exec_option['mem_limit'] = "1200m"
|
|
table_format = vector.get_value('table_format')
|
|
|
|
result = transpose_results(self.execute_query(
|
|
query, exec_option, table_format=table_format).data)
|
|
assert(result[0] == sorted(result[0]))
|
|
|
|
def test_sort_union(self, vector):
|
|
query = """select o_orderdate, o_custkey, o_comment from (select * from orders union
|
|
select * from orders union all select * from orders) as i
|
|
order by o_orderdate limit 100000"""
|
|
|
|
exec_option = vector.get_value('exec_option')
|
|
exec_option['disable_outermost_topn'] = 1
|
|
exec_option['mem_limit'] = "3000m"
|
|
table_format = vector.get_value('table_format')
|
|
|
|
result = transpose_results(self.execute_query(
|
|
query, exec_option, table_format=table_format).data)
|
|
assert(result[0] == sorted(result[0]))
|
|
|
|
def test_pathological_input(self, vector):
|
|
""" Regression test for stack overflow and poor performance on certain inputs where
|
|
always selecting the middle element as a quicksort pivot caused poor performance. The
|
|
trick is to concatenate two equal-size sorted inputs. If the middle element is always
|
|
selected as the pivot (the old method), the sorter tends to get stuck selecting the
|
|
minimum element as the pivot, which results in almost all of the tuples ending up
|
|
in the right partition.
|
|
"""
|
|
query = """select l_orderkey from (
|
|
select * from lineitem limit 300000
|
|
union all
|
|
select * from lineitem limit 300000) t
|
|
order by l_orderkey"""
|
|
|
|
exec_option = vector.get_value('exec_option')
|
|
exec_option['disable_outermost_topn'] = 1
|
|
# Run with a single scanner thread so that the input doesn't get reordered.
|
|
exec_option['num_nodes'] = "1"
|
|
exec_option['num_scanner_threads'] = "1"
|
|
table_format = vector.get_value('table_format')
|
|
|
|
result = transpose_results(self.execute_query(
|
|
query, exec_option, table_format=table_format).data)
|
|
numeric_results = [int(val) for val in result[0]]
|
|
assert(numeric_results == sorted(numeric_results))
|
|
|
|
def test_spill_empty_strings(self, vector):
|
|
"""Test corner case of spilling sort with only empty strings. Spilling with var len
|
|
slots typically means the sort must reorder blocks and convert pointers, but this case
|
|
has to be handled differently because there are no var len blocks to point into."""
|
|
|
|
query = """
|
|
select empty, l_orderkey, l_partkey, l_suppkey,
|
|
l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax
|
|
from (select substr(l_comment, 1000, 0) empty, * from lineitem) t
|
|
order by empty, l_orderkey, l_partkey, l_suppkey, l_linenumber
|
|
limit 100000
|
|
"""
|
|
|
|
exec_option = vector.get_value('exec_option')
|
|
exec_option['disable_outermost_topn'] = 1
|
|
exec_option['max_block_mgr_memory'] = "256m"
|
|
exec_option['num_nodes'] = "1"
|
|
table_format = vector.get_value('table_format')
|
|
|
|
result = transpose_results(self.execute_query(
|
|
query, exec_option, table_format=table_format).data)
|
|
assert(result[0] == sorted(result[0]))
|
|
|