Files
impala/tests/query_test/test_sort.py
Nong Li 7dc57aaa9e Change buffered block mgr to support multiple clients.
This patch does a few things:
1. Moves the buffer block mgr from the sorter to the runtime state. This is now
   one that is shared across the query fragment. The partitioned hash join and agg
   will use this as well.
2. Adds a Client interface to the block mgr. Each exec node is a different client
   and can reserve a minimum number of buffers. This avoid starvation.
3. Updated the BufferedBlockMgr interface's for getting pinned blocks to collapse
   two existing APIs.

Change-Id: Ibb31fbe480f3726048457f26e24a9e33f7201d86
Reviewed-on: http://gerrit.ent.cloudera.com:8080/3504
Reviewed-by: Nong Li <nong@cloudera.com>
Tested-by: Nong Li <nong@cloudera.com>
Reviewed-on: http://gerrit.ent.cloudera.com:8080/3574
2014-07-22 12:45:37 -07:00

93 lines
3.6 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import sys
import re
import random
from copy import copy
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
def transpose_results(result):
"""Given a query result (list of strings, each string represents a row), return a list
of columns, where each column is a list of strings."""
split_result = [row.split('\t') for row in result]
return [list(l) for l in zip(*split_result)]
class TestQueryFullSort(ImpalaTestSuite):
"""Test class to do functional validation of sorting when data is spilled to disk."""
@classmethod
def get_workload(self):
return 'tpch'
@classmethod
def add_test_dimensions(cls):
super(TestQueryFullSort, cls).add_test_dimensions()
if cls.exploration_strategy() == 'core':
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'parquet')
def test_multiple_mem_limits(self, vector):
"""Exercise the dynamic memory scaling functionality."""
"""Using lineitem table forces the multi-phase sort with low mem_limit. This test
takes about a minute"""
query = """select l_comment, l_partkey, l_orderkey, l_suppkey, l_commitdate
from lineitem order by l_comment limit 100000"""
exec_option = vector.get_value('exec_option')
exec_option['disable_outermost_topn'] = 1
table_format = vector.get_value('table_format')
"""The first run should fit in memory, the 300m run is a 2-phase disk sort,
the 150m run is a multi-phase sort (i.e. with an intermediate merge)."""
for mem_limit in ['-1', '300m', '150m']:
exec_option['mem_limit'] = mem_limit
result = transpose_results(self.execute_query(
query, exec_option, table_format=table_format).data)
assert(result[0] == sorted(result[0]))
def test_sort_join(self, vector):
"""With 200m memory limit this should be a 2-phase sort"""
query = """select o1.o_orderdate, o2.o_custkey, o1.o_comment from orders o1 join
orders o2 on (o1.o_orderkey = o2.o_orderkey) order by o1.o_orderdate limit 100000"""
exec_option = vector.get_value('exec_option')
exec_option['disable_outermost_topn'] = 1
exec_option['mem_limit'] = "280m"
table_format = vector.get_value('table_format')
result = transpose_results(self.execute_query(
query, exec_option, table_format=table_format).data)
assert(result[0] == sorted(result[0]))
def test_sort_union(self, vector):
query = """select o_orderdate, o_custkey, o_comment from (select * from orders union
select * from orders union all select * from orders) as i
order by o_orderdate limit 100000"""
exec_option = vector.get_value('exec_option')
exec_option['disable_outermost_topn'] = 1
exec_option['mem_limit'] = "1000m"
table_format = vector.get_value('table_format')
result = transpose_results(self.execute_query(
query, exec_option, table_format=table_format).data)
assert(result[0] == sorted(result[0]))