impala/tests/query_test/test_sort.py

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
import sys
import re
import random
from copy import copy
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.test_vector import *
from tests.common.impala_test_suite import *

def transpose_results(result):
  """Given a query result (list of strings, each string represents a row), return a list
    of columns, where each column is a list of strings."""
  split_result = [row.split('\t') for row in result]
  return [list(l) for l in zip(*split_result)]

class TestQueryFullSort(ImpalaTestSuite):
  """Test class to do functional validation of sorting when data is spilled to disk."""

  @classmethod
  def get_workload(self):
    return 'tpch'

  @classmethod
  def add_test_dimensions(cls):
    super(TestQueryFullSort, cls).add_test_dimensions()

    if cls.exploration_strategy() == 'core':
      cls.TestMatrix.add_constraint(lambda v:\
          v.get_value('table_format').file_format == 'parquet')

  def test_multiple_mem_limits(self, vector):
    """Exercise the dynamic memory scaling functionality."""

    """Using lineitem table forces the multi-phase sort with low mem_limit. This test
       takes about a minute"""
    query = """select l_comment, l_partkey, l_orderkey, l_suppkey, l_commitdate
            from lineitem order by l_comment limit 100000"""
    exec_option = vector.get_value('exec_option')
    exec_option['disable_outermost_topn'] = 1
    table_format = vector.get_value('table_format')

    """The first run should fit in memory, the 300m run is a 2-phase disk sort,
       the 150m run is a multi-phase sort (i.e. with an intermediate merge)."""
    for mem_limit in ['-1', '300m', '150m']:
      exec_option['mem_limit'] = mem_limit
      result = transpose_results(self.execute_query(
        query, exec_option, table_format=table_format).data)
      assert(result[0] == sorted(result[0]))

  def test_sort_join(self, vector):
    """With 200m memory limit this should be a 2-phase sort"""

    query = """select o1.o_orderdate, o2.o_custkey, o1.o_comment from orders o1 join
    orders o2 on (o1.o_orderkey = o2.o_orderkey) order by o1.o_orderdate limit 100000"""

    exec_option = vector.get_value('exec_option')
    exec_option['disable_outermost_topn'] = 1
    exec_option['mem_limit'] = "280m"
    table_format = vector.get_value('table_format')

    result = transpose_results(self.execute_query(
      query, exec_option, table_format=table_format).data)
    assert(result[0] == sorted(result[0]))

  def test_sort_union(self, vector):
    query = """select o_orderdate, o_custkey, o_comment from (select * from orders union
    select * from orders union all select * from orders) as i
    order by o_orderdate limit 100000"""

    exec_option = vector.get_value('exec_option')
    exec_option['disable_outermost_topn'] = 1
    exec_option['mem_limit'] = "1000m"
    table_format = vector.get_value('table_format')

    result = transpose_results(self.execute_query(
      query, exec_option, table_format=table_format).data)
    assert(result[0] == sorted(result[0]))