Files
impala/tests/custom_cluster/test_wide_table_operations.py
Csaba Ringhofer f98b697c7b IMPALA-13929: Make 'functional-query' the default workload in tests
This change adds get_workload() to ImpalaTestSuite and removes it
from all test suites that already returned 'functional-query'.
get_workload() is also removed from CustomClusterTestSuite which
used to return 'tpch'.

All other changes besides impala_test_suite.py and
custom_cluster_test_suite.py are just mass removals of
get_workload() functions.

The behavior is only changed in custom cluster tests that didn't
override get_workload(). By returning 'functional-query' instead
of 'tpch', exploration_strategy() will no longer return 'core' in
'exhaustive' test runs. See IMPALA-3947 on why workload affected
exploration_strategy. An example for affected test is
TestCatalogHMSFailures which was skipped both in core and exhaustive
runs before this change.

get_workload() functions that return a different workload than
'functional-query' are not changed - it is possible that some of
these also don't handle exploration_strategy() as expected, but
individually checking these tests is out of scope in this patch.

Change-Id: I9ec6c41ffb3a30e1ea2de773626d1485c69fe115
Reviewed-on: http://gerrit.cloudera.org:8080/22726
Reviewed-by: Riza Suminto <riza.suminto@cloudera.com>
Reviewed-by: Daniel Becker <daniel.becker@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-04-08 07:12:55 +00:00

90 lines
4.0 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import, division, print_function
from builtins import range
import os
import pytest
from subprocess import call
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.skip import SkipIf
TBL_NAME = "widetable_2000_cols_partitioned"
NUM_PARTS = 50000
@SkipIf.not_hdfs
class TestWideTableOperations(CustomClusterTestSuite):
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('runs only in exhaustive since it takes more than 20 mins')
super(TestWideTableOperations, cls).setup_class()
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
jvm_args="-Xmx2g -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath="
+ os.getenv("LOG_DIR", "/tmp"))
def test_wide_table_operations(self, vector, unique_database):
"""Regression test for IMPALA-11812. Test DDL/DML operations on wide table.
Use a small heap size (2GB) to make sure memory consumption is optimized.
Each FieldSchema instance takes 24 bytes in a small heap (<32GB). Without the fix,
catalogd will hold at least 50,000 (parts) * 2,000 (cols) = 100,000,000 FieldSchema
instances in memory for execDdl or table loading, which already takes more than 2GB
and will results in OOM failures."""
# Create partition dirs and files locally
tmp_dir = "/tmp/" + TBL_NAME
os.mkdir(tmp_dir)
for i in range(NUM_PARTS):
part_dir = tmp_dir + "/p=" + str(i)
data_file = part_dir + "/data.txt"
os.mkdir(part_dir)
with open(data_file, 'w') as local_file:
local_file.write("true")
# Upload files to HDFS
hdfs_dir = self._get_table_location("functional." + TBL_NAME, vector)
call(["hdfs", "dfs", "-rm", "-r", "-skipTrash", hdfs_dir])
# Use 1 replica to save space, 8 threads to speed up
call(["hdfs", "dfs", "-Ddfs.replication=1", "-put", "-t", "8", tmp_dir, hdfs_dir])
# Create a new table so we don't need to drop partitions at the end.
# It will be dropped when 'unique_database' is dropped.
create_tbl_ddl =\
"create external table {db}.{tbl} like functional.{tbl} " \
"location '{location}'".format(
db=unique_database, tbl=TBL_NAME, location=hdfs_dir)
self.execute_query_expect_success(
self.client, create_tbl_ddl.format(db=unique_database, tbl=TBL_NAME))
# Recover partitions first. This takes 10mins for 50k partitions.
recover_stmt = "alter table {db}.{tbl} recover partitions"
# Invalidate the table to test initial metadata loading
invalidate_stmt = "invalidate metadata {db}.{tbl}"
# Test initial table loading and get all partitions
show_parts_stmt = "show partitions {db}.{tbl}"
try:
self.execute_query_expect_success(
self.client, recover_stmt.format(db=unique_database, tbl=TBL_NAME))
self.execute_query_expect_success(
self.client, invalidate_stmt.format(db=unique_database, tbl=TBL_NAME))
res = self.execute_query_expect_success(
self.client, show_parts_stmt.format(db=unique_database, tbl=TBL_NAME))
# Last line is 'Total'
assert len(res.data) == NUM_PARTS + 1
finally:
call(["rm", "-rf", tmp_dir])
call(["hdfs", "dfs", "-rm", "-r", "-skipTrash", hdfs_dir])