Files
impala/tests/custom_cluster/test_wide_table_operations.py
Joe McDonnell eb66d00f9f IMPALA-11974: Fix lazy list operators for Python 3 compatibility
Python 3 changes list operators such as range, map, and filter
to be lazy. Some code that expects the list operators to happen
immediately will fail. e.g.

Python 2:
range(0,5) == [0,1,2,3,4]
True

Python 3:
range(0,5) == [0,1,2,3,4]
False

The fix is to wrap locations with list(). i.e.

Python 3:
list(range(0,5)) == [0,1,2,3,4]
True

Since the base operators are now lazy, Python 3 also removes the
old lazy versions (e.g. xrange, ifilter, izip, etc). This uses
future's builtins package to convert the code to the Python 3
behavior (i.e. xrange -> future's builtins.range).

Most of the changes were done via these futurize fixes:
 - libfuturize.fixes.fix_xrange_with_import
 - lib2to3.fixes.fix_map
 - lib2to3.fixes.fix_filter

This eliminates the pylint warnings:
 - xrange-builtin
 - range-builtin-not-iterating
 - map-builtin-not-iterating
 - zip-builtin-not-iterating
 - filter-builtin-not-iterating
 - reduce-builtin
 - deprecated-itertools-function

Testing:
 - Ran core job

Change-Id: Ic7c082711f8eff451a1b5c085e97461c327edb5f
Reviewed-on: http://gerrit.cloudera.org:8080/19589
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
2023-03-09 17:17:57 +00:00

94 lines
4.1 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import, division, print_function
from builtins import range
import os
import pytest
from subprocess import call
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.skip import SkipIf
TBL_NAME = "widetable_2000_cols_partitioned"
NUM_PARTS = 50000
@SkipIf.not_hdfs
class TestWideTableOperations(CustomClusterTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('runs only in exhaustive since it takes more than 20 mins')
super(TestWideTableOperations, cls).setup_class()
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
jvm_args="-Xmx2g -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath="
+ os.getenv("LOG_DIR", "/tmp"))
def test_wide_table_operations(self, vector, unique_database):
"""Regression test for IMPALA-11812. Test DDL/DML operations on wide table.
Use a small heap size (2GB) to make sure memory consumption is optimized.
Each FieldSchema instance takes 24 bytes in a small heap (<32GB). Without the fix,
catalogd will hold at least 50,000 (parts) * 2,000 (cols) = 100,000,000 FieldSchema
instances in memory for execDdl or table loading, which already takes more than 2GB
and will results in OOM failures."""
# Create partition dirs and files locally
tmp_dir = "/tmp/" + TBL_NAME
os.mkdir(tmp_dir)
for i in range(NUM_PARTS):
part_dir = tmp_dir + "/p=" + str(i)
data_file = part_dir + "/data.txt"
os.mkdir(part_dir)
with open(data_file, 'w') as local_file:
local_file.write("true")
# Upload files to HDFS
hdfs_dir = self._get_table_location("functional." + TBL_NAME, vector)
call(["hdfs", "dfs", "-rm", "-r", "-skipTrash", hdfs_dir])
# Use 1 replica to save space, 8 threads to speed up
call(["hdfs", "dfs", "-Ddfs.replication=1", "-put", "-t", "8", tmp_dir, hdfs_dir])
# Create a new table so we don't need to drop partitions at the end.
# It will be dropped when 'unique_database' is dropped.
create_tbl_ddl =\
"create external table {db}.{tbl} like functional.{tbl} " \
"location '{location}'".format(
db=unique_database, tbl=TBL_NAME, location=hdfs_dir)
self.execute_query_expect_success(
self.client, create_tbl_ddl.format(db=unique_database, tbl=TBL_NAME))
# Recover partitions first. This takes 10mins for 50k partitions.
recover_stmt = "alter table {db}.{tbl} recover partitions"
# Invalidate the table to test initial metadata loading
invalidate_stmt = "invalidate metadata {db}.{tbl}"
# Test initial table loading and get all partitions
show_parts_stmt = "show partitions {db}.{tbl}"
try:
self.execute_query_expect_success(
self.client, recover_stmt.format(db=unique_database, tbl=TBL_NAME))
self.execute_query_expect_success(
self.client, invalidate_stmt.format(db=unique_database, tbl=TBL_NAME))
res = self.execute_query_expect_success(
self.client, show_parts_stmt.format(db=unique_database, tbl=TBL_NAME))
# Last line is 'Total'
assert len(res.data) == NUM_PARTS + 1
finally:
call(["rm", "-rf", tmp_dir])
call(["hdfs", "dfs", "-rm", "-r", "-skipTrash", hdfs_dir])