IMPALA-11976: Fix use of deprecated functions/fields removed in Python 3

Python 3 moved several things around or removed deprecated functions / fields: - sys.maxint was removed, but sys.maxsize provides similar functionality - long was removed, but int provides the same range - file() was removed, but open() already provided the same functionality - Exception.message was removed, but str(exception) is equivalent - Some encodings (like hex) were moved to codecs.encode() - string.letters -> string.ascii_letters - string.lowercase -> string.ascii_lowercase - string.strip was removed This fixes all of those locations. Python 3 also has slightly different rounding behavior from round(), so this changes round() to use future's builtins.round() to get the Python 3 behavior. This fixes the following pylint warnings: - file-builtin - long-builtin - invalid-str-codec - round-builtin - deprecated-string-function - sys-max-int - exception-message-attribute Testing: - Ran cores tests Change-Id: I094cd7fd06b0d417fc875add401d18c90d7a792f Reviewed-on: http://gerrit.cloudera.org:8080/19591 Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com> Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2023-03-04 10:21:59 -08:00
parent c233634d74
commit aa4050b4d9
27 changed files with 64 additions and 53 deletions
--- a/bin/banned_py3k_warnings.txt
+++ b/bin/banned_py3k_warnings.txt
@@ -10,3 +10,10 @@ deprecated-itertools-function
 dict-iter-method
 dict-keys-not-iterating
 dict-values-not-iterating
+file-builtin
+long-builtin
+invalid-str-codec
+round-builtin
+deprecated-string-function
+sys-max-int
+exception-message-attribute
--- a/bin/parse-thrift-profile.py
+++ b/bin/parse-thrift-profile.py
@@ -46,7 +46,7 @@ import sys
 if len(sys.argv) == 1 or sys.argv[1] == "-":
  input_data = sys.stdin
 elif len(sys.argv) == 2:
-  input_data = file(sys.argv[1])
+  input_data = open(sys.argv[1])
 else:
  print("Usage: %s [file]" % (sys.argv[0],), file=sys.stderr)
  sys.exit(1)
--- a/docker/setup_build_context.py
+++ b/docker/setup_build_context.py
@@ -194,7 +194,7 @@ if args.utility_context:
 else:
  # Impala Coordinator dependencies.
  num_jars_on_classpath = 0
-  dep_classpath = file(os.path.join(IMPALA_HOME, "fe/target/build-classpath.txt")).read()
+  dep_classpath = open(os.path.join(IMPALA_HOME, "fe/target/build-classpath.txt")).read()
  for jar in dep_classpath.split(":"):
    num_jars_on_classpath += 1
    assert os.path.exists(jar), "missing jar from classpath: {0}".format(jar)
@@ -216,7 +216,7 @@ else:
  assert num_frontend_jars == 1

  # Impala Executor dependencies.
-  dep_classpath = file(os.path.join(IMPALA_HOME,
+  dep_classpath = open(os.path.join(IMPALA_HOME,
      "java/executor-deps/target/build-executor-deps-classpath.txt")).read()
  for jar in dep_classpath.split(":"):
    assert os.path.exists(jar), "missing jar from classpath: {0}".format(jar)
--- a/lib/python/impala_py_lib/gdb/impala-gdb.py
+++ b/lib/python/impala_py_lib/gdb/impala-gdb.py
@@ -49,8 +49,8 @@ def get_fragment_instances():
                # No valid thread_debug_info
                if not tdi:
                    break
-                hi = long(tdi['instance_id_']['hi'])
-                lo = long(tdi['instance_id_']['lo'])
+                hi = int(tdi['instance_id_']['hi'])
+                lo = int(tdi['instance_id_']['lo'])
                fi = "%lx:%lx" % (hi, lo)
                if fi != "0:0":
                    fragment_instances[fi.strip('"')].append(thread.num)
--- a/testdata/bin/wait-for-hiveserver2.py
+++ b/testdata/bin/wait-for-hiveserver2.py
@@ -75,10 +75,10 @@ while time.time() - now < TIMEOUT_SECONDS:
      print("HiveServer2 service is up at %s." % options.hs2_hostport)
      exit(0)
  except Exception as e:
-    if "SASL" in e.message:  # Bail out on SASL failures
+    if "SASL" in str(e):  # Bail out on SASL failures
      print("SASL failure when attempting connection:")
      raise
-    if "GSS" in e.message:   # Other GSSAPI failures
+    if "GSS" in str(e):   # Other GSSAPI failures
      print("GSS failure when attempting connection:")
      raise
    print("Waiting for HiveServer2 at %s..." % options.hs2_hostport)
--- a/testdata/bin/wait-for-metastore.py
+++ b/testdata/bin/wait-for-metastore.py
@@ -60,10 +60,10 @@ while time.time() - now < TIMEOUT_SECONDS:
      print("Metastore service is up at %s." % options.metastore_hostport)
      exit(0)
  except Exception as e:
-    if "SASL" in e.message:  # Bail out on SASL failures
+    if "SASL" in str(e):  # Bail out on SASL failures
      print("SASL failure when attempting connection:")
      raise
-    if "GSS" in e.message:   # Other GSSAPI failures
+    if "GSS" in str(e):   # Other GSSAPI failures
      print("GSS failure when attempting connection:")
      raise
    print("Waiting for the Metastore at %s..." % options.metastore_hostport)
--- a/tests/common/impala_connection.py
+++ b/tests/common/impala_connection.py
@@ -21,6 +21,7 @@

 from __future__ import absolute_import, division, print_function
 import abc
+import codecs
 import logging
 import re

@@ -386,8 +387,8 @@ class ImpylaHS2Connection(ImpalaConnection):
    """Return the string representation of the query id."""
    guid_bytes = \
        operation_handle.get_handle()._last_operation.handle.operationId.guid
-    return "{0}:{1}".format(guid_bytes[7::-1].encode('hex_codec'),
-                            guid_bytes[16:7:-1].encode('hex_codec'))
+    return "{0}:{1}".format(codecs.encode(guid_bytes[7::-1], 'hex_codec'),
+                            codecs.encode(guid_bytes[16:7:-1], 'hex_codec'))

  def get_state(self, operation_handle):
    LOG.info("-- getting state for operation: {0}".format(operation_handle))
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -18,7 +18,7 @@
 # The base class that should be used for almost all Impala tests

 from __future__ import absolute_import, division, print_function
-from builtins import range
+from builtins import range, round
 import glob
 import grp
 import json
@@ -1041,7 +1041,7 @@ class ImpalaTestSuite(BaseTestSuite):
        # is specified; explicitly make sure there's nothing to
        # read to avoid hanging, especially when running interactively
        # with py.test.
-        stdin=file("/dev/null"),
+        stdin=open("/dev/null"),
        env=env)
    (stdout, stderr) = call.communicate()
    call.wait()
--- a/tests/common/kudu_test_suite.py
+++ b/tests/common/kudu_test_suite.py
@@ -111,7 +111,7 @@ class KuduTestSuite(ImpalaTestSuite):

  @classmethod
  def random_table_name(cls):
-    return "".join(choice(string.lowercase) for _ in range(10))
+    return "".join(choice(string.ascii_lowercase) for _ in range(10))

  @classmethod
  def to_kudu_table_name(cls, db_name, tbl_name):
--- a/tests/comparison/cluster.py
+++ b/tests/comparison/cluster.py
@@ -21,7 +21,7 @@
 # module depends on db_connection which use some query generator classes.

 from __future__ import absolute_import, division, print_function
-from builtins import range, zip
+from builtins import int, range, zip
 import hdfs
 import logging
 import os
@@ -37,7 +37,7 @@ from getpass import getuser
 from multiprocessing.pool import ThreadPool
 from random import choice
 from StringIO import StringIO
-from sys import maxint
+from sys import maxsize
 from tempfile import mkdtemp
 from threading import Lock
 from time import mktime, strptime
@@ -629,7 +629,7 @@ class Impala(Service):
      impalads = self.impalads
    promise = self._thread_pool.map_async(func, impalads)
    # Python doesn't handle ctrl-c well unless a timeout is provided.
-    results = promise.get(maxint)
+    results = promise.get(maxsize)
    if as_dict:
      results = dict(zip(impalads, results))
    return results
@@ -874,7 +874,7 @@ class MiniClusterImpalad(Impalad):
      return int(pid)

  def find_process_mem_mb_limit(self):
-    return long(self.get_metric("mem-tracker.process.limit")["value"]) // 1024 ** 2
+    return int(self.get_metric("mem-tracker.process.limit")["value"]) // 1024 ** 2

  def find_core_dump_dir(self):
    raise NotImplementedError()
--- a/tests/comparison/discrepancy_searcher.py
+++ b/tests/comparison/discrepancy_searcher.py
@@ -25,7 +25,7 @@

 # TODO: IMPALA-4600: refactor this module
 from __future__ import absolute_import, division, print_function
-from builtins import range, zip
+from builtins import range, round, zip
 from copy import deepcopy
 from decimal import Decimal
 from logging import getLogger
--- a/tests/custom_cluster/test_admission_controller.py
+++ b/tests/custom_cluster/test_admission_controller.py
@@ -18,7 +18,7 @@
 # Tests admission control

 from __future__ import absolute_import, division, print_function
-from builtins import range
+from builtins import int, range, round
 import itertools
 import logging
 import os
@@ -504,7 +504,7 @@ class TestAdmissionController(TestAdmissionControllerBase, HS2TestSuite):
      self.execute_query_expect_success(self.client, query, exec_options)

      # A bit too much memory to run on coordinator.
-      exec_options['mem_limit'] = long(self.PROC_MEM_TEST_LIMIT * 1.1)
+      exec_options['mem_limit'] = int(self.PROC_MEM_TEST_LIMIT * 1.1)
      ex = self.execute_query_expect_failure(self.client, query, exec_options)
      assert ("Rejected query from pool default-pool: request memory needed "
              "1.10 GB is greater than memory available for admission 1.00 GB" in
@@ -2218,7 +2218,7 @@ class TestAdmissionControllerStress(TestAdmissionControllerBase):
    # should be fine. This exercises the code that does the per-pool memory
    # accounting (see MemTracker::GetPoolMemReserved()) without actually being throttled.
    self.run_admission_test(vector, {'request_pool': self.pool_name,
-      'mem_limit': sys.maxint})
+      'mem_limit': sys.maxsize})

  @pytest.mark.execute_serially
  @SkipIfOS.redhat6
--- a/tests/custom_cluster/test_frontend_connection_limit.py
+++ b/tests/custom_cluster/test_frontend_connection_limit.py
@@ -47,7 +47,7 @@ class TestFrontendConnectionLimit(CustomClusterTestSuite):
      client.execute(query)
    except Exception as e:
      client.close()
-      raise ImpalaBeeswaxException(e.message)
+      raise ImpalaBeeswaxException(str(e))
    client.close()

  @pytest.mark.execute_serially
--- a/tests/custom_cluster/test_hs2_fault_injection.py
+++ b/tests/custom_cluster/test_hs2_fault_injection.py
@@ -16,6 +16,7 @@
 # under the License.

 from __future__ import absolute_import, division, print_function
+from builtins import round
 import pytest
 import requests

--- a/tests/custom_cluster/test_parquet_max_page_header.py
+++ b/tests/custom_cluster/test_parquet_max_page_header.py
@@ -91,9 +91,9 @@ class TestParquetMaxPageHeader(CustomClusterTestSuite):
    """Creates a file in HDFS containing two MAX_STRING_LENGTH lines."""
    file_name = os.path.join(dir, file)
    # Create two 10MB long strings.
-    random_text1 = "".join([random.choice(string.letters)
+    random_text1 = "".join([random.choice(string.ascii_letters)
        for i in range(self.MAX_STRING_LENGTH)])
-    random_text2 = "".join([random.choice(string.letters)
+    random_text2 = "".join([random.choice(string.ascii_letters)
        for i in range(self.MAX_STRING_LENGTH)])
    put = subprocess.Popen(["hdfs", "dfs", "-put", "-d", "-f", "-", file_name],
        stdin=subprocess.PIPE, bufsize=-1)
--- a/tests/custom_cluster/test_restart_services.py
+++ b/tests/custom_cluster/test_restart_services.py
@@ -65,8 +65,8 @@ class TestRestart(CustomClusterTestSuite):
        cursor.execute("describe database functional")
        return
      except HiveServer2Error as e:
-        assert "AnalysisException: Database does not exist: functional" in e.message,\
-               "Unexpected exception: " + e.message
+        assert "AnalysisException: Database does not exist: functional" in str(e),\
+               "Unexpected exception: " + str(e)
        sleep(1)
    assert False, "Coordinator never received non-empty metadata from the restarted " \
           "statestore after {0} seconds".format(wait_time_s)
--- a/tests/metadata/test_hms_integration.py
+++ b/tests/metadata/test_hms_integration.py
@@ -236,7 +236,7 @@ class TestHmsIntegration(ImpalaTestSuite):
    dictionary that holds the parsed attributes."""
    result = {}
    output_lines = output.split('\n')
-    stat_names = list(map(string.strip, output_lines[0].split(',')))
+    stat_names = [s.strip() for s in output_lines[0].split(',')]
    stat_values = output_lines[3].split(',')
    assert len(stat_names) == len(stat_values)
    for i in range(0, len(stat_names)):
@@ -248,7 +248,7 @@ class TestHmsIntegration(ImpalaTestSuite):
    dictionary that holds the parsed attributes."""
    result = {}
    for line in output.split('\n'):
-      line_elements = list(map(string.strip, line.split(',')))
+      line_elements = [s.strip() for s in line.split(',')]
      if len(line_elements) >= 2:
        result[line_elements[0]] = line_elements[1]
    return result
--- a/tests/metadata/test_last_ddl_time_update.py
+++ b/tests/metadata/test_last_ddl_time_update.py
@@ -17,6 +17,7 @@

 # Impala tests for DDL statements
 from __future__ import absolute_import, division, print_function
+from builtins import int
 import time

 from tests.common.impala_test_suite import ImpalaTestSuite
@@ -110,15 +111,15 @@ class TestLastDdlTimeUpdate(ImpalaTestSuite):

      if expect_changed_ddl_time:
        # check that the new ddlTime is strictly greater than the old one.
-        assert long(afterDdlTime) > long(beforeDdlTime)
+        assert int(afterDdlTime) > int(beforeDdlTime)
      else:
-        assert long(afterDdlTime) == long(beforeDdlTime)
+        assert int(afterDdlTime) == int(beforeDdlTime)

      if expect_changed_stats_time:
        # check that the new statsTime is strictly greater than the old one.
-        assert long(afterStatsTime) > long(beforeStatsTime)
+        assert int(afterStatsTime) > int(beforeStatsTime)
      else:
-        assert long(afterStatsTime) == long(beforeStatsTime)
+        assert int(afterStatsTime) == int(beforeStatsTime)

    def _update_name(self, new_tbl_name):
      """"
--- a/tests/performance/query_exec_functions.py
+++ b/tests/performance/query_exec_functions.py
@@ -22,7 +22,7 @@ import re
 from datetime import datetime
 from impala.dbapi import connect
 from tests.beeswax.impala_beeswax import ImpalaBeeswaxClient, ImpalaBeeswaxResult
-from sys import maxint
+from sys import maxsize
 from tests.performance.query import HiveQueryResult, ImpalaQueryResult
 from tests.util.shell_util import exec_process
 from time import time
@@ -44,7 +44,7 @@ def get_hs2_hive_cursor(hiveserver, user=None, use_kerberos=False,
        user=user,
        database=database,
        auth_mechanism="GSSAPI" if use_kerberos else "PLAIN",
-        timeout=maxint)
+        timeout=maxsize)

    cursor = conn.cursor(configuration=execOptions)
    LOG.info("Connected to {0}:{1}".format(host, port))
--- a/tests/query_test/test_insert_parquet.py
+++ b/tests/query_test/test_insert_parquet.py
@@ -18,7 +18,7 @@
 # Targeted Impala insert tests

 from __future__ import absolute_import, division, print_function
-from builtins import map, range
+from builtins import map, range, round
 import os

 from collections import namedtuple
--- a/tests/query_test/test_query_mem_limit.py
+++ b/tests/query_test/test_query_mem_limit.py
@@ -46,9 +46,9 @@ class TestQueryMemLimit(ImpalaTestSuite):
  # dynamically, even if it is a rough approximation.
  # A mem_limit is expressed in bytes, with values <= 0 signifying no cap.
  # These values are either really small, unlimited, or have a really large cap.
-  MAXINT_BYTES = str(sys.maxint)
-  MAXINT_MB = str(sys.maxint // (1024 * 1024))
-  MAXINT_GB = str(sys.maxint // (1024 * 1024 * 1024))
+  MAXINT_BYTES = str(sys.maxsize)
+  MAXINT_MB = str(sys.maxsize // (1024 * 1024))
+  MAXINT_GB = str(sys.maxsize // (1024 * 1024 * 1024))
  # We expect the tests with MAXINT_* using valid units [bmg] to succeed.
  PASS_REGEX = re.compile("(%s|%s|%s)[bmg]?$" % (MAXINT_BYTES, MAXINT_MB, MAXINT_GB),
                          re.I)
--- a/tests/query_test/test_scanners_fuzz.py
+++ b/tests/query_test/test_scanners_fuzz.py
@@ -16,7 +16,7 @@
 # under the License.

 from __future__ import absolute_import, division, print_function
-from builtins import range
+from builtins import range, int
 from copy import copy
 import itertools
 import logging
@@ -175,7 +175,7 @@ class TestScannersFuzzing(ImpalaTestSuite):
    rng = random.Random()
    random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time()
    LOG.info("Using random seed %d", random_seed)
-    rng.seed(long(random_seed))
+    rng.seed(int(random_seed))

    tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % fuzz_table,
        dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))
--- a/tests/shell/test_shell_interactive.py
+++ b/tests/shell/test_shell_interactive.py
@@ -626,7 +626,7 @@ class TestImpalaShellInteractive(ImpalaTestSuite):
      self._expect_with_cmd(child_proc, "select 'hi'", vector, ('hi'))
      child_proc.sendline('exit;')
      child_proc.expect(pexpect.EOF)
-      history_contents = file(new_hist.name).read()
+      history_contents = open(new_hist.name).read()
      assert "select 'hi'" in history_contents

  def test_rerun(self, vector, tmp_history_file):
--- a/tests/stress/concurrent_select.py
+++ b/tests/stress/concurrent_select.py
@@ -71,7 +71,7 @@ from copy import copy
 from datetime import datetime
 from multiprocessing import Lock, Process, Queue, Value
 from random import choice, random, randrange, shuffle
-from sys import exit, maxint
+from sys import exit, maxsize
 from tempfile import gettempdir
 from textwrap import dedent
 from threading import current_thread
@@ -596,7 +596,7 @@ class StressRunner(object):
        else:
          # Let the query run as long as necessary - it is nearly impossible to pick a
          # good value that won't have false positives under load - see IMPALA-8222.
-          timeout = maxint
+          timeout = maxsize
        report = query_runner.run_query(query, mem_limit, timeout_secs=timeout,
            cancel_mech=cancel_mech)
        LOG.debug("Got execution report for query")
@@ -858,7 +858,7 @@ def populate_runtime_info_for_random_queries(impala, candidate_queries, converte
  return queries


-def populate_runtime_info(query, impala, converted_args, timeout_secs=maxint):
+def populate_runtime_info(query, impala, converted_args, timeout_secs=maxsize):
  """Runs the given query by itself repeatedly until the minimum memory is determined
  with and without spilling. Potentially all fields in the Query class (except
  'sql') will be populated by this method. 'required_mem_mb_without_spilling' and
@@ -997,7 +997,7 @@ def populate_runtime_info(query, impala, converted_args, timeout_secs=maxint):

  LOG.info("Finding minimum memory required to avoid spilling")
  lower_bound = max(limit_exceeded_mem, spill_mem)
-  upper_bound = min(non_spill_mem or maxint, impala.min_impalad_mem_mb)
+  upper_bound = min(non_spill_mem or maxsize, impala.min_impalad_mem_mb)
  while True:
    if old_required_mem_mb_without_spilling:
      mem_limit = old_required_mem_mb_without_spilling
@@ -1034,7 +1034,7 @@ def populate_runtime_info(query, impala, converted_args, timeout_secs=maxint):
  LOG.info("Finding absolute minimum memory required")
  lower_bound = limit_exceeded_mem
  upper_bound = min(
-      spill_mem or maxint, non_spill_mem or maxint, impala.min_impalad_mem_mb)
+      spill_mem or maxsize, non_spill_mem or maxsize, impala.min_impalad_mem_mb)
  while True:
    if old_required_mem_mb_with_spilling:
      mem_limit = old_required_mem_mb_with_spilling
--- a/tests/stress/query_runner.py
+++ b/tests/stress/query_runner.py
@@ -18,13 +18,14 @@
 # under the License.

 from __future__ import absolute_import, division, print_function
+from builtins import round
 import logging
 from multiprocessing import Value
 import os
 import re
 from textwrap import dedent
 from time import sleep, time
-from sys import maxint
+from sys import maxsize

 from tests.stress.queries import QueryType
 from tests.stress.util import create_and_start_daemon_thread, increment
@@ -103,7 +104,7 @@ class QueryRunner(object):
    self.impalad_conn = self.impalad.impala.connect(impalad=self.impalad)

  def run_query(self, query, mem_limit_mb, run_set_up=False,
-                timeout_secs=maxint, cancel_mech=None, retain_profile=False):
+                timeout_secs=maxsize, cancel_mech=None, retain_profile=False):
    """Run a query and return an execution report. If 'run_set_up' is True, set up sql
    will be executed before the main query. This should be the case during the binary
    search phase of the stress test. 'cancel_mech' is optionally a CancelMechanism
@@ -472,7 +473,7 @@ def _add_row_to_hash(row, curr_hash):
    curr_hash += _hash_val(idx, val)
    # Modulo the result to keep it "small" otherwise the math ops can be slow
    # since python does infinite precision math.
-    curr_hash %= maxint
+    curr_hash %= maxsize
  return curr_hash


--- a/tests/unittests/test_file_parser.py
+++ b/tests/unittests/test_file_parser.py
@@ -88,7 +88,7 @@ class TestTestFileParser(BaseTestSuite):
                                     skip_unknown_sections=False)
      assert 0, 'Expected error due to invalid section'
    except RuntimeError as re:
-      assert re.message == "Unknown subsection: TYPES"
+      assert str(re) == "Unknown subsection: TYPES"

  def test_parse_query_name(self):
    results = parse_test_file_text(test_text, VALID_SECTIONS, False)
--- a/tests/unittests/test_result_verifier.py
+++ b/tests/unittests/test_result_verifier.py
@@ -50,13 +50,13 @@ class TestResultVerifier(ImpalaTestSuite):
      res.rows[0]['does_not_exist']
      assert False, 'Expected error due to column alias not existing'
    except IndexError as e:
-      assert "No column with label: does_not_exist" in e.message
+      assert "No column with label: does_not_exist" in str(e)

    try:
      res.rows[0][2]
      assert False, 'Expected error due to column position not existing'
    except IndexError as e:
-      assert 'list index out of range' in e.message
+      assert 'list index out of range' in str(e)

  def test_compute_aggregation(self, vector):
    profile = '''