From 3ce0004c12f587dbc6ac6d7451b5f362d148d033 Mon Sep 17 00:00:00 2001 From: Joe McDonnell Date: Sat, 25 Oct 2025 09:39:03 -0700 Subject: [PATCH] IMPALA-14512: Remove dependency on sh python package This modifies bin/single_node_perf_run.py to stop using the sh python package. It replaces sh with calls to subprocess. It stops installing sh for both the Python 2 and 3 virtualenvs. Testing: - Ran perf-AB-test job with it and examined the logs Change-Id: Ic5f9316a5d83c5c0dc37d4a94c55b6a655765fe3 Reviewed-on: http://gerrit.cloudera.org:8080/23600 Reviewed-by: Riza Suminto Reviewed-by: Jason Fehr Tested-by: Impala Public Jenkins --- bin/single_node_perf_run.py | 75 ++++++++++++++++++------------ infra/python/deps/requirements.txt | 1 - 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/bin/single_node_perf_run.py b/bin/single_node_perf_run.py index d141f8694..c54c71a49 100755 --- a/bin/single_node_perf_run.py +++ b/bin/single_node_perf_run.py @@ -79,7 +79,6 @@ from tempfile import mkdtemp import json import os import pipes -import sh import shutil import subprocess import sys @@ -99,6 +98,16 @@ def configured_call(cmd): return subprocess.check_call(["bash", "-c", cmd]) +def run_git(args): + """Runs git without capturing output (stdout passes through to stdout)""" + subprocess.check_call(["git"] + args, text=True) + + +def get_git_output(args): + """Runs git, capturing the output and returning it""" + return subprocess.check_output(["git"] + args, text=True) + + def load_data(db_to_load, table_formats, scale): """Loads a database with a particular scale factor.""" all_formats = ("text/none," + table_formats if "text/none" not in table_formats @@ -115,12 +124,12 @@ def load_data(db_to_load, table_formats, scale): def get_git_hash_for_name(name): - return sh.git("rev-parse", name).strip() + return get_git_output(["rev-parse", name]).strip() def build(git_hash, options): """Builds Impala in release mode; doesn't build tests.""" - sh.git.checkout(git_hash) + run_git(["checkout", git_hash]) buildall = ["{0}/buildall.sh".format(IMPALA_HOME), "-notests", "-release", "-noclean"] if options.ninja: buildall += ["-ninja"] @@ -168,15 +177,20 @@ def run_workload(base_dir, workloads, options): def report_benchmark_results(file_a, file_b, description): """Wrapper around report_benchmark_result.py.""" + performance_result = subprocess.check_output( + ["{0}/tests/benchmark/report_benchmark_results.py".format(IMPALA_HOME), + "--reference_result_file={0}".format(file_a), + "--input_result_file={0}".format(file_b), + '--report_description="{0}"'.format(description)], + text=True) + + # Output the performance result to stdout for convenience + print(performance_result) + + # Dump the performance result to a file to preserve result = os.path.join(IMPALA_PERF_RESULTS, "latest", "performance_result.txt") with open(result, "w") as f: - subprocess.check_call( - ["{0}/tests/benchmark/report_benchmark_results.py".format(IMPALA_HOME), - "--reference_result_file={0}".format(file_a), - "--input_result_file={0}".format(file_b), - '--report_description="{0}"'.format(description)], - stdout=f) - sh.cat(result, _out=sys.stdout) + f.write(performance_result) def compare(base_dir, hash_a, hash_b, options): @@ -190,19 +204,17 @@ def compare(base_dir, hash_a, hash_b, options): if options.split_profiles: generate_profile_files(file_a, hash_a, base_dir) generate_profile_files(file_b, hash_b, base_dir) - sh.diff("-u", - os.path.join(base_dir, hash_a + "_profiles"), - os.path.join(base_dir, hash_b + "_profiles"), - _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"), - _ok_code=[0, 1]) + with open(os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"), "w") as f: + # This does not check that the diff command succeeds + subprocess.run(["diff", "-u", os.path.join(base_dir, hash_a + "_profiles"), + os.path.join(base_dir, hash_b + "_profiles")], stdout=f, text=True) else: generate_profile_file(file_a, hash_a, base_dir) generate_profile_file(file_b, hash_b, base_dir) - sh.diff("-u", - os.path.join(base_dir, hash_a + "_profile.txt"), - os.path.join(base_dir, hash_b + "_profile.txt"), - _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"), - _ok_code=[0, 1]) + with open(os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"), "w") as f: + # This does not check that the diff command succeeds + subprocess.run(["diff", "-u", os.path.join(base_dir, hash_a + "_profile.txt"), + os.path.join(base_dir, hash_b + "_profile.txt")], stdout=f, text=True) def generate_profile_file(name, hash, base_dir): @@ -253,16 +265,17 @@ def backup_workloads(): Used to keep workloads from being clobbered by git checkout. """ temp_dir = mkdtemp() - sh.cp(os.path.join(IMPALA_HOME, "testdata", "workloads"), - temp_dir, R=True, _out=sys.stdout, _err=sys.stderr) + shutil.copytree(os.path.join(IMPALA_HOME, "testdata", "workloads"), + os.path.join(temp_dir, "workloads")) print("Backed up workloads to {0}".format(temp_dir)) return temp_dir def restore_workloads(source): """Restores the workload directory from source into the Impala tree.""" - sh.cp(os.path.join(source, "workloads"), os.path.join(IMPALA_HOME, "testdata"), - R=True, _out=sys.stdout, _err=sys.stderr) + # dirs_exist_ok=True allows this to overwrite the existing files + shutil.copytree(os.path.join(source, "workloads"), + os.path.join(IMPALA_HOME, "testdata", "workloads"), dirs_exist_ok=True) def perf_ab_test(options, args): @@ -314,7 +327,7 @@ def perf_ab_test(options, args): hash_b = get_git_hash_for_name(args[1]) # discard any changes created by the previous restore_workloads() shutil.rmtree("testdata/workloads") - sh.git.checkout("--", "testdata/workloads") + run_git(["checkout", "--", "testdata/workloads"]) build(hash_b, options) restore_workloads(workload_dir) start_impala(options.num_impalads, options) @@ -399,17 +412,17 @@ def main(): os.chdir(IMPALA_HOME) - if sh.git("status", "--porcelain", "--untracked-files=no", _out=None).strip(): - sh.git("status", "--porcelain", "--untracked-files=no", _out=sys.stdout) + if get_git_output(["status", "--porcelain", "--untracked-files=no"]).strip(): + run_git(["status", "--porcelain", "--untracked-files=no"]) # Something went wrong, let's dump the actual diff to make it easier to # track down print("#### Working copy is dirty, dumping the diff #####") - sh.git("--no-pager", "diff", _out=sys.stdout) + run_git(["--no-pager", "diff"]) print("#### End of diff #####") raise Exception("Working copy is dirty. Consider 'git stash' and try again.") # Save the current hash to be able to return to this place in the tree when done - current_hash = sh.git("rev-parse", "--abbrev-ref", "HEAD").strip() + current_hash = get_git_output(["rev-parse", "--abbrev-ref", "HEAD"]).strip() if current_hash == "HEAD": current_hash = get_git_hash_for_name("HEAD") @@ -419,8 +432,8 @@ def main(): finally: # discard any changes created by the previous restore_workloads() shutil.rmtree("testdata/workloads") - sh.git.checkout("--", "testdata/workloads") - sh.git.checkout(current_hash) + run_git(["checkout", "--", "testdata/workloads"]) + run_git(["checkout", current_hash]) restore_workloads(workloads) diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt index ae27ff169..ad4f43e11 100644 --- a/infra/python/deps/requirements.txt +++ b/infra/python/deps/requirements.txt @@ -54,7 +54,6 @@ requests == 2.21.0 urllib3 == 1.24.2 certifi == 2020.12.5 sasl == 0.2.1 -sh == 1.11 six == 1.14.0 sqlparse == 0.3.1 texttable == 0.8.3