From 667403b2cb0c05fffd33dc6f8b53dba7f9957a0a Mon Sep 17 00:00:00 2001 From: Riza Suminto Date: Thu, 20 Apr 2023 17:31:59 -0700 Subject: [PATCH] IMPALA-12090: Split runtime profiles made by single_node_perf_run.py single_node_perf_run.py produce a single text file containing all runtime profiles from perf run from one git hash. This is handy, but the resulting text file can be very long and makes it difficult to analyze individual profile. This patch add --split_profiles and --no_split_profiles option into single_node_perf_run.py. If --split_profiles is specified, it it will extract runtime profiles into individual file instead of single long text file. Specifying --no_split_profiles will retain the old behavior of putting runtime profiles into a single-combined text file. Default to split profiles if neither is specified. Files in profile directory will look like this with --split_profiles: $ ls -1 perf_results/latest/2267d9d104cc3fb0740cba09acb369b4d7ae4f52_profiles/ TPCDS-Q14-1_iter001.txt TPCDS-Q14-1_iter002.txt TPCDS-Q14-1_iter003.txt TPCDS-Q14-2_iter001.txt TPCDS-Q14-2_iter002.txt TPCDS-Q14-2_iter003.txt TPCDS-Q23-1_iter001.txt TPCDS-Q23-1_iter002.txt TPCDS-Q23-1_iter003.txt TPCDS-Q23-2_iter001.txt TPCDS-Q23-2_iter002.txt TPCDS-Q23-2_iter003.txt Testing: - Manually test run the script with selected queries from tpcds workload with either --split_profiles or --no_split_profiles. Change-Id: Ibc2d3cefd7ad61b76cbef74c734543ef9ca51795 Reviewed-on: http://gerrit.cloudera.org:8080/19796 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- bin/single_node_perf_run.py | 64 ++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/bin/single_node_perf_run.py b/bin/single_node_perf_run.py index 1452c9fee..2c0418175 100755 --- a/bin/single_node_perf_run.py +++ b/bin/single_node_perf_run.py @@ -107,6 +107,7 @@ def load_data(db_to_load, table_formats, scale): configured_call(["{0}/tests/util/compute_table_stats.py".format(IMPALA_HOME), "--stop_on_error", "--db_names", db_name]) + def get_git_hash_for_name(name): return sh.git("rev-parse", name).strip() @@ -169,7 +170,7 @@ def report_benchmark_results(file_a, file_b, description): sh.cat(result, _out=sys.stdout) -def compare(base_dir, hash_a, hash_b): +def compare(base_dir, hash_a, hash_b, options): """Take the results of two performance runs and compare them.""" file_a = os.path.join(base_dir, hash_a + ".json") file_b = os.path.join(base_dir, hash_b + ".json") @@ -177,14 +178,22 @@ def compare(base_dir, hash_a, hash_b): report_benchmark_results(file_a, file_b, description) # From the two json files extract the profiles and diff them - generate_profile_file(file_a, hash_a, base_dir) - generate_profile_file(file_b, hash_b, base_dir) - - sh.diff("-u", - os.path.join(base_dir, hash_a + "_profile.txt"), - os.path.join(base_dir, hash_b + "_profile.txt"), - _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"), - _ok_code=[0, 1]) + if options.split_profiles: + generate_profile_files(file_a, hash_a, base_dir) + generate_profile_files(file_b, hash_b, base_dir) + sh.diff("-u", + os.path.join(base_dir, hash_a + "_profiles"), + os.path.join(base_dir, hash_b + "_profiles"), + _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"), + _ok_code=[0, 1]) + else: + generate_profile_file(file_a, hash_a, base_dir) + generate_profile_file(file_b, hash_b, base_dir) + sh.diff("-u", + os.path.join(base_dir, hash_a + "_profile.txt"), + os.path.join(base_dir, hash_b + "_profile.txt"), + _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"), + _ok_code=[0, 1]) def generate_profile_file(name, hash, base_dir): @@ -202,6 +211,33 @@ def generate_profile_file(name, hash, base_dir): out.write("\n\n") +def generate_profile_files(name, hash, base_dir): + """Extracts runtime profiles from the JSON file 'name'. + + Writes the runtime profiles back as separated simple text file in '[hash]_profiles' dir + in base_dir. + """ + profile_dir = os.path.join(base_dir, hash + "_profiles") + if not os.path.exists(profile_dir): + os.makedirs(profile_dir) + with open(name) as fid: + data = json.loads(fid.read().decode("utf-8", "ignore")) + iter_num = {} + # For each query + for key in data: + for iteration in data[key]: + query_name = iteration["query"]["name"] + if query_name in iter_num: + iter_num[query_name] += 1 + else: + iter_num[query_name] = 1 + curr_iter = iter_num[query_name] + + file_name = "{}_iter{:03d}.txt".format(query_name, curr_iter) + with open(os.path.join(profile_dir, file_name), "w") as out: + out.write(iteration["runtime_profile"]) + + def backup_workloads(): """Copy the workload folder to a temporary directory and returns its name. @@ -266,7 +302,7 @@ def perf_ab_test(options, args): restore_workloads(workload_dir) start_impala(options.num_impalads, options) run_workload(temp_dir, workloads, options) - compare(temp_dir, hash_a, hash_b) + compare(temp_dir, hash_a, hash_b, options) def parse_options(): @@ -289,10 +325,16 @@ def parse_options(): parser.add_option("--start_minicluster", action="store_true", help="start a new Hadoop minicluster") parser.add_option("--ninja", action="store_true", - help = "use ninja, rather than Make, as the build tool") + help="use ninja, rather than Make, as the build tool") parser.add_option("--impalad_args", dest="impalad_args", action="append", type="string", default=[], help="Additional arguments to pass to each Impalad during startup") + parser.add_option("--split_profiles", action="store_true", dest="split_profiles", + default=True, help=("If specified, query profiles will be generated " + "as separate files")) + parser.add_option("--no_split_profiles", action="store_false", dest="split_profiles", + help=("If specified, query profiles will be generated as a " + "single-combined file")) parser.set_usage(textwrap.dedent(""" single_node_perf_run.py [options] git_hash_A [git_hash_B]