From 667403b2cb0c05fffd33dc6f8b53dba7f9957a0a Mon Sep 17 00:00:00 2001
From: Riza Suminto <riza.suminto@cloudera.com>
Date: Thu, 20 Apr 2023 17:31:59 -0700
Subject: [PATCH] IMPALA-12090: Split runtime profiles made by
 single_node_perf_run.py

single_node_perf_run.py produce a single text file containing all
runtime profiles from perf run from one git hash. This is handy, but the
resulting text file can be very long and makes it difficult to analyze
individual profile.

This patch add --split_profiles and --no_split_profiles option into
single_node_perf_run.py. If --split_profiles is specified, it it will
extract runtime profiles into individual file instead of single long
text file. Specifying --no_split_profiles will retain the old behavior
of putting runtime profiles into a single-combined text file. Default to
split profiles if neither is specified. Files in profile directory will
look like this with --split_profiles:

$ ls -1 perf_results/latest/2267d9d104cc3fb0740cba09acb369b4d7ae4f52_profiles/
TPCDS-Q14-1_iter001.txt
TPCDS-Q14-1_iter002.txt
TPCDS-Q14-1_iter003.txt
TPCDS-Q14-2_iter001.txt
TPCDS-Q14-2_iter002.txt
TPCDS-Q14-2_iter003.txt
TPCDS-Q23-1_iter001.txt
TPCDS-Q23-1_iter002.txt
TPCDS-Q23-1_iter003.txt
TPCDS-Q23-2_iter001.txt
TPCDS-Q23-2_iter002.txt
TPCDS-Q23-2_iter003.txt

Testing:
- Manually test run the script with selected queries from tpcds
  workload with either --split_profiles or --no_split_profiles.

Change-Id: Ibc2d3cefd7ad61b76cbef74c734543ef9ca51795
Reviewed-on: http://gerrit.cloudera.org:8080/19796
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
---
 bin/single_node_perf_run.py | 64 ++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 11 deletions(-)

diff --git a/bin/single_node_perf_run.py b/bin/single_node_perf_run.py
index 1452c9fee..2c0418175 100755
--- a/bin/single_node_perf_run.py
+++ b/bin/single_node_perf_run.py
@@ -107,6 +107,7 @@ def load_data(db_to_load, table_formats, scale):
     configured_call(["{0}/tests/util/compute_table_stats.py".format(IMPALA_HOME),
                      "--stop_on_error", "--db_names", db_name])
 
+
 def get_git_hash_for_name(name):
   return sh.git("rev-parse", name).strip()
 
@@ -169,7 +170,7 @@ def report_benchmark_results(file_a, file_b, description):
   sh.cat(result, _out=sys.stdout)
 
 
-def compare(base_dir, hash_a, hash_b):
+def compare(base_dir, hash_a, hash_b, options):
   """Take the results of two performance runs and compare them."""
   file_a = os.path.join(base_dir, hash_a + ".json")
   file_b = os.path.join(base_dir, hash_b + ".json")
@@ -177,14 +178,22 @@ def compare(base_dir, hash_a, hash_b):
   report_benchmark_results(file_a, file_b, description)
 
   # From the two json files extract the profiles and diff them
-  generate_profile_file(file_a, hash_a, base_dir)
-  generate_profile_file(file_b, hash_b, base_dir)
-
-  sh.diff("-u",
-          os.path.join(base_dir, hash_a + "_profile.txt"),
-          os.path.join(base_dir, hash_b + "_profile.txt"),
-          _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"),
-          _ok_code=[0, 1])
+  if options.split_profiles:
+    generate_profile_files(file_a, hash_a, base_dir)
+    generate_profile_files(file_b, hash_b, base_dir)
+    sh.diff("-u",
+            os.path.join(base_dir, hash_a + "_profiles"),
+            os.path.join(base_dir, hash_b + "_profiles"),
+            _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"),
+            _ok_code=[0, 1])
+  else:
+    generate_profile_file(file_a, hash_a, base_dir)
+    generate_profile_file(file_b, hash_b, base_dir)
+    sh.diff("-u",
+            os.path.join(base_dir, hash_a + "_profile.txt"),
+            os.path.join(base_dir, hash_b + "_profile.txt"),
+            _out=os.path.join(IMPALA_HOME, "performance_result_profile_diff.txt"),
+            _ok_code=[0, 1])
 
 
 def generate_profile_file(name, hash, base_dir):
@@ -202,6 +211,33 @@ def generate_profile_file(name, hash, base_dir):
           out.write("\n\n")
 
 
+def generate_profile_files(name, hash, base_dir):
+  """Extracts runtime profiles from the JSON file 'name'.
+
+  Writes the runtime profiles back as separated simple text file in '[hash]_profiles' dir
+  in base_dir.
+  """
+  profile_dir = os.path.join(base_dir, hash + "_profiles")
+  if not os.path.exists(profile_dir):
+    os.makedirs(profile_dir)
+  with open(name) as fid:
+    data = json.loads(fid.read().decode("utf-8", "ignore"))
+    iter_num = {}
+    # For each query
+    for key in data:
+      for iteration in data[key]:
+        query_name = iteration["query"]["name"]
+        if query_name in iter_num:
+          iter_num[query_name] += 1
+        else:
+          iter_num[query_name] = 1
+        curr_iter = iter_num[query_name]
+
+        file_name = "{}_iter{:03d}.txt".format(query_name, curr_iter)
+        with open(os.path.join(profile_dir, file_name), "w") as out:
+          out.write(iteration["runtime_profile"])
+
+
 def backup_workloads():
   """Copy the workload folder to a temporary directory and returns its name.
 
@@ -266,7 +302,7 @@ def perf_ab_test(options, args):
     restore_workloads(workload_dir)
     start_impala(options.num_impalads, options)
     run_workload(temp_dir, workloads, options)
-    compare(temp_dir, hash_a, hash_b)
+    compare(temp_dir, hash_a, hash_b, options)
 
 
 def parse_options():
@@ -289,10 +325,16 @@ def parse_options():
   parser.add_option("--start_minicluster", action="store_true",
                     help="start a new Hadoop minicluster")
   parser.add_option("--ninja", action="store_true",
-                    help = "use ninja, rather than Make, as the build tool")
+                    help="use ninja, rather than Make, as the build tool")
   parser.add_option("--impalad_args", dest="impalad_args", action="append", type="string",
                     default=[],
                     help="Additional arguments to pass to each Impalad during startup")
+  parser.add_option("--split_profiles", action="store_true", dest="split_profiles",
+                    default=True, help=("If specified, query profiles will be generated "
+                      "as separate files"))
+  parser.add_option("--no_split_profiles", action="store_false", dest="split_profiles",
+                    help=("If specified, query profiles will be generated as a "
+                      "single-combined file"))
 
   parser.set_usage(textwrap.dedent("""
     single_node_perf_run.py [options] git_hash_A [git_hash_B]