[CDH5] Improved benchmark report formatting

- Results are displayed more compactly in the report - Only nodes with significant performance changes are displayed in execution summary comparisons Change-Id: I51fe1f71760ba451b23d3d0ec31358c848e54976 Reviewed-on: http://gerrit.sjc.cloudera.com:8080/3979 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: jenkins (cherry picked from commit feabfbc35c2f45c07a32d4925477ae52552b0b39) Reviewed-on: http://gerrit.sjc.cloudera.com:8080/4201
2026-01-09 06:05:09 -05:00 · 2014-08-21 16:13:40 -07:00
parent 3acee6b2b6
commit dae2efc16d
1 changed files with 133 additions and 106 deletions
--- a/tests/benchmark/report-benchmark-results.py
+++ b/tests/benchmark/report-benchmark-results.py
@@ -78,6 +78,9 @@ parser.add_option("--cluster_name", dest="cluster_name", default='UNKNOWN',
                 help="Name of the cluster the results are from (ex. Bolt)")
 parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
                 default= False, help='Outputs to console with with increased verbosity')
+parser.add_option("--output_all_summary_nodes", dest="output_all_summary_nodes",
+                 action="store_true", default= False,
+                 help='Print all execution summary nodes')
 parser.add_option("--build_version", dest="build_version", default='UNKNOWN',
                 help="Build/version info about the Impalad instance results are from.")
 parser.add_option("--lab_run_info", dest="lab_run_info", default='UNKNOWN',
@@ -161,12 +164,12 @@ def get_dict_from_json(filename):
      level = list()
      # In the outer layer, we group by workload name and scale factor
      level.append([('query', 'workload_name'), ('query', 'scale_factor')])
-      # In the middle layer, we group by query name
-      level.append([('query', 'name')])
-      # In the inner layer, we group by file format and compression type
+      # In the middle layer, we group by file format and compression type
      level.append([('query', 'test_vector', 'file_format'),
      ('query', 'test_vector', 'compression_codec'),
      ('query', 'test_vector', 'compression_type')])
+      # In the bottom layer, we group by query name
+      level.append([('query', 'name')])

      key = []

@@ -202,6 +205,8 @@ def get_dict_from_json(filename):
    for workload_name, workload in data.items():
      for query_result in workload:
        add_result(query_result)
+    # Calculate average runtime and stddev for each query type
+    calculate_time_stats(grouped)
    return grouped

 def calculate_time_stats(grouped):
@@ -209,10 +214,10 @@ def calculate_time_stats(grouped):
     and Standard Deviation for each query type.
  """

-  for workload_scale in grouped:
-    for query_name in grouped[workload_scale]:
-      for file_format in grouped[workload_scale][query_name]:
-        result_list = grouped[workload_scale][query_name][file_format][RESULT_LIST]
+  for workload_scale, workload in grouped.items():
+    for file_format, queries in workload.items():
+      for query_name, results in queries.items():
+        result_list = results[RESULT_LIST]
        avg = calculate_avg(
            [query_results[TIME_TAKEN] for query_results in result_list])
        dev = calculate_stddev(
@@ -220,49 +225,15 @@ def calculate_time_stats(grouped):
        num_clients = max(
            int(query_results[CLIENT_NAME]) for query_results in result_list)
        iterations = len(result_list)
+        results[AVG] = avg
+        results[STDDEV] = dev
+        results[NUM_CLIENTS] = num_clients
+        results[ITERATIONS] = iterations

-        grouped[workload_scale][query_name][file_format][AVG] = avg
-        grouped[workload_scale][query_name][file_format][STDDEV] = dev
-        grouped[workload_scale][query_name][file_format][NUM_CLIENTS] = num_clients
-        grouped[workload_scale][query_name][file_format][ITERATIONS] = iterations
-
-def calculate_workload_file_format_runtimes(grouped):
-  """Calculate average time for each workload and scale factor, for each file format and
-  compression.
-
-  This returns a new dictionary with avarage times.
-
-  Here's an example of how this dictionary is structured:
-  dictionary->
-  (('workload', 'tpch'), ('scale', '300gb'))->
-  (('file_format','parquet'), ('compression_codec','zip'), ('compression_type','block'))->
-  'avg'
-
-  We also have access to the list of QueryResult associated with each file_format
-
-  The difference between this dictionary and grouped_queries is that query name is missing
-  after workload.
-  """
-  new_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-
-  # First populate the dictionary with query results
-  for workload_scale, workload in grouped.items():
-    for query_name, file_formats in workload.items():
-      for file_format, results in file_formats.items():
-        new_dict[workload_scale][file_format][RESULT_LIST].extend(results[RESULT_LIST])
-
-  # Do the average calculation. Standard deviation could also be calculated here
-  for workload_scale in new_dict:
-    for file_format in new_dict[workload_scale]:
-      avg = calculate_avg([query_results[TIME_TAKEN]
-        for query_results in new_dict[workload_scale][file_format][RESULT_LIST]])
-      new_dict[workload_scale][file_format][AVG] = avg
-  return new_dict
-
-def build_perf_change_str(result, ref_result, regression):
+def build_perf_change_str(result, ref_result, is_regression):
  """Build a performance change string"""

-  perf_change_type = "regression" if regression else "improvement"
+  perf_change_type = "regression" if is_regression else "improvement"
  query = result[RESULT_LIST][0][QUERY]

  query_name = query[NAME]
@@ -270,7 +241,7 @@ def build_perf_change_str(result, ref_result, regression):
  compression_codec = query[TEST_VECTOR][COMPRESSION_CODEC]
  compression_type = query[TEST_VECTOR][COMPRESSION_TYPE]

-  template = ("\nSignificant perf {perf_change_type} detected: "
+  template = ("Significant perf {perf_change_type}: "
             "{query_name} [{file_format}/{compression_codec}/{compression_type}] "
             "({ref_avg:.3f}s -> {avg:.3f}s)")
  return template.format(
@@ -521,8 +492,8 @@ class ExecSummaryComparison(object):
        ref_row = self.ref_combined_summary.rows[i]

        comparison_row = {}
-        for key in [PREFIX, OPERATOR, NUM_HOSTS, AVG_TIME, STDDEV_TIME, MAX_TIME, PEAK_MEM,
-            NUM_ROWS, EST_NUM_ROWS, EST_PEAK_MEM, DETAIL]:
+        for key in [PREFIX, OPERATOR, NUM_HOSTS, AVG_TIME, STDDEV_TIME,
+            MAX_TIME, PEAK_MEM, NUM_ROWS, EST_NUM_ROWS, EST_PEAK_MEM, DETAIL]:
          comparison_row[key] = row[key]

        comparison_row[AVG_TIME_CHANGE] = self.__calculate_change(
@@ -565,19 +536,24 @@ class ExecSummaryComparison(object):
          "Est #Rows"])
    table.align = 'l'

-    for row in self.rows:
-      table_row = [ row[PREFIX] + row[OPERATOR],
-          prettyprint_values(row[NUM_HOSTS]),
-          prettyprint_time(row[AVG_TIME]),
-          prettyprint_time(row[STDDEV_TIME]),
-          prettyprint_percent(row[AVG_TIME_CHANGE]),
-          prettyprint_percent(row[AVG_TIME_CHANGE_TOTAL]),
-          prettyprint_time(row[MAX_TIME]),
-          prettyprint_percent(row[MAX_TIME_CHANGE]),
-          prettyprint_values(row[NUM_ROWS]),
-          prettyprint_values(row[EST_NUM_ROWS]) ]
+    def is_significant(row):
+      """Check if the performance change in this row was significant"""
+      return options.output_all_summary_nodes or abs(row[AVG_TIME_CHANGE_TOTAL]) > 0.01

-      table.add_row(table_row)
+    for row in self.rows:
+      if is_significant(row):
+        table_row = [row[OPERATOR],
+            prettyprint_values(row[NUM_HOSTS]),
+            prettyprint_time(row[AVG_TIME]),
+            prettyprint_time(row[STDDEV_TIME]),
+            prettyprint_percent(row[AVG_TIME_CHANGE]),
+            prettyprint_percent(row[AVG_TIME_CHANGE_TOTAL]),
+            prettyprint_time(row[MAX_TIME]),
+            prettyprint_percent(row[MAX_TIME_CHANGE]),
+            prettyprint_values(row[NUM_ROWS]),
+            prettyprint_values(row[EST_NUM_ROWS]) ]
+
+        table.add_row(table_row)

    return str(table)

@@ -653,6 +629,21 @@ def build_exec_summary_str(results, ref_results):

  return str(comparison) + '\n'

+def build_perf_change_row(result, ref_result, is_regression):
+  """Build a performance change table row"""
+
+  query = result[RESULT_LIST][0][QUERY]
+
+  query_name = query[NAME]
+  file_format = query[TEST_VECTOR][FILE_FORMAT]
+  compression_codec = query[TEST_VECTOR][COMPRESSION_CODEC]
+  compression_type = query[TEST_VECTOR][COMPRESSION_TYPE]
+  format_str = '{0}/{1}/{2}'.format(file_format, compression_codec, compression_type)
+  ref_avg = ref_result[AVG]
+  avg = result[AVG]
+
+  return [query_name, format_str, ref_avg, avg]
+
 def compare_time_stats(grouped, ref_grouped):
  """Given two nested dictionaries generated by get_dict_from_json, after running
  calculate_time_stats on both, compare the performance of the given run to a reference
@@ -661,8 +652,9 @@ def compare_time_stats(grouped, ref_grouped):
  A string will be returned with instances where there is a significant performance
  difference
  """
-  out_str = str()
-  all_exec_summaries = str()
+  regression_table_data = list()
+  improvement_table_data = list()
+  full_comparison_str = str()
  for workload_scale_key, workload in grouped.items():
    for query_name, file_formats in workload.items():
      for file_format, results in file_formats.items():
@@ -671,15 +663,23 @@ def compare_time_stats(grouped, ref_grouped):
            results, ref_results)

        if change_significant:
-          out_str += build_perf_change_str(results, ref_results, is_regression) + '\n'
-          out_str += build_exec_summary_str(results, ref_results)
+          full_comparison_str += build_perf_change_str(
+              results, ref_results, is_regression) + '\n'
+          full_comparison_str += build_exec_summary_str(results, ref_results) + '\n'
+
+          change_row = build_perf_change_row(results, ref_results, is_regression)
+
+          if is_regression:
+            regression_table_data.append(change_row)
+          else:
+            improvement_table_data.append(change_row)

        try:
          save_runtime_diffs(results, ref_results, change_significant, is_regression)
        except Exception as e:
          print 'Could not generate an html diff: %s' % e

-  return out_str
+  return full_comparison_str, regression_table_data, improvement_table_data

 def is_result_group_comparable(grouped, ref_grouped):
  """Given two nested dictionaries generated by get_dict_from_json, return true if they
@@ -726,46 +726,49 @@ def build_summary_header():
    summary += 'Lab Run Info: {0}\n'.format(options.lab_run_info)
  return summary

-def get_summary_str(workload_ff):
-  """This prints a table containing the average run time per file format"""
+def get_summary_str(grouped):
  summary_str = str()
-  summary_str += build_summary_header() + '\n'

-  for workload_scale in workload_ff:
+  for workload_scale, workload in grouped.items():
    summary_str += "{0} / {1} \n".format(workload_scale[0][1], workload_scale[1][1])
    table = prettytable.PrettyTable(["File Format", "Compression", "Avg (s)"])
    table.align = 'l'
    table.float_format = '.2'
-    for file_format in workload_ff[workload_scale]:
+    for file_format, queries in workload.items():
+      # Calculate The average time for each file format and compression
      ff = file_format[0][1]
      compression = file_format[1][1] + " / " + file_format[2][1]
-      avg = workload_ff[workload_scale][file_format][AVG]
+      avg = calculate_avg([query_results[TIME_TAKEN] for results in queries.values() for
+        query_results in results[RESULT_LIST]])
      table.add_row([ff, compression, avg])
    summary_str += str(table) + '\n'
  return summary_str

 def get_stats_str(grouped):
  stats_str = str()
-  for workload_scale_key, workload in grouped.items():
-    stats_str += "Workload / Scale Factor: {0} / {1}".format(workload_scale_key[0][1],
-        workload_scale_key[1][1])
-    for query_name, file_formats in workload.items():
-      stats_str += "\n\nQuery: {0} \n".format(query_name[0][1])
-      table = prettytable.PrettyTable(
-          ["File Format", "Compression", "Avg(s)", "StdDev(s)", "Num Clients", "Iters"])
-      table.align = 'l'
-      table.float_format = '.2'
-      for file_format, results in file_formats.items():
-        table_row = []
-        # File Format
-        table_row.append(file_format[0][1])
-        # Compression
-        table_row.append(file_format[1][1] + " / " + file_format[2][1])
-        table_row.append(results[AVG])
-        table_row.append(results[STDDEV])
-        table_row.append(results[NUM_CLIENTS])
-        table_row.append(results[ITERATIONS])
-        table.add_row(table_row)
+  for workload_scale, workload in grouped.items():
+    stats_str += "Workload / Scale Factor: {0} / {1}\n".format(
+        workload_scale[0][1], workload_scale[1][1])
+    table = prettytable.PrettyTable(["Query", "File Format", "Compression", "Avg(s)",
+      "StdDev(s)", "Rel StdDev", "Num Clients", "Iters"])
+    table.align = 'l'
+    table.float_format = '.2'
+    for file_format, queries in workload.items():
+      for query_name, results in queries.items():
+        relative_stddev = results[STDDEV] / results[AVG] if results[AVG] > 0 else 0.0
+        relative_stddev_str = '{0:.2%}'.format(relative_stddev)
+        if relative_stddev > 0.1:
+          relative_stddev_str = '* ' + relative_stddev_str + ' *'
+        else:
+          relative_stddev_str = '  ' + relative_stddev_str
+        table.add_row([query_name[0][1],
+            file_format[0][1],
+            file_format[1][1] + ' / ' + file_format[2][1],
+            results[AVG],
+            results[STDDEV],
+            relative_stddev_str,
+            results[NUM_CLIENTS],
+            results[ITERATIONS]])
      stats_str += str(table) + '\n'
  return stats_str

@@ -775,7 +778,6 @@ def all_query_results(grouped):
      for file_format, results in file_formats.items():
        yield(results)

-
 def write_results_to_datastore(grouped):
  """ Saves results to a database """
  from perf_result_datastore import PerfResultDataStore
@@ -832,6 +834,19 @@ def write_results_to_datastore(grouped):
        runtime_profile = runtime_profile,
        is_official = options.is_official)

+def build_perf_summary_table(table_data):
+  table = prettytable.PrettyTable(
+      ['Query',
+        'Format',
+        'Original Time (s)',
+        'Current Time (s)'])
+  table.align = 'l'
+  table.float_format = '.2'
+  for row in table_data:
+    table.add_row(row)
+
+  return str(table)
+
 if __name__ == "__main__":
  """Workflow:
  1. Build a nested dictionary for the current result JSON and reference result JSON.
@@ -852,22 +867,34 @@ if __name__ == "__main__":
    print 'Could not read reference result file: %s' % e
    ref_grouped = None

-  # Calculate average runtime and stddev for each query type
-  calculate_time_stats(grouped)
-  if ref_grouped is not None:
-    calculate_time_stats(ref_grouped)
-
  if options.save_to_db: write_results_to_datastore(grouped)

-  summary_str = get_summary_str(calculate_workload_file_format_runtimes(grouped))
+  summary_str = get_summary_str(grouped)
  stats_str = get_stats_str(grouped)
-  if is_result_group_comparable(grouped, ref_grouped):
-    comparison_str = compare_time_stats(grouped, ref_grouped)
-  else:
-    comparison_str = ("Comparison could not be generated because reference results do "
-                     "not contain all queries\nthat in results (or reference results are "
-                     "missing)")

+  comparison_str = ("Comparison could not be generated because reference results do "
+                   "not contain all queries\nin results (or reference results are "
+                   "missing)")
+  regression_table_data = []
+  improvement_table_data = []
+  if is_result_group_comparable(grouped, ref_grouped):
+    comparison_str, regression_table_data, improvement_table_data = compare_time_stats(
+        grouped, ref_grouped)
+
+  regression_table_str = str()
+  improvement_table_str = str()
+
+  if len(regression_table_data) > 0:
+    regression_table_str += 'Performance Regressions:\n'
+    regression_table_str += build_perf_summary_table(regression_table_data) + '\n'
+
+  if len(improvement_table_data) > 0:
+    improvement_table_str += 'Performance Improvements:\n'
+    improvement_table_str += build_perf_summary_table(improvement_table_data) + '\n'
+
+  print build_summary_header()
  print summary_str
  print stats_str
+  print regression_table_str
+  print improvement_table_str
  print comparison_str