compare.py: compute and print 'OVERALL GEOMEAN' aggregate (#1289)

Despite the wide variety of the features we provide, some people still have the audacity to complain and demand more. Concretely, i *very* often would like to see the overall result of the benchmark. Is the 'new' better or worse, overall, over all the non-aggregate time/cpu measurements. This comes up for me most often when i want to quickly see what effect some LLVM optimization change has on the benchmark. The idea is straight-forward, just produce four lists: wall times for LHS benchmark, CPU times for LHS benchmark, wall times for RHS benchmark, CPU times for RHS benchmark; then compute geomean for each one of those four lists, and compute the two percentage change between * geomean wall time for LHS benchmark and geomean wall time for RHS benchmark * geomean CPU time for LHS benchmark and geomean CPU time for RHS benchmark and voila! It is complicated by the fact that it needs to graciously handle different time units, so pandas.Timedelta dependency is introduced. That is the only library that does not barf upon floating times, i have tried numpy.timedelta64 (only takes integers) and python's datetime.timedelta (does not take nanosecons), and they won't do. Fixes https://github.com/google/benchmark/issues/1147
2021-11-24 13:47:08 +03:00 · 2021-11-24 13:47:08 +03:00 · d6ba952fc1
parent ce92bbfb90
commit d6ba952fc1
2 changed files with 109 additions and 18 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
 numpy == 1.19.4
 scipy == 1.5.4
+pandas == 1.1.5
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@ -7,7 +7,9 @@ import re
 import copy
 import random

-from scipy.stats import mannwhitneyu
+from scipy.stats import mannwhitneyu, gmean
+from numpy import array
+from pandas import Timedelta


 class BenchmarkColor(object):
@ -150,6 +152,30 @@ def partition_benchmarks(json1, json2):
    return partitions


+def get_timedelta_field_as_seconds(benchmark, field_name):
+    """
+    Get value of field_name field of benchmark, which is time with time unit
+    time_unit, as time in seconds.
+    """
+    time_unit = benchmark['time_unit'] if 'time_unit' in benchmark else 's'
+    dt = Timedelta(benchmark[field_name], time_unit)
+    return dt / Timedelta(1, 's')
+
+
+def calculate_geomean(json):
+    """
+    Extract all real/cpu times from all the benchmarks as seconds,
+    and calculate their geomean.
+    """
+    times = []
+    for benchmark in json['benchmarks']:
+        if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
+            continue
+        times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
+                      get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
+    return gmean(times) if times else array([])
+
+
 def extract_field(partition, field_name):
    # The count of elements may be different. We want *all* of them.
    lhs = [x[field_name] for x in partition[0]]
@ -174,6 +200,7 @@ def calc_utest(timings_cpu, timings_time):

    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue

+
 def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
    def get_utest_color(pval):
        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
@ -242,7 +269,8 @@ def get_difference_report(
        if utest:
            timings_cpu = extract_field(partition, 'cpu_time')
            timings_time = extract_field(partition, 'real_time')
-            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
+                timings_cpu, timings_time)
            if cpu_pvalue and time_pvalue:
                utest_results = {
                    'have_optimal_repetitions': have_optimal_repetitions,
@ -268,6 +296,25 @@ def get_difference_report(
                'utest': utest_results
            })

+    lhs_gmean = calculate_geomean(json1)
+    rhs_gmean = calculate_geomean(json2)
+    if lhs_gmean.any() and rhs_gmean.any():
+        diff_report.append({
+            'name': 'OVERALL_GEOMEAN',
+            'measurements': [{
+                'real_time': lhs_gmean[0],
+                'cpu_time': lhs_gmean[1],
+                'real_time_other': rhs_gmean[0],
+                'cpu_time_other': rhs_gmean[1],
+                'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
+                'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
+            }],
+            'time_unit': 's',
+            'run_type': 'aggregate',
+            'aggregate_name': 'geomean',
+            'utest': {}
+        })
+
    return diff_report


@ -307,19 +354,19 @@ def print_difference_report(
        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
            for measurement in benchmark['measurements']:
                output_strs += [color_format(use_color,
-                                            fmt_str,
-                                            BC_HEADER,
-                                            benchmark['name'],
-                                            first_col_width,
-                                            get_color(measurement['time']),
-                                            measurement['time'],
-                                            get_color(measurement['cpu']),
-                                            measurement['cpu'],
-                                            measurement['real_time'],
-                                            measurement['real_time_other'],
-                                            measurement['cpu_time'],
-                                            measurement['cpu_time_other'],
-                                            endc=BC_ENDC)]
+                                             fmt_str,
+                                             BC_HEADER,
+                                             benchmark['name'],
+                                             first_col_width,
+                                             get_color(measurement['time']),
+                                             measurement['time'],
+                                             get_color(measurement['cpu']),
+                                             measurement['cpu'],
+                                             measurement['real_time'],
+                                             measurement['real_time_other'],
+                                             measurement['cpu_time'],
+                                             measurement['cpu_time_other'],
+                                             endc=BC_ENDC)]

        # After processing the measurements, if requested and
        # if applicable (e.g. u-test exists for given benchmark),
@ -403,6 +450,7 @@ class TestReportDifference(unittest.TestCase):
                '-0.1000', '100', '110', '100', '90'],
            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['OVERALL_GEOMEAN', '-0.8344', '-0.8026', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, use_color=False)
@ -489,6 +537,15 @@ class TestReportDifference(unittest.TestCase):
                'time_unit': 's',
                'utest': {}
            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 1.193776641714438e-06, 'cpu_time': 1.2144445585302297e-06,
+                                  'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
+                                  'time': -0.834399601997324, 'cpu': -0.8025889499549471}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean', 'utest': {}
+            },
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
        for out, expected in zip(
@ -524,6 +581,7 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+            ['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, use_color=False)
@ -561,6 +619,16 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
                'time_unit': 'ns',
                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
+                                  'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
+                                  'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
            }
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -631,6 +699,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
             'repetitions',
             'recommended.'],
            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
@ -677,6 +746,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
             '9+',
             'repetitions',
             'recommended.'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
@ -753,6 +823,16 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
                ],
                'time_unit': 'ns',
                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
            }
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -823,7 +903,8 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
             '9+',
             'repetitions',
             'recommended.'],
-             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report,
@ -898,11 +979,21 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
                     'real_time': 8,
                     'cpu_time_other': 53,
                     'cpu': -0.3375
-                    }
+                     }
                ],
                'utest': {},
                'time_unit': u'ns',
                'aggregate_name': ''
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
            }
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -914,7 +1005,6 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
            assert_measurements(self, out, expected)


-
 class TestReportDifferenceForPercentageAggregates(
        unittest.TestCase):
    @classmethod