compare.py: compute and print 'OVERALL GEOMEAN' aggregate (#1289)

Despite the wide variety of the features we provide, some people still have the audacity to complain and demand more. Concretely, i *very* often would like to see the overall result of the benchmark. Is the 'new' better or worse, overall, over all the non-aggregate time/cpu measurements. This comes up for me most often when i want to quickly see what effect some LLVM optimization change has on the benchmark. The idea is straight-forward, just produce four lists: wall times for LHS benchmark, CPU times for LHS benchmark, wall times for RHS benchmark, CPU times for RHS benchmark; then compute geomean for each one of those four lists, and compute the two percentage change between * geomean wall time for LHS benchmark and geomean wall time for RHS benchmark * geomean CPU time for LHS benchmark and geomean CPU time for RHS benchmark and voila! It is complicated by the fact that it needs to graciously handle different time units, so pandas.Timedelta dependency is introduced. That is the only library that does not barf upon floating times, i have tried numpy.timedelta64 (only takes integers) and python's datetime.timedelta (does not take nanosecons), and they won't do. Fixes https://github.com/google/benchmark/issues/1147
2021-11-24 13:47:08 +03:00 · 2021-11-24 13:47:08 +03:00 · d6ba952fc1
parent ce92bbfb90
commit d6ba952fc1
2 changed files with 109 additions and 18 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
 numpy == 1.19.4
 scipy == 1.5.4
 pandas == 1.1.5
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@ -7,7 +7,9 @@ import re
 import copy
 import random
-from scipy.stats import mannwhitneyu
+from scipy.stats import mannwhitneyu, gmean
 from numpy import array
 from pandas import Timedelta
 class BenchmarkColor(object):
@ -150,6 +152,30 @@ def partition_benchmarks(json1, json2):
    return partitions
 def get_timedelta_field_as_seconds(benchmark, field_name):
    """
    Get value of field_name field of benchmark, which is time with time unit
    time_unit, as time in seconds.
    """
    time_unit = benchmark['time_unit'] if 'time_unit' in benchmark else 's'
    dt = Timedelta(benchmark[field_name], time_unit)
    return dt / Timedelta(1, 's')
 def calculate_geomean(json):
    """
    Extract all real/cpu times from all the benchmarks as seconds,
    and calculate their geomean.
    """
    times = []
    for benchmark in json['benchmarks']:
        if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
            continue
        times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
                      get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
    return gmean(times) if times else array([])
 def extract_field(partition, field_name):
    # The count of elements may be different. We want *all* of them.
    lhs = [x[field_name] for x in partition[0]]
@ -174,6 +200,7 @@ def calc_utest(timings_cpu, timings_time):
    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
 def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
    def get_utest_color(pval):
        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
@ -242,7 +269,8 @@ def get_difference_report(
        if utest:
            timings_cpu = extract_field(partition, 'cpu_time')
            timings_time = extract_field(partition, 'real_time')
-            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
                timings_cpu, timings_time)
            if cpu_pvalue and time_pvalue:
                utest_results = {
                    'have_optimal_repetitions': have_optimal_repetitions,
@ -268,6 +296,25 @@ def get_difference_report(
                'utest': utest_results
            })
    lhs_gmean = calculate_geomean(json1)
    rhs_gmean = calculate_geomean(json2)
    if lhs_gmean.any() and rhs_gmean.any():
        diff_report.append({
            'name': 'OVERALL_GEOMEAN',
            'measurements': [{
                'real_time': lhs_gmean[0],
                'cpu_time': lhs_gmean[1],
                'real_time_other': rhs_gmean[0],
                'cpu_time_other': rhs_gmean[1],
                'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
                'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
            }],
            'time_unit': 's',
            'run_type': 'aggregate',
            'aggregate_name': 'geomean',
            'utest': {}
        })
    return diff_report
@ -403,6 +450,7 @@ class TestReportDifference(unittest.TestCase):
                '-0.1000', '100', '110', '100', '90'],
            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
            ['OVERALL_GEOMEAN', '-0.8344', '-0.8026', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, use_color=False)
@ -489,6 +537,15 @@ class TestReportDifference(unittest.TestCase):
                'time_unit': 's',
                'utest': {}
            },
            {
                'name': 'OVERALL_GEOMEAN',
                'measurements': [{'real_time': 1.193776641714438e-06, 'cpu_time': 1.2144445585302297e-06,
                                  'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
                                  'time': -0.834399601997324, 'cpu': -0.8025889499549471}],
                'time_unit': 's',
                'run_type': 'aggregate',
                'aggregate_name': 'geomean', 'utest': {}
            },
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
        for out, expected in zip(
@ -524,6 +581,7 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
            ['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, use_color=False)
@ -561,6 +619,16 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
                'time_unit': 'ns',
                'utest': {}
            },
            {
                'name': 'OVERALL_GEOMEAN',
                'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
                                  'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
                                  'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
                'time_unit': 's',
                'run_type': 'aggregate',
                'aggregate_name': 'geomean',
                'utest': {}
            }
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -631,6 +699,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
             'repetitions',
             'recommended.'],
            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
@ -677,6 +746,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
             '9+',
             'repetitions',
             'recommended.'],
            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
@ -753,6 +823,16 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
                ],
                'time_unit': 'ns',
                'utest': {}
            },
            {
                'name': 'OVERALL_GEOMEAN',
                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
                'time_unit': 's',
                'run_type': 'aggregate',
                'aggregate_name': 'geomean',
                'utest': {}
            }
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -823,7 +903,8 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
             '9+',
             'repetitions',
             'recommended.'],
-             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
        ]
        output_lines_with_header = print_difference_report(
            self.json_diff_report,
@ -903,6 +984,16 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
                'utest': {},
                'time_unit': u'ns',
                'aggregate_name': ''
            },
            {
                'name': 'OVERALL_GEOMEAN',
                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
                'time_unit': 's',
                'run_type': 'aggregate',
                'aggregate_name': 'geomean',
                'utest': {}
            }
        ]
        self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -914,7 +1005,6 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
            assert_measurements(self, out, expected)
 class TestReportDifferenceForPercentageAggregates(
        unittest.TestCase):
    @classmethod