compare.py: compute and print 'OVERALL GEOMEAN' aggregate (#1289)

Despite the wide variety of the features we provide,
some people still have the audacity to complain and demand more.

Concretely, i *very* often would like to see the overall result
of the benchmark. Is the 'new' better or worse, overall,
over all the non-aggregate time/cpu measurements.

This comes up for me most often when i want to quickly see
what effect some LLVM optimization change has on the benchmark.

The idea is straight-forward, just produce four lists:
wall times for LHS benchmark, CPU times for LHS benchmark,
wall times for RHS benchmark, CPU times for RHS benchmark;
then compute geomean for each one of those four lists,
and compute the two percentage change between
* geomean wall time for LHS benchmark and geomean wall time for RHS benchmark
* geomean CPU time for LHS benchmark and geomean CPU time for RHS benchmark
and voila!

It is complicated by the fact that it needs to graciously handle
different time units, so pandas.Timedelta dependency is introduced.
That is the only library that does not barf upon floating times,
i have tried numpy.timedelta64 (only takes integers)
and python's datetime.timedelta (does not take nanosecons),
and they won't do.

Fixes https://github.com/google/benchmark/issues/1147
This commit is contained in:
Roman Lebedev 2021-11-24 13:47:08 +03:00 committed by GitHub
parent ce92bbfb90
commit d6ba952fc1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 109 additions and 18 deletions

View File

@ -1,2 +1,3 @@
numpy == 1.19.4 numpy == 1.19.4
scipy == 1.5.4 scipy == 1.5.4
pandas == 1.1.5

View File

@ -7,7 +7,9 @@ import re
import copy import copy
import random import random
from scipy.stats import mannwhitneyu from scipy.stats import mannwhitneyu, gmean
from numpy import array
from pandas import Timedelta
class BenchmarkColor(object): class BenchmarkColor(object):
@ -150,6 +152,30 @@ def partition_benchmarks(json1, json2):
return partitions return partitions
def get_timedelta_field_as_seconds(benchmark, field_name):
"""
Get value of field_name field of benchmark, which is time with time unit
time_unit, as time in seconds.
"""
time_unit = benchmark['time_unit'] if 'time_unit' in benchmark else 's'
dt = Timedelta(benchmark[field_name], time_unit)
return dt / Timedelta(1, 's')
def calculate_geomean(json):
"""
Extract all real/cpu times from all the benchmarks as seconds,
and calculate their geomean.
"""
times = []
for benchmark in json['benchmarks']:
if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
continue
times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
return gmean(times) if times else array([])
def extract_field(partition, field_name): def extract_field(partition, field_name):
# The count of elements may be different. We want *all* of them. # The count of elements may be different. We want *all* of them.
lhs = [x[field_name] for x in partition[0]] lhs = [x[field_name] for x in partition[0]]
@ -174,6 +200,7 @@ def calc_utest(timings_cpu, timings_time):
return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True): def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
def get_utest_color(pval): def get_utest_color(pval):
return BC_FAIL if pval >= utest_alpha else BC_OKGREEN return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
@ -242,7 +269,8 @@ def get_difference_report(
if utest: if utest:
timings_cpu = extract_field(partition, 'cpu_time') timings_cpu = extract_field(partition, 'cpu_time')
timings_time = extract_field(partition, 'real_time') timings_time = extract_field(partition, 'real_time')
have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time) have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
timings_cpu, timings_time)
if cpu_pvalue and time_pvalue: if cpu_pvalue and time_pvalue:
utest_results = { utest_results = {
'have_optimal_repetitions': have_optimal_repetitions, 'have_optimal_repetitions': have_optimal_repetitions,
@ -268,6 +296,25 @@ def get_difference_report(
'utest': utest_results 'utest': utest_results
}) })
lhs_gmean = calculate_geomean(json1)
rhs_gmean = calculate_geomean(json2)
if lhs_gmean.any() and rhs_gmean.any():
diff_report.append({
'name': 'OVERALL_GEOMEAN',
'measurements': [{
'real_time': lhs_gmean[0],
'cpu_time': lhs_gmean[1],
'real_time_other': rhs_gmean[0],
'cpu_time_other': rhs_gmean[1],
'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
}],
'time_unit': 's',
'run_type': 'aggregate',
'aggregate_name': 'geomean',
'utest': {}
})
return diff_report return diff_report
@ -307,19 +354,19 @@ def print_difference_report(
if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate': if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
for measurement in benchmark['measurements']: for measurement in benchmark['measurements']:
output_strs += [color_format(use_color, output_strs += [color_format(use_color,
fmt_str, fmt_str,
BC_HEADER, BC_HEADER,
benchmark['name'], benchmark['name'],
first_col_width, first_col_width,
get_color(measurement['time']), get_color(measurement['time']),
measurement['time'], measurement['time'],
get_color(measurement['cpu']), get_color(measurement['cpu']),
measurement['cpu'], measurement['cpu'],
measurement['real_time'], measurement['real_time'],
measurement['real_time_other'], measurement['real_time_other'],
measurement['cpu_time'], measurement['cpu_time'],
measurement['cpu_time_other'], measurement['cpu_time_other'],
endc=BC_ENDC)] endc=BC_ENDC)]
# After processing the measurements, if requested and # After processing the measurements, if requested and
# if applicable (e.g. u-test exists for given benchmark), # if applicable (e.g. u-test exists for given benchmark),
@ -403,6 +450,7 @@ class TestReportDifference(unittest.TestCase):
'-0.1000', '100', '110', '100', '90'], '-0.1000', '100', '110', '100', '90'],
['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'], ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'], ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
['OVERALL_GEOMEAN', '-0.8344', '-0.8026', '0', '0', '0', '0']
] ]
output_lines_with_header = print_difference_report( output_lines_with_header = print_difference_report(
self.json_diff_report, use_color=False) self.json_diff_report, use_color=False)
@ -489,6 +537,15 @@ class TestReportDifference(unittest.TestCase):
'time_unit': 's', 'time_unit': 's',
'utest': {} 'utest': {}
}, },
{
'name': 'OVERALL_GEOMEAN',
'measurements': [{'real_time': 1.193776641714438e-06, 'cpu_time': 1.2144445585302297e-06,
'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
'time': -0.834399601997324, 'cpu': -0.8025889499549471}],
'time_unit': 's',
'run_type': 'aggregate',
'aggregate_name': 'geomean', 'utest': {}
},
] ]
self.assertEqual(len(self.json_diff_report), len(expected_output)) self.assertEqual(len(self.json_diff_report), len(expected_output))
for out, expected in zip( for out, expected in zip(
@ -524,6 +581,7 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'], ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'], ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'], ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
] ]
output_lines_with_header = print_difference_report( output_lines_with_header = print_difference_report(
self.json_diff_report, use_color=False) self.json_diff_report, use_color=False)
@ -561,6 +619,16 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}], 'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
'time_unit': 'ns', 'time_unit': 'ns',
'utest': {} 'utest': {}
},
{
'name': 'OVERALL_GEOMEAN',
'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
'time_unit': 's',
'run_type': 'aggregate',
'aggregate_name': 'geomean',
'utest': {}
} }
] ]
self.assertEqual(len(self.json_diff_report), len(expected_output)) self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -631,6 +699,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
'repetitions', 'repetitions',
'recommended.'], 'recommended.'],
['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'], ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
] ]
output_lines_with_header = print_difference_report( output_lines_with_header = print_difference_report(
self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False) self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
@ -677,6 +746,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
'9+', '9+',
'repetitions', 'repetitions',
'recommended.'], 'recommended.'],
['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
] ]
output_lines_with_header = print_difference_report( output_lines_with_header = print_difference_report(
self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False) self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
@ -753,6 +823,16 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
], ],
'time_unit': 'ns', 'time_unit': 'ns',
'utest': {} 'utest': {}
},
{
'name': 'OVERALL_GEOMEAN',
'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
'time_unit': 's',
'run_type': 'aggregate',
'aggregate_name': 'geomean',
'utest': {}
} }
] ]
self.assertEqual(len(self.json_diff_report), len(expected_output)) self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -823,7 +903,8 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
'9+', '9+',
'repetitions', 'repetitions',
'recommended.'], 'recommended.'],
['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'] ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
] ]
output_lines_with_header = print_difference_report( output_lines_with_header = print_difference_report(
self.json_diff_report, self.json_diff_report,
@ -898,11 +979,21 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
'real_time': 8, 'real_time': 8,
'cpu_time_other': 53, 'cpu_time_other': 53,
'cpu': -0.3375 'cpu': -0.3375
} }
], ],
'utest': {}, 'utest': {},
'time_unit': u'ns', 'time_unit': u'ns',
'aggregate_name': '' 'aggregate_name': ''
},
{
'name': 'OVERALL_GEOMEAN',
'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
'time_unit': 's',
'run_type': 'aggregate',
'aggregate_name': 'geomean',
'utest': {}
} }
] ]
self.assertEqual(len(self.json_diff_report), len(expected_output)) self.assertEqual(len(self.json_diff_report), len(expected_output))
@ -914,7 +1005,6 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
assert_measurements(self, out, expected) assert_measurements(self, out, expected)
class TestReportDifferenceForPercentageAggregates( class TestReportDifferenceForPercentageAggregates(
unittest.TestCase): unittest.TestCase):
@classmethod @classmethod