[Tooling] Enable U Test by default, add tooltip about repetition count. (#617)

As previously discussed, let's flip the switch ^^.

This exposes the problem that it will now be run
for everyone, even if one did not read the help
about the recommended repetition count.

This is not good. So i think we can do the smart thing:
```
$ ./compare.py benchmarks gbench/Inputs/test3_run{0,1}.json
Comparing gbench/Inputs/test3_run0.json to gbench/Inputs/test3_run1.json
Benchmark                   Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------
BM_One                   -0.1000         +0.1000            10             9           100           110
BM_Two                   +0.1111         -0.0111             9            10            90            89
BM_Two                   +0.2500         +0.1125             8            10            80            89
BM_Two_pvalue             0.2207          0.6831      U Test, Repetitions: 2. WARNING: Results unreliable! 9+ repetitions recommended.
BM_Two_stat              +0.0000         +0.0000             8             8            80            80
```
(old screenshot)
![image](https://user-images.githubusercontent.com/88600/41502182-ea25d872-71bc-11e8-9842-8aa049509b14.png)

Or, in the good case (noise omitted):
```
s$ ./compare.py benchmarks /tmp/run{0,1}.json
Comparing /tmp/run0.json to /tmp/run1.json
Benchmark                                            Time             CPU      Time Old      Time New       CPU Old       CPU New
---------------------------------------------------------------------------------------------------------------------------------
<99 more rows like this>
./_T012014.RW2/threads:8/real_time                +0.0160         +0.0596            46            47            10            10
./_T012014.RW2/threads:8/real_time_pvalue          0.0000          0.0000      U Test, Repetitions: 100
./_T012014.RW2/threads:8/real_time_mean           +0.0094         +0.0609            46            47            10            10
./_T012014.RW2/threads:8/real_time_median         +0.0104         +0.0613            46            46            10            10
./_T012014.RW2/threads:8/real_time_stddev         -0.1160         -0.1807             1             1             0             0
```
(old screenshot)
![image](https://user-images.githubusercontent.com/88600/41502185-fb8193f4-71bc-11e8-85fa-cbba83e39db4.png)
This commit is contained in:
Roman Lebedev 2018-06-18 14:58:16 +03:00 committed by Dominic Hamon
parent 151ead6242
commit 7d03f2df49
4 changed files with 78 additions and 49 deletions

View file

@ -38,10 +38,11 @@ def create_parser():
utest = parser.add_argument_group() utest = parser.add_argument_group()
utest.add_argument( utest.add_argument(
'-u', '--no-utest',
'--utest', dest='utest',
action="store_true", default=True,
help="Do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than 9) number of repetitions to be meaningful!") action="store_false",
help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
alpha_default = 0.05 alpha_default = 0.05
utest.add_argument( utest.add_argument(
"--alpha", "--alpha",
@ -245,26 +246,16 @@ class TestParser(unittest.TestCase):
def test_benchmarks_basic(self): def test_benchmarks_basic(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['benchmarks', self.testInput0, self.testInput1]) ['benchmarks', self.testInput0, self.testInput1])
self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'benchmarks')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
self.assertFalse(parsed.benchmark_options)
def test_benchmarks_basic_without_utest(self):
parsed = self.parser.parse_args(
['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
self.assertFalse(parsed.utest) self.assertFalse(parsed.utest)
self.assertEqual(parsed.mode, 'benchmarks')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
self.assertFalse(parsed.benchmark_options)
def test_benchmarks_basic_with_utest(self):
parsed = self.parser.parse_args(
['-u', 'benchmarks', self.testInput0, self.testInput1])
self.assertTrue(parsed.utest)
self.assertEqual(parsed.utest_alpha, 0.05)
self.assertEqual(parsed.mode, 'benchmarks')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
self.assertFalse(parsed.benchmark_options)
def test_benchmarks_basic_with_utest(self):
parsed = self.parser.parse_args(
['--utest', 'benchmarks', self.testInput0, self.testInput1])
self.assertTrue(parsed.utest)
self.assertEqual(parsed.utest_alpha, 0.05) self.assertEqual(parsed.utest_alpha, 0.05)
self.assertEqual(parsed.mode, 'benchmarks') self.assertEqual(parsed.mode, 'benchmarks')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
@ -273,7 +264,7 @@ class TestParser(unittest.TestCase):
def test_benchmarks_basic_with_utest_alpha(self): def test_benchmarks_basic_with_utest_alpha(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['--utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
self.assertTrue(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.utest_alpha, 0.314) self.assertEqual(parsed.utest_alpha, 0.314)
self.assertEqual(parsed.mode, 'benchmarks') self.assertEqual(parsed.mode, 'benchmarks')
@ -281,10 +272,20 @@ class TestParser(unittest.TestCase):
self.assertEqual(parsed.test_contender[0].name, self.testInput1) self.assertEqual(parsed.test_contender[0].name, self.testInput1)
self.assertFalse(parsed.benchmark_options) self.assertFalse(parsed.benchmark_options)
def test_benchmarks_basic_without_utest_with_utest_alpha(self):
parsed = self.parser.parse_args(
['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
self.assertFalse(parsed.utest)
self.assertEqual(parsed.utest_alpha, 0.314)
self.assertEqual(parsed.mode, 'benchmarks')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
self.assertFalse(parsed.benchmark_options)
def test_benchmarks_with_remainder(self): def test_benchmarks_with_remainder(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['benchmarks', self.testInput0, self.testInput1, 'd']) ['benchmarks', self.testInput0, self.testInput1, 'd'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'benchmarks') self.assertEqual(parsed.mode, 'benchmarks')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.test_contender[0].name, self.testInput1) self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@ -293,7 +294,7 @@ class TestParser(unittest.TestCase):
def test_benchmarks_with_remainder_after_doubleminus(self): def test_benchmarks_with_remainder_after_doubleminus(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['benchmarks', self.testInput0, self.testInput1, '--', 'e']) ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'benchmarks') self.assertEqual(parsed.mode, 'benchmarks')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.test_contender[0].name, self.testInput1) self.assertEqual(parsed.test_contender[0].name, self.testInput1)
@ -302,7 +303,7 @@ class TestParser(unittest.TestCase):
def test_filters_basic(self): def test_filters_basic(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['filters', self.testInput0, 'c', 'd']) ['filters', self.testInput0, 'c', 'd'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'filters') self.assertEqual(parsed.mode, 'filters')
self.assertEqual(parsed.test[0].name, self.testInput0) self.assertEqual(parsed.test[0].name, self.testInput0)
self.assertEqual(parsed.filter_baseline[0], 'c') self.assertEqual(parsed.filter_baseline[0], 'c')
@ -312,7 +313,7 @@ class TestParser(unittest.TestCase):
def test_filters_with_remainder(self): def test_filters_with_remainder(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['filters', self.testInput0, 'c', 'd', 'e']) ['filters', self.testInput0, 'c', 'd', 'e'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'filters') self.assertEqual(parsed.mode, 'filters')
self.assertEqual(parsed.test[0].name, self.testInput0) self.assertEqual(parsed.test[0].name, self.testInput0)
self.assertEqual(parsed.filter_baseline[0], 'c') self.assertEqual(parsed.filter_baseline[0], 'c')
@ -322,7 +323,7 @@ class TestParser(unittest.TestCase):
def test_filters_with_remainder_after_doubleminus(self): def test_filters_with_remainder_after_doubleminus(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['filters', self.testInput0, 'c', 'd', '--', 'f']) ['filters', self.testInput0, 'c', 'd', '--', 'f'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'filters') self.assertEqual(parsed.mode, 'filters')
self.assertEqual(parsed.test[0].name, self.testInput0) self.assertEqual(parsed.test[0].name, self.testInput0)
self.assertEqual(parsed.filter_baseline[0], 'c') self.assertEqual(parsed.filter_baseline[0], 'c')
@ -332,7 +333,7 @@ class TestParser(unittest.TestCase):
def test_benchmarksfiltered_basic(self): def test_benchmarksfiltered_basic(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e']) ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'benchmarksfiltered') self.assertEqual(parsed.mode, 'benchmarksfiltered')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.filter_baseline[0], 'c') self.assertEqual(parsed.filter_baseline[0], 'c')
@ -343,7 +344,7 @@ class TestParser(unittest.TestCase):
def test_benchmarksfiltered_with_remainder(self): def test_benchmarksfiltered_with_remainder(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f']) ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'benchmarksfiltered') self.assertEqual(parsed.mode, 'benchmarksfiltered')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.filter_baseline[0], 'c') self.assertEqual(parsed.filter_baseline[0], 'c')
@ -354,7 +355,7 @@ class TestParser(unittest.TestCase):
def test_benchmarksfiltered_with_remainder_after_doubleminus(self): def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
parsed = self.parser.parse_args( parsed = self.parser.parse_args(
['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g']) ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
self.assertFalse(parsed.utest) self.assertTrue(parsed.utest)
self.assertEqual(parsed.mode, 'benchmarksfiltered') self.assertEqual(parsed.mode, 'benchmarksfiltered')
self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
self.assertEqual(parsed.filter_baseline[0], 'c') self.assertEqual(parsed.filter_baseline[0], 'c')

View file

@ -29,7 +29,7 @@
"time_unit": "ns" "time_unit": "ns"
}, },
{ {
"name": "BM_Two_stat", "name": "short",
"iterations": 1000, "iterations": 1000,
"real_time": 8, "real_time": 8,
"cpu_time": 80, "cpu_time": 80,

View file

@ -29,7 +29,7 @@
"time_unit": "ns" "time_unit": "ns"
}, },
{ {
"name": "BM_Two_stat", "name": "short",
"iterations": 1000, "iterations": 1000,
"real_time": 8, "real_time": 8,
"cpu_time": 80, "cpu_time": 80,

View file

@ -34,6 +34,9 @@ BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
BC_BOLD = BenchmarkColor('BOLD', '\033[1m') BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m') BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
UTEST_MIN_REPETITIONS = 2
UTEST_OPTIMAL_REPETITIONS = 9 # Lowest reasonable number, More is better.
def color_format(use_color, fmt_str, *args, **kwargs): def color_format(use_color, fmt_str, *args, **kwargs):
""" """
@ -109,11 +112,11 @@ def generate_difference_report(
return b return b
return None return None
utest_col_name = "U-test (p-value)" utest_col_name = "_pvalue"
first_col_width = max( first_col_width = max(
first_col_width, first_col_width,
len('Benchmark'), len('Benchmark'))
len(utest_col_name)) first_col_width += len(utest_col_name)
first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format( first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format(
'Benchmark', 12 + first_col_width) 'Benchmark', 12 + first_col_width)
output_strs = [first_line, '-' * len(first_line)] output_strs = [first_line, '-' * len(first_line)]
@ -126,16 +129,15 @@ def generate_difference_report(
if 'real_time' in bn and 'cpu_time' in bn) if 'real_time' in bn and 'cpu_time' in bn)
for bn in gen: for bn in gen:
fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}" fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}" special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{} {}"
if last_name is None: if last_name is None:
last_name = bn['name'] last_name = bn['name']
if last_name != bn['name']: if last_name != bn['name']:
MIN_REPETITIONS = 2 if ((len(timings_time[0]) >= UTEST_MIN_REPETITIONS) and
if ((len(timings_time[0]) >= MIN_REPETITIONS) and (len(timings_time[1]) >= UTEST_MIN_REPETITIONS) and
(len(timings_time[1]) >= MIN_REPETITIONS) and (len(timings_cpu[0]) >= UTEST_MIN_REPETITIONS) and
(len(timings_cpu[0]) >= MIN_REPETITIONS) and (len(timings_cpu[1]) >= UTEST_MIN_REPETITIONS)):
(len(timings_cpu[1]) >= MIN_REPETITIONS)):
if utest: if utest:
def get_utest_color(pval): def get_utest_color(pval):
if pval >= utest_alpha: if pval >= utest_alpha:
@ -146,15 +148,24 @@ def generate_difference_report(
timings_time[0], timings_time[1], alternative='two-sided').pvalue timings_time[0], timings_time[1], alternative='two-sided').pvalue
cpu_pvalue = mannwhitneyu( cpu_pvalue = mannwhitneyu(
timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
dsc = "U Test, Repetitions: {}".format(len(timings_cpu[0]))
dsc_color = BC_OKGREEN
if len(timings_cpu[0]) < UTEST_OPTIMAL_REPETITIONS:
dsc_color = BC_WARNING
dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
UTEST_OPTIMAL_REPETITIONS)
output_strs += [color_format(use_color, output_strs += [color_format(use_color,
special_str, special_str,
BC_HEADER, BC_HEADER,
utest_col_name, "{}{}".format(last_name,
utest_col_name),
first_col_width, first_col_width,
get_utest_color(time_pvalue), get_utest_color(time_pvalue),
time_pvalue, time_pvalue,
get_utest_color(cpu_pvalue), get_utest_color(cpu_pvalue),
cpu_pvalue, cpu_pvalue,
dsc_color,
dsc,
endc=BC_ENDC)] endc=BC_ENDC)]
last_name = bn['name'] last_name = bn['name']
timings_time = [[], []] timings_time = [[], []]
@ -229,9 +240,12 @@ class TestReportDifference(unittest.TestCase):
['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'], ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'], ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'], ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
['BM_100xSlower', '+99.0000', '+99.0000', '100', '10000', '100', '10000'], ['BM_100xSlower', '+99.0000', '+99.0000',
['BM_100xFaster', '-0.9900', '-0.9900', '10000', '100', '10000', '100'], '100', '10000', '100', '10000'],
['BM_10PercentCPUToTime', '+0.1000', '-0.1000', '100', '110', '100', '90'], ['BM_100xFaster', '-0.9900', '-0.9900',
'10000', '100', '10000', '100'],
['BM_10PercentCPUToTime', '+0.1000',
'-0.1000', '100', '110', '100', '90'],
['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'], ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'], ['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
] ]
@ -239,6 +253,7 @@ class TestReportDifference(unittest.TestCase):
output_lines_with_header = generate_difference_report( output_lines_with_header = generate_difference_report(
json1, json2, use_color=False) json1, json2, use_color=False)
output_lines = output_lines_with_header[2:] output_lines = output_lines_with_header[2:]
print("\n")
print("\n".join(output_lines_with_header)) print("\n".join(output_lines_with_header))
self.assertEqual(len(output_lines), len(expect_lines)) self.assertEqual(len(output_lines), len(expect_lines))
for i in range(0, len(output_lines)): for i in range(0, len(output_lines)):
@ -302,13 +317,26 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'], ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'], ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
['BM_Two', '+0.2500', '+0.1125', '8', '10', '80', '89'], ['BM_Two', '+0.2500', '+0.1125', '8', '10', '80', '89'],
['U-test', '(p-value)', '0.2207', '0.6831'], ['BM_Two_pvalue',
['BM_Two_stat', '+0.0000', '+0.0000', '8', '8', '80', '80'], '0.2207',
'0.6831',
'U',
'Test,',
'Repetitions:',
'2.',
'WARNING:',
'Results',
'unreliable!',
'9+',
'repetitions',
'recommended.'],
['short', '+0.0000', '+0.0000', '8', '8', '80', '80'],
] ]
json1, json2 = self.load_results() json1, json2 = self.load_results()
output_lines_with_header = generate_difference_report( output_lines_with_header = generate_difference_report(
json1, json2, True, 0.05, use_color=False) json1, json2, True, 0.05, use_color=False)
output_lines = output_lines_with_header[2:] output_lines = output_lines_with_header[2:]
print("\n")
print("\n".join(output_lines_with_header)) print("\n".join(output_lines_with_header))
self.assertEqual(len(output_lines), len(expect_lines)) self.assertEqual(len(output_lines), len(expect_lines))
for i in range(0, len(output_lines)): for i in range(0, len(output_lines)):