benchmark/tools/gbench/report.py

"""report.py - Utilities for reporting statistics about benchmark results
"""
import os

class BenchmarkColor(object):
    def __init__(self, name, code):
        self.name = name
        self.code = code

    def __repr__(self):
        return '%s%r' % (self.__class__.__name__,
                         (self.name, self.code))

    def __format__(self, format):
        return self.code

# Benchmark Colors Enumeration
BC_NONE = BenchmarkColor('NONE', '')
BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')

def color_format(use_color, fmt_str, *args, **kwargs):
    """
    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
    is False then all color codes in 'args' and 'kwargs' are replaced with
    the empty string.
    """
    assert use_color is True or use_color is False
    if not use_color:
        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
                for arg in args]
        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
                  for key, arg in kwargs.items()}
    return fmt_str.format(*args, **kwargs)


def find_longest_name(benchmark_list):
    """
    Return the length of the longest benchmark name in a given list of
    benchmark JSON objects
    """
    longest_name = 1
    for bc in benchmark_list:
        if len(bc['name']) > longest_name:
            longest_name = len(bc['name'])
    return longest_name


def calculate_change(old_val, new_val):
    """
    Return a float representing the decimal change between old_val and new_val.
    """
    if old_val == 0 and new_val == 0:
        return 0.0
    if old_val == 0:
        return float(new_val - old_val) / (float(old_val + new_val) / 2)
    return float(new_val - old_val) / abs(old_val)


def generate_difference_report(json1, json2, use_color=True):
    """
    Calculate and report the difference between each test of two benchmarks
    runs specified as 'json1' and 'json2'.
    """
    first_col_width = find_longest_name(json1['benchmarks']) + 5
    def find_test(name):
        for b in json2['benchmarks']:
            if b['name'] == name:
                return b
        return None
    first_line = "{:<{}s}     Time           CPU           Old           New".format(
        'Benchmark', first_col_width)
    output_strs = [first_line, '-' * len(first_line)]

    gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)
    for bn in gen:
        other_bench = find_test(bn['name'])
        if not other_bench:
            continue

        if bn['time_unit'] != other_bench['time_unit']:
            continue

        def get_color(res):
            if res > 0.05:
                return BC_FAIL
            elif res > -0.07:
                return BC_WHITE
            else:
                return BC_CYAN
        fmt_str = "{}{:<{}s}{endc}{}{:+9.2f}{endc}{}{:+14.2f}{endc}{:14.0f}{:14.0f}"
        tres = calculate_change(bn['real_time'], other_bench['real_time'])
        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
        output_strs += [color_format(use_color, fmt_str,
            BC_HEADER, bn['name'], first_col_width,
            get_color(tres), tres, get_color(cpures), cpures,
            bn['cpu_time'], other_bench['cpu_time'],
            endc=BC_ENDC)]
    return output_strs

###############################################################################
# Unit tests

import unittest

class TestReportDifference(unittest.TestCase):
    def load_results(self):
        import json
        testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')
        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
        with open(testOutput1, 'r') as f:
            json1 = json.load(f)
        with open(testOutput2, 'r') as f:
            json2 = json.load(f)
        return json1, json2

    def test_basic(self):
        expect_lines = [
            ['BM_SameTimes', '+0.00', '+0.00', '10', '10'],
            ['BM_2xFaster', '-0.50', '-0.50', '50', '25'],
            ['BM_2xSlower', '+1.00', '+1.00', '50', '100'],
            ['BM_1PercentFaster', '-0.01', '-0.01', '100', '99'],
            ['BM_1PercentSlower', '+0.01', '+0.01', '100', '101'],
            ['BM_10PercentFaster', '-0.10', '-0.10', '100', '90'],
            ['BM_10PercentSlower', '+0.10', '+0.10', '100', '110'],
            ['BM_100xSlower', '+99.00', '+99.00', '100', '10000'],
            ['BM_100xFaster', '-0.99', '-0.99', '10000', '100'],
        ]
        json1, json2 = self.load_results()
        output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
        output_lines = output_lines_with_header[2:]
        print("\n".join(output_lines_with_header))
        self.assertEqual(len(output_lines), len(expect_lines))
        for i in xrange(0, len(output_lines)):
            parts = [x for x in output_lines[i].split(' ') if x]
            self.assertEqual(len(parts), 5)
            self.assertEqual(parts, expect_lines[i])


if __name__ == '__main__':
    unittest.main()
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`"""report.py - Utilities for reporting statistics about benchmark results`
			`"""`
			`import os`

			`class BenchmarkColor(object):`
			`def __init__(self, name, code):`
			`self.name = name`
			`self.code = code`

			`def __repr__(self):`
			`return '%s%r' % (self.__class__.__name__,`
			`(self.name, self.code))`

			`def __format__(self, format):`
			`return self.code`

			`# Benchmark Colors Enumeration`
			`BC_NONE = BenchmarkColor('NONE', '')`
			`BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')`
			`BC_CYAN = BenchmarkColor('CYAN', '\033[96m')`
			`BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')`
			`BC_HEADER = BenchmarkColor('HEADER', '\033[92m')`
			`BC_WARNING = BenchmarkColor('WARNING', '\033[93m')`
			`BC_WHITE = BenchmarkColor('WHITE', '\033[97m')`
			`BC_FAIL = BenchmarkColor('FAIL', '\033[91m')`
			`BC_ENDC = BenchmarkColor('ENDC', '\033[0m')`
			`BC_BOLD = BenchmarkColor('BOLD', '\033[1m')`
			`BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')`

			`def color_format(use_color, fmt_str, args, *kwargs):`
			`"""`
			`Return the result of 'fmt_str.format(args, *kwargs)' after transforming`
			`'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'`
			`is False then all color codes in 'args' and 'kwargs' are replaced with`
			`the empty string.`
			`"""`
			`assert use_color is True or use_color is False`
			`if not use_color:`
			`args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE`
			`for arg in args]`
			`kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE`
			`for key, arg in kwargs.items()}`
			`return fmt_str.format(args, *kwargs)`


			`def find_longest_name(benchmark_list):`
			`"""`
			`Return the length of the longest benchmark name in a given list of`
			`benchmark JSON objects`
			`"""`
			`longest_name = 1`
			`for bc in benchmark_list:`
			`if len(bc['name']) > longest_name:`
			`longest_name = len(bc['name'])`
			`return longest_name`


			`def calculate_change(old_val, new_val):`
			`"""`
			`Return a float representing the decimal change between old_val and new_val.`
			`"""`
Make `PauseTiming()` and `ResumeTiming()` per thread. (#286) * Change to using per-thread timers * fix bad assertions * fix copy paste error on windows * Fix thread safety annotations * Make null-log thread safe * remove remaining globals * use chrono for walltime since it is thread safe * consolidate timer functions * Add missing ctime include * Rename to be consistent with Google style * Format patch using clang-format * cleanup -Wthread-safety configuration * Don't trust _POSIX_FEATURE macros because OS X lies. * Fix OS X thread timings * attempt to fix mingw build * Attempt to make mingw work again * Revert old mingw workaround * improve diagnostics * Drastically improve OS X measurements * Use average real time instead of max 2016-09-03 03:34:34 +00:00			`if old_val == 0 and new_val == 0:`
			`return 0.0`
			`if old_val == 0:`
			`return float(new_val - old_val) / (float(old_val + new_val) / 2)`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`return float(new_val - old_val) / abs(old_val)`


			`def generate_difference_report(json1, json2, use_color=True):`
			`"""`
			`Calculate and report the difference between each test of two benchmarks`
			`runs specified as 'json1' and 'json2'.`
			`"""`
			`first_col_width = find_longest_name(json1['benchmarks']) + 5`
			`def find_test(name):`
			`for b in json2['benchmarks']:`
			`if b['name'] == name:`
			`return b`
			`return None`
Make `PauseTiming()` and `ResumeTiming()` per thread. (#286) * Change to using per-thread timers * fix bad assertions * fix copy paste error on windows * Fix thread safety annotations * Make null-log thread safe * remove remaining globals * use chrono for walltime since it is thread safe * consolidate timer functions * Add missing ctime include * Rename to be consistent with Google style * Format patch using clang-format * cleanup -Wthread-safety configuration * Don't trust _POSIX_FEATURE macros because OS X lies. * Fix OS X thread timings * attempt to fix mingw build * Attempt to make mingw work again * Revert old mingw workaround * improve diagnostics * Drastically improve OS X measurements * Use average real time instead of max 2016-09-03 03:34:34 +00:00			`first_line = "{:<{}s} Time CPU Old New".format(`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`'Benchmark', first_col_width)`
			`output_strs = [first_line, '-' * len(first_line)]`
Ensure all the necessary keys are present before parsing JSON data (#380) This prevents errors when additional non-timing data are present in the JSON that is loaded, for example when complexity data has been computed (see #379). 2017-05-02 15:19:35 +00:00
			`gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)`
			`for bn in gen:`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`other_bench = find_test(bn['name'])`
			`if not other_bench:`
			`continue`

Json reporter: don't cast floating-point to int; adjust tooling (#426) * Json reporter: passthrough fp, don't cast it to int; adjust tooling Json output format is generally meant for further processing using some automated tools. Thus, it makes sense not to intentionally limit the precision of the values contained in the report. As it can be seen, FormatKV() for doubles, used %.2f format, which was meant to preserve at least some of the precision. However, before that function is ever called, the doubles were already cast to the integer via RoundDouble()... This is also the case for console reporter, where it makes sense because the screen space is limited, and this reporter, however the CSV reporter does output some( decimal digits. Thus i can only conclude that the loss of the precision was not really considered, so i have decided to adjust the code of the json reporter to output the full fp precision. There can be several reasons why that is the right thing to do, the bigger the time_unit used, the greater the precision loss, so i'd say any sort of further processing (like e.g. tools/compare_bench.py does) is best done on the values with most precision. Also, that cast skewed the data away from zero, which i think may or may not result in false- positives/negatives in the output of tools/compare_bench.py * Json reporter: FormatKV(double): address review note * tools/gbench/report.py: skip benchmarks with different time units While it may be useful to teach it to operate on the measurements with different time units, which is now possible since floats are stored, and not the integers, but for now at least doing such a sanity-checking is better than providing misinformation. 2017-07-24 23:13:55 +00:00			`if bn['time_unit'] != other_bench['time_unit']:`
			`continue`

Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`def get_color(res):`
			`if res > 0.05:`
			`return BC_FAIL`
			`elif res > -0.07:`
			`return BC_WHITE`
			`else:`
			`return BC_CYAN`
Json reporter: don't cast floating-point to int; adjust tooling (#426) * Json reporter: passthrough fp, don't cast it to int; adjust tooling Json output format is generally meant for further processing using some automated tools. Thus, it makes sense not to intentionally limit the precision of the values contained in the report. As it can be seen, FormatKV() for doubles, used %.2f format, which was meant to preserve at least some of the precision. However, before that function is ever called, the doubles were already cast to the integer via RoundDouble()... This is also the case for console reporter, where it makes sense because the screen space is limited, and this reporter, however the CSV reporter does output some( decimal digits. Thus i can only conclude that the loss of the precision was not really considered, so i have decided to adjust the code of the json reporter to output the full fp precision. There can be several reasons why that is the right thing to do, the bigger the time_unit used, the greater the precision loss, so i'd say any sort of further processing (like e.g. tools/compare_bench.py does) is best done on the values with most precision. Also, that cast skewed the data away from zero, which i think may or may not result in false- positives/negatives in the output of tools/compare_bench.py * Json reporter: FormatKV(double): address review note * tools/gbench/report.py: skip benchmarks with different time units While it may be useful to teach it to operate on the measurements with different time units, which is now possible since floats are stored, and not the integers, but for now at least doing such a sanity-checking is better than providing misinformation. 2017-07-24 23:13:55 +00:00			`fmt_str = "{}{:<{}s}{endc}{}{:+9.2f}{endc}{}{:+14.2f}{endc}{:14.0f}{:14.0f}"`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`tres = calculate_change(bn['real_time'], other_bench['real_time'])`
			`cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])`
			`output_strs += [color_format(use_color, fmt_str,`
			`BC_HEADER, bn['name'], first_col_width,`
			`get_color(tres), tres, get_color(cpures), cpures,`
Make `PauseTiming()` and `ResumeTiming()` per thread. (#286) * Change to using per-thread timers * fix bad assertions * fix copy paste error on windows * Fix thread safety annotations * Make null-log thread safe * remove remaining globals * use chrono for walltime since it is thread safe * consolidate timer functions * Add missing ctime include * Rename to be consistent with Google style * Format patch using clang-format * cleanup -Wthread-safety configuration * Don't trust _POSIX_FEATURE macros because OS X lies. * Fix OS X thread timings * attempt to fix mingw build * Attempt to make mingw work again * Revert old mingw workaround * improve diagnostics * Drastically improve OS X measurements * Use average real time instead of max 2016-09-03 03:34:34 +00:00			`bn['cpu_time'], other_bench['cpu_time'],`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`endc=BC_ENDC)]`
			`return output_strs`

			`###############################################################################`
			`# Unit tests`

			`import unittest`

			`class TestReportDifference(unittest.TestCase):`
			`def load_results(self):`
			`import json`
			`testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs')`
			`testOutput1 = os.path.join(testInputs, 'test1_run1.json')`
			`testOutput2 = os.path.join(testInputs, 'test1_run2.json')`
			`with open(testOutput1, 'r') as f:`
			`json1 = json.load(f)`
			`with open(testOutput2, 'r') as f:`
			`json2 = json.load(f)`
			`return json1, json2`

			`def test_basic(self):`
			`expect_lines = [`
fix compare script - output formatting - correctly align numbers >9999 (#322) * fix compare script - output formatting - correctly align numbers >9999 * fix failing test (report.py); fix compare script output formatting (large numbers alignment) 2016-12-09 12:24:31 +00:00			`['BM_SameTimes', '+0.00', '+0.00', '10', '10'],`
			`['BM_2xFaster', '-0.50', '-0.50', '50', '25'],`
			`['BM_2xSlower', '+1.00', '+1.00', '50', '100'],`
Json reporter: don't cast floating-point to int; adjust tooling (#426) * Json reporter: passthrough fp, don't cast it to int; adjust tooling Json output format is generally meant for further processing using some automated tools. Thus, it makes sense not to intentionally limit the precision of the values contained in the report. As it can be seen, FormatKV() for doubles, used %.2f format, which was meant to preserve at least some of the precision. However, before that function is ever called, the doubles were already cast to the integer via RoundDouble()... This is also the case for console reporter, where it makes sense because the screen space is limited, and this reporter, however the CSV reporter does output some( decimal digits. Thus i can only conclude that the loss of the precision was not really considered, so i have decided to adjust the code of the json reporter to output the full fp precision. There can be several reasons why that is the right thing to do, the bigger the time_unit used, the greater the precision loss, so i'd say any sort of further processing (like e.g. tools/compare_bench.py does) is best done on the values with most precision. Also, that cast skewed the data away from zero, which i think may or may not result in false- positives/negatives in the output of tools/compare_bench.py * Json reporter: FormatKV(double): address review note * tools/gbench/report.py: skip benchmarks with different time units While it may be useful to teach it to operate on the measurements with different time units, which is now possible since floats are stored, and not the integers, but for now at least doing such a sanity-checking is better than providing misinformation. 2017-07-24 23:13:55 +00:00			`['BM_1PercentFaster', '-0.01', '-0.01', '100', '99'],`
			`['BM_1PercentSlower', '+0.01', '+0.01', '100', '101'],`
fix compare script - output formatting - correctly align numbers >9999 (#322) * fix compare script - output formatting - correctly align numbers >9999 * fix failing test (report.py); fix compare script output formatting (large numbers alignment) 2016-12-09 12:24:31 +00:00			`['BM_10PercentFaster', '-0.10', '-0.10', '100', '90'],`
			`['BM_10PercentSlower', '+0.10', '+0.10', '100', '110'],`
			`['BM_100xSlower', '+99.00', '+99.00', '100', '10000'],`
			`['BM_100xFaster', '-0.99', '-0.99', '10000', '100'],`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`]`
			`json1, json2 = self.load_results()`
fix compare script - output formatting - correctly align numbers >9999 (#322) * fix compare script - output formatting - correctly align numbers >9999 * fix failing test (report.py); fix compare script output formatting (large numbers alignment) 2016-12-09 12:24:31 +00:00			`output_lines_with_header = generate_difference_report(json1, json2, use_color=False)`
			`output_lines = output_lines_with_header[2:]`
Python 2/3 compatibility (#361) * [tools] python 2/3 support * update authors/contributors 2017-03-29 10:39:18 +00:00			`print("\n".join(output_lines_with_header))`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`self.assertEqual(len(output_lines), len(expect_lines))`
			`for i in xrange(0, len(output_lines)):`
			`parts = [x for x in output_lines[i].split(' ') if x]`
fix compare script - output formatting - correctly align numbers >9999 (#322) * fix compare script - output formatting - correctly align numbers >9999 * fix failing test (report.py); fix compare script output formatting (large numbers alignment) 2016-12-09 12:24:31 +00:00			`self.assertEqual(len(parts), 5)`
Add a "compare_bench.py" tooling script. (#266) This patch adds the compare_bench.py utility which can be used to compare the result of benchmarks. The program is invoked like: $ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]... Where <old-benchmark> and <new-benchmark> either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file. 2016-08-09 18:33:57 +00:00			`self.assertEqual(parts, expect_lines[i])`


			`if __name__ == '__main__':`
			`unittest.main()`