benchmark/src/complexity.cc

260 lines
8.9 KiB
C++
Raw Normal View History

2016-05-20 14:49:39 +00:00
// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Source project : https://github.com/ismaelJimenez/cpp.leastsq
2016-05-21 09:51:42 +00:00
// Adapted to be used with google benchmark
2016-05-20 14:49:39 +00:00
#include "complexity.h"
2016-05-26 20:39:17 +00:00
2016-06-02 20:01:31 +00:00
#include <algorithm>
#include <cmath>
#include "benchmark/benchmark.h"
2016-05-23 18:12:54 +00:00
#include "check.h"
2016-05-20 14:49:39 +00:00
2016-05-25 20:57:52 +00:00
namespace benchmark {
2016-06-01 21:08:01 +00:00
2016-05-20 14:49:39 +00:00
// Internal function to calculate the different scalability forms
2016-06-01 21:08:01 +00:00
BigOFunc* FittingCurve(BigO complexity) {
static const double kLog2E = 1.44269504088896340736;
2016-05-25 20:26:57 +00:00
switch (complexity) {
2016-06-02 20:01:31 +00:00
case oN:
Iteration counts should be `uint64_t` globally. (#817) This is a shameless rip-off of https://github.com/google/benchmark/pull/646 I did promise to look into why that proposed PR was producing so much worse assembly, and so i finally did. The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed). There is this nice little `assert`: https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744 It ensures that we didn't magically decide to advance our iterator when we should have finished benchmarking. When `cached_` was unsigned, the `assert` was `cached_ UGT 0`. But we only ever get to that `assert` if `cached_ NE 0`, and naturally if `cached_` is not `0`, then it is bigger than `0`, so the `assert` is tautological, and gets folded away. But now that `cached_` became signed, the assert became `cached_ SGT 0`. And we still only know that `cached_ NE 0`, so the assert can't be optimized out, or at least it doesn't currently. Regardless of whether or not that is a bug in itself, that particular diff would have regressed the normal 64-bit systems, by halving the maximal iteration space (since we go from unsigned counter to signed one, of the same bit-width), which seems like a bug. And just so it happens, fixing *this* bug, fixes the other bug. This produces fully (bit-by-bit) identical state_assembly_test.s The filecheck change is actually needed regardless of this patch, else this test does not pass for me even without this diff.
2019-05-13 09:33:11 +00:00
return [](IterationCount n) -> double { return static_cast<double>(n); };
2016-06-02 20:01:31 +00:00
case oNSquared:
Iteration counts should be `uint64_t` globally. (#817) This is a shameless rip-off of https://github.com/google/benchmark/pull/646 I did promise to look into why that proposed PR was producing so much worse assembly, and so i finally did. The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed). There is this nice little `assert`: https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744 It ensures that we didn't magically decide to advance our iterator when we should have finished benchmarking. When `cached_` was unsigned, the `assert` was `cached_ UGT 0`. But we only ever get to that `assert` if `cached_ NE 0`, and naturally if `cached_` is not `0`, then it is bigger than `0`, so the `assert` is tautological, and gets folded away. But now that `cached_` became signed, the assert became `cached_ SGT 0`. And we still only know that `cached_ NE 0`, so the assert can't be optimized out, or at least it doesn't currently. Regardless of whether or not that is a bug in itself, that particular diff would have regressed the normal 64-bit systems, by halving the maximal iteration space (since we go from unsigned counter to signed one, of the same bit-width), which seems like a bug. And just so it happens, fixing *this* bug, fixes the other bug. This produces fully (bit-by-bit) identical state_assembly_test.s The filecheck change is actually needed regardless of this patch, else this test does not pass for me even without this diff.
2019-05-13 09:33:11 +00:00
return [](IterationCount n) -> double { return std::pow(n, 2); };
2016-06-02 20:01:31 +00:00
case oNCubed:
Iteration counts should be `uint64_t` globally. (#817) This is a shameless rip-off of https://github.com/google/benchmark/pull/646 I did promise to look into why that proposed PR was producing so much worse assembly, and so i finally did. The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed). There is this nice little `assert`: https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744 It ensures that we didn't magically decide to advance our iterator when we should have finished benchmarking. When `cached_` was unsigned, the `assert` was `cached_ UGT 0`. But we only ever get to that `assert` if `cached_ NE 0`, and naturally if `cached_` is not `0`, then it is bigger than `0`, so the `assert` is tautological, and gets folded away. But now that `cached_` became signed, the assert became `cached_ SGT 0`. And we still only know that `cached_ NE 0`, so the assert can't be optimized out, or at least it doesn't currently. Regardless of whether or not that is a bug in itself, that particular diff would have regressed the normal 64-bit systems, by halving the maximal iteration space (since we go from unsigned counter to signed one, of the same bit-width), which seems like a bug. And just so it happens, fixing *this* bug, fixes the other bug. This produces fully (bit-by-bit) identical state_assembly_test.s The filecheck change is actually needed regardless of this patch, else this test does not pass for me even without this diff.
2019-05-13 09:33:11 +00:00
return [](IterationCount n) -> double { return std::pow(n, 3); };
2016-06-02 20:01:31 +00:00
case oLogN:
/* Note: can't use log2 because Android's GNU STL lacks it */
return [](IterationCount n) {
return kLog2E * std::log(static_cast<double>(n));
};
2016-06-02 20:01:31 +00:00
case oNLogN:
/* Note: can't use log2 because Android's GNU STL lacks it */
Iteration counts should be `uint64_t` globally. (#817) This is a shameless rip-off of https://github.com/google/benchmark/pull/646 I did promise to look into why that proposed PR was producing so much worse assembly, and so i finally did. The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed). There is this nice little `assert`: https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744 It ensures that we didn't magically decide to advance our iterator when we should have finished benchmarking. When `cached_` was unsigned, the `assert` was `cached_ UGT 0`. But we only ever get to that `assert` if `cached_ NE 0`, and naturally if `cached_` is not `0`, then it is bigger than `0`, so the `assert` is tautological, and gets folded away. But now that `cached_` became signed, the assert became `cached_ SGT 0`. And we still only know that `cached_ NE 0`, so the assert can't be optimized out, or at least it doesn't currently. Regardless of whether or not that is a bug in itself, that particular diff would have regressed the normal 64-bit systems, by halving the maximal iteration space (since we go from unsigned counter to signed one, of the same bit-width), which seems like a bug. And just so it happens, fixing *this* bug, fixes the other bug. This produces fully (bit-by-bit) identical state_assembly_test.s The filecheck change is actually needed regardless of this patch, else this test does not pass for me even without this diff.
2019-05-13 09:33:11 +00:00
return [](IterationCount n) {
return kLog2E * static_cast<double>(n) *
std::log(static_cast<double>(n));
Iteration counts should be `uint64_t` globally. (#817) This is a shameless rip-off of https://github.com/google/benchmark/pull/646 I did promise to look into why that proposed PR was producing so much worse assembly, and so i finally did. The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed). There is this nice little `assert`: https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744 It ensures that we didn't magically decide to advance our iterator when we should have finished benchmarking. When `cached_` was unsigned, the `assert` was `cached_ UGT 0`. But we only ever get to that `assert` if `cached_ NE 0`, and naturally if `cached_` is not `0`, then it is bigger than `0`, so the `assert` is tautological, and gets folded away. But now that `cached_` became signed, the assert became `cached_ SGT 0`. And we still only know that `cached_ NE 0`, so the assert can't be optimized out, or at least it doesn't currently. Regardless of whether or not that is a bug in itself, that particular diff would have regressed the normal 64-bit systems, by halving the maximal iteration space (since we go from unsigned counter to signed one, of the same bit-width), which seems like a bug. And just so it happens, fixing *this* bug, fixes the other bug. This produces fully (bit-by-bit) identical state_assembly_test.s The filecheck change is actually needed regardless of this patch, else this test does not pass for me even without this diff.
2019-05-13 09:33:11 +00:00
};
2016-06-02 20:01:31 +00:00
case o1:
default:
Iteration counts should be `uint64_t` globally. (#817) This is a shameless rip-off of https://github.com/google/benchmark/pull/646 I did promise to look into why that proposed PR was producing so much worse assembly, and so i finally did. The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed). There is this nice little `assert`: https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744 It ensures that we didn't magically decide to advance our iterator when we should have finished benchmarking. When `cached_` was unsigned, the `assert` was `cached_ UGT 0`. But we only ever get to that `assert` if `cached_ NE 0`, and naturally if `cached_` is not `0`, then it is bigger than `0`, so the `assert` is tautological, and gets folded away. But now that `cached_` became signed, the assert became `cached_ SGT 0`. And we still only know that `cached_ NE 0`, so the assert can't be optimized out, or at least it doesn't currently. Regardless of whether or not that is a bug in itself, that particular diff would have regressed the normal 64-bit systems, by halving the maximal iteration space (since we go from unsigned counter to signed one, of the same bit-width), which seems like a bug. And just so it happens, fixing *this* bug, fixes the other bug. This produces fully (bit-by-bit) identical state_assembly_test.s The filecheck change is actually needed regardless of this patch, else this test does not pass for me even without this diff.
2019-05-13 09:33:11 +00:00
return [](IterationCount) { return 1.0; };
2016-05-25 20:26:57 +00:00
}
}
2016-05-25 21:33:25 +00:00
// Function to return an string for the calculated complexity
2016-05-25 20:57:52 +00:00
std::string GetBigOString(BigO complexity) {
2016-05-23 18:40:41 +00:00
switch (complexity) {
2016-06-02 20:01:31 +00:00
case oN:
return "N";
case oNSquared:
return "N^2";
case oNCubed:
return "N^3";
case oLogN:
return "lgN";
case oNLogN:
return "NlgN";
case o1:
return "(1)";
default:
return "f(N)";
2016-05-23 18:40:41 +00:00
}
2016-05-20 14:49:39 +00:00
}
2016-06-02 20:01:31 +00:00
// Find the coefficient for the high-order term in the running time, by
// minimizing the sum of squares of relative error, for the fitting curve
// given by the lambda expression.
2016-05-25 20:26:57 +00:00
// - n : Vector containing the size of the benchmark tests.
// - time : Vector containing the times for the benchmark tests.
// - fitting_curve : lambda expression (e.g. [](ComplexityN n) {return n; };).
2016-05-25 21:33:25 +00:00
// For a deeper explanation on the algorithm logic, please refer to
// https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics
2016-05-20 14:49:39 +00:00
LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
2016-06-01 21:08:01 +00:00
const std::vector<double>& time,
BigOFunc* fitting_curve) {
2016-05-26 17:44:11 +00:00
double sigma_gn_squared = 0.0;
double sigma_time = 0.0;
double sigma_time_gn = 0.0;
2016-05-23 18:40:41 +00:00
// Calculate least square fitting parameter
for (size_t i = 0; i < n.size(); ++i) {
2016-05-25 20:26:57 +00:00
double gn_i = fitting_curve(n[i]);
2016-05-23 18:40:41 +00:00
sigma_gn_squared += gn_i * gn_i;
sigma_time += time[i];
sigma_time_gn += time[i] * gn_i;
}
LeastSq result;
2016-06-01 21:08:01 +00:00
result.complexity = oLambda;
2016-05-23 18:40:41 +00:00
2016-05-24 20:25:59 +00:00
// Calculate complexity.
2016-05-25 20:26:57 +00:00
result.coef = sigma_time_gn / sigma_gn_squared;
2016-05-23 18:40:41 +00:00
// Calculate RMS
2016-05-26 17:44:11 +00:00
double rms = 0.0;
2016-05-23 18:40:41 +00:00
for (size_t i = 0; i < n.size(); ++i) {
2016-05-25 20:26:57 +00:00
double fit = result.coef * fitting_curve(n[i]);
rms += std::pow((time[i] - fit), 2);
2016-05-23 18:40:41 +00:00
}
2016-05-24 20:25:59 +00:00
// Normalized RMS by the mean of the observed values
double mean = sigma_time / static_cast<double>(n.size());
result.rms = std::sqrt(rms / static_cast<double>(n.size())) / mean;
2016-05-23 18:40:41 +00:00
return result;
2016-05-20 14:49:39 +00:00
}
2016-05-24 20:25:59 +00:00
// Find the coefficient for the high-order term in the running time, by
// minimizing the sum of squares of relative error.
2016-05-23 18:12:54 +00:00
// - n : Vector containing the size of the benchmark tests.
// - time : Vector containing the times for the benchmark tests.
2016-05-24 20:25:59 +00:00
// - complexity : If different than oAuto, the fitting curve will stick to
// this one. If it is oAuto, it will be calculated the best
// fitting curve.
LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
const std::vector<double>& time, const BigO complexity) {
2021-06-24 17:21:59 +00:00
BM_CHECK_EQ(n.size(), time.size());
BM_CHECK_GE(n.size(), 2); // Do not compute fitting curve is less than two
// benchmark runs are given
BM_CHECK_NE(complexity, oNone);
2016-05-23 18:40:41 +00:00
2016-05-25 20:26:57 +00:00
LeastSq best_fit;
2016-06-02 20:01:31 +00:00
if (complexity == oAuto) {
std::vector<BigO> fit_curves = {oLogN, oN, oNLogN, oNSquared, oNCubed};
2016-05-23 18:40:41 +00:00
2016-05-24 20:25:59 +00:00
// Take o1 as default best fitting curve
2016-06-01 21:08:01 +00:00
best_fit = MinimalLeastSq(n, time, FittingCurve(o1));
2016-05-25 20:57:52 +00:00
best_fit.complexity = o1;
2016-05-23 18:40:41 +00:00
// Compute all possible fitting curves and stick to the best one
for (const auto& fit : fit_curves) {
2016-06-01 21:08:01 +00:00
LeastSq current_fit = MinimalLeastSq(n, time, FittingCurve(fit));
2016-05-24 20:25:59 +00:00
if (current_fit.rms < best_fit.rms) {
2016-05-23 18:40:41 +00:00
best_fit = current_fit;
2016-05-25 20:26:57 +00:00
best_fit.complexity = fit;
2016-05-24 20:25:59 +00:00
}
2016-05-23 18:40:41 +00:00
}
2016-05-25 20:26:57 +00:00
} else {
2016-06-01 21:08:01 +00:00
best_fit = MinimalLeastSq(n, time, FittingCurve(complexity));
2016-05-25 20:26:57 +00:00
best_fit.complexity = complexity;
2016-05-23 18:40:41 +00:00
}
2016-05-24 20:25:59 +00:00
2016-05-25 20:26:57 +00:00
return best_fit;
2016-05-24 20:25:59 +00:00
}
2016-05-25 20:57:52 +00:00
std::vector<BenchmarkReporter::Run> ComputeBigO(
2016-06-02 20:01:31 +00:00
const std::vector<BenchmarkReporter::Run>& reports) {
typedef BenchmarkReporter::Run Run;
std::vector<Run> results;
if (reports.size() < 2) return results;
// Accumulators.
std::vector<ComplexityN> n;
std::vector<double> real_time;
std::vector<double> cpu_time;
// Populate the accumulators.
for (const Run& run : reports) {
2021-06-24 17:21:59 +00:00
BM_CHECK_GT(run.complexity_n, 0)
<< "Did you forget to call SetComplexityN?";
n.push_back(run.complexity_n);
real_time.push_back(run.real_accumulated_time /
static_cast<double>(run.iterations));
cpu_time.push_back(run.cpu_accumulated_time /
static_cast<double>(run.iterations));
}
2016-06-01 21:08:01 +00:00
LeastSq result_cpu;
LeastSq result_real;
2016-06-02 20:23:39 +00:00
if (reports[0].complexity == oLambda) {
2016-06-01 21:08:01 +00:00
result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
2016-06-02 20:23:39 +00:00
} else {
const BigO* InitialBigO = &reports[0].complexity;
const bool use_real_time_for_initial_big_o =
reports[0].use_real_time_for_initial_big_o;
if (use_real_time_for_initial_big_o) {
result_real = MinimalLeastSq(n, real_time, *InitialBigO);
InitialBigO = &result_real.complexity;
// The Big-O complexity for CPU time must have the same Big-O function!
}
result_cpu = MinimalLeastSq(n, cpu_time, *InitialBigO);
InitialBigO = &result_cpu.complexity;
if (!use_real_time_for_initial_big_o) {
result_real = MinimalLeastSq(n, real_time, *InitialBigO);
}
2016-06-01 21:08:01 +00:00
}
// Drop the 'args' when reporting complexity.
auto run_name = reports[0].run_name;
run_name.args.clear();
// Get the data from the accumulator to BenchmarkReporter::Run's.
Run big_o;
big_o.run_name = run_name;
big_o.family_index = reports[0].family_index;
big_o.per_family_instance_index = reports[0].per_family_instance_index;
big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
big_o.repetitions = reports[0].repetitions;
big_o.repetition_index = Run::no_repetition_index;
big_o.threads = reports[0].threads;
big_o.aggregate_name = "BigO";
big_o.aggregate_unit = StatisticUnit::kTime;
big_o.report_label = reports[0].report_label;
big_o.iterations = 0;
big_o.real_accumulated_time = result_real.coef;
big_o.cpu_accumulated_time = result_cpu.coef;
big_o.report_big_o = true;
big_o.complexity = result_cpu.complexity;
// All the time results are reported after being multiplied by the
// time unit multiplier. But since RMS is a relative quantity it
// should not be multiplied at all. So, here, we _divide_ it by the
// multiplier so that when it is multiplied later the result is the
// correct one.
double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
// Only add label to mean/stddev if it is same for all runs
Run rms;
rms.run_name = run_name;
rms.family_index = reports[0].family_index;
rms.per_family_instance_index = reports[0].per_family_instance_index;
rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
rms.aggregate_name = "RMS";
rms.aggregate_unit = StatisticUnit::kPercentage;
rms.report_label = big_o.report_label;
rms.iterations = 0;
rms.repetition_index = Run::no_repetition_index;
rms.repetitions = reports[0].repetitions;
rms.threads = reports[0].threads;
rms.real_accumulated_time = result_real.rms / multiplier;
rms.cpu_accumulated_time = result_cpu.rms / multiplier;
rms.report_rms = true;
rms.complexity = result_cpu.complexity;
// don't forget to keep the time unit, or we won't be able to
// recover the correct value.
rms.time_unit = reports[0].time_unit;
results.push_back(big_o);
results.push_back(rms);
return results;
}
2016-05-25 21:13:19 +00:00
} // end namespace benchmark