Fix CPU frequency estimation on riscv (#1549)

* Fix CPU frequency estimation on riscv

* Cleanup code for CPU frequency estimation

* Fix use before definition of the macro

* Move the platform definitions back

* Fix compilation error on windows

* Remove unused sleep.h and sleep.cc
This commit is contained in:
Yingwei Zheng 2023-02-21 19:30:28 +08:00 committed by GitHub
parent b111d01c1b
commit 3b19d7222d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 138 additions and 86 deletions

View File

@ -307,6 +307,7 @@ cxx_feature_check(STEADY_CLOCK)
# Ensure we have pthreads # Ensure we have pthreads
set(THREADS_PREFER_PTHREAD_FLAG ON) set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
cxx_feature_check(PTHREAD_AFFINITY)
if (BENCHMARK_ENABLE_LIBPFM) if (BENCHMARK_ENABLE_LIBPFM)
find_package(PFM) find_package(PFM)

View File

@ -0,0 +1,16 @@
#include <pthread.h>
int main() {
cpu_set_t set;
CPU_ZERO(&set);
for (int i = 0; i < CPU_SETSIZE; ++i) {
CPU_SET(i, &set);
CPU_CLR(i, &set);
}
pthread_t self = pthread_self();
int ret;
ret = pthread_getaffinity_np(self, sizeof(set), &set);
if (ret != 0) return ret;
ret = pthread_setaffinity_np(self, sizeof(set), &set);
if (ret != 0) return ret;
return 0;
}

View File

@ -34,6 +34,11 @@ if (HAVE_LIBPFM)
target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM) target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
endif() endif()
# pthread affinity, if available
if(HAVE_PTHREAD_AFFINITY)
target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
endif()
# Link threads. # Link threads.
target_link_libraries(benchmark PRIVATE Threads::Threads) target_link_libraries(benchmark PRIVATE Threads::Threads)

View File

@ -42,6 +42,10 @@
#define BENCHMARK_OS_CYGWIN 1 #define BENCHMARK_OS_CYGWIN 1
#elif defined(_WIN32) #elif defined(_WIN32)
#define BENCHMARK_OS_WINDOWS 1 #define BENCHMARK_OS_WINDOWS 1
// WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
// We include windows.h which implicitly includes winapifamily.h for compatibility.
#define NOMINMAX
#include <windows.h>
#if defined(WINAPI_FAMILY_PARTITION) #if defined(WINAPI_FAMILY_PARTITION)
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
#define BENCHMARK_OS_WINDOWS_WIN32 1 #define BENCHMARK_OS_WINDOWS_WIN32 1

View File

@ -1,66 +0,0 @@
// Copyright 2015 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "sleep.h"
#include <cerrno>
#include <cstdlib>
#include <ctime>
#include "internal_macros.h"
#ifdef BENCHMARK_OS_WINDOWS
#include <windows.h>
#endif
#ifdef BENCHMARK_OS_ZOS
#include <unistd.h>
#endif
namespace benchmark {
#ifdef BENCHMARK_OS_WINDOWS
// Window's Sleep takes milliseconds argument.
void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
void SleepForSeconds(double seconds) {
SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
}
#else // BENCHMARK_OS_WINDOWS
void SleepForMicroseconds(int microseconds) {
#ifdef BENCHMARK_OS_ZOS
// z/OS does not support nanosleep. Instead call sleep() and then usleep() to
// sleep for the remaining microseconds because usleep() will fail if its
// argument is greater than 1000000.
div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
int seconds = sleepTime.quot;
while (seconds != 0) seconds = sleep(seconds);
while (usleep(sleepTime.rem) == -1 && errno == EINTR)
;
#else
struct timespec sleep_time;
sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
; // Ignore signals and wait for the full interval to elapse.
#endif
}
void SleepForMilliseconds(int milliseconds) {
SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
}
void SleepForSeconds(double seconds) {
SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
}
#endif // BENCHMARK_OS_WINDOWS
} // end namespace benchmark

View File

@ -1,15 +0,0 @@
#ifndef BENCHMARK_SLEEP_H_
#define BENCHMARK_SLEEP_H_
namespace benchmark {
const int kNumMillisPerSecond = 1000;
const int kNumMicrosPerMilli = 1000;
const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
const int kNumNanosPerMicro = 1000;
const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
void SleepForMilliseconds(int milliseconds);
void SleepForSeconds(double seconds);
} // end namespace benchmark
#endif // BENCHMARK_SLEEP_H_

View File

@ -46,6 +46,9 @@
#if defined(BENCHMARK_OS_QURT) #if defined(BENCHMARK_OS_QURT)
#include <qurt.h> #include <qurt.h>
#endif #endif
#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
#include <pthread.h>
#endif
#include <algorithm> #include <algorithm>
#include <array> #include <array>
@ -62,15 +65,17 @@
#include <limits> #include <limits>
#include <locale> #include <locale>
#include <memory> #include <memory>
#include <random>
#include <sstream> #include <sstream>
#include <utility> #include <utility>
#include "benchmark/benchmark.h"
#include "check.h" #include "check.h"
#include "cycleclock.h" #include "cycleclock.h"
#include "internal_macros.h" #include "internal_macros.h"
#include "log.h" #include "log.h"
#include "sleep.h"
#include "string_util.h" #include "string_util.h"
#include "timers.h"
namespace benchmark { namespace benchmark {
namespace { namespace {
@ -544,6 +549,80 @@ int GetNumCPUs() {
BENCHMARK_UNREACHABLE(); BENCHMARK_UNREACHABLE();
} }
class ThreadAffinityGuard final {
public:
ThreadAffinityGuard() : reset_affinity(SetAffinity()) {
if (!reset_affinity)
std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU "
"frequency may be incorrect."
<< std::endl;
}
~ThreadAffinityGuard() {
if (!reset_affinity) return;
#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
int ret = pthread_setaffinity_np(self, sizeof(previous_affinity),
&previous_affinity);
if (ret == 0) return;
#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity);
if (ret != 0) return;
#endif // def BENCHMARK_HAS_PTHREAD_AFFINITY
PrintErrorAndDie("Failed to reset thread affinity");
}
ThreadAffinityGuard(ThreadAffinityGuard&&) = delete;
ThreadAffinityGuard(const ThreadAffinityGuard&) = delete;
ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete;
ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete;
private:
bool SetAffinity() {
#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
int ret;
self = pthread_self();
ret = pthread_getaffinity_np(self, sizeof(previous_affinity),
&previous_affinity);
if (ret != 0) return false;
cpu_set_t affinity;
memcpy(&affinity, &previous_affinity, sizeof(affinity));
bool is_first_cpu = true;
for (int i = 0; i < CPU_SETSIZE; ++i)
if (CPU_ISSET(i, &affinity)) {
if (is_first_cpu)
is_first_cpu = false;
else
CPU_CLR(i, &affinity);
}
if (is_first_cpu) return false;
ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity);
return ret == 0;
#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
self = GetCurrentThread();
DWORD_PTR mask = static_cast<DWORD_PTR>(1) << GetCurrentProcessorNumber();
previous_affinity = SetThreadAffinityMask(self, mask);
return previous_affinity != 0;
#else
return false;
#endif // def BENCHMARK_HAS_PTHREAD_AFFINITY
}
#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
pthread_t self;
cpu_set_t previous_affinity;
#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
HANDLE self;
DWORD_PTR previous_affinity;
#endif // def BENCHMARK_HAS_PTHREAD_AFFINITY
bool reset_affinity;
};
double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) { double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
// Currently, scaling is only used on linux path here, // Currently, scaling is only used on linux path here,
// suppress diagnostics about it being unused on other paths. // suppress diagnostics about it being unused on other paths.
@ -699,10 +778,39 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
return 1000000000; return 1000000000;
#endif #endif
// If we've fallen through, attempt to roughly estimate the CPU clock rate. // If we've fallen through, attempt to roughly estimate the CPU clock rate.
static constexpr int estimate_time_ms = 1000;
// Make sure to use the same cycle counter when starting and stopping the
// cycle timer. We just pin the current thread to a cpu in the previous
// affinity set.
ThreadAffinityGuard affinity_guard;
static constexpr double estimate_time_s = 1.0;
const double start_time = ChronoClockNow();
const auto start_ticks = cycleclock::Now(); const auto start_ticks = cycleclock::Now();
SleepForMilliseconds(estimate_time_ms);
return static_cast<double>(cycleclock::Now() - start_ticks); // Impose load instead of calling sleep() to make sure the cycle counter
// works.
using PRNG = std::minstd_rand;
using Result = PRNG::result_type;
PRNG rng(static_cast<Result>(start_ticks));
Result state = 0;
do {
static constexpr size_t batch_size = 10000;
rng.discard(batch_size);
state += rng();
} while (ChronoClockNow() - start_time < estimate_time_s);
DoNotOptimize(state);
const auto end_ticks = cycleclock::Now();
const double end_time = ChronoClockNow();
return static_cast<double>(end_ticks - start_ticks) / (end_time - start_time);
// Reset the affinity of current thread when the lifetime of affinity_guard
// ends.
} }
std::vector<double> GetLoadAvg() { std::vector<double> GetLoadAvg() {

View File

@ -59,7 +59,6 @@
#include "check.h" #include "check.h"
#include "log.h" #include "log.h"
#include "sleep.h"
#include "string_util.h" #include "string_util.h"
namespace benchmark { namespace benchmark {