From 3b19d7222db7babfdc9b3949408b2294c3bbb540 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Tue, 21 Feb 2023 19:30:28 +0800 Subject: [PATCH] Fix CPU frequency estimation on riscv (#1549) * Fix CPU frequency estimation on riscv * Cleanup code for CPU frequency estimation * Fix use before definition of the macro * Move the platform definitions back * Fix compilation error on windows * Remove unused sleep.h and sleep.cc --- CMakeLists.txt | 1 + cmake/pthread_affinity.cpp | 16 +++++ src/CMakeLists.txt | 5 ++ src/internal_macros.h | 4 ++ src/sleep.cc | 66 --------------------- src/sleep.h | 15 ----- src/sysinfo.cc | 116 +++++++++++++++++++++++++++++++++++-- src/timers.cc | 1 - 8 files changed, 138 insertions(+), 86 deletions(-) create mode 100644 cmake/pthread_affinity.cpp delete mode 100644 src/sleep.cc delete mode 100644 src/sleep.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ccec880c..6e7701e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,6 +307,7 @@ cxx_feature_check(STEADY_CLOCK) # Ensure we have pthreads set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) +cxx_feature_check(PTHREAD_AFFINITY) if (BENCHMARK_ENABLE_LIBPFM) find_package(PFM) diff --git a/cmake/pthread_affinity.cpp b/cmake/pthread_affinity.cpp new file mode 100644 index 00000000..7b143bc0 --- /dev/null +++ b/cmake/pthread_affinity.cpp @@ -0,0 +1,16 @@ +#include +int main() { + cpu_set_t set; + CPU_ZERO(&set); + for (int i = 0; i < CPU_SETSIZE; ++i) { + CPU_SET(i, &set); + CPU_CLR(i, &set); + } + pthread_t self = pthread_self(); + int ret; + ret = pthread_getaffinity_np(self, sizeof(set), &set); + if (ret != 0) return ret; + ret = pthread_setaffinity_np(self, sizeof(set), &set); + if (ret != 0) return ret; + return 0; +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7f2c88b5..91ea5f42 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,6 +34,11 @@ if (HAVE_LIBPFM) target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM) endif() +# pthread affinity, if available +if(HAVE_PTHREAD_AFFINITY) + target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY) +endif() + # Link threads. target_link_libraries(benchmark PRIVATE Threads::Threads) diff --git a/src/internal_macros.h b/src/internal_macros.h index 396a390a..658f1573 100644 --- a/src/internal_macros.h +++ b/src/internal_macros.h @@ -42,6 +42,10 @@ #define BENCHMARK_OS_CYGWIN 1 #elif defined(_WIN32) #define BENCHMARK_OS_WINDOWS 1 + // WINAPI_FAMILY_PARTITION is defined in winapifamily.h. + // We include windows.h which implicitly includes winapifamily.h for compatibility. + #define NOMINMAX + #include #if defined(WINAPI_FAMILY_PARTITION) #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) #define BENCHMARK_OS_WINDOWS_WIN32 1 diff --git a/src/sleep.cc b/src/sleep.cc deleted file mode 100644 index ab59000f..00000000 --- a/src/sleep.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2015 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "sleep.h" - -#include -#include -#include - -#include "internal_macros.h" - -#ifdef BENCHMARK_OS_WINDOWS -#include -#endif - -#ifdef BENCHMARK_OS_ZOS -#include -#endif - -namespace benchmark { -#ifdef BENCHMARK_OS_WINDOWS -// Window's Sleep takes milliseconds argument. -void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); } -void SleepForSeconds(double seconds) { - SleepForMilliseconds(static_cast(kNumMillisPerSecond * seconds)); -} -#else // BENCHMARK_OS_WINDOWS -void SleepForMicroseconds(int microseconds) { -#ifdef BENCHMARK_OS_ZOS - // z/OS does not support nanosleep. Instead call sleep() and then usleep() to - // sleep for the remaining microseconds because usleep() will fail if its - // argument is greater than 1000000. - div_t sleepTime = div(microseconds, kNumMicrosPerSecond); - int seconds = sleepTime.quot; - while (seconds != 0) seconds = sleep(seconds); - while (usleep(sleepTime.rem) == -1 && errno == EINTR) - ; -#else - struct timespec sleep_time; - sleep_time.tv_sec = microseconds / kNumMicrosPerSecond; - sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro; - while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR) - ; // Ignore signals and wait for the full interval to elapse. -#endif -} - -void SleepForMilliseconds(int milliseconds) { - SleepForMicroseconds(milliseconds * kNumMicrosPerMilli); -} - -void SleepForSeconds(double seconds) { - SleepForMicroseconds(static_cast(seconds * kNumMicrosPerSecond)); -} -#endif // BENCHMARK_OS_WINDOWS -} // end namespace benchmark diff --git a/src/sleep.h b/src/sleep.h deleted file mode 100644 index f98551af..00000000 --- a/src/sleep.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef BENCHMARK_SLEEP_H_ -#define BENCHMARK_SLEEP_H_ - -namespace benchmark { -const int kNumMillisPerSecond = 1000; -const int kNumMicrosPerMilli = 1000; -const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000; -const int kNumNanosPerMicro = 1000; -const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond; - -void SleepForMilliseconds(int milliseconds); -void SleepForSeconds(double seconds); -} // end namespace benchmark - -#endif // BENCHMARK_SLEEP_H_ diff --git a/src/sysinfo.cc b/src/sysinfo.cc index 59120b72..4578cb0f 100644 --- a/src/sysinfo.cc +++ b/src/sysinfo.cc @@ -46,6 +46,9 @@ #if defined(BENCHMARK_OS_QURT) #include #endif +#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY) +#include +#endif #include #include @@ -62,15 +65,17 @@ #include #include #include +#include #include #include +#include "benchmark/benchmark.h" #include "check.h" #include "cycleclock.h" #include "internal_macros.h" #include "log.h" -#include "sleep.h" #include "string_util.h" +#include "timers.h" namespace benchmark { namespace { @@ -544,6 +549,80 @@ int GetNumCPUs() { BENCHMARK_UNREACHABLE(); } +class ThreadAffinityGuard final { + public: + ThreadAffinityGuard() : reset_affinity(SetAffinity()) { + if (!reset_affinity) + std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU " + "frequency may be incorrect." + << std::endl; + } + + ~ThreadAffinityGuard() { + if (!reset_affinity) return; + +#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY) + int ret = pthread_setaffinity_np(self, sizeof(previous_affinity), + &previous_affinity); + if (ret == 0) return; +#elif defined(BENCHMARK_OS_WINDOWS_WIN32) + DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity); + if (ret != 0) return; +#endif // def BENCHMARK_HAS_PTHREAD_AFFINITY + PrintErrorAndDie("Failed to reset thread affinity"); + } + + ThreadAffinityGuard(ThreadAffinityGuard&&) = delete; + ThreadAffinityGuard(const ThreadAffinityGuard&) = delete; + ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete; + ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete; + + private: + bool SetAffinity() { +#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY) + int ret; + self = pthread_self(); + ret = pthread_getaffinity_np(self, sizeof(previous_affinity), + &previous_affinity); + if (ret != 0) return false; + + cpu_set_t affinity; + memcpy(&affinity, &previous_affinity, sizeof(affinity)); + + bool is_first_cpu = true; + + for (int i = 0; i < CPU_SETSIZE; ++i) + if (CPU_ISSET(i, &affinity)) { + if (is_first_cpu) + is_first_cpu = false; + else + CPU_CLR(i, &affinity); + } + + if (is_first_cpu) return false; + + ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity); + return ret == 0; +#elif defined(BENCHMARK_OS_WINDOWS_WIN32) + self = GetCurrentThread(); + DWORD_PTR mask = static_cast(1) << GetCurrentProcessorNumber(); + previous_affinity = SetThreadAffinityMask(self, mask); + return previous_affinity != 0; +#else + return false; +#endif // def BENCHMARK_HAS_PTHREAD_AFFINITY + } + +#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY) + pthread_t self; + cpu_set_t previous_affinity; +#elif defined(BENCHMARK_OS_WINDOWS_WIN32) + HANDLE self; + DWORD_PTR previous_affinity; +#endif // def BENCHMARK_HAS_PTHREAD_AFFINITY + bool reset_affinity; +}; + double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) { // Currently, scaling is only used on linux path here, // suppress diagnostics about it being unused on other paths. @@ -699,10 +778,39 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) { return 1000000000; #endif // If we've fallen through, attempt to roughly estimate the CPU clock rate. - static constexpr int estimate_time_ms = 1000; + + // Make sure to use the same cycle counter when starting and stopping the + // cycle timer. We just pin the current thread to a cpu in the previous + // affinity set. + ThreadAffinityGuard affinity_guard; + + static constexpr double estimate_time_s = 1.0; + const double start_time = ChronoClockNow(); const auto start_ticks = cycleclock::Now(); - SleepForMilliseconds(estimate_time_ms); - return static_cast(cycleclock::Now() - start_ticks); + + // Impose load instead of calling sleep() to make sure the cycle counter + // works. + using PRNG = std::minstd_rand; + using Result = PRNG::result_type; + PRNG rng(static_cast(start_ticks)); + + Result state = 0; + + do { + static constexpr size_t batch_size = 10000; + rng.discard(batch_size); + state += rng(); + + } while (ChronoClockNow() - start_time < estimate_time_s); + + DoNotOptimize(state); + + const auto end_ticks = cycleclock::Now(); + const double end_time = ChronoClockNow(); + + return static_cast(end_ticks - start_ticks) / (end_time - start_time); + // Reset the affinity of current thread when the lifetime of affinity_guard + // ends. } std::vector GetLoadAvg() { diff --git a/src/timers.cc b/src/timers.cc index 379d97dd..89ddbfb0 100644 --- a/src/timers.cc +++ b/src/timers.cc @@ -59,7 +59,6 @@ #include "check.h" #include "log.h" -#include "sleep.h" #include "string_util.h" namespace benchmark {