15 #ifndef OPM_AUTOTUNER_HPP 16 #define OPM_AUTOTUNER_HPP 19 #include <cuda_runtime.h> 22 #include <opm/common/ErrorMacros.hpp> 23 #include <opm/common/OpmLog/OpmLog.hpp> 24 #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp> 25 #include <opm/simulators/linalg/gpuistl/gpu_resources.hpp> 36 template <
typename func>
47 constexpr
const int runs = 2;
48 std::array<GPUEvent, runs+1> events;
51 float bestTime = std::numeric_limits<float>::max();
52 int bestBlockSize = -1;
56 for (
int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
59 OPM_GPU_SAFE_CALL(cudaEventRecord(events[0].
get()));
60 for (
int i = 0; i < runs; ++i) {
62 OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1].
get()));
66 OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs].
get()));
69 if (cudaSuccess == cudaGetLastError()) {
71 for (
int i = 0; i < runs; ++i) {
72 float candidateBlockSizeTime;
73 OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i].
get(), events[i + 1].
get()));
74 if (candidateBlockSizeTime < bestTime) {
75 bestTime = candidateBlockSizeTime;
76 bestBlockSize = thrBlockSize;
83 fmt::format(
"[Kernel tuning completed] {}: Tuned Blocksize = {}, Fastest Runtime = {}ms.", descriptionOfFunction, bestBlockSize, bestTime));
int tuneThreadBlockSize(func &f, std::string descriptionOfFunction)
Function that tests the best thread block size, assumes the provided function depends on threadblock-...
Definition: autotuner.hpp:38
Contains wrappers to make the CuBLAS library behave as a modern C++ library with function overlading...
Definition: autotuner.hpp:29