opm-simulators
autotuner.hpp
1 /*
2  Copyright 2024 SINTEF AS
3  This file is part of the Open Porous Media project (OPM).
4  OPM is free software: you can redistribute it and/or modify
5  it under the terms of the GNU General Public License as published by
6  the Free Software Foundation, either version 3 of the License, or
7  (at your option) any later version.
8  OPM is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  GNU General Public License for more details.
12  You should have received a copy of the GNU General Public License
13  along with OPM. If not, see <http://www.gnu.org/licenses/>.
14 */
15 #ifndef OPM_AUTOTUNER_HPP
16 #define OPM_AUTOTUNER_HPP
17 
18 #include <cuda.h>
19 #include <cuda_runtime.h>
20 #include <functional>
21 #include <limits>
22 #include <opm/common/ErrorMacros.hpp>
23 #include <opm/common/OpmLog/OpmLog.hpp>
24 #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
25 #include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
26 #include <string>
27 #include <utility>
28 
30 {
31 
36 template <typename func>
37 int
38 tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
39 {
40  // This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
41  // 64 is chosen so it is a multiple of the AMD wavefront size.
42  // The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
43  // We time the kernel with each possible threadblock-size, and return the one
44  // that gave the fastest invidivual run.
45 
46  // TODO: figure out a more rigorous way of deciding how many runs will suffice?
47  constexpr const int runs = 2;
48  std::array<GPUEvent, runs+1> events;
49 
50  // Initialize helper variables
51  float bestTime = std::numeric_limits<float>::max();
52  int bestBlockSize = -1;
53  int interval = 64;
54 
55  // try each possible blocksize
56  for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
57 
58  // record a first event, and then an event after each kernel
59  OPM_GPU_SAFE_CALL(cudaEventRecord(events[0].get()));
60  for (int i = 0; i < runs; ++i) {
61  f(thrBlockSize); // runs an arbitrary function with the provided arguments
62  OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1].get()));
63  }
64 
65  // make sure the runs are over
66  OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs].get()));
67 
68  // kernel launch was valid
69  if (cudaSuccess == cudaGetLastError()) {
70  // check if we beat the record for the fastest kernel
71  for (int i = 0; i < runs; ++i) {
72  float candidateBlockSizeTime;
73  OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i].get(), events[i + 1].get()));
74  if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
75  bestTime = candidateBlockSizeTime;
76  bestBlockSize = thrBlockSize;
77  }
78  }
79  }
80  }
81 
82  OpmLog::info(
83  fmt::format("[Kernel tuning completed] {}: Tuned Blocksize = {}, Fastest Runtime = {}ms.", descriptionOfFunction, bestBlockSize, bestTime));
84 
85  return bestBlockSize;
86 }
87 
88 } // end namespace Opm::gpuistl::detail
89 
90 #endif
int tuneThreadBlockSize(func &f, std::string descriptionOfFunction)
Function that tests the best thread block size, assumes the provided function depends on threadblock-...
Definition: autotuner.hpp:38
Contains wrappers to make the CuBLAS library behave as a modern C++ library with function overlading...
Definition: autotuner.hpp:29