autotuner.hpp
Go to the documentation of this file.
1/*
2 Copyright 2024 SINTEF AS
3 This file is part of the Open Porous Media project (OPM).
4 OPM is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8 OPM is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12 You should have received a copy of the GNU General Public License
13 along with OPM. If not, see <http://www.gnu.org/licenses/>.
14*/
15#ifndef OPM_AUTOTUNER_HPP
16#define OPM_AUTOTUNER_HPP
17
18#include <cuda.h>
19#include <cuda_runtime.h>
20#include <functional>
21#include <limits>
22#include <opm/common/ErrorMacros.hpp>
23#include <opm/common/OpmLog/OpmLog.hpp>
26#include <string>
27#include <utility>
28
30{
31
36template <typename func>
37int
38tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
39{
40 // This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
41 // 64 is chosen so it is a multiple of the AMD wavefront size.
42 // The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
43 // We time the kernel with each possible threadblock-size, and return the one
44 // that gave the fastest invidivual run.
45
46 // TODO: figure out a more rigorous way of deciding how many runs will suffice?
47 constexpr const int runs = 2;
48 std::array<GPUEvent, runs+1> events;
49
50 // Initialize helper variables
51 float bestTime = std::numeric_limits<float>::max();
52 int bestBlockSize = -1;
53 int interval = 64;
54
55 // try each possible blocksize
56 for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
57
58 // record a first event, and then an event after each kernel
59 OPM_GPU_SAFE_CALL(cudaEventRecord(events[0].get()));
60 for (int i = 0; i < runs; ++i) {
61 f(thrBlockSize); // runs an arbitrary function with the provided arguments
62 OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1].get()));
63 }
64
65 // make sure the runs are over
66 OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs].get()));
67
68 // kernel launch was valid
69 if (cudaSuccess == cudaGetLastError()) {
70 // check if we beat the record for the fastest kernel
71 for (int i = 0; i < runs; ++i) {
72 float candidateBlockSizeTime;
73 OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i].get(), events[i + 1].get()));
74 if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
75 bestTime = candidateBlockSizeTime;
76 bestBlockSize = thrBlockSize;
77 }
78 }
79 }
80 }
81
82 OpmLog::info(
83 fmt::format("[Kernel tuning completed] {}: Tuned Blocksize = {}, Fastest Runtime = {}ms.", descriptionOfFunction, bestBlockSize, bestTime));
84
85 return bestBlockSize;
86}
87
88} // end namespace Opm::gpuistl::detail
89
90#endif
#define OPM_GPU_SAFE_CALL(expression)
OPM_GPU_SAFE_CALL checks the return type of the GPU expression (function call) and throws an exceptio...
Definition: gpu_safe_call.hpp:150
Definition: autotuner.hpp:30
int tuneThreadBlockSize(func &f, std::string descriptionOfFunction)
Function that tests the best thread block size, assumes the provided function depends on threadblock-...
Definition: autotuner.hpp:38