autotuner.hpp
Go to the documentation of this file.
1/*
2 Copyright 2024 SINTEF AS
3 This file is part of the Open Porous Media project (OPM).
4 OPM is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8 OPM is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12 You should have received a copy of the GNU General Public License
13 along with OPM. If not, see <http://www.gnu.org/licenses/>.
14*/
15#ifndef OPM_AUTOTUNER_HPP
16#define OPM_AUTOTUNER_HPP
17
18#include <cuda.h>
19#include <cuda_runtime.h>
20#include <functional>
21#include <limits>
22#include <opm/common/ErrorMacros.hpp>
23#include <opm/common/OpmLog/OpmLog.hpp>
26#include <string>
27#include <utility>
28
30{
31
35template <typename func>
36int
37tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
38{
39 // This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
40 // 64 is chosen so it is a multiple of the AMD wavefront size.
41 // The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
42 // We time the kernel with each possible threadblock-size, and return the one
43 // that gave the fastest invidivual run.
44
45 // TODO: figure out a more rigorous way of deciding how many runs will suffice?
46 constexpr const int runs = 2;
47 std::array<GPUEvent, runs+1> events;
48
49 // Initialize helper variables
50 float bestTime = std::numeric_limits<float>::max();
51 int bestBlockSize = -1;
52 int interval = 64;
53
54 // try each possible blocksize
55 for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
56
57 // record a first event, and then an event after each kernel
58 OPM_GPU_SAFE_CALL(cudaEventRecord(events[0].get()));
59 for (int i = 0; i < runs; ++i) {
60 f(thrBlockSize); // runs an arbitrary function with the provided arguments
61 OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1].get()));
62 }
63
64 // make sure the runs are over
65 OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs].get()));
66
67 // kernel launch was valid
68 if (cudaSuccess == cudaGetLastError()) {
69 // check if we beat the record for the fastest kernel
70 for (int i = 0; i < runs; ++i) {
71 float candidateBlockSizeTime;
72 OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i].get(), events[i + 1].get()));
73 if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
74 bestTime = candidateBlockSizeTime;
75 bestBlockSize = thrBlockSize;
76 }
77 }
78 }
79 }
80
81 OpmLog::info(
82 fmt::format("[Kernel tuning completed] {}: Tuned Blocksize = {}, Fastest Runtime = {}ms.", descriptionOfFunction, bestBlockSize, bestTime));
83
84 return bestBlockSize;
85}
86
87} // end namespace Opm::gpuistl::detail
88
89#endif
#define OPM_GPU_SAFE_CALL(expression)
OPM_GPU_SAFE_CALL checks the return type of the GPU expression (function call) and throws an exceptio...
Definition: gpu_safe_call.hpp:150
Definition: autotuner.hpp:30
int tuneThreadBlockSize(func &f, std::string descriptionOfFunction)
Function that tests the best thread block size, assumes the provided function depends on threadblock-...
Definition: autotuner.hpp:37