autotuner.hpp
Go to the documentation of this file.
1/*
2 Copyright 2024 SINTEF AS
3 This file is part of the Open Porous Media project (OPM).
4 OPM is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8 OPM is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12 You should have received a copy of the GNU General Public License
13 along with OPM. If not, see <http://www.gnu.org/licenses/>.
14*/
15#ifndef OPM_AUTOTUNER_HPP
16#define OPM_AUTOTUNER_HPP
17
18#include <cuda.h>
19#include <cuda_runtime.h>
20#include <functional>
21#include <limits>
22#include <opm/common/ErrorMacros.hpp>
23#include <opm/common/OpmLog/OpmLog.hpp>
25#include <string>
26#include <utility>
27
29{
30
34template <typename func>
35int
36tuneThreadBlockSize(func& f, std::string descriptionOfFunction)
37{
38 // This threadblock-tuner is very simple, it tests all valid block sizes divisble by 64
39 // 64 is chosen so it is a multiple of the AMD wavefront size.
40 // The maximum size of a threadblock is 1024, so an exhaustive search here will not be expensive
41 // We time the kernel with each possible threadblock-size, and return the one
42 // that gave the fastest invidivual run.
43
44 // TODO: figure out a more rigorous way of deciding how many runs will suffice?
45 constexpr const int runs = 2;
46 cudaEvent_t events[runs + 1];
47
48 // create the events
49 for (int i = 0; i < runs + 1; ++i) {
50 OPM_GPU_SAFE_CALL(cudaEventCreate(&events[i]));
51 }
52
53 // Initialize helper variables
54 float bestTime = std::numeric_limits<float>::max();
55 int bestBlockSize = -1;
56 int interval = 64;
57
58 // try each possible blocksize
59 for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
60
61 // record a first event, and then an event after each kernel
62 OPM_GPU_SAFE_CALL(cudaEventRecord(events[0]));
63 for (int i = 0; i < runs; ++i) {
64 f(thrBlockSize); // runs an arbitrary function with the provided arguments
65 OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1]));
66 }
67
68 // make suret he runs are over
69 OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs]));
70
71 // kernel launch was valid
72 if (cudaSuccess == cudaGetLastError()) {
73 // check if we beat the record for the fastest kernel
74 for (int i = 0; i < runs; ++i) {
75 float candidateBlockSizeTime;
76 OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i + 1]));
77 if (candidateBlockSizeTime < bestTime) { // checks if this configuration beat the current best
78 bestTime = candidateBlockSizeTime;
79 bestBlockSize = thrBlockSize;
80 }
81 }
82 }
83 }
84
85 OpmLog::info(
86 fmt::format("{}: Tuned Blocksize: {} (fastest runtime: {}).", descriptionOfFunction, bestBlockSize, bestTime));
87
88 return bestBlockSize;
89}
90
91} // end namespace Opm::gpuistl::detail
92
93#endif
#define OPM_GPU_SAFE_CALL(expression)
OPM_GPU_SAFE_CALL checks the return type of the GPU expression (function call) and throws an exceptio...
Definition: gpu_safe_call.hpp:150
Definition: autotuner.hpp:29
int tuneThreadBlockSize(func &f, std::string descriptionOfFunction)
Function that tests the best thread block size, assumes the provided function depends on threadblock-...
Definition: autotuner.hpp:36