GpuOwnerOverlapCopy.hpp
Go to the documentation of this file.
1/*
2 Copyright 2022-2023 SINTEF AS
3
4 This file is part of the Open Porous Media project (OPM).
5
6 OPM is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 OPM is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with OPM. If not, see <http://www.gnu.org/licenses/>.
18*/
19#ifndef OPM_GPUISTL_GPUOWNEROVERLAPCOPY_HPP
20#define OPM_GPUISTL_GPUOWNEROVERLAPCOPY_HPP
21
22#include <dune/istl/owneroverlapcopy.hh>
23
25
26#include <mpi.h>
27
28#include <memory>
29#include <mutex>
30#include <vector>
31
32namespace Opm::gpuistl {
33
41template<class field_type, class OwnerOverlapCopyCommunicationType>
42class GPUSender {
43public:
44 using X = GpuVector<field_type>;
45
46 GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){}
47
48 virtual ~GPUSender() = default;
49
56 virtual void copyOwnerToAll(const X& source, X& dest) const = 0;
57 virtual void initIndexSet() const = 0;
58
66 void project(X& x) const
67 {
68 std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
69 x.setZeroAtIndexSet(*m_indicesCopy);
70 }
71
79 void dot(const X& x, const X& y, field_type& output) const
80 {
81 std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
82
83 const auto dotAtRank = x.dot(y, *m_indicesOwner);
84 output = m_cpuOwnerOverlapCopy.communicator().sum(dotAtRank);
85 }
86
93 field_type norm(const X& x) const
94 {
95 auto xDotX = field_type(0);
96 dot(x, x, xDotX);
97
98 // using std::sqrt;
99 return std::sqrt(xDotX);
100 }
101
102protected:
103 // Used to call the initIndexSet. Note that this is kind of a
104 // premature optimization, in the sense that we could just initialize these indices
105 // always, but they are not always used.
106 mutable std::once_flag m_initializedIndices;
107 mutable std::unique_ptr<GpuVector<int>> m_indicesOwner;
108 mutable std::unique_ptr<GpuVector<int>> m_indicesCopy;
109 const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
110};
111
119template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
120class GPUObliviousMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
121{
122public:
123 using X = GpuVector<field_type>;
124
125 explicit GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
126 : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
127 {
128 }
129
130 void copyOwnerToAll(const X& source, X& dest) const override
131 {
132 // TODO: [perf] Can we reduce copying from the GPU here?
133 // TODO: [perf] Maybe create a global buffer instead?
134 auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
135 auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
136 this->m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
137 dest.copyFromHost(destAsDuneVector);
138 }
139
140private:
141 void initIndexSet() const override
142 {
143 // We need indices that we we will use in the project, dot and norm calls.
144 // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
145 const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
146 std::vector<int> indicesCopyOnCPU;
147 std::vector<int> indicesOwnerCPU;
148 for (const auto& index : pis) {
149 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
150 for (int component = 0; component < block_size; ++component) {
151 indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
152 }
153 }
154
155 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
156 for (int component = 0; component < block_size; ++component) {
157 indicesOwnerCPU.push_back(index.local().local() * block_size + component);
158 }
159 }
160 }
161
162 this->m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
163 this->m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);
164 }
165};
166
175template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
176class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
177{
178public:
179 using X = GpuVector<field_type>;
180
181 explicit GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
182 : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
183 {
184 }
185
186 void copyOwnerToAll(const X& source, X& dest) const override
187 {
188 OPM_ERROR_IF(&source != &dest, "The provided GpuVectors' address did not match"); // In this context, source == dest!!!
189 std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
190
191 int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
192 dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner);
193
194 // Start MPI stuff here...
195 // Note: This has been taken from DUNE's parallel/communicator.hh
196 std::vector<MPI_Request> sendRequests(m_messageInformation.size());
197 std::vector<MPI_Request> recvRequests(m_messageInformation.size());
198 std::vector<int> processMap(m_messageInformation.size());
199 size_t numberOfRealRecvRequests = 0;
200
201 using const_iterator = typename InformationMap::const_iterator;
202 const const_iterator end = m_messageInformation.end();
203
204 {
205 size_t i = 0;
206 for (const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
207 processMap[i]=info->first;
208 if (info->second.second.m_size) {
209 MPI_Irecv(m_GPURecvBuf->data()+info->second.second.m_start,
210 detail::to_int(info->second.second.m_size),
211 MPI_BYTE,
212 info->first,
213 m_commTag,
214 this->m_cpuOwnerOverlapCopy.communicator(),
215 &recvRequests[i]);
216 numberOfRealRecvRequests += 1;
217 }
218 else {
219 recvRequests[i] = MPI_REQUEST_NULL;
220 }
221 }
222 }
223
224 {
225 size_t i = 0;
226 for (const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
227 if (info->second.first.m_size) {
228 MPI_Issend(m_GPUSendBuf->data()+info->second.first.m_start,
229 detail::to_int(info->second.first.m_size),
230 MPI_BYTE,
231 info->first,
232 m_commTag,
233 this->m_cpuOwnerOverlapCopy.communicator(),
234 &sendRequests[i]);
235 } else {
236 sendRequests[i] = MPI_REQUEST_NULL;
237 }
238 }
239 }
240 int finished = MPI_UNDEFINED;
241 MPI_Status status;
242 for (size_t i = 0; i < numberOfRealRecvRequests; i++) {
243 status.MPI_ERROR=MPI_SUCCESS;
244 MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status);
245
246 if (status.MPI_ERROR!=MPI_SUCCESS) {
247 OPM_THROW(std::runtime_error,
248 fmt::format("MPI_Error occurred while rank {} received a message from rank {}",
249 rank, processMap[finished]));
250 }
251 }
252 MPI_Status recvStatus;
253 for (size_t i = 0; i < m_messageInformation.size(); i++) {
254 if (MPI_SUCCESS != MPI_Wait(&sendRequests[i], &recvStatus)) {
255 OPM_THROW(std::runtime_error,
256 fmt::format("MPI_Error occurred while rank {} sent a message from rank {}",
257 rank, processMap[finished]));
258 }
259 }
260 // ...End of MPI stuff
261
262 dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy);
263 }
264
265private:
266 mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesCopy;
267 mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesOwner;
268 mutable std::unique_ptr<GpuVector<field_type>> m_GPUSendBuf;
269 mutable std::unique_ptr<GpuVector<field_type>> m_GPURecvBuf;
270
271 struct MessageInformation
272 {
273 MessageInformation() : m_start(0), m_size(0) {}
274 MessageInformation(size_t start, size_t size) : m_start(start), m_size(size) {}
275 size_t m_start; // offset in elements of "field_type"
276 size_t m_size; // size in bytes
277 };
278
279 using InformationMap = std::map<int,std::pair<MessageInformation,MessageInformation> >;
280 mutable InformationMap m_messageInformation;
281 using IM = std::map<int,std::pair<std::vector<int>,std::vector<int> > >;
282 mutable IM m_im;
283
284 constexpr static int m_commTag = 0; // So says DUNE
285
286 void buildCommPairIdxs() const
287 {
288 const auto& ri = this->m_cpuOwnerOverlapCopy.remoteIndices();
289 std::vector<int> commpairIndicesCopyOnCPU;
290 std::vector<int> commpairIndicesOwnerCPU;
291
292 for (const auto& process : ri) {
293 m_im[process.first] = std::pair(std::vector<int>(), std::vector<int>());
294 for (int send = 0; send < 2; ++send) {
295 auto remoteEnd = send ? process.second.first->end()
296 : process.second.second->end();
297 auto remote = send ? process.second.first->begin()
298 : process.second.second->begin();
299
300 while (remote != remoteEnd) {
301 if (send ? (remote->localIndexPair().local().attribute() == 1)
302 : (remote->attribute() == 1)) {
303 if (send) {
304 m_im[process.first].first.push_back(remote->localIndexPair().local().local());
305 }
306 else {
307 m_im[process.first].second.push_back(remote->localIndexPair().local().local());
308 }
309 }
310 ++remote;
311 }
312 }
313 }
314
315 int sendBufIdx = 0;
316 int recvBufIdx = 0;
317 for (auto it = m_im.begin(); it != m_im.end(); it++) {
318 int noSend = it->second.first.size();
319 int noRecv = it->second.second.size();
320
321 if (noSend + noRecv > 0) {
322 m_messageInformation.insert(
323 std::make_pair(it->first,
324 std::make_pair(MessageInformation(
325 sendBufIdx * block_size,
326 noSend * block_size * sizeof(field_type)),
327 MessageInformation(
328 recvBufIdx * block_size,
329 noRecv * block_size * sizeof(field_type)))));
330
331 for (int x = 0; x < noSend; x++) {
332 for (int bs = 0; bs < block_size; bs++) {
333 commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
334 }
335 }
336 for (int x = 0; x < noRecv; x++) {
337 for (int bs = 0; bs < block_size; bs++) {
338 commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
339 }
340 }
341 sendBufIdx += noSend;
342 recvBufIdx += noRecv;
343 }
344 }
345
346 m_commpairIndicesCopy = std::make_unique<GpuVector<int>>(commpairIndicesCopyOnCPU);
347 m_commpairIndicesOwner = std::make_unique<GpuVector<int>>(commpairIndicesOwnerCPU);
348
349 m_GPUSendBuf = std::make_unique<GpuVector<field_type>>(sendBufIdx * block_size);
350 m_GPURecvBuf = std::make_unique<GpuVector<field_type>>(recvBufIdx * block_size);
351 }
352
353 void initIndexSet() const override
354 {
355 // We need indices that we we will use in the project, dot and norm calls.
356 // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
357 const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
358 std::vector<int> indicesCopyOnCPU;
359 std::vector<int> indicesOwnerCPU;
360 for (const auto& index : pis) {
361 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
362 for (int component = 0; component < block_size; ++component) {
363 indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
364 }
365 }
366
367 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
368 for (int component = 0; component < block_size; ++component) {
369 indicesOwnerCPU.push_back(index.local().local() * block_size + component);
370 }
371 }
372 }
373
374 this->m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
375 this->m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);
376
377 buildCommPairIdxs();
378 }
379};
380
393template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
395{
396public:
397 using X = GpuVector<field_type>;
398
400 : m_sender(sender)
401 {}
402
403 void copyOwnerToAll(const X& source, X& dest) const
404 {
405 m_sender->copyOwnerToAll(source, dest);
406 }
407
408 void dot(const X& x, const X& y, field_type& output) const
409 {
410 m_sender->dot(x, y, output);
411 }
412
413 field_type norm(const X& x) const
414 {
415 return m_sender->norm(x);
416 }
417
418 void project(X& x) const
419 {
420 m_sender->project(x);
421 }
422
423private:
424 std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> m_sender;
425};
426
427} // namespace Opm::gpuistl
428
429#endif
Derived class of GPUSender that handles MPI made with CUDA aware MPI The copOwnerToAll function uses ...
Definition: GpuOwnerOverlapCopy.hpp:177
void copyOwnerToAll(const X &source, X &dest) const override
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
Definition: GpuOwnerOverlapCopy.hpp:186
GPUAwareMPISender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: GpuOwnerOverlapCopy.hpp:181
Derived class of GPUSender that handles MPI calls that should NOT use GPU direct communicatoin The im...
Definition: GpuOwnerOverlapCopy.hpp:121
void copyOwnerToAll(const X &source, X &dest) const override
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
Definition: GpuOwnerOverlapCopy.hpp:130
GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: GpuOwnerOverlapCopy.hpp:125
GPUSender is a wrapper class for classes which will implement copOwnerToAll This is implemented with ...
Definition: GpuOwnerOverlapCopy.hpp:42
GpuVector< field_type > X
Definition: GpuOwnerOverlapCopy.hpp:44
std::unique_ptr< GpuVector< int > > m_indicesCopy
Definition: GpuOwnerOverlapCopy.hpp:108
const OwnerOverlapCopyCommunicationType & m_cpuOwnerOverlapCopy
Definition: GpuOwnerOverlapCopy.hpp:109
void project(X &x) const
project will project x to the owned subspace
Definition: GpuOwnerOverlapCopy.hpp:66
virtual void copyOwnerToAll(const X &source, X &dest) const =0
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
std::unique_ptr< GpuVector< int > > m_indicesOwner
Definition: GpuOwnerOverlapCopy.hpp:107
virtual ~GPUSender()=default
virtual void initIndexSet() const =0
void dot(const X &x, const X &y, field_type &output) const
dot will carry out the dot product between x and y on the owned indices, then sum up the result acros...
Definition: GpuOwnerOverlapCopy.hpp:79
field_type norm(const X &x) const
norm computes the l^2-norm of x across processes.
Definition: GpuOwnerOverlapCopy.hpp:93
GPUSender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: GpuOwnerOverlapCopy.hpp:46
std::once_flag m_initializedIndices
Definition: GpuOwnerOverlapCopy.hpp:106
CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication.
Definition: GpuOwnerOverlapCopy.hpp:395
GpuOwnerOverlapCopy(std::shared_ptr< GPUSender< field_type, OwnerOverlapCopyCommunicationType > > sender)
Definition: GpuOwnerOverlapCopy.hpp:399
void project(X &x) const
Definition: GpuOwnerOverlapCopy.hpp:418
void dot(const X &x, const X &y, field_type &output) const
Definition: GpuOwnerOverlapCopy.hpp:408
GpuVector< field_type > X
Definition: GpuOwnerOverlapCopy.hpp:397
field_type norm(const X &x) const
Definition: GpuOwnerOverlapCopy.hpp:413
void copyOwnerToAll(const X &source, X &dest) const
Definition: GpuOwnerOverlapCopy.hpp:403
int to_int(std::size_t s)
to_int converts a (on most relevant platforms) 64 bits unsigned size_t to a signed 32 bits signed int
Definition: safe_conversion.hpp:52
Definition: autotuner.hpp:30