CuOwnerOverlapCopy.hpp
Go to the documentation of this file.
1/*
2 Copyright 2022-2023 SINTEF AS
3
4 This file is part of the Open Porous Media project (OPM).
5
6 OPM is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 OPM is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with OPM. If not, see <http://www.gnu.org/licenses/>.
18*/
19#ifndef OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
20#define OPM_CUISTL_CUOWNEROVERLAPCOPY_HPP
21#include <dune/istl/owneroverlapcopy.hh>
22#include <memory>
23#include <mutex>
25#include <vector>
26
27namespace Opm::cuistl
28{
36template<class field_type, class OwnerOverlapCopyCommunicationType>
37class GPUSender {
38public:
39 using X = CuVector<field_type>;
40
41 GPUSender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy) : m_cpuOwnerOverlapCopy(cpuOwnerOverlapCopy){}
42
49 virtual void copyOwnerToAll(const X& source, X& dest) const = 0;
50 virtual void initIndexSet() const = 0;
51
59 void project(X& x) const
60 {
61 std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
62 x.setZeroAtIndexSet(*m_indicesCopy);
63 }
64
72 void dot(const X& x, const X& y, field_type& output) const
73 {
74 std::call_once(m_initializedIndices, [&]() { initIndexSet(); });
75
76 const auto dotAtRank = x.dot(y, *m_indicesOwner);
77 output = m_cpuOwnerOverlapCopy.communicator().sum(dotAtRank);
78 }
79
86 field_type norm(const X& x) const
87 {
88 auto xDotX = field_type(0);
89 dot(x, x, xDotX);
90
91 // using std::sqrt;
92 return std::sqrt(xDotX);
93 }
94
95protected:
96 // Used to call the initIndexSet. Note that this is kind of a
97 // premature optimization, in the sense that we could just initialize these indices
98 // always, but they are not always used.
99 mutable std::once_flag m_initializedIndices;
100 mutable std::unique_ptr<CuVector<int>> m_indicesOwner;
101 mutable std::unique_ptr<CuVector<int>> m_indicesCopy;
102 const OwnerOverlapCopyCommunicationType& m_cpuOwnerOverlapCopy;
103};
104
112template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
113class GPUObliviousMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
114{
115public:
116 using X = CuVector<field_type>;
117
118 GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
119 : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
120 {
121 }
122
123 void copyOwnerToAll(const X& source, X& dest) const override {
124 // TODO: [perf] Can we reduce copying from the GPU here?
125 // TODO: [perf] Maybe create a global buffer instead?
126 auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
127 auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
128 this->m_cpuOwnerOverlapCopy.copyOwnerToAll(sourceAsDuneVector, destAsDuneVector);
129 dest.copyFromHost(destAsDuneVector);
130 }
131
132private:
133 void initIndexSet() const override
134 {
135 // We need indices that we we will use in the project, dot and norm calls.
136 // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
137 const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
138 std::vector<int> indicesCopyOnCPU;
139 std::vector<int> indicesOwnerCPU;
140 for (const auto& index : pis) {
141 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
142 for (int component = 0; component < block_size; ++component) {
143 indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
144 }
145 }
146
147 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
148 for (int component = 0; component < block_size; ++component) {
149 indicesOwnerCPU.push_back(index.local().local() * block_size + component);
150 }
151 }
152 }
153
154 this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
155 this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
156 }
157};
158
167template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
168class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
169{
170public:
171 using X = CuVector<field_type>;
172
173 GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
174 : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
175 {
176 }
177
178 void copyOwnerToAll(const X& source, X& dest) const override
179 {
180
181 OPM_ERROR_IF(&source != &dest, "The provided CuVectors' address did not match"); // In this context, source == dest!!!
182 std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
183
184 int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
185 dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner);
186
187 // Start MPI stuff here...
188 // Note: This has been taken from DUNE's parallel/communicator.hh
189 std::vector<MPI_Request> sendRequests(m_messageInformation.size());
190 std::vector<MPI_Request> recvRequests(m_messageInformation.size());
191 std::vector<int> processMap(m_messageInformation.size());
192 size_t numberOfRealRecvRequests = 0;
193
194 using const_iterator = typename InformationMap::const_iterator;
195 const const_iterator end = m_messageInformation.end();
196
197 {
198 size_t i = 0;
199 for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
200 processMap[i]=info->first;
201 if(info->second.second.m_size) {
202 MPI_Irecv(m_GPURecvBuf->data()+info->second.second.m_start,
203 detail::to_int(info->second.second.m_size),
204 MPI_BYTE,
205 info->first,
206 m_commTag,
207 this->m_cpuOwnerOverlapCopy.communicator(),
208 &recvRequests[i]);
209 numberOfRealRecvRequests += 1;
210 } else {
211 recvRequests[i]=MPI_REQUEST_NULL;
212 }
213 }
214 }
215
216 {
217 size_t i = 0;
218 for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
219 if(info->second.first.m_size) {
220 MPI_Issend(m_GPUSendBuf->data()+info->second.first.m_start,
221 detail::to_int(info->second.first.m_size),
222 MPI_BYTE,
223 info->first,
224 m_commTag,
225 this->m_cpuOwnerOverlapCopy.communicator(),
226 &sendRequests[i]);
227 } else {
228 sendRequests[i]=MPI_REQUEST_NULL;
229 }
230 }
231 }
232 int finished = MPI_UNDEFINED;
233 MPI_Status status;
234 for(size_t i = 0; i < numberOfRealRecvRequests; i++) {
235 status.MPI_ERROR=MPI_SUCCESS;
236 MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status);
237
238 if(status.MPI_ERROR!=MPI_SUCCESS) {
239 OPM_THROW(std::runtime_error, fmt::format("MPI_Error occurred while rank {} received a message from rank {}", rank, processMap[finished]));
240 }
241 }
242 MPI_Status recvStatus;
243 for(size_t i = 0; i < m_messageInformation.size(); i++) {
244 if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) {
245 OPM_THROW(std::runtime_error, fmt::format("MPI_Error occurred while rank {} sent a message from rank {}", rank, processMap[finished]));
246 }
247 }
248 // ...End of MPI stuff
249
250 dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy);
251 }
252
253private:
254 mutable std::unique_ptr<CuVector<int>> m_commpairIndicesCopy;
255 mutable std::unique_ptr<CuVector<int>> m_commpairIndicesOwner;
256 mutable std::unique_ptr<CuVector<field_type>> m_GPUSendBuf;
257 mutable std::unique_ptr<CuVector<field_type>> m_GPURecvBuf;
258
259 struct MessageInformation
260 {
261 MessageInformation() : m_start(0), m_size(0) {}
262 MessageInformation(size_t start, size_t size) : m_start(start), m_size(size) {}
263 size_t m_start; // offset in elements of "field_type"
264 size_t m_size; // size in bytes
265 };
266
267 using InformationMap = std::map<int,std::pair<MessageInformation,MessageInformation> >;
268 mutable InformationMap m_messageInformation;
269 using IM = std::map<int,std::pair<std::vector<int>,std::vector<int> > >;
270 mutable IM m_im;
271
272 constexpr static int m_commTag = 0; // So says DUNE
273
274 void buildCommPairIdxs() const
275 {
276 auto &ri = this->m_cpuOwnerOverlapCopy.remoteIndices();
277 std::vector<int> commpairIndicesCopyOnCPU;
278 std::vector<int> commpairIndicesOwnerCPU;
279
280 for(auto process : ri) {
281 int size = 0;
282 m_im[process.first] = std::pair(std::vector<int>(), std::vector<int>());
283 for(int send = 0; send < 2; ++send) {
284 auto remoteEnd = send ? process.second.first->end()
285 : process.second.second->end();
286 auto remote = send ? process.second.first->begin()
287 : process.second.second->begin();
288
289 while(remote != remoteEnd) {
290 if (send ? (remote->localIndexPair().local().attribute() == 1)
291 : (remote->attribute() == 1)) {
292 ++size;
293 if (send) {
294 m_im[process.first].first.push_back(remote->localIndexPair().local().local());
295 } else {
296 m_im[process.first].second.push_back(remote->localIndexPair().local().local());
297 }
298 }
299 ++remote;
300 }
301 }
302 }
303
304 int sendBufIdx = 0;
305 int recvBufIdx = 0;
306 for (auto it = m_im.begin(); it != m_im.end(); it++) {
307 int noSend = it->second.first.size();
308 int noRecv = it->second.second.size();
309
310 if (noSend + noRecv > 0) {
311 m_messageInformation.insert(
312 std::make_pair(it->first,
313 std::make_pair(MessageInformation(
314 sendBufIdx * block_size,
315 noSend * block_size * sizeof(field_type)),
316 MessageInformation(
317 recvBufIdx * block_size,
318 noRecv * block_size * sizeof(field_type)))));
319
320 for(int x = 0; x < noSend; x++) {
321 for(int bs = 0; bs < block_size; bs++) {
322 commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
323 }
324 }
325 for(int x = 0; x < noRecv; x++) {
326 for(int bs = 0; bs < block_size; bs++) {
327 commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
328 }
329 }
330 sendBufIdx += noSend;
331 recvBufIdx += noRecv;
332 }
333 }
334
335 m_commpairIndicesCopy = std::make_unique<CuVector<int>>(commpairIndicesCopyOnCPU);
336 m_commpairIndicesOwner = std::make_unique<CuVector<int>>(commpairIndicesOwnerCPU);
337
338 m_GPUSendBuf = std::make_unique<CuVector<field_type>>(sendBufIdx * block_size);
339 m_GPURecvBuf = std::make_unique<CuVector<field_type>>(recvBufIdx * block_size);
340 }
341
342 void initIndexSet() const override
343 {
344 // We need indices that we we will use in the project, dot and norm calls.
345 // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
346 const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
347 std::vector<int> indicesCopyOnCPU;
348 std::vector<int> indicesOwnerCPU;
349 for (const auto& index : pis) {
350 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
351 for (int component = 0; component < block_size; ++component) {
352 indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
353 }
354 }
355
356 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
357 for (int component = 0; component < block_size; ++component) {
358 indicesOwnerCPU.push_back(index.local().local() * block_size + component);
359 }
360 }
361 }
362
363 this->m_indicesCopy = std::make_unique<CuVector<int>>(indicesCopyOnCPU);
364 this->m_indicesOwner = std::make_unique<CuVector<int>>(indicesOwnerCPU);
365
366 buildCommPairIdxs();
367 }
368};
369
382template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
384{
385public:
386 using X = CuVector<field_type>;
387
389
390 void copyOwnerToAll(const X& source, X& dest) const {
391 m_sender->copyOwnerToAll(source, dest);
392 }
393
394 void dot(const X& x, const X& y, field_type& output) const
395 {
396 m_sender->dot(x, y, output);
397 }
398
399 field_type norm(const X& x) const
400 {
401 return m_sender->norm(x);
402 }
403
404 void project(X& x) const
405 {
406 m_sender->project(x);
407 }
408
409private:
410 std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> m_sender;
411};
412} // namespace Opm::cuistl
413#endif
CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication.
Definition: CuOwnerOverlapCopy.hpp:384
CuOwnerOverlapCopy(std::shared_ptr< GPUSender< field_type, OwnerOverlapCopyCommunicationType > > sender)
Definition: CuOwnerOverlapCopy.hpp:388
void project(X &x) const
Definition: CuOwnerOverlapCopy.hpp:404
field_type norm(const X &x) const
Definition: CuOwnerOverlapCopy.hpp:399
CuVector< field_type > X
Definition: CuOwnerOverlapCopy.hpp:386
void copyOwnerToAll(const X &source, X &dest) const
Definition: CuOwnerOverlapCopy.hpp:390
void dot(const X &x, const X &y, field_type &output) const
Definition: CuOwnerOverlapCopy.hpp:394
Derived class of GPUSender that handles MPI made with CUDA aware MPI The copOwnerToAll function uses ...
Definition: CuOwnerOverlapCopy.hpp:169
void copyOwnerToAll(const X &source, X &dest) const override
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
Definition: CuOwnerOverlapCopy.hpp:178
GPUAwareMPISender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: CuOwnerOverlapCopy.hpp:173
Derived class of GPUSender that handles MPI calls that should NOT use GPU direct communicatoin The im...
Definition: CuOwnerOverlapCopy.hpp:114
void copyOwnerToAll(const X &source, X &dest) const override
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
Definition: CuOwnerOverlapCopy.hpp:123
GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: CuOwnerOverlapCopy.hpp:118
GPUSender is a wrapper class for classes which will implement copOwnerToAll This is implemented with ...
Definition: CuOwnerOverlapCopy.hpp:37
const OwnerOverlapCopyCommunicationType & m_cpuOwnerOverlapCopy
Definition: CuOwnerOverlapCopy.hpp:102
GPUSender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: CuOwnerOverlapCopy.hpp:41
virtual void copyOwnerToAll(const X &source, X &dest) const =0
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
std::unique_ptr< CuVector< int > > m_indicesOwner
Definition: CuOwnerOverlapCopy.hpp:100
void dot(const X &x, const X &y, field_type &output) const
dot will carry out the dot product between x and y on the owned indices, then sum up the result acros...
Definition: CuOwnerOverlapCopy.hpp:72
CuVector< field_type > X
Definition: CuOwnerOverlapCopy.hpp:39
field_type norm(const X &x) const
norm computes the l^2-norm of x across processes.
Definition: CuOwnerOverlapCopy.hpp:86
std::once_flag m_initializedIndices
Definition: CuOwnerOverlapCopy.hpp:99
void project(X &x) const
project will project x to the owned subspace
Definition: CuOwnerOverlapCopy.hpp:59
virtual void initIndexSet() const =0
std::unique_ptr< CuVector< int > > m_indicesCopy
Definition: CuOwnerOverlapCopy.hpp:101
int to_int(std::size_t s)
to_int converts a (on most relevant platforms) 64 bits unsigned size_t to a signed 32 bits signed int
Definition: safe_conversion.hpp:51
Definition: CuBlockPreconditioner.hpp:29