GpuAwareMPISender.hpp
Go to the documentation of this file.
1/*
2 Copyright 2022-2023 SINTEF AS, 2025 Equinor ASA
3
4 This file is part of the Open Porous Media project (OPM).
5
6 OPM is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 OPM is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with OPM. If not, see <http://www.gnu.org/licenses/>.
18*/
19#ifndef OPM_GPUISTL_GPUAWAREMPISENDER_HPP
20#define OPM_GPUISTL_GPUAWAREMPISENDER_HPP
21
22#include <dune/istl/owneroverlapcopy.hh>
23
27
28#include <mpi.h>
29
30#include <memory>
31#include <vector>
32
33namespace Opm::gpuistl
34{
35
44template <class field_type, int block_size, class OwnerOverlapCopyCommunicationType>
45class GPUAwareMPISender : public GPUSender<field_type, OwnerOverlapCopyCommunicationType>
46{
47public:
49
50 explicit GPUAwareMPISender(const OwnerOverlapCopyCommunicationType& cpuOwnerOverlapCopy)
51 : GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
52 {
53 }
54
55 void copyOwnerToAll(const X& source, X& dest) const override
56 {
57 OPM_ERROR_IF(&source != &dest,
58 "The provided GpuVectors' address did not match"); // In this context, source == dest!!!
59 std::call_once(this->m_initializedIndices, [&]() { initIndexSet(); });
60
61 int rank = this->m_cpuOwnerOverlapCopy.communicator().rank();
62 dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner);
63
64 // Start MPI stuff here...
65 // Note: This has been taken from DUNE's parallel/communicator.hh
66 std::vector<MPI_Request> sendRequests(m_messageInformation.size());
67 std::vector<MPI_Request> recvRequests(m_messageInformation.size());
68 std::vector<int> processMap(m_messageInformation.size());
69 size_t numberOfRealRecvRequests = 0;
70
71 using const_iterator = typename InformationMap::const_iterator;
72 const const_iterator end = m_messageInformation.end();
73
74 {
75 size_t i = 0;
76 for (const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
77 processMap[i] = info->first;
78 if (info->second.second.m_size) {
79 MPI_Irecv(m_GPURecvBuf->data() + info->second.second.m_start,
80 detail::to_int(info->second.second.m_size),
81 MPI_BYTE,
82 info->first,
83 m_commTag,
84 this->m_cpuOwnerOverlapCopy.communicator(),
85 &recvRequests[i]);
86 numberOfRealRecvRequests += 1;
87 } else {
88 recvRequests[i] = MPI_REQUEST_NULL;
89 }
90 }
91 }
92
93 {
94 size_t i = 0;
95 for (const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
96 if (info->second.first.m_size) {
97 MPI_Issend(m_GPUSendBuf->data() + info->second.first.m_start,
98 detail::to_int(info->second.first.m_size),
99 MPI_BYTE,
100 info->first,
101 m_commTag,
102 this->m_cpuOwnerOverlapCopy.communicator(),
103 &sendRequests[i]);
104 } else {
105 sendRequests[i] = MPI_REQUEST_NULL;
106 }
107 }
108 }
109 int finished = MPI_UNDEFINED;
110 MPI_Status status;
111 for (size_t i = 0; i < numberOfRealRecvRequests; i++) {
112 status.MPI_ERROR = MPI_SUCCESS;
113 MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status);
114
115 if (status.MPI_ERROR != MPI_SUCCESS) {
116 OPM_THROW(std::runtime_error,
117 fmt::format("MPI_Error occurred while rank {} received a message from rank {}",
118 rank,
119 processMap[finished]));
120 }
121 }
122 MPI_Status recvStatus;
123 for (size_t i = 0; i < m_messageInformation.size(); i++) {
124 if (MPI_SUCCESS != MPI_Wait(&sendRequests[i], &recvStatus)) {
125 OPM_THROW(std::runtime_error,
126 fmt::format("MPI_Error occurred while rank {} sent a message from rank {}",
127 rank,
128 processMap[finished]));
129 }
130 }
131 // ...End of MPI stuff
132
133 dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy);
134 }
135
136private:
137 mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesCopy;
138 mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesOwner;
139 mutable std::unique_ptr<GpuVector<field_type>> m_GPUSendBuf;
140 mutable std::unique_ptr<GpuVector<field_type>> m_GPURecvBuf;
141
142 struct MessageInformation {
143 MessageInformation()
144 : m_start(0)
145 , m_size(0)
146 {
147 }
148 MessageInformation(size_t start, size_t size)
149 : m_start(start)
150 , m_size(size)
151 {
152 }
153 size_t m_start; // offset in elements of "field_type"
154 size_t m_size; // size in bytes
155 };
156
157 using InformationMap = std::map<int, std::pair<MessageInformation, MessageInformation>>;
158 mutable InformationMap m_messageInformation;
159 using IM = std::map<int, std::pair<std::vector<int>, std::vector<int>>>;
160 mutable IM m_im;
161
162 constexpr static int m_commTag = 0; // So says DUNE
163
164 void buildCommPairIdxs() const
165 {
166 const auto& ri = this->m_cpuOwnerOverlapCopy.remoteIndices();
167 std::vector<int> commpairIndicesCopyOnCPU;
168 std::vector<int> commpairIndicesOwnerCPU;
169
170 for (const auto& process : ri) {
171 m_im[process.first] = std::pair(std::vector<int>(), std::vector<int>());
172 for (int send = 0; send < 2; ++send) {
173 auto remoteEnd = send ? process.second.first->end() : process.second.second->end();
174 auto remote = send ? process.second.first->begin() : process.second.second->begin();
175
176 while (remote != remoteEnd) {
177 if (send ? (remote->localIndexPair().local().attribute() == 1) : (remote->attribute() == 1)) {
178 if (send) {
179 m_im[process.first].first.push_back(remote->localIndexPair().local().local());
180 } else {
181 m_im[process.first].second.push_back(remote->localIndexPair().local().local());
182 }
183 }
184 ++remote;
185 }
186 }
187 }
188
189 int sendBufIdx = 0;
190 int recvBufIdx = 0;
191 for (auto it = m_im.begin(); it != m_im.end(); it++) {
192 int noSend = it->second.first.size();
193 int noRecv = it->second.second.size();
194
195 if (noSend + noRecv > 0) {
196 m_messageInformation.insert(std::make_pair(
197 it->first,
198 std::make_pair(
199 MessageInformation(sendBufIdx * block_size, noSend * block_size * sizeof(field_type)),
200 MessageInformation(recvBufIdx * block_size, noRecv * block_size * sizeof(field_type)))));
201
202 for (int x = 0; x < noSend; x++) {
203 for (int bs = 0; bs < block_size; bs++) {
204 commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
205 }
206 }
207 for (int x = 0; x < noRecv; x++) {
208 for (int bs = 0; bs < block_size; bs++) {
209 commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
210 }
211 }
212 sendBufIdx += noSend;
213 recvBufIdx += noRecv;
214 }
215 }
216
217 m_commpairIndicesCopy = std::make_unique<GpuVector<int>>(commpairIndicesCopyOnCPU);
218 m_commpairIndicesOwner = std::make_unique<GpuVector<int>>(commpairIndicesOwnerCPU);
219
220 m_GPUSendBuf = std::make_unique<GpuVector<field_type>>(sendBufIdx * block_size);
221 m_GPURecvBuf = std::make_unique<GpuVector<field_type>>(recvBufIdx * block_size);
222 }
223
224 void initIndexSet() const override
225 {
226 // We need indices that we we will use in the project, dot and norm calls.
227 // TODO: [premature perf] Can this be run once per instance? Or do we need to rebuild every time?
228 const auto& pis = this->m_cpuOwnerOverlapCopy.indexSet();
229 std::vector<int> indicesCopyOnCPU;
230 std::vector<int> indicesOwnerCPU;
231 for (const auto& index : pis) {
232 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
233 for (int component = 0; component < block_size; ++component) {
234 indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
235 }
236 }
237
238 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
239 for (int component = 0; component < block_size; ++component) {
240 indicesOwnerCPU.push_back(index.local().local() * block_size + component);
241 }
242 }
243 }
244
245 this->m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
246 this->m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);
247
248 buildCommPairIdxs();
249 }
250};
251
252} // namespace Opm::gpuistl
253#endif // OPM_GPUISTL_GPUAWAREMPISENDER_HPP
Derived class of GPUSender that handles MPI made with CUDA aware MPI The copOwnerToAll function uses ...
Definition: GpuAwareMPISender.hpp:46
void copyOwnerToAll(const X &source, X &dest) const override
copyOwnerToAll will copy the data in source to all processes.
Definition: GpuAwareMPISender.hpp:55
GPUAwareMPISender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: GpuAwareMPISender.hpp:50
GPUSender is a wrapper class for classes which will implement copOwnerToAll This is implemented with ...
Definition: GpuSender.hpp:44
std::unique_ptr< GpuVector< int > > m_indicesCopy
Definition: GpuSender.hpp:128
const OwnerOverlapCopyCommunicationType & m_cpuOwnerOverlapCopy
Definition: GpuSender.hpp:129
std::unique_ptr< GpuVector< int > > m_indicesOwner
Definition: GpuSender.hpp:127
std::once_flag m_initializedIndices
Definition: GpuSender.hpp:126
int to_int(std::size_t s)
to_int converts a (on most relevant platforms) 64 bits unsigned size_t to a signed 32 bits signed int
Definition: safe_conversion.hpp:52
Definition: AmgxInterface.hpp:38