19#ifndef OPM_GPUISTL_GPUOWNEROVERLAPCOPY_HPP
20#define OPM_GPUISTL_GPUOWNEROVERLAPCOPY_HPP
21#include <dune/istl/owneroverlapcopy.hh>
36template<
class field_type,
class OwnerOverlapCopyCommunicationType>
39 using X = GpuVector<field_type>;
72 void dot(
const X& x,
const X& y, field_type& output)
const
86 field_type
norm(
const X& x)
const
88 auto xDotX = field_type(0);
92 return std::sqrt(xDotX);
112template <
class field_type,
int block_size,
class OwnerOverlapCopyCommunicationType>
116 using X = GpuVector<field_type>;
119 :
GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
126 auto sourceAsDuneVector = source.template asDuneBlockVector<block_size>();
127 auto destAsDuneVector = dest.template asDuneBlockVector<block_size>();
129 dest.copyFromHost(destAsDuneVector);
133 void initIndexSet()
const override
138 std::vector<int> indicesCopyOnCPU;
139 std::vector<int> indicesOwnerCPU;
140 for (
const auto& index : pis) {
141 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
142 for (
int component = 0; component < block_size; ++component) {
143 indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
147 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
148 for (
int component = 0; component < block_size; ++component) {
149 indicesOwnerCPU.push_back(index.local().local() * block_size + component);
154 this->
m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
155 this->
m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);
167template <
class field_type,
int block_size,
class OwnerOverlapCopyCommunicationType>
171 using X = GpuVector<field_type>;
174 :
GPUSender<field_type, OwnerOverlapCopyCommunicationType>(cpuOwnerOverlapCopy)
181 OPM_ERROR_IF(&source != &dest,
"The provided GpuVectors' address did not match");
185 dest.prepareSendBuf(*m_GPUSendBuf, *m_commpairIndicesOwner);
189 std::vector<MPI_Request> sendRequests(m_messageInformation.size());
190 std::vector<MPI_Request> recvRequests(m_messageInformation.size());
191 std::vector<int> processMap(m_messageInformation.size());
192 size_t numberOfRealRecvRequests = 0;
194 using const_iterator =
typename InformationMap::const_iterator;
195 const const_iterator end = m_messageInformation.end();
199 for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
200 processMap[i]=info->first;
201 if(info->second.second.m_size) {
202 MPI_Irecv(m_GPURecvBuf->data()+info->second.second.m_start,
207 this->m_cpuOwnerOverlapCopy.communicator(),
209 numberOfRealRecvRequests += 1;
211 recvRequests[i]=MPI_REQUEST_NULL;
218 for(const_iterator info = m_messageInformation.begin(); info != end; ++info, ++i) {
219 if(info->second.first.m_size) {
220 MPI_Issend(m_GPUSendBuf->data()+info->second.first.m_start,
225 this->m_cpuOwnerOverlapCopy.communicator(),
228 sendRequests[i]=MPI_REQUEST_NULL;
232 int finished = MPI_UNDEFINED;
234 for(
size_t i = 0; i < numberOfRealRecvRequests; i++) {
235 status.MPI_ERROR=MPI_SUCCESS;
236 MPI_Waitany(m_messageInformation.size(), recvRequests.data(), &finished, &status);
238 if(status.MPI_ERROR!=MPI_SUCCESS) {
239 OPM_THROW(std::runtime_error, fmt::format(
"MPI_Error occurred while rank {} received a message from rank {}", rank, processMap[finished]));
242 MPI_Status recvStatus;
243 for(
size_t i = 0; i < m_messageInformation.size(); i++) {
244 if(MPI_SUCCESS!=MPI_Wait(&sendRequests[i], &recvStatus)) {
245 OPM_THROW(std::runtime_error, fmt::format(
"MPI_Error occurred while rank {} sent a message from rank {}", rank, processMap[finished]));
250 dest.syncFromRecvBuf(*m_GPURecvBuf, *m_commpairIndicesCopy);
254 mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesCopy;
255 mutable std::unique_ptr<GpuVector<int>> m_commpairIndicesOwner;
256 mutable std::unique_ptr<GpuVector<field_type>> m_GPUSendBuf;
257 mutable std::unique_ptr<GpuVector<field_type>> m_GPURecvBuf;
259 struct MessageInformation
261 MessageInformation() : m_start(0), m_size(0) {}
262 MessageInformation(
size_t start,
size_t size) : m_start(start), m_size(size) {}
267 using InformationMap = std::map<int,std::pair<MessageInformation,MessageInformation> >;
268 mutable InformationMap m_messageInformation;
269 using IM = std::map<int,std::pair<std::vector<int>,std::vector<int> > >;
272 constexpr static int m_commTag = 0;
274 void buildCommPairIdxs()
const
277 std::vector<int> commpairIndicesCopyOnCPU;
278 std::vector<int> commpairIndicesOwnerCPU;
280 for(
auto process : ri) {
282 m_im[process.first] = std::pair(std::vector<int>(), std::vector<int>());
283 for(
int send = 0; send < 2; ++send) {
284 auto remoteEnd = send ? process.second.first->end()
285 : process.second.second->end();
286 auto remote = send ? process.second.first->begin()
287 : process.second.second->begin();
289 while(remote != remoteEnd) {
290 if (send ? (remote->localIndexPair().local().attribute() == 1)
291 : (remote->attribute() == 1)) {
294 m_im[process.first].first.push_back(remote->localIndexPair().local().local());
296 m_im[process.first].second.push_back(remote->localIndexPair().local().local());
306 for (
auto it = m_im.begin(); it != m_im.end(); it++) {
307 int noSend = it->second.first.size();
308 int noRecv = it->second.second.size();
310 if (noSend + noRecv > 0) {
311 m_messageInformation.insert(
312 std::make_pair(it->first,
313 std::make_pair(MessageInformation(
314 sendBufIdx * block_size,
315 noSend * block_size *
sizeof(field_type)),
317 recvBufIdx * block_size,
318 noRecv * block_size *
sizeof(field_type)))));
320 for(
int x = 0; x < noSend; x++) {
321 for(
int bs = 0; bs < block_size; bs++) {
322 commpairIndicesOwnerCPU.push_back(it->second.first[x] * block_size + bs);
325 for(
int x = 0; x < noRecv; x++) {
326 for(
int bs = 0; bs < block_size; bs++) {
327 commpairIndicesCopyOnCPU.push_back(it->second.second[x] * block_size + bs);
330 sendBufIdx += noSend;
331 recvBufIdx += noRecv;
335 m_commpairIndicesCopy = std::make_unique<GpuVector<int>>(commpairIndicesCopyOnCPU);
336 m_commpairIndicesOwner = std::make_unique<GpuVector<int>>(commpairIndicesOwnerCPU);
338 m_GPUSendBuf = std::make_unique<GpuVector<field_type>>(sendBufIdx * block_size);
339 m_GPURecvBuf = std::make_unique<GpuVector<field_type>>(recvBufIdx * block_size);
342 void initIndexSet()
const override
347 std::vector<int> indicesCopyOnCPU;
348 std::vector<int> indicesOwnerCPU;
349 for (
const auto& index : pis) {
350 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::copy) {
351 for (
int component = 0; component < block_size; ++component) {
352 indicesCopyOnCPU.push_back(index.local().local() * block_size + component);
356 if (index.local().attribute() == Dune::OwnerOverlapCopyAttributeSet::owner) {
357 for (
int component = 0; component < block_size; ++component) {
358 indicesOwnerCPU.push_back(index.local().local() * block_size + component);
363 this->
m_indicesCopy = std::make_unique<GpuVector<int>>(indicesCopyOnCPU);
364 this->
m_indicesOwner = std::make_unique<GpuVector<int>>(indicesOwnerCPU);
382template <
class field_type,
int block_size,
class OwnerOverlapCopyCommunicationType>
386 using X = GpuVector<field_type>;
391 m_sender->copyOwnerToAll(source, dest);
394 void dot(
const X& x,
const X& y, field_type& output)
const
396 m_sender->dot(x, y, output);
401 return m_sender->norm(x);
406 m_sender->project(x);
410 std::shared_ptr<GPUSender<field_type, OwnerOverlapCopyCommunicationType>> m_sender;
Derived class of GPUSender that handles MPI made with CUDA aware MPI The copOwnerToAll function uses ...
Definition: GpuOwnerOverlapCopy.hpp:169
void copyOwnerToAll(const X &source, X &dest) const override
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
Definition: GpuOwnerOverlapCopy.hpp:178
GPUAwareMPISender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: GpuOwnerOverlapCopy.hpp:173
Derived class of GPUSender that handles MPI calls that should NOT use GPU direct communicatoin The im...
Definition: GpuOwnerOverlapCopy.hpp:114
void copyOwnerToAll(const X &source, X &dest) const override
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
Definition: GpuOwnerOverlapCopy.hpp:123
GPUObliviousMPISender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: GpuOwnerOverlapCopy.hpp:118
GPUSender is a wrapper class for classes which will implement copOwnerToAll This is implemented with ...
Definition: GpuOwnerOverlapCopy.hpp:37
GpuVector< field_type > X
Definition: GpuOwnerOverlapCopy.hpp:39
std::unique_ptr< GpuVector< int > > m_indicesCopy
Definition: GpuOwnerOverlapCopy.hpp:101
const OwnerOverlapCopyCommunicationType & m_cpuOwnerOverlapCopy
Definition: GpuOwnerOverlapCopy.hpp:102
void project(X &x) const
project will project x to the owned subspace
Definition: GpuOwnerOverlapCopy.hpp:59
virtual void copyOwnerToAll(const X &source, X &dest) const =0
copyOwnerToAll will copy source to the CPU, then call OwnerOverlapCopyCommunicationType::copyOwnerToA...
std::unique_ptr< GpuVector< int > > m_indicesOwner
Definition: GpuOwnerOverlapCopy.hpp:100
virtual void initIndexSet() const =0
void dot(const X &x, const X &y, field_type &output) const
dot will carry out the dot product between x and y on the owned indices, then sum up the result acros...
Definition: GpuOwnerOverlapCopy.hpp:72
field_type norm(const X &x) const
norm computes the l^2-norm of x across processes.
Definition: GpuOwnerOverlapCopy.hpp:86
GPUSender(const OwnerOverlapCopyCommunicationType &cpuOwnerOverlapCopy)
Definition: GpuOwnerOverlapCopy.hpp:41
std::once_flag m_initializedIndices
Definition: GpuOwnerOverlapCopy.hpp:99
CUDA compatiable variant of Dune::OwnerOverlapCopyCommunication.
Definition: GpuOwnerOverlapCopy.hpp:384
GpuOwnerOverlapCopy(std::shared_ptr< GPUSender< field_type, OwnerOverlapCopyCommunicationType > > sender)
Definition: GpuOwnerOverlapCopy.hpp:388
void project(X &x) const
Definition: GpuOwnerOverlapCopy.hpp:404
void dot(const X &x, const X &y, field_type &output) const
Definition: GpuOwnerOverlapCopy.hpp:394
GpuVector< field_type > X
Definition: GpuOwnerOverlapCopy.hpp:386
field_type norm(const X &x) const
Definition: GpuOwnerOverlapCopy.hpp:399
void copyOwnerToAll(const X &source, X &dest) const
Definition: GpuOwnerOverlapCopy.hpp:390
int to_int(std::size_t s)
to_int converts a (on most relevant platforms) 64 bits unsigned size_t to a signed 32 bits signed int
Definition: safe_conversion.hpp:52
Definition: autotuner.hpp:29