opm-simulators
deviceBlockOperations.hpp
1 /*
2  Copyright 2024 SINTEF AS
3 
4  This file is part of the Open Porous Media project (OPM).
5 
6  OPM is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  OPM is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with OPM. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
21 #define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
22 
23 #include <config.h>
24 #include <cuda_runtime.h>
25 
26 #include <cassert>
27 
28 /*
29  This file provides inlineable functions intended for CUDA kernels operating on block matrix elements
30  The functions provides various matrix operations that are used by the preconditioners.
31 */
32 
33 namespace
34 {
35 // TODO: figure out if this can be generalized effectively, this seems excessively verbose
36 // explicit formulas based on Dune cpu code
37 template <class T, int blocksize>
38 __device__ __forceinline__ void
39 invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
40 {
41  if (blocksize == 1) {
42  dstBlock[0] = T(1.0) / (srcBlock[0]);
43  } else if (blocksize == 2) {
44  T detInv = T(1.0) / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
45  dstBlock[0] = srcBlock[3] * detInv;
46  dstBlock[1] = -srcBlock[1] * detInv;
47  dstBlock[2] = -srcBlock[2] * detInv;
48  dstBlock[3] = srcBlock[0] * detInv;
49  } else if (blocksize == 3) {
50  // based on Dune implementation
51  T t4 = srcBlock[0] * srcBlock[4];
52  T t6 = srcBlock[0] * srcBlock[5];
53  T t8 = srcBlock[1] * srcBlock[3];
54  T t10 = srcBlock[2] * srcBlock[3];
55  T t12 = srcBlock[1] * srcBlock[6];
56  T t14 = srcBlock[2] * srcBlock[6];
57 
58  T t17 = T(1.0)
59  / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
60  - t14 * srcBlock[4]); // t17 is 1/determinant
61 
62  dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
63  dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
64  dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
65  dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
66  dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
67  dstBlock[5] = -(t6 - t10) * t17;
68  dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
69  dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
70  dstBlock[8] = (t4 - t8) * t17;
71  }
72 }
73 
74 // explicit formulas based on Dune cpu code
75 template <class T, int blocksize>
76 __device__ __forceinline__ void
77 invBlockInPlace(T* __restrict__ block)
78 {
79  if (blocksize == 1) {
80  block[0] = T(1.0) / (block[0]);
81  } else if (blocksize == 2) {
82  T detInv = T(1.0) / (block[0] * block[3] - block[1] * block[2]);
83 
84  T temp = block[0];
85  block[0] = block[3] * detInv;
86  block[1] = -block[1] * detInv;
87  block[2] = -block[2] * detInv;
88  block[3] = temp * detInv;
89  } else if (blocksize == 3) {
90  T t4 = block[0] * block[4];
91  T t6 = block[0] * block[5];
92  T t8 = block[1] * block[3];
93  T t10 = block[2] * block[3];
94  T t12 = block[1] * block[6];
95  T t14 = block[2] * block[6];
96 
97  T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
98  T t17 = T(1.0) / det;
99 
100  T matrix01 = block[1];
101  T matrix00 = block[0];
102  T matrix10 = block[3];
103  T matrix11 = block[4];
104 
105  block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
106  block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
107  block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
108  block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
109  block[4] = (matrix00 * block[8] - t14) * t17;
110  block[5] = -(t6 - t10) * t17;
111  block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
112  block[7] = -(matrix00 * block[7] - t12) * t17;
113  block[8] = (t4 - t8) * t17;
114  }
115 }
116 
117 enum class MVType { SET, PLUS, MINUS };
118 // SET: c = A*b
119 // PLS: c += A*b
120 // MINUS: c -= A*b
121 template <class T, int blocksize, MVType OP>
122 __device__ __forceinline__ void
123 matrixVectorProductWithAction(const T* A, const T* b, T* c)
124 {
125  for (int i = 0; i < blocksize; ++i) {
126  if (OP == MVType::SET) {
127  c[i] = 0;
128  }
129 
130  for (int j = 0; j < blocksize; ++j) {
131  if (OP == MVType::SET || OP == MVType::PLUS) {
132  c[i] += A[i * blocksize + j] * b[j];
133  } else if (OP == MVType::MINUS) {
134  c[i] -= A[i * blocksize + j] * b[j];
135  }
136  }
137  }
138 }
139 
140 template <class T, int blocksize>
141 __device__ __forceinline__ void
142 mv(const T* a, const T* b, T* c)
143 {
144  matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
145 }
146 
147 template <class T, int blocksize>
148 __device__ __forceinline__ void
149 umv(const T* a, const T* b, T* c)
150 {
151  matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
152 }
153 
154 template <class T, int blocksize>
155 __device__ __forceinline__ void
156 mmv(const T* a, const T* b, T* c)
157 {
158  matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
159 }
160 
161 // dst -= A*B*C
162 template <class T, int blocksize>
163 __device__ __forceinline__ void
164 mmx2Subtraction(const T* A, const T* B, const T* C, T* dst)
165 {
166 
167  T tmp[blocksize * blocksize] = {0};
168  // tmp = A*B
169  for (int i = 0; i < blocksize; ++i) {
170  for (int k = 0; k < blocksize; ++k) {
171  for (int j = 0; j < blocksize; ++j) {
172  tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
173  }
174  }
175  }
176 
177  // dst = tmp*C
178  for (int i = 0; i < blocksize; ++i) {
179  for (int k = 0; k < blocksize; ++k) {
180  for (int j = 0; j < blocksize; ++j) {
181  dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
182  }
183  }
184  }
185 }
186 
187 // C = A*B, assumes the three buffers do not overlap
188 template <class T, int blocksize>
189 __device__ __forceinline__ void
190 mmNoOverlap(T* A, T* B, T* C)
191 {
192  for (int i = 0; i < blocksize; ++i) {
193  for (int k = 0; k < blocksize; ++k) {
194  for (int j = 0; j < blocksize; ++j) {
195  C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
196  }
197  }
198  }
199 }
200 
201 // C = A*B, buffers may overlap
202 template <class T, int blocksize>
203 __device__ __forceinline__ void
204 mmOverlap(T* A, T* B, T* C)
205 {
206  T tmp[blocksize * blocksize] = {0};
207  for (int i = 0; i < blocksize; ++i) {
208  for (int k = 0; k < blocksize; ++k) {
209  for (int j = 0; j < blocksize; ++j) {
210  tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
211  }
212  }
213  }
214 
215  for (int i = 0; i < blocksize; ++i) {
216  for (int j = 0; j < blocksize; ++j) {
217  C[i * blocksize + j] = tmp[i * blocksize + j];
218  }
219  }
220 }
221 
222 // A -= B
223 template <class T, int blocksize>
224 __device__ __forceinline__ void
225 matrixSubtraction(T* A, T* B)
226 {
227 
228  for (int i = 0; i < blocksize; ++i) {
229  for (int j = 0; j < blocksize; ++j) {
230  A[i * blocksize + j] -= B[i * blocksize + j];
231  }
232  }
233 }
234 
235 // B = A
236 template<int blocksize, class ScalarInputType, class ScalarOutputType>
237 __device__ __forceinline__ void
238 moveBlock(const ScalarInputType* __restrict__ A, ScalarOutputType* __restrict__ B){
239  for (int i = 0; i < blocksize; ++i){
240  for (int j = 0; j < blocksize; ++j){
241  B[i * blocksize + j] = ScalarOutputType(A[i * blocksize + j]);
242  }
243  }
244 }
245 
246 // TODO: consider merging with existing block operations
247 // mixed precision general version of c = Ab
248 template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
249 __device__ __forceinline__ void
250 mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
251 {
252  for (int i = 0; i < blocksize; ++i) {
253  c[i] = 0;
254 
255  for (int j = 0; j < blocksize; ++j) {
256  c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
257  }
258  }
259 }
260 
261 // TODO: consider merging with existing block operations
262 // mixed precision general version of c += Ab
263 template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
264 __device__ __forceinline__ void
265 umvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
266 {
267  for (int i = 0; i < blocksize; ++i) {
268  for (int j = 0; j < blocksize; ++j) {
269  c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
270  }
271  }
272 }
273 
274 // TODO: consider merging with existing block operations
275 // Mixed precision general version of c -= Ab
276 template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
277 __device__ __forceinline__ void
278 mmvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
279 {
280  for (int i = 0; i < blocksize; ++i) {
281  for (int j = 0; j < blocksize; ++j) {
282  c[i] -= ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
283  }
284  }
285 }
286 
287 // Checks if a value is close to zero based on a precision limit
288 // Tolerance is 1e-40, to match matrixblock.hh
289 template <typename T>
290 __device__ __forceinline__ bool
291 isCloseToZero(const T value, const T limit = T(1e-40))
292 {
293  return abs(value) < limit;
294 }
295 
296 // Solve a linear system Ax=b for block sizes 1-3
297 template <typename T, int blockSize>
298 __device__ __forceinline__ bool
299 solveBlock(const T* A, const T* b, T* x)
300 {
301  if constexpr (blockSize == 1) {
302  if (isCloseToZero(A[0])) {
303  return false;
304  }
305  x[0] = b[0] / A[0];
306  return true;
307  }
308  else if constexpr (blockSize == 2) {
309  // Calculate determinant
310  T det = A[0] * A[3] - A[1] * A[2];
311 
312  if (isCloseToZero(det)) {
313  return false;
314  }
315 
316  T invDet = T(1.0) / det;
317 
318  // Compute solution using Cramer's rule
319  x[0] = (A[3] * b[0] - A[1] * b[1]) * invDet;
320  x[1] = (A[0] * b[1] - A[2] * b[0]) * invDet;
321 
322  return true;
323  }
324  else if constexpr (blockSize == 3) {
325  // Calculate determinant
326  T det = A[0] * (A[4] * A[8] - A[5] * A[7]) -
327  A[1] * (A[3] * A[8] - A[5] * A[6]) +
328  A[2] * (A[3] * A[7] - A[4] * A[6]);
329 
330  if (isCloseToZero(det)) {
331  return false;
332  }
333 
334  T invDet = T(1.0) / det;
335 
336  // Calculate cofactors for each element of b
337  x[0] = ((A[4] * A[8] - A[5] * A[7]) * b[0] +
338  (A[2] * A[7] - A[1] * A[8]) * b[1] +
339  (A[1] * A[5] - A[2] * A[4]) * b[2]) * invDet;
340 
341  x[1] = ((A[5] * A[6] - A[3] * A[8]) * b[0] +
342  (A[0] * A[8] - A[2] * A[6]) * b[1] +
343  (A[2] * A[3] - A[0] * A[5]) * b[2]) * invDet;
344 
345  x[2] = ((A[3] * A[7] - A[4] * A[6]) * b[0] +
346  (A[1] * A[6] - A[0] * A[7]) * b[1] +
347  (A[0] * A[4] - A[1] * A[3]) * b[2]) * invDet;
348 
349  return true;
350  }
351  else {
352  // Unsupported block size
353  return false;
354  }
355 }
356 
357 // Transpose a block matrix (row-major)
358 // Note: This function does NOT support in-place transposition (srcBlock == dstBlock)
359 // The source and destination blocks must be different memory locations
360 template <class T, int blockSize>
361 __device__ __forceinline__ void
362 transposeBlock(const T* srcBlock, T* dstBlock)
363 {
364  assert(srcBlock != dstBlock && "Source and destination blocks must be different");
365 
366  for (int i = 0; i < blockSize; ++i) {
367  for (int j = 0; j < blockSize; ++j) {
368  dstBlock[j * blockSize + i] = srcBlock[i * blockSize + j];
369  }
370  }
371 }
372 } // namespace
373 
374 #endif