deviceBlockOperations.hpp
Go to the documentation of this file.
1/*
2 Copyright 2024 SINTEF AS
3
4 This file is part of the Open Porous Media project (OPM).
5
6 OPM is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 OPM is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with OPM. If not, see <http://www.gnu.org/licenses/>.
18*/
19
20#ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
21#define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
22
23#include <config.h>
24#include <cuda_runtime.h>
25
26#include <cassert>
27
28/*
29 This file provides inlineable functions intended for CUDA kernels operating on block matrix elements
30 The functions provides various matrix operations that are used by the preconditioners.
31*/
32
33namespace
34{
35// TODO: figure out if this can be generalized effectively, this seems excessively verbose
36// explicit formulas based on Dune cpu code
37template <class T, int blocksize>
38__device__ __forceinline__ void
39invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
40{
41 if (blocksize == 1) {
42 dstBlock[0] = T(1.0) / (srcBlock[0]);
43 } else if (blocksize == 2) {
44 T detInv = T(1.0) / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
45 dstBlock[0] = srcBlock[3] * detInv;
46 dstBlock[1] = -srcBlock[1] * detInv;
47 dstBlock[2] = -srcBlock[2] * detInv;
48 dstBlock[3] = srcBlock[0] * detInv;
49 } else if (blocksize == 3) {
50 // based on Dune implementation
51 T t4 = srcBlock[0] * srcBlock[4];
52 T t6 = srcBlock[0] * srcBlock[5];
53 T t8 = srcBlock[1] * srcBlock[3];
54 T t10 = srcBlock[2] * srcBlock[3];
55 T t12 = srcBlock[1] * srcBlock[6];
56 T t14 = srcBlock[2] * srcBlock[6];
57
58 T t17 = T(1.0)
59 / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
60 - t14 * srcBlock[4]); // t17 is 1/determinant
61
62 dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
63 dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
64 dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
65 dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
66 dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
67 dstBlock[5] = -(t6 - t10) * t17;
68 dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
69 dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
70 dstBlock[8] = (t4 - t8) * t17;
71 }
72}
73
74// explicit formulas based on Dune cpu code
75template <class T, int blocksize>
76__device__ __forceinline__ void
77invBlockInPlace(T* __restrict__ block)
78{
79 if (blocksize == 1) {
80 block[0] = T(1.0) / (block[0]);
81 } else if (blocksize == 2) {
82 T detInv = T(1.0) / (block[0] * block[3] - block[1] * block[2]);
83
84 T temp = block[0];
85 block[0] = block[3] * detInv;
86 block[1] = -block[1] * detInv;
87 block[2] = -block[2] * detInv;
88 block[3] = temp * detInv;
89 } else if (blocksize == 3) {
90 T t4 = block[0] * block[4];
91 T t6 = block[0] * block[5];
92 T t8 = block[1] * block[3];
93 T t10 = block[2] * block[3];
94 T t12 = block[1] * block[6];
95 T t14 = block[2] * block[6];
96
97 T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
98 T t17 = T(1.0) / det;
99
100 T matrix01 = block[1];
101 T matrix00 = block[0];
102 T matrix10 = block[3];
103 T matrix11 = block[4];
104
105 block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
106 block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
107 block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
108 block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
109 block[4] = (matrix00 * block[8] - t14) * t17;
110 block[5] = -(t6 - t10) * t17;
111 block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
112 block[7] = -(matrix00 * block[7] - t12) * t17;
113 block[8] = (t4 - t8) * t17;
114 }
115}
116
117enum class MVType { SET, PLUS, MINUS };
118// SET: c = A*b
119// PLS: c += A*b
120// MINUS: c -= A*b
121template <class T, int blocksize, MVType OP>
122__device__ __forceinline__ void
123matrixVectorProductWithAction(const T* A, const T* b, T* c)
124{
125 for (int i = 0; i < blocksize; ++i) {
126 if (OP == MVType::SET) {
127 c[i] = 0;
128 }
129
130 for (int j = 0; j < blocksize; ++j) {
131 if (OP == MVType::SET || OP == MVType::PLUS) {
132 c[i] += A[i * blocksize + j] * b[j];
133 } else if (OP == MVType::MINUS) {
134 c[i] -= A[i * blocksize + j] * b[j];
135 }
136 }
137 }
138}
139
140template <class T, int blocksize>
141__device__ __forceinline__ void
142mv(const T* a, const T* b, T* c)
143{
144 matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
145}
146
147template <class T, int blocksize>
148__device__ __forceinline__ void
149umv(const T* a, const T* b, T* c)
150{
151 matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
152}
153
154template <class T, int blocksize>
155__device__ __forceinline__ void
156mmv(const T* a, const T* b, T* c)
157{
158 matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
159}
160
161// dst -= A*B*C
162template <class T, int blocksize>
163__device__ __forceinline__ void
164mmx2Subtraction(const T* A, const T* B, const T* C, T* dst)
165{
166
167 T tmp[blocksize * blocksize] = {0};
168 // tmp = A*B
169 for (int i = 0; i < blocksize; ++i) {
170 for (int k = 0; k < blocksize; ++k) {
171 for (int j = 0; j < blocksize; ++j) {
172 tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
173 }
174 }
175 }
176
177 // dst = tmp*C
178 for (int i = 0; i < blocksize; ++i) {
179 for (int k = 0; k < blocksize; ++k) {
180 for (int j = 0; j < blocksize; ++j) {
181 dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
182 }
183 }
184 }
185}
186
187// C = A*B, assumes the three buffers do not overlap
188template <class T, int blocksize>
189__device__ __forceinline__ void
190mmNoOverlap(T* A, T* B, T* C)
191{
192 for (int i = 0; i < blocksize; ++i) {
193 for (int k = 0; k < blocksize; ++k) {
194 for (int j = 0; j < blocksize; ++j) {
195 C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
196 }
197 }
198 }
199}
200
201// C = A*B, buffers may overlap
202template <class T, int blocksize>
203__device__ __forceinline__ void
204mmOverlap(T* A, T* B, T* C)
205{
206 T tmp[blocksize * blocksize] = {0};
207 for (int i = 0; i < blocksize; ++i) {
208 for (int k = 0; k < blocksize; ++k) {
209 for (int j = 0; j < blocksize; ++j) {
210 tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
211 }
212 }
213 }
214
215 for (int i = 0; i < blocksize; ++i) {
216 for (int j = 0; j < blocksize; ++j) {
217 C[i * blocksize + j] = tmp[i * blocksize + j];
218 }
219 }
220}
221
222// A -= B
223template <class T, int blocksize>
224__device__ __forceinline__ void
225matrixSubtraction(T* A, T* B)
226{
227
228 for (int i = 0; i < blocksize; ++i) {
229 for (int j = 0; j < blocksize; ++j) {
230 A[i * blocksize + j] -= B[i * blocksize + j];
231 }
232 }
233}
234
235// B = A
236template<int blocksize, class ScalarInputType, class ScalarOutputType>
237__device__ __forceinline__ void
238moveBlock(const ScalarInputType* __restrict__ A, ScalarOutputType* __restrict__ B){
239 for (int i = 0; i < blocksize; ++i){
240 for (int j = 0; j < blocksize; ++j){
241 B[i * blocksize + j] = ScalarOutputType(A[i * blocksize + j]);
242 }
243 }
244}
245
246// TODO: consider merging with existing block operations
247// mixed precision general version of c = Ab
248template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
249__device__ __forceinline__ void
250mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
251{
252 for (int i = 0; i < blocksize; ++i) {
253 c[i] = 0;
254
255 for (int j = 0; j < blocksize; ++j) {
256 c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
257 }
258 }
259}
260
261// TODO: consider merging with existing block operations
262// mixed precision general version of c += Ab
263template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
264__device__ __forceinline__ void
265umvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
266{
267 for (int i = 0; i < blocksize; ++i) {
268 for (int j = 0; j < blocksize; ++j) {
269 c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
270 }
271 }
272}
273
274// TODO: consider merging with existing block operations
275// Mixed precision general version of c -= Ab
276template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
277__device__ __forceinline__ void
278mmvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
279{
280 for (int i = 0; i < blocksize; ++i) {
281 for (int j = 0; j < blocksize; ++j) {
282 c[i] -= ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
283 }
284 }
285}
286
287// Checks if a value is close to zero based on a precision limit
288// Tolerance is 1e-40, to match matrixblock.hh
289template <typename T>
290__device__ __forceinline__ bool
291isCloseToZero(const T value, const T limit = T(1e-40))
292{
293 return abs(value) < limit;
294}
295
296// Solve a linear system Ax=b for block sizes 1-3
297template <typename T, int blockSize>
298__device__ __forceinline__ bool
299solveBlock(const T* A, const T* b, T* x)
300{
301 if constexpr (blockSize == 1) {
302 if (isCloseToZero(A[0])) {
303 return false;
304 }
305 x[0] = b[0] / A[0];
306 return true;
307 }
308 else if constexpr (blockSize == 2) {
309 // Calculate determinant
310 T det = A[0] * A[3] - A[1] * A[2];
311
312 if (isCloseToZero(det)) {
313 return false;
314 }
315
316 T invDet = T(1.0) / det;
317
318 // Compute solution using Cramer's rule
319 x[0] = (A[3] * b[0] - A[1] * b[1]) * invDet;
320 x[1] = (A[0] * b[1] - A[2] * b[0]) * invDet;
321
322 return true;
323 }
324 else if constexpr (blockSize == 3) {
325 // Calculate determinant
326 T det = A[0] * (A[4] * A[8] - A[5] * A[7]) -
327 A[1] * (A[3] * A[8] - A[5] * A[6]) +
328 A[2] * (A[3] * A[7] - A[4] * A[6]);
329
330 if (isCloseToZero(det)) {
331 return false;
332 }
333
334 T invDet = T(1.0) / det;
335
336 // Calculate cofactors for each element of b
337 x[0] = ((A[4] * A[8] - A[5] * A[7]) * b[0] +
338 (A[2] * A[7] - A[1] * A[8]) * b[1] +
339 (A[1] * A[5] - A[2] * A[4]) * b[2]) * invDet;
340
341 x[1] = ((A[5] * A[6] - A[3] * A[8]) * b[0] +
342 (A[0] * A[8] - A[2] * A[6]) * b[1] +
343 (A[2] * A[3] - A[0] * A[5]) * b[2]) * invDet;
344
345 x[2] = ((A[3] * A[7] - A[4] * A[6]) * b[0] +
346 (A[1] * A[6] - A[0] * A[7]) * b[1] +
347 (A[0] * A[4] - A[1] * A[3]) * b[2]) * invDet;
348
349 return true;
350 }
351 else {
352 // Unsupported block size
353 return false;
354 }
355}
356
357// Transpose a block matrix (row-major)
358// Note: This function does NOT support in-place transposition (srcBlock == dstBlock)
359// The source and destination blocks must be different memory locations
360template <class T, int blockSize>
361__device__ __forceinline__ void
362transposeBlock(const T* srcBlock, T* dstBlock)
363{
364 assert(srcBlock != dstBlock && "Source and destination blocks must be different");
365
366 for (int i = 0; i < blockSize; ++i) {
367 for (int j = 0; j < blockSize; ++j) {
368 dstBlock[j * blockSize + i] = srcBlock[i * blockSize + j];
369 }
370 }
371}
372} // namespace
373
374#endif