deviceBlockOperations.hpp
Go to the documentation of this file.
1/*
2 Copyright 2024 SINTEF AS
3
4 This file is part of the Open Porous Media project (OPM).
5
6 OPM is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 OPM is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with OPM. If not, see <http://www.gnu.org/licenses/>.
18*/
19
20#ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
21#define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
22
23#include <config.h>
24#include <cuda_runtime.h>
25
26/*
27 This file provides inlineable functions intended for CUDA kernels operating on block matrix elements
28 The functions provides various matrix operations that are used by the preconditioners.
29*/
30
31namespace
32{
33// TODO: figure out if this can be generalized effectively, this seems excessively verbose
34// explicit formulas based on Dune cpu code
35template <class T, int blocksize>
36__device__ __forceinline__ void
37invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
38{
39 if (blocksize == 1) {
40 dstBlock[0] = 1.0 / (srcBlock[0]);
41 } else if (blocksize == 2) {
42 T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
43 dstBlock[0] = srcBlock[3] * detInv;
44 dstBlock[1] = -srcBlock[1] * detInv;
45 dstBlock[2] = -srcBlock[2] * detInv;
46 dstBlock[3] = srcBlock[0] * detInv;
47 } else if (blocksize == 3) {
48 // based on Dune implementation
49 T t4 = srcBlock[0] * srcBlock[4];
50 T t6 = srcBlock[0] * srcBlock[5];
51 T t8 = srcBlock[1] * srcBlock[3];
52 T t10 = srcBlock[2] * srcBlock[3];
53 T t12 = srcBlock[1] * srcBlock[6];
54 T t14 = srcBlock[2] * srcBlock[6];
55
56 T t17 = 1.0
57 / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
58 - t14 * srcBlock[4]); // t17 is 1/determinant
59
60 dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
61 dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
62 dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
63 dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
64 dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
65 dstBlock[5] = -(t6 - t10) * t17;
66 dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
67 dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
68 dstBlock[8] = (t4 - t8) * t17;
69 }
70}
71
72// explicit formulas based on Dune cpu code
73template <class T, int blocksize>
74__device__ __forceinline__ void
75invBlockInPlace(T* __restrict__ block)
76{
77 if (blocksize == 1) {
78 block[0] = 1.0 / (block[0]);
79 } else if (blocksize == 2) {
80 T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
81
82 T temp = block[0];
83 block[0] = block[3] * detInv;
84 block[1] = -block[1] * detInv;
85 block[2] = -block[2] * detInv;
86 block[3] = temp * detInv;
87 } else if (blocksize == 3) {
88 T t4 = block[0] * block[4];
89 T t6 = block[0] * block[5];
90 T t8 = block[1] * block[3];
91 T t10 = block[2] * block[3];
92 T t12 = block[1] * block[6];
93 T t14 = block[2] * block[6];
94
95 T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
96 T t17 = T(1.0) / det;
97
98 T matrix01 = block[1];
99 T matrix00 = block[0];
100 T matrix10 = block[3];
101 T matrix11 = block[4];
102
103 block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
104 block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
105 block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
106 block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
107 block[4] = (matrix00 * block[8] - t14) * t17;
108 block[5] = -(t6 - t10) * t17;
109 block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
110 block[7] = -(matrix00 * block[7] - t12) * t17;
111 block[8] = (t4 - t8) * t17;
112 }
113}
114
115enum class MVType { SET, PLUS, MINUS };
116// SET: c = A*b
117// PLS: c += A*b
118// MINUS: c -= A*b
119template <class T, int blocksize, MVType OP>
120__device__ __forceinline__ void
121matrixVectorProductWithAction(const T* A, const T* b, T* c)
122{
123 for (int i = 0; i < blocksize; ++i) {
124 if (OP == MVType::SET) {
125 c[i] = 0;
126 }
127
128 for (int j = 0; j < blocksize; ++j) {
129 if (OP == MVType::SET || OP == MVType::PLUS) {
130 c[i] += A[i * blocksize + j] * b[j];
131 } else if (OP == MVType::MINUS) {
132 c[i] -= A[i * blocksize + j] * b[j];
133 }
134 }
135 }
136}
137
138template <class T, int blocksize>
139__device__ __forceinline__ void
140mv(const T* a, const T* b, T* c)
141{
142 matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
143}
144
145template <class T, int blocksize>
146__device__ __forceinline__ void
147umv(const T* a, const T* b, T* c)
148{
149 matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
150}
151
152template <class T, int blocksize>
153__device__ __forceinline__ void
154mmv(const T* a, const T* b, T* c)
155{
156 matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
157}
158
159// dst -= A*B*C
160template <class T, int blocksize>
161__device__ __forceinline__ void
162mmx2Subtraction(T* A, T* B, T* C, T* dst)
163{
164
165 T tmp[blocksize * blocksize] = {0};
166 // tmp = A*B
167 for (int i = 0; i < blocksize; ++i) {
168 for (int k = 0; k < blocksize; ++k) {
169 for (int j = 0; j < blocksize; ++j) {
170 tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
171 }
172 }
173 }
174
175 // dst = tmp*C
176 for (int i = 0; i < blocksize; ++i) {
177 for (int k = 0; k < blocksize; ++k) {
178 for (int j = 0; j < blocksize; ++j) {
179 dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
180 }
181 }
182 }
183}
184
185// C = A*B, assumes the three buffers do not overlap
186template <class T, int blocksize>
187__device__ __forceinline__ void
188mmNoOverlap(T* A, T* B, T* C)
189{
190 for (int i = 0; i < blocksize; ++i) {
191 for (int k = 0; k < blocksize; ++k) {
192 for (int j = 0; j < blocksize; ++j) {
193 C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
194 }
195 }
196 }
197}
198
199// C = A*B, buffers may overlap
200template <class T, int blocksize>
201__device__ __forceinline__ void
202mmOverlap(T* A, T* B, T* C)
203{
204 T tmp[blocksize * blocksize] = {0};
205 for (int i = 0; i < blocksize; ++i) {
206 for (int k = 0; k < blocksize; ++k) {
207 for (int j = 0; j < blocksize; ++j) {
208 tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
209 }
210 }
211 }
212
213 for (int i = 0; i < blocksize; ++i) {
214 for (int j = 0; j < blocksize; ++j) {
215 C[i * blocksize + j] = tmp[i * blocksize + j];
216 }
217 }
218}
219
220// A -= B
221template <class T, int blocksize>
222__device__ __forceinline__ void
223matrixSubtraction(T* A, T* B)
224{
225
226 for (int i = 0; i < blocksize; ++i) {
227 for (int j = 0; j < blocksize; ++j) {
228 A[i * blocksize + j] -= B[i * blocksize + j];
229 }
230 }
231}
232} // namespace
233
234#endif