20#ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
21#define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
24#include <cuda_runtime.h>
37template <
class T,
int blocksize>
38__device__ __forceinline__
void
39invBlockOutOfPlace(
const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
42 dstBlock[0] = T(1.0) / (srcBlock[0]);
43 }
else if (blocksize == 2) {
44 T detInv = T(1.0) / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
45 dstBlock[0] = srcBlock[3] * detInv;
46 dstBlock[1] = -srcBlock[1] * detInv;
47 dstBlock[2] = -srcBlock[2] * detInv;
48 dstBlock[3] = srcBlock[0] * detInv;
49 }
else if (blocksize == 3) {
51 T t4 = srcBlock[0] * srcBlock[4];
52 T t6 = srcBlock[0] * srcBlock[5];
53 T t8 = srcBlock[1] * srcBlock[3];
54 T t10 = srcBlock[2] * srcBlock[3];
55 T t12 = srcBlock[1] * srcBlock[6];
56 T t14 = srcBlock[2] * srcBlock[6];
59 / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
62 dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
63 dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
64 dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
65 dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
66 dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
67 dstBlock[5] = -(t6 - t10) * t17;
68 dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
69 dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
70 dstBlock[8] = (t4 - t8) * t17;
75template <
class T,
int blocksize>
76__device__ __forceinline__
void
77invBlockInPlace(T* __restrict__ block)
80 block[0] = T(1.0) / (block[0]);
81 }
else if (blocksize == 2) {
82 T detInv = T(1.0) / (block[0] * block[3] - block[1] * block[2]);
85 block[0] = block[3] * detInv;
86 block[1] = -block[1] * detInv;
87 block[2] = -block[2] * detInv;
88 block[3] = temp * detInv;
89 }
else if (blocksize == 3) {
90 T t4 = block[0] * block[4];
91 T t6 = block[0] * block[5];
92 T t8 = block[1] * block[3];
93 T t10 = block[2] * block[3];
94 T t12 = block[1] * block[6];
95 T t14 = block[2] * block[6];
97 T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
100 T matrix01 = block[1];
101 T matrix00 = block[0];
102 T matrix10 = block[3];
103 T matrix11 = block[4];
105 block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
106 block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
107 block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
108 block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
109 block[4] = (matrix00 * block[8] - t14) * t17;
110 block[5] = -(t6 - t10) * t17;
111 block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
112 block[7] = -(matrix00 * block[7] - t12) * t17;
113 block[8] = (t4 - t8) * t17;
117enum class MVType { SET, PLUS, MINUS };
121template <
class T,
int blocksize, MVType OP>
122__device__ __forceinline__
void
123matrixVectorProductWithAction(
const T* A,
const T* b, T* c)
125 for (
int i = 0; i < blocksize; ++i) {
126 if (OP == MVType::SET) {
130 for (
int j = 0; j < blocksize; ++j) {
131 if (OP == MVType::SET || OP == MVType::PLUS) {
132 c[i] += A[i * blocksize + j] * b[j];
133 }
else if (OP == MVType::MINUS) {
134 c[i] -= A[i * blocksize + j] * b[j];
140template <
class T,
int blocksize>
141__device__ __forceinline__
void
142mv(
const T* a,
const T* b, T* c)
144 matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
147template <
class T,
int blocksize>
148__device__ __forceinline__
void
149umv(
const T* a,
const T* b, T* c)
151 matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
154template <
class T,
int blocksize>
155__device__ __forceinline__
void
156mmv(
const T* a,
const T* b, T* c)
158 matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
162template <
class T,
int blocksize>
163__device__ __forceinline__
void
164mmx2Subtraction(
const T* A,
const T* B,
const T* C, T* dst)
167 T tmp[blocksize * blocksize] = {0};
169 for (
int i = 0; i < blocksize; ++i) {
170 for (
int k = 0; k < blocksize; ++k) {
171 for (
int j = 0; j < blocksize; ++j) {
172 tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
178 for (
int i = 0; i < blocksize; ++i) {
179 for (
int k = 0; k < blocksize; ++k) {
180 for (
int j = 0; j < blocksize; ++j) {
181 dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
188template <
class T,
int blocksize>
189__device__ __forceinline__
void
190mmNoOverlap(T* A, T* B, T* C)
192 for (
int i = 0; i < blocksize; ++i) {
193 for (
int k = 0; k < blocksize; ++k) {
194 for (
int j = 0; j < blocksize; ++j) {
195 C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
202template <
class T,
int blocksize>
203__device__ __forceinline__
void
204mmOverlap(T* A, T* B, T* C)
206 T tmp[blocksize * blocksize] = {0};
207 for (
int i = 0; i < blocksize; ++i) {
208 for (
int k = 0; k < blocksize; ++k) {
209 for (
int j = 0; j < blocksize; ++j) {
210 tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
215 for (
int i = 0; i < blocksize; ++i) {
216 for (
int j = 0; j < blocksize; ++j) {
217 C[i * blocksize + j] = tmp[i * blocksize + j];
223template <
class T,
int blocksize>
224__device__ __forceinline__
void
225matrixSubtraction(T* A, T* B)
228 for (
int i = 0; i < blocksize; ++i) {
229 for (
int j = 0; j < blocksize; ++j) {
230 A[i * blocksize + j] -= B[i * blocksize + j];
236template<
int blocksize,
class ScalarInputType,
class ScalarOutputType>
237__device__ __forceinline__
void
238moveBlock(
const ScalarInputType* __restrict__ A, ScalarOutputType* __restrict__ B){
239 for (
int i = 0; i < blocksize; ++i){
240 for (
int j = 0; j < blocksize; ++j){
241 B[i * blocksize + j] = ScalarOutputType(A[i * blocksize + j]);
248template <
int blocksize,
class MatrixScalar,
class VectorScalar,
class ResultScalar,
class ComputeScalar>
249__device__ __forceinline__
void
250mvMixedGeneral(
const MatrixScalar* A,
const VectorScalar* b, ResultScalar* c)
252 for (
int i = 0; i < blocksize; ++i) {
255 for (
int j = 0; j < blocksize; ++j) {
256 c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
263template <
int blocksize,
class MatrixScalar,
class VectorScalar,
class ResultScalar,
class ComputeScalar>
264__device__ __forceinline__
void
265umvMixedGeneral(
const MatrixScalar* A,
const VectorScalar* b, ResultScalar* c)
267 for (
int i = 0; i < blocksize; ++i) {
268 for (
int j = 0; j < blocksize; ++j) {
269 c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
276template <
int blocksize,
class MatrixScalar,
class VectorScalar,
class ResultScalar,
class ComputeScalar>
277__device__ __forceinline__
void
278mmvMixedGeneral(
const MatrixScalar* A,
const VectorScalar* b, ResultScalar* c)
280 for (
int i = 0; i < blocksize; ++i) {
281 for (
int j = 0; j < blocksize; ++j) {
282 c[i] -= ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
290__device__ __forceinline__
bool
291isCloseToZero(
const T value,
const T limit = T(1e-40))
293 return abs(value) < limit;
297template <
typename T,
int blockSize>
298__device__ __forceinline__
bool
299solveBlock(
const T* A,
const T* b, T* x)
301 if constexpr (blockSize == 1) {
302 if (isCloseToZero(A[0])) {
308 else if constexpr (blockSize == 2) {
310 T det = A[0] * A[3] - A[1] * A[2];
312 if (isCloseToZero(det)) {
316 T invDet = T(1.0) / det;
319 x[0] = (A[3] * b[0] - A[1] * b[1]) * invDet;
320 x[1] = (A[0] * b[1] - A[2] * b[0]) * invDet;
324 else if constexpr (blockSize == 3) {
326 T det = A[0] * (A[4] * A[8] - A[5] * A[7]) -
327 A[1] * (A[3] * A[8] - A[5] * A[6]) +
328 A[2] * (A[3] * A[7] - A[4] * A[6]);
330 if (isCloseToZero(det)) {
334 T invDet = T(1.0) / det;
337 x[0] = ((A[4] * A[8] - A[5] * A[7]) * b[0] +
338 (A[2] * A[7] - A[1] * A[8]) * b[1] +
339 (A[1] * A[5] - A[2] * A[4]) * b[2]) * invDet;
341 x[1] = ((A[5] * A[6] - A[3] * A[8]) * b[0] +
342 (A[0] * A[8] - A[2] * A[6]) * b[1] +
343 (A[2] * A[3] - A[0] * A[5]) * b[2]) * invDet;
345 x[2] = ((A[3] * A[7] - A[4] * A[6]) * b[0] +
346 (A[1] * A[6] - A[0] * A[7]) * b[1] +
347 (A[0] * A[4] - A[1] * A[3]) * b[2]) * invDet;
360template <
class T,
int blockSize>
361__device__ __forceinline__
void
362transposeBlock(
const T* srcBlock, T* dstBlock)
364 assert(srcBlock != dstBlock &&
"Source and destination blocks must be different");
366 for (
int i = 0; i < blockSize; ++i) {
367 for (
int j = 0; j < blockSize; ++j) {
368 dstBlock[j * blockSize + i] = srcBlock[i * blockSize + j];