forked from stanfordhpccenter/HTR-solver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
math_utils.rg
658 lines (597 loc) · 18.5 KB
/
math_utils.rg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
-- Copyright (c) "2019, by Stanford University
-- Developer: Mario Di Renzo
-- Affiliation: Center for Turbulence Research, Stanford University
-- URL: https://ctr.stanford.edu
-- Citation: Di Renzo, M., Lin, F., and Urzay, J. (2020).
-- HTR solver: An open-source exascale-oriented task-based
-- multi-GPU high-order code for hypersonic aerothermodynamics.
-- Computer Physics Communications 255, 107262"
-- All rights reserved.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are met:
-- * Redistributions of source code must retain the above copyright
-- notice, this list of conditions and the following disclaimer.
-- * Redistributions in binary form must reproduce the above copyright
-- notice, this list of conditions and the following disclaimer in the
-- documentation and/or other materials provided with the distribution.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
-- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import "regent"
local Exports = {}
local floor = regentlib.floor(double)
local ceil = regentlib.ceil(double)
local fabs = regentlib.fabs(double)
local max = regentlib.fmax
local min = regentlib.fmin
local pow = regentlib.pow(double)
-- Row by column multiplication
Exports.mkMatMul = terralib.memoize(function(n)
local matMul
local __demand (__inline)
task matMul(x : double[n],
A : double[n*n])
var b : double[n]
for i=0, n do
b[i] = 0.0
for j=0, n do
b[i] += A[i*n+j]*x[j]
end
end
return b
end
return matMul
end)
-- LU decomposition
Exports.mkLUdec = terralib.memoize(function(n)
local LUdec, ludcmp, lubksb
-- Data structures
local struct LUdec {
A : double[n*n]
ind : int[n]
sing : bool
}
-- Computes LU decomposition
local __demand (__inline)
task ludcmp(LU : LUdec)
var TINY = 1.0e-20
var imax = 0
var vv : double[n]
LU.sing = false
for i = 0, n do
var big = 0.0
for j = 0, n do
big max= fabs(LU.A[i*n+j])
end
-- TODO : assertion is not supported by CUDA code generator
-- emit a bool for now
-- regentlib.assert(big ~= 0.0, "Singular matrix in ludcmp")
if (big == 0.0) then
LU.sing = true
end
vv[i] = 1.0/big
end
for j = 0, n do
for i = 0, j do
var sum = LU.A[i*n+j]
for k = 0, i do
sum -= LU.A[i*n+k]*LU.A[k*n+j]
end
LU.A[i*n+j] = sum
end
var big = 0.0
for i = j, n do
var sum = LU.A[i*n+j]
for k = 0, j do
sum -= LU.A[i*n+k]*LU.A[k*n+j]
end
LU.A[i*n+j] = sum
var dum = vv[i]*fabs(sum)
if (dum >= big) then
big=dum
imax=i
end
end
if (j ~= imax) then
for k = 0, n do
var dum = LU.A[imax*n+k]
LU.A[imax*n+k ]= LU.A[j*n+k]
LU.A[j*n+k] = dum
end
vv[imax]=vv[j]
end
LU.ind[j] = imax
if (LU.A[j*n+j] == 0.0) then
LU.A[j*n+j] = TINY
end
if (j ~= n-1) then
var dum=1.0/(LU.A[j*n+j])
for i = j+1, n do
LU.A[i*n+j] *= dum
end
end
end
return LU
end
-- Backsubstitutes in the LU decomposed matrix
local __demand (__inline)
task lubksb(LU : LUdec,
b : double[n])
var ii = 0
for i = 0, n do
var ip = LU.ind[i]
var sum = b[ip]
b[ip] = b[i]
if (ii ~= 0) then
for j = ii-1, i do
sum -= LU.A[i*n+j]*b[j]
end
elseif (sum ~= 0.0) then
ii = i+1
end
b[i]=sum
end
for i = n-1, -1, -1 do
var sum=b[i]
for j = i+1, n do
sum -= LU.A[i*n+j]*b[j]
end
b[i] = sum/LU.A[i*n+i]
end
return b
end
return {LUdec, ludcmp, lubksb}
end)
-- Matrix inversion using Gauss elimination
local mkInverseMatrix = terralib.memoize(function(n)
local InverseMatrix
local __demand(__inline)
task InverseMatrix(A : double[n*n])
var B : double[n*n]
for i=0, n do
for j=0, n do
B[i*n+j] = 0.0
end
end
-- Forward elimination
for i=0, n do
B[i*n+i] = 1.0
var Ainv = 1.0/A[i*n+i]
for j=0, n do
B[i*n+j] *= Ainv
A[i*n+j] *= Ainv
end
for l=i+1, n do
var factor = A[l*n+i]
for j=0, n do
B[l*n+j] -= factor*B[i*n+j]
A[l*n+j] -= factor*A[i*n+j]
end
end
end
-- Backward substitution
for i = n-1, -1, -1 do
for l=i+1, n do
var factor = A[i*n+l]
for j=0, n do
B[i*n+j] -= factor*B[l*n+j]
end
end
end
return B
end
return InverseMatrix
end)
--HO finite volume reconstruction
Exports.mkReconCoeff = terralib.memoize(function(n)
local reconCoeff
__demand(__inline)
task reconCoeff(xc : double[n+1], xp : double)
-- Form the matrix
var A : double[n*n]
for i=0, n do
for j=0, n do
A[i*n+j] = (pow(xc[i+1], n-j) - pow(xc[i], n-j))/((n-j)*(xc[i+1] - xc[i]))
end
end
-- Invert it
var B = [mkInverseMatrix(n)](A)
-- Compute metrics
var coeff : double[n]
for i=0, n do
coeff[i] = 0.0
for j=0, n do
coeff[i] += B[j*n+i]*pow(xp, (n-(j+1)))
end
end
return coeff
end
return reconCoeff
end)
--HO finite volume reconstruction with left Dirichlet BC
Exports.mkReconCoeffLeftBC = terralib.memoize(function(n)
local reconCoeff
__demand(__inline)
task reconCoeff(xc : double[n], xp : double)
-- Form the matrix
var A : double[n*n]
for j=0, n do
A[j] = pow(xc[0], n-j-1)
end
for i=1, n do
for j=0, n do
A[i*n+j] = (pow(xc[i], n-j) - pow(xc[i-1], n-j))/((n-j)*(xc[i] - xc[i-1]))
end
end
-- Invert it
var B = [mkInverseMatrix(n)](A)
-- Compute metrics
var coeff : double[n]
for i=0, n do
coeff[i] = 0.0
for j=0, n do
coeff[i] += B[j*n+i]*pow(xp, (n-(j+1)))
end
end
return coeff
end
return reconCoeff
end)
--HO finite volume reconstruction with right Dirichlet BC
Exports.mkReconCoeffRightBC = terralib.memoize(function(n)
local reconCoeff
__demand(__inline)
task reconCoeff(xc : double[n], xp : double)
-- Form the matrix
var A : double[n*n]
for i=0, n-1 do
for j=0, n do
A[i*n+j] = (pow(xc[i+1], n-j) - pow(xc[i], n-j))/((n-j)*(xc[i+1] - xc[i]))
end
end
for j=0, n do
A[(n-1)*n+j] = pow(xc[n-1], n-j-1)
end
-- Invert it
var B = [mkInverseMatrix(n)](A)
-- Compute metrics
var coeff : double[n]
for i=0, n do
coeff[i] = 0.0
for j=0, n do
coeff[i] += B[j*n+i]*pow(xp, (n-(j+1)))
end
end
return coeff
end
return reconCoeff
end)
-- Implicit Rosenbrock solver
-- See Numerical recipes in c for reference
Exports.mkRosenbrock = terralib.memoize(function(nEq, Fields, Vars, Unkowns, Data, rhs)
local Rosenbrock
-- Algorithm paramenters
local MAXSTP = 100000
local MAXITS = 100
local TOL = 1e-10
local SAFETY = 0.9
local GROW = 1.5
local PGROW =-0.25
local SHRNK = 0.5
local PSHRNK =-1.0/3.0
local ERRCON = 0.1296
local GAM = 1.0/2.0
local A21 = 2.0
local A31 = 48.0/25.0
local A32 = 6.0/25.0
local C21 =-8.0
local C31 = 372.0/25.0
local C32 = 12.0/5.0
local C41 =-112.0/125.0
local C42 =-54.0/125.0
local C43 =-2.0/5.0
local B1 = 19.0/9.0
local B2 = 1.0/2.0
local B3 = 25.0/108.0
local B4 = 125.0/108.0
local E1 = 17.0/54.0
local E2 = 7.0/36.0
local E3 = 0.0
local E4 = 125.0/108.0
local C1X = 1.0/2.0
local C2X =-3.0/2.0
local C3X = 121.0/50.0
local C4X = 29.0/250.0
local A2X = 1.0
local A3X = 3.0/5.0
-- Computes the jacobian with second order finite difference
local __demand (__inline)
task GetJacobian(Mesh : region(ispace(int3d), Fields),
c : int3d,
data : Data)
where
reads writes(Mesh.[Vars])
do
var EPS = 1.0e-6
var DEL = 1.0e-14
var tmp = Mesh[c].[Unkowns]
var Jac : double[nEq*nEq]
for j = 0, nEq do
var h = Mesh[c].[Unkowns][j]*EPS + DEL
Mesh[c].[Unkowns][j] = tmp[j] + h
var hp = Mesh[c].[Unkowns][j] - tmp[j]
var fp = rhs(Mesh, c, data)
Mesh[c].[Unkowns][j] = tmp[j] - h
var hm = tmp[j] - Mesh[c].[Unkowns][j]
var fm = rhs(Mesh, c, data)
Mesh[c].[Unkowns][j] = tmp[j]
for i = 0, nEq do
Jac[i*nEq+j] = (fp[i] - fm[i])/(hp + hm)
end
end
return Jac
end
-- LU decomposition tasks
local LUdec, ludcmp, lubksb = unpack(Exports.mkLUdec(nEq))
__demand (__inline)
task Rosenbrock( Mesh : region(ispace(int3d), Fields),
c : int3d,
dtTry : double,
DelT : double,
data : Data)
where
reads writes(Mesh.[Vars])
do
var err : double[nEq]
var g1 : double[nEq]
var g2 : double[nEq]
var g3 : double[nEq]
var g4 : double[nEq]
var finish = false
var time = 0.0
var dt = dtTry
var fail = 0
for step = 0, MAXSTP do
var t0 = time
var dtNext : double
var Jac = GetJacobian(Mesh, c, data)
var dx = rhs(Mesh, c, data)
var xsav = Mesh[c].[Unkowns]
var dxsav = dx
for jtry = 0, MAXITS do
var LU : LUdec
for i = 0, nEq do
for j = 0, nEq do
LU.A[i*nEq+j] = -Jac[i*nEq+j]
end
LU.A[i*nEq+i] = LU.A[i*nEq+i] + 1.0/(GAM*dt)
end
LU = ludcmp(LU)
if ( LU.sing == true ) then
fail = 1
break
end
for i = 0, nEq do
g1[i] = dxsav[i]+dt*C1X*dx[i]
end
g1 = lubksb(LU, g1)
for i = 0, nEq do
Mesh[c].[Unkowns][i] = xsav[i]+A21*g1[i]
end
time = t0+A2X*dt
dx = rhs(Mesh, c, data)
for i = 0, nEq do
g2[i] = dx[i]+dt*C2X*dx[i]+C21*g1[i]/dt
end
g2 = lubksb(LU, g2)
for i = 0, nEq do
Mesh[c].[Unkowns][i] = xsav[i]+A31*g1[i]+A32*g2[i]
end
time = t0+A3X*dt
dx = rhs(Mesh, c, data)
for i = 0, nEq do
g3[i] = dx[i]+dt*C3X*dx[i]+(C31*g1[i]+C32*g2[i])/dt
end
g3 = lubksb(LU, g3)
for i = 0, nEq do
g4[i] = dx[i]+dt*C4X*dx[i]+(C41*g1[i]+C42*g2[i]+C43*g3[i])/dt
end
g4 = lubksb(LU ,g4)
for i = 0, nEq do
Mesh[c].[Unkowns][i] = xsav[i]+B1*g1[i]+B2*g2[i]+B3*g3[i]+B4*g4[i]
err[i] = E1*g1[i]+E2*g2[i]+E3*g3[i]+E4*g4[i]
end
time = t0+dt
-- TODO : assertion is not supported by CUDA code generator
-- emit an int for now
-- regentlib.assert(time ~= t0, "Stepsize not significant in Rosenbrock")
if ( time == t0 ) then
fail = 2
break
end
var errmax = 0.0
for i = 0, nEq do
errmax max= fabs(err[i])
end
errmax /= TOL
if (errmax <= 1.0 or finish) then
dtTry = dt
-- c.printf("Rosenbrock converged with dt = %g and errmax = %g\n", dt, errmax*TOL)
if (errmax > ERRCON) then
dtNext = SAFETY*dt*pow(errmax,PGROW)
else
dtNext = GROW*dt
end
break
else
dtNext = SAFETY*dt*pow(errmax,PSHRNK)
if (dt >= 0.0 ) then
dt = max(dtNext,SHRNK*dt)
else
dt = min(dtNext,SHRNK*dt)
end
end
-- TODO : assertion is not supported by CUDA code generator
-- emit an int for now
-- regentlib.assert(jtry ~= MAXITS-1, "Exceeded MAXITS in Rosenbrock")
if (jtry == MAXITS-1) then fail = 3 end
end
if ( DelT == time ) then finish = true end
if finish then break end
if ( fail==1 ) then break end
if ( dtNext*1.5 > DelT-time ) then
-- Force the algorithm to integrate till DelT
dt = DelT-time
finish = true
else
dt = dtNext
end
-- TODO : assertion is not supported by CUDA code generator
-- emit an int for now
-- regentlib.assert(step ~= MAXSTP-1, "Exceeded MAXSTP in Rosenbrock")
if ( step == MAXSTP-1 ) then fail = 4 end
end
return fail
end
return Rosenbrock
end)
-- Fast interpolation using the integer domain
Exports.mkFastInterp = terralib.memoize(function(SrcType, xfld)
local FastInterpData, FastInterpType
local FastInterpInitData, FastInterpInitRegion
local FastInterpFindIndex, FastInterpGetWeight
local eps = 1e-6
local struct FastInterpData {
nloc : int;
xmin : double;
xmax : double;
small : double;
dxloc : double;
idxloc : double;
}
-- Single precision is sufficient at this stage
local struct FastInterpType {
xloc : float;
iloc : float;
}
-- Initializes data structure
local __demand(__leaf) -- MANUALLY PARALLELIZED, NO CUDA, NO OPENMP
task FastInterpInitData(src : region(ispace(int1d), SrcType))
where
reads(src.[xfld])
do
var data : FastInterpData
if src.volume > 1 then
var xmin = math.huge
var xmax = -math.huge
var dxmin = math.huge
__demand(__openmp)
for c in src do
xmin min= src[c].[xfld]
xmax max= src[c].[xfld]
if c < src.bounds.hi then
dxmin min= src[c+1].[xfld] - src[c].[xfld]
end
end
regentlib.assert(dxmin >= 0.0, "FastInterpInitData: something wrong in the input region")
data.xmin = xmin
data.xmax = xmax
data.small = eps*(data.xmax - data.xmin)
-- ensure at least 2 points per interval in src
data.nloc = ceil(2.0*(data.xmax-data.xmin)/dxmin)
-- Size of the uniform grid
data.dxloc = (data.xmax-data.xmin)/(data.nloc-1)
data.idxloc = 1.0/data.dxloc
else
data.xmin = src[0].[xfld]
data.xmax = src[0].[xfld]
data.small = eps
data.nloc = 2
data.dxloc = 0.0
data.idxloc = 0.0
end
return data
end
-- Initializes region
local --__demand(__leaf) -- MANUALLY PARALLELIZED, NO CUDA
task FastInterpInitRegion(r : region(ispace(int1d), FastInterpType),
src : region(ispace(int1d), SrcType),
d : FastInterpData)
where
reads(src.[xfld]),
reads writes(r.{xloc, iloc})
do
-- Initialize xloc and iloc
__demand(__openmp)
for c in r do
r[c].xloc = d.xmin + float(c)*d.dxloc
var i : int1d
for c1 in src do
i = c1
var cp1 = min(c1+int1d(1), src.bounds.hi)
if (r[c].xloc <= src[cp1].[xfld]) then break end
end
i min= src.bounds.hi-int1d(1)
var ip1 = i+int1d(1)
var w = (src[ip1].[xfld] - r[c].xloc)/(src[ip1].[xfld]-src[i].[xfld])
r[c].iloc = w*float(i)+(1.0 - w)*float(ip1)
end
var dimin = math.huge
for c in r do
if c < r.bounds.hi then
dimin min= r[c+1].iloc - r[c].iloc
end
end
if r.volume==1 then dimin = 0 end
-- Correct iloc to pass though src.[xfld]'s
r[0].iloc = 0.0
-- __demand(__openmp)
for c in src do
if c > int1d(0) then
var csi = int1d(0)
for c1 in r do
csi = c1
var cp1 = min(c1+int1d(1), r.bounds.hi)
if (src[c].[xfld] <= r[cp1].xloc) then break end
end
var cp1 = min(csi+int1d(1), r.bounds.hi)
r[csi].iloc = float(c) + dimin*d.idxloc*(r[csi].xloc - src[c].[xfld])
r[cp1].iloc = float(c) + dimin*d.idxloc*(r[cp1].xloc - src[c].[xfld])
end
end
r[r.bounds.hi].iloc = float(src.bounds.hi)
end
-- Finds index of first element on the left
__demand(__inline)
task FastInterpFindIndex(x : double,
r : region(ispace(int1d), FastInterpType),
d : FastInterpData)
where
reads(r.{iloc, xloc})
do
x max = d.xmin+d.small
x min = d.xmax-d.small
var k = int1d(floor((x-d.xmin)*d.idxloc))
var kp1 = k+int1d(1)
return int1d(floor(r[k].iloc + (r[kp1].iloc-r[k].iloc)*
( x-r[k].xloc)*d.idxloc))
end
-- Compute linear interpolation weight for point on the left
__demand(__inline)
task FastInterpGetWeight(x : double, xm : double, xp : double)
return (xp - x)/(xp - xm)
end
return {FastInterpData, FastInterpType,
FastInterpInitData, FastInterpInitRegion,
FastInterpFindIndex, FastInterpGetWeight}
end)
return Exports