1 """Interface with Nvidia CUDA."""
2
3 from __future__ import division
4
5 __copyright__ = "Copyright (C) 2008 Andreas Kloeckner"
6
7 __license__ = """
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see U{http://www.gnu.org/licenses/}.
20 """
21
22
23
24 import numpy
25 import numpy.linalg as la
26 from pytools import memoize_method, memoize
27 import hedge.optemplate
28 import pycuda.driver as cuda
29 import pycuda.gpuarray as gpuarray
30 import pymbolic.mapper.stringifier
31
32
33
34
35
36 @memoize
38 from hedge.cuda.cgen import Struct, POD
39
40 return Struct("flux_header", [
41 POD(numpy.uint16, "els_in_block"),
42 POD(numpy.uint16, "same_facepairs_end"),
43 POD(numpy.uint16, "diff_facepairs_end"),
44 POD(numpy.uint16, "bdry_facepairs_end"),
45 ])
46
47 @memoize
49 from hedge.cuda.cgen import Struct, POD, ArrayOf
50 return Struct("face_pair", [
51 POD(float_type, "h", ),
52 POD(float_type, "order"),
53 POD(float_type, "face_jacobian"),
54 ArrayOf(POD(float_type, "normal"), dims),
55
56 POD(numpy.uint32, "a_base"),
57 POD(numpy.uint32, "b_base"),
58
59 POD(numpy.uint16, "a_ilist_index"),
60 POD(numpy.uint16, "b_ilist_index"),
61 POD(numpy.uint16, "b_write_ilist_index"),
62 POD(numpy.uint8, "boundary_id"),
63 POD(numpy.uint8, "pad"),
64 POD(numpy.uint16, "a_dest"),
65 POD(numpy.uint16, "b_dest"),
66 ])
67
68
69
70
73 def float_mapper(x):
74 if isinstance(x, float):
75 return "%sf" % repr(x)
76 else:
77 return repr(x)
78
79 pymbolic.mapper.stringifier.StringifyMapper.__init__(self, float_mapper)
80 self.flip_normal = flip_normal
81
83 if self.flip_normal:
84 sign = "-"
85 else:
86 sign = ""
87 return "%sfpair->normal[%d]" % (sign, expr.axis)
88
90 return ("pow(fpair->order*fpair->order/fpair->h, %r)"
91 % expr.power)
92
94 from pymbolic.mapper.stringifier import PREC_NONE
95 return "(%s > 0 ? %s : %s)" % (
96 self.rec(expr.criterion, PREC_NONE),
97 self.rec(expr.then, PREC_NONE),
98 self.rec(expr.else_, PREC_NONE),
99 )
100
101
102
103
104
105 -class ExecutionMapper(hedge.optemplate.Evaluator,
106 hedge.optemplate.BoundOpMapperMixin,
107 hedge.optemplate.LocalOpReducerMixin):
108
114
116 discr = self.ex.discr
117
118 norm_ref = la.norm(reference)
119 struc = ""
120
121 numpy.set_printoptions(precision=2, linewidth=130, suppress=True)
122 for block in discr.blocks:
123 i_el = 0
124 for mb in block.microblocks:
125 for el in mb:
126 s = discr.find_el_range(el.id)
127 relerr = la.norm(diff[s])/norm_ref
128 if relerr > 1e-4:
129 struc += "*"
130 if True:
131 print "block %d, el %d, global el #%d, rel.l2err=%g" % (
132 block.number, i_el, el.id, relerr)
133 print computed[s]
134 print reference[s]
135 print diff[s]
136 raw_input()
137 elif numpy.isnan(relerr):
138 struc += "N"
139 if False:
140 print "block %d, el %d, global el #%d, rel.l2err=%g" % (
141 block.number, i_el, el.id, relerr)
142 print computed[s]
143 print reference[s]
144 print diff[s]
145 raw_input()
146 else:
147 if numpy.max(numpy.abs(reference[s])) == 0:
148 struc += "0"
149 else:
150 if False:
151 print "block %d, el %d, global el #%d, rel.l2err=%g" % (
152 block.number, i_el, el.id, relerr)
153 print computed[s]
154 print reference[s]
155 print diff[s]
156 raw_input()
157 struc += "."
158 i_el += 1
159 struc += " "
160 struc += "\n"
161 print
162 print struc
163
165 try:
166 xyz_diff = self.diff_xyz_cache[op.__class__, field_expr]
167 except KeyError:
168 pass
169 else:
170 return xyz_diff[op.xyz_axis]
171
172 discr = self.ex.discr
173 d = discr.dimensions
174
175 eg, = discr.element_groups
176 func, texrefs, field_texref = self.ex.get_diff_kernel(op.__class__, eg)
177
178 fplan = discr.flux_plan
179 lplan = fplan.diff_plan()
180
181 field = self.rec(field_expr)
182 assert field.dtype == discr.flux_plan.float_type
183
184 field.bind_to_texref(field_texref)
185
186 from hedge.cuda.tools import int_ceiling
187 kwargs = {
188 "block": (lplan.chunk_size, lplan.parallelism.p, 1),
189 "grid": (lplan.chunks_per_microblock(),
190 int_ceiling(
191 fplan.dofs_per_block()*len(discr.blocks)/
192 lplan.dofs_per_macroblock())
193 ),
194 "time_kernel": discr.instrumented,
195 "texrefs": texrefs,
196 }
197
198
199
200 xyz_diff = [discr.volume_empty() for axis in range(d)]
201
202 elgroup, = discr.element_groups
203 args = xyz_diff+[
204 self.ex.gpu_diffmats(op.__class__, eg).device_memory,
205
206 ]
207
208 kernel_time = func(*args, **kwargs)
209 if discr.instrumented:
210 discr.diff_op_timer.add_time(kernel_time)
211 discr.diff_op_counter.add(discr.dimensions)
212
213 if False:
214 copied_debugbuf = debugbuf.get()
215 print "DEBUG"
216
217 print copied_debugbuf[:100].reshape((10,10))
218 raw_input()
219
220 if discr.debug:
221 f = discr.volume_from_gpu(field)
222 dx = discr.volume_from_gpu(xyz_diff[0])
223
224 test_discr = discr.test_discr
225 real_dx = test_discr.nabla[0].apply(f.astype(numpy.float64))
226
227 diff = dx - real_dx
228
229 rel_err_norm = la.norm(diff)/la.norm(real_dx)
230 print "diff", rel_err_norm
231 if rel_err_norm > 5e-5:
232 self.print_error_structure(dx, real_dx, diff)
233 assert rel_err_norm < 5e-5
234
235 self.diff_xyz_cache[op.__class__, field_expr] = xyz_diff
236 return xyz_diff[op.xyz_axis]
237
238 - def map_whole_domain_flux(self, wdflux, out=None):
239 discr = self.ex.discr
240
241 eg, = discr.element_groups
242 fdata = self.ex.flux_with_temp_data(wdflux, eg)
243 fplan = discr.flux_plan
244 lplan = fplan.flux_lifting_plan()
245
246 gather, gather_texrefs, texref_map = \
247 self.ex.get_flux_gather_kernel(wdflux)
248 lift, lift_texrefs, fluxes_on_faces_texref = \
249 self.ex.get_flux_local_kernel(wdflux.is_lift, eg)
250
251 debugbuf = gpuarray.zeros((512,), dtype=numpy.float32)
252
253 from hedge.cuda.tools import int_ceiling
254 fluxes_on_faces = gpuarray.empty(
255 (int_ceiling(
256 len(discr.blocks)
257 * fplan.aligned_face_dofs_per_microblock()
258 * fplan.microblocks_per_block(),
259 lplan.parallelism.total()
260 * fplan.aligned_face_dofs_per_microblock()
261 ),),
262 dtype=fplan.float_type)
263
264
265 for dep_expr in wdflux.all_deps:
266 dep_field = self.rec(dep_expr)
267 assert dep_field.dtype == fplan.float_type
268 dep_field.bind_to_texref(texref_map[dep_expr])
269
270 gather_args = [
271 debugbuf,
272 fluxes_on_faces,
273 fdata.device_memory,
274 self.ex.index_list_global_data().device_memory,
275 ]
276
277 gather_kwargs = {
278 "texrefs": gather_texrefs,
279 "block": (discr.flux_plan.dofs_per_face(),
280 fplan.parallel_faces, 1),
281 "grid": (len(discr.blocks), 1),
282 "time_kernel": discr.instrumented,
283 }
284
285 kernel_time = gather(*gather_args, **gather_kwargs)
286
287 if discr.instrumented:
288 discr.inner_flux_timer.add_time(kernel_time)
289 discr.inner_flux_counter.add()
290
291 if False:
292 fof = fluxes_on_faces.get()
293 print numpy.reshape(fof[:20*15], (20,15))
294 raw_input()
295
296 if False:
297 copied_debugbuf = debugbuf.get()
298 print "DEBUG"
299 numpy.set_printoptions(linewidth=100)
300 print numpy.reshape(copied_debugbuf, (32, 16))
301
302 raw_input()
303
304
305 flux = discr.volume_empty()
306
307 lift_args = [
308 flux,
309 self.ex.gpu_liftmat(wdflux.is_lift).device_memory,
310 debugbuf,
311 ]
312
313 lift_kwargs = {
314 "texrefs": lift_texrefs,
315 "block": (lplan.chunk_size, lplan.parallelism.p, 1),
316 "grid": (lplan.chunks_per_microblock(),
317 int_ceiling(
318 fplan.dofs_per_block()*len(discr.blocks)/
319 lplan.dofs_per_macroblock())
320 ),
321 "time_kernel": discr.instrumented,
322 }
323
324 fluxes_on_faces.bind_to_texref(fluxes_on_faces_texref)
325
326 kernel_time = lift(*lift_args, **lift_kwargs)
327
328 if discr.instrumented:
329 discr.inner_flux_timer.add_time(kernel_time)
330 discr.inner_flux_counter.add()
331
332 if False:
333 copied_debugbuf = debugbuf.get()
334 print "DEBUG"
335 numpy.set_printoptions(linewidth=100)
336 print copied_debugbuf
337 print eg.inverse_jacobians[
338 self.ex.elgroup_microblock_indices(eg)][:500]
339 raw_input()
340
341
342 if discr.debug:
343 cot = discr.test_discr.compile(op.flux_optemplate)
344 ctx = {field_expr.name:
345 discr.volume_from_gpu(field).astype(numpy.float64)
346 }
347 for boundary in op.boundaries:
348 ctx[boundary.bfield_expr.name] = \
349 discr.test_discr.boundary_zeros(boundary.tag)
350 true_flux = cot(**ctx)
351
352 copied_flux = discr.volume_from_gpu(flux)
353
354 diff = copied_flux-true_flux
355
356 norm_true = la.norm(true_flux)
357
358 if False:
359 self.print_error_structure(copied_flux, true_flux, diff)
360 raw_input()
361
362 print "flux", la.norm(diff)/norm_true
363 assert la.norm(diff)/norm_true < 1e-6
364
365 if False:
366 copied_bfield = bfield.get()
367 face_len = discr.flux_plan.ldis.face_node_count()
368 aligned_face_len = discr.devdata.align_dtype(face_len, 4)
369 for elface in discr.mesh.tag_to_boundary.get('inflow', []):
370 face_stor = discr.face_storage_map[elface]
371 bdry_stor = face_stor.opposite
372 gpu_base = bdry_stor.gpu_bdry_index_in_floats
373 print gpu_base, copied_bfield[gpu_base:gpu_base+aligned_face_len]
374 raw_input()
375
376 return flux
377
378
379
380
397
400
401
402
403
404
405 - def get_load_code(self, dest, base, bytes, word_type=numpy.uint32,
406 descr=None):
407 from hedge.cuda.cgen import \
408 Pointer, POD, Value, ArrayOf, Const, \
409 Comment, Block, Line, \
410 Constant, Initializer, If, For, Statement, Assign
411
412 from hedge.cuda.cgen import dtype_to_ctype
413 copy_dtype = numpy.dtype(word_type)
414 copy_dtype_str = dtype_to_ctype(copy_dtype)
415
416 code = []
417 if descr is not None:
418 code.append(Comment(descr))
419
420 code.extend([
421 Block([
422 Constant(Pointer(POD(copy_dtype, "load_base")),
423 ("(%s *) (%s)" % (copy_dtype_str, base))),
424 For("unsigned word_nr = THREAD_NUM",
425 "word_nr*sizeof(int) < (%s)" % bytes,
426 "word_nr += COALESCING_THREAD_COUNT",
427 Statement("((%s *) (%s))[word_nr] = load_base[word_nr]"
428 % (copy_dtype_str, dest))
429 ),
430 ]),
431 Line(),
432 ])
433
434 return code
435
436
437
438
439
440 @memoize_method
442 from hedge.cuda.cgen import \
443 Pointer, POD, Value, ArrayOf, Const, \
444 Module, FunctionDeclaration, FunctionBody, Block, \
445 Comment, Line, \
446 CudaShared, CudaGlobal, Static, \
447 Define, \
448 Constant, Initializer, If, For, Statement, Assign
449
450 discr = self.discr
451 d = discr.dimensions
452 dims = range(d)
453 fplan = discr.flux_plan
454 lplan = fplan.diff_plan()
455
456 diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
457 elgroup, = discr.element_groups
458
459 float_type = fplan.float_type
460
461 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"),
462 [Pointer(POD(float_type, "dxyz%d" % i)) for i in dims]
463 + [
464 Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
465
466 ]
467 ))
468
469 rst_channels = discr.devdata.make_valid_tex_channel_count(d)
470 cmod = Module([
471 Value("texture<float%d, 2, cudaReadModeElementType>"
472 % rst_channels,
473 "rst_to_xyz_tex"),
474 Value("texture<float, 1, cudaReadModeElementType>",
475 "field_tex"),
476 Line(),
477 Define("DIMENSIONS", discr.dimensions),
478 Define("DOFS_PER_EL", fplan.dofs_per_el()),
479 Line(),
480 Define("CHUNK_DOF", "threadIdx.x"),
481 Define("PAR_MB_NR", "threadIdx.y"),
482 Line(),
483 Define("MB_CHUNK", "blockIdx.x"),
484 Define("MACROBLOCK_NR", "blockIdx.y"),
485 Line(),
486 Define("CHUNK_DOF_COUNT", lplan.chunk_size),
487 Define("MB_CHUNK_COUNT", lplan.chunks_per_microblock()),
488 Define("MB_DOF_COUNT", fplan.microblock.aligned_floats),
489 Define("MB_EL_COUNT", fplan.microblock.elements),
490 Define("PAR_MB_COUNT", lplan.parallelism.p),
491 Define("SEQ_MB_COUNT", lplan.parallelism.s),
492 Line(),
493 Define("THREAD_NUM", "(CHUNK_DOF+PAR_MB_NR*CHUNK_DOF_COUNT)"),
494 Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*CHUNK_DOF_COUNT)"),
495 Line(),
496 Define("MB_DOF_BASE", "(MB_CHUNK*CHUNK_DOF_COUNT)"),
497 Define("MB_DOF", "(MB_DOF_BASE+CHUNK_DOF)"),
498 Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*SEQ_MB_COUNT)"),
499 Line(),
500 Define("DIFFMAT_CHUNK_FLOATS", diffmat_data.block_floats),
501 Define("DIFFMAT_CHUNK_BYTES", "(DIFFMAT_CHUNK_FLOATS*%d)"
502 % fplan.float_size),
503 Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
504 Line(),
505 CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"),
506 "DIMENSIONS*DOFS_PER_EL*CHUNK_DOF_COUNT")),
507 Line(),
508 ])
509
510 S = Statement
511 f_body = Block()
512
513 f_body.extend_log_block("calculate responsibility data", [
514 Initializer(POD(numpy.uint8, "mb_el"),
515 "MB_DOF/DOFS_PER_EL"),
516 ])
517
518 f_body.extend(
519 self.get_load_code(
520 dest="smem_diff_rst_mat",
521 base="gmem_diff_rst_mat + MB_CHUNK*DIFFMAT_CHUNK_BYTES",
522 bytes="DIFFMAT_CHUNK_BYTES",
523 descr="load diff mat chunk")
524 +[S("__syncthreads()")])
525
526
527 def get_scalar_diff_code(matrix_row, dest_pattern):
528 code = []
529 for axis in dims:
530 code.append(
531 Initializer(POD(float_type, "drst%d" % axis), 0))
532
533 code.append(Line())
534
535 def get_mat_entry(row, col, axis):
536 return ("smem_diff_rst_mat["
537 "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
538 "+%(col)s"
539 "]" % {"row":row, "col":col, "axis":axis}
540 )
541
542 tex_channels = ["x", "y", "z", "w"]
543 from pytools import flatten
544 code.extend(
545 [POD(float_type, "field_value"),
546 Line(),
547 ]
548 +list(flatten( [
549 Assign("field_value",
550 "tex1Dfetch(field_tex, global_mb_dof_base"
551 "+mb_el*DOFS_PER_EL+%d)" % j
552 ),
553 Line(),
554 ]
555 +[
556 S("drst%d += %s * field_value"
557 % (axis, get_mat_entry(matrix_row, j, axis)))
558 for axis in dims
559 ]+[Line()]
560 for j in range(fplan.dofs_per_el())
561 ))
562 )
563
564 store_code = Block()
565 for glob_axis in dims:
566 store_code.append(Block([
567 Initializer(Value("float%d" % rst_channels, "rst_to_xyz"),
568 "tex2D(rst_to_xyz_tex, %d, global_mb_nr*MB_EL_COUNT+mb_el)"
569 % glob_axis
570 ),
571 Assign(
572 dest_pattern % glob_axis,
573 " + ".join(
574 "rst_to_xyz.%s"
575 "*"
576 "drst%d" % (tex_channels[loc_axis], loc_axis)
577 for loc_axis in dims
578 )
579 )
580 ])
581 )
582
583 code.append(If("MB_DOF < DOFS_PER_EL*MB_EL_COUNT", store_code))
584
585 return code
586
587 f_body.extend([
588 For("unsigned short seq_mb_number = 0",
589 "seq_mb_number < SEQ_MB_COUNT",
590 "++seq_mb_number",
591 Block([
592 Initializer(POD(numpy.uint32, "global_mb_nr"),
593 "GLOBAL_MB_NR_BASE + seq_mb_number*PAR_MB_COUNT + PAR_MB_NR"),
594 Initializer(POD(numpy.uint32, "global_mb_dof_base"),
595 "global_mb_nr*MB_DOF_COUNT"),
596 Line(),
597 ]+
598 get_scalar_diff_code(
599 "CHUNK_DOF",
600 "dxyz%d[global_mb_dof_base+MB_DOF]")
601 )
602 )
603 ])
604
605
606 cmod.append(FunctionBody(f_decl, f_body))
607
608 mod = cuda.SourceModule(cmod,
609 keep=True,
610
611 )
612 print "diff: lmem=%d smem=%d regs=%d" % (mod.lmem, mod.smem, mod.registers)
613
614 rst_to_xyz_texref = mod.get_texref("rst_to_xyz_tex")
615 cuda.bind_array_to_texref(
616 self.localop_rst_to_xyz(diff_op_cls, elgroup),
617 rst_to_xyz_texref)
618
619 field_texref = mod.get_texref("field_tex")
620 texrefs = [field_texref, rst_to_xyz_texref]
621
622 return mod.get_function("apply_diff_mat"), texrefs, field_texref
623
624
625
626
627
628 @memoize_method
630 from hedge.cuda.cgen import \
631 Pointer, POD, Value, ArrayOf, Const, \
632 Module, FunctionDeclaration, FunctionBody, Block, \
633 Comment, Line, \
634 CudaShared, CudaConstant, CudaGlobal, Static, \
635 Define, \
636 Constant, Initializer, If, For, Statement, Assign, \
637 ArrayInitializer
638
639 discr = self.discr
640 d = discr.dimensions
641 dims = range(d)
642 fplan = discr.flux_plan
643 lplan = fplan.flux_lifting_plan()
644
645 liftmat_data = self.gpu_liftmat(is_lift)
646
647 float_type = fplan.float_type
648
649 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_lift_mat"),
650 [
651 Pointer(POD(float_type, "flux")),
652 Pointer(POD(numpy.uint8, "gmem_lift_mat")),
653 Pointer(POD(float_type, "debugbuf")),
654 ]
655 ))
656
657 rst_channels = discr.devdata.make_valid_tex_channel_count(d)
658 cmod = Module([
659 Value("texture<float, 1, cudaReadModeElementType>",
660 "fluxes_on_faces_tex"),
661 ])
662 if is_lift:
663 cmod.append(
664 Value("texture<float, 1, cudaReadModeElementType>",
665 "inverse_jacobians_tex"),
666 )
667
668 cmod.extend([
669 Line(),
670 Define("DIMENSIONS", discr.dimensions),
671 Define("DOFS_PER_EL", fplan.dofs_per_el()),
672 Define("FACES_PER_EL", fplan.faces_per_el()),
673 Define("DOFS_PER_FACE", fplan.dofs_per_face()),
674 Define("FACE_DOFS_PER_EL", "(DOFS_PER_FACE*FACES_PER_EL)"),
675 Line(),
676 Define("CHUNK_DOF", "threadIdx.x"),
677 Define("PAR_MB_NR", "threadIdx.y"),
678 Line(),
679 Define("MB_CHUNK", "blockIdx.x"),
680 Define("MACROBLOCK_NR", "blockIdx.y"),
681 Line(),
682 Define("CHUNK_DOF_COUNT", lplan.chunk_size),
683 Define("MB_CHUNK_COUNT", lplan.chunks_per_microblock()),
684 Define("MB_DOF_COUNT", fplan.microblock.aligned_floats),
685 Define("MB_FACEDOF_COUNT", fplan.aligned_face_dofs_per_microblock()),
686 Define("MB_EL_COUNT", fplan.microblock.elements),
687 Define("PAR_MB_COUNT", lplan.parallelism.p),
688 Define("SEQ_MB_COUNT", lplan.parallelism.s),
689 Line(),
690 Define("THREAD_NUM", "(CHUNK_DOF+PAR_MB_NR*CHUNK_DOF_COUNT)"),
691 Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*CHUNK_DOF_COUNT)"),
692 Line(),
693 Define("MB_DOF_BASE", "(MB_CHUNK*CHUNK_DOF_COUNT)"),
694 Define("MB_DOF", "(MB_DOF_BASE+CHUNK_DOF)"),
695 Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*SEQ_MB_COUNT)"),
696 Line(),
697 Define("LIFTMAT_COLUMNS", liftmat_data.matrix_columns),
698 Define("LIFTMAT_CHUNK_FLOATS", liftmat_data.block_floats),
699 Define("LIFTMAT_CHUNK_BYTES",
700 "(LIFTMAT_CHUNK_FLOATS*%d)" % fplan.float_size),
701
702 Line(),
703 CudaShared(ArrayOf(POD(float_type, "smem_lift_mat"),
704 "LIFTMAT_CHUNK_FLOATS")),
705 CudaShared(
706 ArrayOf(
707 ArrayOf(
708 POD(float_type, "dof_buffer"),
709 "PAR_MB_COUNT"),
710 "CHUNK_DOF_COUNT"),
711 ),
712 CudaShared(POD(numpy.uint16, "chunk_start_el")),
713 CudaShared(POD(numpy.uint16, "chunk_stop_el")),
714 CudaShared(POD(numpy.uint16, "chunk_el_count")),
715 Line(),
716 ArrayInitializer(
717 CudaConstant(
718 ArrayOf(
719 POD(numpy.uint16, "chunk_start_el_lookup"),
720 "MB_CHUNK_COUNT")),
721 [(chk*lplan.chunk_size)//fplan.dofs_per_el()
722 for chk in range(lplan.chunks_per_microblock())]
723 ),
724 ArrayInitializer(
725 CudaConstant(
726 ArrayOf(
727 POD(numpy.uint16, "chunk_stop_el_lookup"),
728 "MB_CHUNK_COUNT")),
729 [min(fplan.microblock.elements,
730 (chk*lplan.chunk_size+lplan.chunk_size-1)
731 //fplan.dofs_per_el()+1)
732 for chk in range(lplan.chunks_per_microblock())]
733 ),
734 ])
735
736 S = Statement
737 f_body = Block()
738
739 f_body.extend_log_block("calculate responsibility data", [
740 Initializer(POD(numpy.uint8, "dof_el"),
741 "MB_DOF/DOFS_PER_EL"),
742 Line(),
743
744 If("THREAD_NUM==0",
745 Block([
746 Assign("chunk_start_el", "chunk_start_el_lookup[MB_CHUNK]"),
747 Assign("chunk_stop_el", "chunk_stop_el_lookup[MB_CHUNK]"),
748 Assign("chunk_el_count", "chunk_stop_el-chunk_start_el")
749 ])
750 ),
751 S("__syncthreads()")
752 ])
753
754 f_body.extend(
755 self.get_load_code(
756 dest="smem_lift_mat",
757 base=("gmem_lift_mat + MB_CHUNK*LIFTMAT_CHUNK_BYTES"),
758 bytes="LIFTMAT_CHUNK_BYTES",
759 descr="load lift mat chunk")
760 )
761
762
763 def get_batched_fetch_mat_mul_code(el_fetch_count):
764 result = []
765 dofs = range(fplan.face_dofs_per_el())
766
767 for load_chunk_start in range(0, fplan.face_dofs_per_el(),
768 lplan.chunk_size):
769 result.append(
770 Assign(
771 "dof_buffer[PAR_MB_NR][CHUNK_DOF]",
772 "tex1Dfetch(fluxes_on_faces_tex, "
773 "global_mb_facedof_base"
774 "+(chunk_start_el)*FACE_DOFS_PER_EL+%d+CHUNK_DOF)"
775 % (load_chunk_start)
776 ))
777
778 result.extend([
779 S("__syncthreads()"),
780 Line(),
781 ])
782
783 for dof in dofs[load_chunk_start:load_chunk_start+lplan.chunk_size]:
784 result.append(
785 S("result += "
786 "smem_lift_mat[CHUNK_DOF*LIFTMAT_COLUMNS + %d]"
787 "*"
788 "dof_buffer[PAR_MB_NR][%d]"
789 % (dof, dof-load_chunk_start))
790 )
791 result.append(Line())
792 return result
793
794 def get_direct_tex_mat_mul_code():
795 return [
796 S("result += "
797 "tex1Dfetch(fluxes_on_faces_tex, "
798 "global_mb_facedof_base"
799 "+dof_el*FACE_DOFS_PER_EL+%(j)d)"
800 " * smem_lift_mat["
801 "%(row)s*LIFTMAT_COLUMNS + %(j)s"
802 "]"
803 % {"j":j, "row": "CHUNK_DOF"}
804 )
805 for j in range(
806 fplan.dofs_per_face()*fplan.faces_per_el())
807 ]+[Line()]
808
809 def get_mat_mul_code(el_fetch_count):
810 if el_fetch_count == 1:
811 return get_batched_fetch_mat_mul_code(el_fetch_count)
812 else:
813 return get_direct_tex_mat_mul_code()
814
815 if is_lift:
816 inv_jac_multiplier = ("tex1Dfetch(inverse_jacobians_tex,"
817 "global_mb_nr*MB_EL_COUNT+dof_el)")
818 else:
819 inv_jac_multiplier = "1"
820
821 from hedge.cuda.cgen import make_multiple_ifs
822 f_body.append(make_multiple_ifs([
823 ("chunk_el_count == %d" % fetch_count,
824 For("unsigned short seq_mb_number = 0",
825 "seq_mb_number < SEQ_MB_COUNT",
826 "++seq_mb_number",
827 Block([
828 Initializer(POD(numpy.uint32, "global_mb_nr"),
829 "GLOBAL_MB_NR_BASE + seq_mb_number*PAR_MB_COUNT + PAR_MB_NR"),
830 Initializer(POD(numpy.uint32, "global_mb_dof_base"),
831 "global_mb_nr*MB_DOF_COUNT"),
832 Initializer(POD(numpy.uint32, "global_mb_facedof_base"),
833 "global_mb_nr*MB_FACEDOF_COUNT"),
834 Line(),
835 Initializer(POD(float_type, "result"), 0),
836 Line(),
837 ]
838 +get_mat_mul_code(fetch_count)+[
839 If("MB_DOF < DOFS_PER_EL*MB_EL_COUNT",
840 Assign(
841 "flux[global_mb_dof_base+MB_DOF]",
842 "result*%s" % inv_jac_multiplier
843 )
844 )
845 ])
846 )
847 )
848 for fetch_count in
849 range(1, lplan.max_elements_touched_by_chunk()+1)]
850 ))
851
852
853 cmod.append(FunctionBody(f_decl, f_body))
854
855 mod = cuda.SourceModule(cmod,
856 keep=True,
857
858 )
859 print "lift: lmem=%d smem=%d regs=%d" % (mod.lmem, mod.smem, mod.registers)
860
861 fluxes_on_faces_texref = mod.get_texref("fluxes_on_faces_tex")
862 texrefs = [fluxes_on_faces_texref]
863
864 if is_lift:
865 inverse_jacobians_texref = mod.get_texref("inverse_jacobians_tex")
866 self.inverse_jacobians_tex(elgroup).bind_to_texref(
867 inverse_jacobians_texref)
868 texrefs.append(inverse_jacobians_texref)
869
870 return (mod.get_function("apply_lift_mat"),
871 texrefs,
872 fluxes_on_faces_texref)
873
874
875
876
877
878 @memoize_method
880 from hedge.cuda.cgen import \
881 Pointer, POD, Value, ArrayOf, Const, \
882 Module, FunctionDeclaration, FunctionBody, Block, \
883 Comment, Line, \
884 CudaShared, CudaGlobal, Static, \
885 Define, Pragma, \
886 Constant, Initializer, If, For, Statement, Assign, While
887
888 discr = self.discr
889 fplan = discr.flux_plan
890 d = discr.dimensions
891 dims = range(d)
892
893 elgroup, = discr.element_groups
894 flux_with_temp_data = self.flux_with_temp_data(wdflux, elgroup)
895
896 float_type = fplan.float_type
897
898 f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"),
899 [
900 Pointer(POD(float_type, "debugbuf")),
901 Pointer(POD(float_type, "gmem_fluxes_on_faces")),
902
903 Pointer(POD(numpy.uint8, "gmem_data")),
904 Pointer(POD(numpy.uint8, "gmem_index_lists")),
905 ]
906 ))
907
908 cmod = Module()
909
910 for dep_expr in wdflux.all_deps:
911 cmod.append(
912 Value("texture<float, 1, cudaReadModeElementType>",
913 "%s_tex" % wdflux.short_name(dep_expr)))
914
915 cmod.extend([
916 flux_header_struct(),
917 face_pair_struct(float_type, discr.dimensions),
918 Line(),
919 Define("DIMENSIONS", discr.dimensions),
920 Define("DOFS_PER_EL", fplan.dofs_per_el()),
921 Define("DOFS_PER_FACE", fplan.dofs_per_face()),
922 Line(),
923 Define("CONCURRENT_FACES", fplan.parallel_faces),
924 Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
925 Line(),
926 Define("FACEDOF_NR", "threadIdx.x"),
927 Define("BLOCK_FACE", "threadIdx.y"),
928 Line(),
929 Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*DOFS_PER_FACE)"),
930 Define("THREAD_COUNT", "(DOFS_PER_FACE*CONCURRENT_FACES)"),
931 Define("COALESCING_THREAD_COUNT", "(THREAD_COUNT & ~0xf)"),
932 Line(),
933 Define("DATA_BLOCK_SIZE", flux_with_temp_data.block_bytes),
934 Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()),
935 Define("ALIGNED_FACE_DOFS_PER_BLOCK",
936 "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
937 Line(),
938 Line(),
939 ] + self.index_list_global_data().code + [
940 Line(),
941 flux_with_temp_data.struct,
942 Line(),
943 CudaShared(
944 ArrayOf(Value("index_list_entry_t", "smem_index_lists"),
945 "INDEX_LISTS_LENGTH")),
946 CudaShared(Value("flux_data", "data")),
947 CudaShared(ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
948 "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT"
949 )),
950 Line(),
951 ])
952
953 S = Statement
954 f_body = Block()
955
956 f_body.extend(self.get_load_code(
957 dest="smem_index_lists",
958 base="gmem_index_lists",
959 bytes="sizeof(index_list_entry_t)*INDEX_LISTS_LENGTH",
960 descr="load index list data")
961 )
962
963 f_body.extend(self.get_load_code(
964 dest="&data",
965 base="gmem_data + blockIdx.x*DATA_BLOCK_SIZE",
966 bytes="sizeof(flux_data)",
967 descr="load face_pair data")
968 +[ S("__syncthreads()"), Line() ])
969
970 def flux_writer(dest_expr, bdry_id_expr, flipped, is_bdry):
971 from hedge.cuda.cgen import make_multiple_ifs
972 from pymbolic.mapper.stringifier import PREC_NONE
973
974 if flipped:
975 int_prefix, ext_prefix = "b_", "a_"
976 else:
977 int_prefix, ext_prefix = "a_", "b_"
978
979 flux_write_code = Block([
980 Initializer(POD(float_type, "flux"), 0)
981 ])
982
983 if not is_bdry:
984 for int_rec in wdflux.interiors:
985 flux_write_code.extend([
986 S("flux += /*int*/ (%s) * %s_%svalue"
987 % (FluxToCodeMapper(flipped)(int_rec.int_coeff, PREC_NONE),
988 int_rec.short_name,
989 int_prefix)),
990 S("flux += /*ext*/ (%s) * %s_%svalue"
991 % (FluxToCodeMapper(flipped)(int_rec.ext_coeff, PREC_NONE),
992 int_rec.short_name,
993 ext_prefix))
994 ])
995 else:
996 from pytools import flatten
997
998 def get_bdry_load_code(bdry_fluxes):
999
1000 return list(flatten([
1001 Initializer(
1002 POD(float_type, "%s_a_value" % flux.field_short_name),
1003 "tex1Dfetch(%s_tex, a_index)"
1004 % flux.field_short_name
1005 ),
1006 Initializer(
1007 POD(float_type, "%s_b_value" % flux.bfield_short_name),
1008 "tex1Dfetch(%s_tex, b_index)"
1009 % flux.bfield_short_name
1010 )
1011 ]
1012 for flux in bdry_fluxes))
1013
1014 flux_write_code.append(
1015 make_multiple_ifs([
1016 ("(%s) == %d" % (bdry_id_expr, bdry_id),
1017 Block(get_bdry_load_code(fluxes)+list(flatten([
1018 S("flux += /*int*/ (%s) * %s_%svalue"
1019 % (FluxToCodeMapper(flipped)(flux.int_coeff, PREC_NONE),
1020 flux.field_short_name,
1021 int_prefix)),
1022 S("flux += /*ext*/ (%s) * %s_%svalue"
1023 % (FluxToCodeMapper(flipped)(flux.ext_coeff, PREC_NONE),
1024 flux.bfield_short_name,
1025 ext_prefix)),
1026 ]
1027 for flux in fluxes
1028 ))))
1029 for bdry_id, fluxes in wdflux.bdry_id_to_fluxes.iteritems()
1030 ])
1031 )
1032 flux_write_code.append(
1033 S("%s = fpair->face_jacobian*flux" % dest_expr))
1034 return flux_write_code
1035
1036 def get_flux_code(is_bdry, is_twosided):
1037 flux_code = Block([])
1038
1039 flux_code.extend([
1040 Initializer(Pointer(
1041 Value("face_pair", "fpair")),
1042 "data.facepairs+fpair_nr"),
1043 Initializer(Pointer(Value(
1044 "index_list_entry_t", "a_ilist")),
1045 "smem_index_lists + fpair->a_ilist_index"
1046 ),
1047 Initializer(Pointer(Value(
1048 "index_list_entry_t", "b_ilist")),
1049 "smem_index_lists + fpair->b_ilist_index"
1050 ),
1051 Initializer(
1052 POD(numpy.uint32, "a_index"),
1053 "fpair->a_base + a_ilist[FACEDOF_NR]"),
1054 Initializer(
1055 POD(numpy.uint32, "b_index"),
1056 "fpair->b_base + b_ilist[FACEDOF_NR]"),
1057 ])
1058
1059 if not is_bdry:
1060 for dep_expr in wdflux.interior_deps:
1061 dep_sn = wdflux.short_name(dep_expr)
1062 flux_code.append(Initializer(
1063 POD(float_type, "%s_a_value" % dep_sn),
1064 "tex1Dfetch(%s_tex, a_index)"
1065 % dep_sn
1066 ))
1067 flux_code.extend([
1068 Initializer(
1069 POD(float_type, "%s_b_value" % dep_sn),
1070 "tex1Dfetch(%s_tex, b_index)"
1071 % dep_sn
1072 ),
1073 ])
1074
1075 flux_code.append(
1076 flux_writer(
1077 "smem_fluxes_on_faces[fpair->a_dest+FACEDOF_NR]",
1078 "fpair->boundary_id",
1079 flipped=False, is_bdry=is_bdry))
1080
1081 if is_twosided:
1082 flux_code.extend([
1083 Initializer(Pointer(Value(
1084 "index_list_entry_t", "b_write_ilist")),
1085 "smem_index_lists + fpair->b_write_ilist_index"
1086 ),
1087 flux_writer(
1088 "smem_fluxes_on_faces[fpair->b_dest+b_write_ilist[FACEDOF_NR]]",
1089 None,
1090 flipped=True, is_bdry=is_bdry)
1091 ])
1092
1093 flux_code.append(S("fpair_nr += CONCURRENT_FACES"))
1094
1095 return flux_code
1096
1097 f_body.extend_log_block("compute the fluxes", [
1098 Initializer(POD(numpy.uint16, "fpair_nr"), "BLOCK_FACE"),
1099 Comment("fluxes for dual-sided (intra-block) interior face pairs"),
1100 While("fpair_nr < data.header.same_facepairs_end",
1101 get_flux_code(is_bdry=False, is_twosided=True)
1102 ),
1103 Line(),
1104 Comment("work around nvcc assertion failure"),
1105 S("fpair_nr+=1"),
1106 S("fpair_nr-=1"),
1107 Line(),
1108 Comment("fluxes for single-sided (inter-block) interior face pairs"),
1109 While("fpair_nr < data.header.diff_facepairs_end",
1110 get_flux_code(is_bdry=False, is_twosided=False)
1111 ),
1112 Line(),
1113 Comment("fluxes for single-sided boundary face pairs"),
1114 While("fpair_nr < data.header.bdry_facepairs_end",
1115 get_flux_code(is_bdry=True, is_twosided=False)
1116 ),
1117 ])
1118
1119 f_body.extend_log_block("store the fluxes", [
1120 S("__syncthreads()"),
1121 Line(),
1122 For("unsigned word_nr = THREAD_NUM",
1123 "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
1124 "word_nr += COALESCING_THREAD_COUNT",
1125 Assign(
1126 "gmem_fluxes_on_faces[blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK+word_nr]",
1127 "smem_fluxes_on_faces[word_nr]")
1128 ),
1129 ])
1130
1131
1132 cmod.append(FunctionBody(f_decl, f_body))
1133
1134 mod = cuda.SourceModule(cmod,
1135 keep=True,
1136 options=["--maxrregcount=12"]
1137 )
1138 print "flux: lmem=%d smem=%d regs=%d" % (mod.lmem, mod.smem, mod.registers)
1139
1140 expr_to_texture_map = dict(
1141 (dep_expr, mod.get_texref(
1142 "%s_tex" % wdflux.short_name(dep_expr)))
1143 for dep_expr in wdflux.all_deps)
1144
1145 texrefs = expr_to_texture_map.values()
1146
1147 return mod.get_function("apply_flux"), texrefs, expr_to_texture_map
1148
1149
1150
1151
1152
1153 @memoize_method
1155 discr = self.discr
1156 fplan = discr.flux_plan
1157 lplan = fplan.diff_plan()
1158
1159 columns = fplan.dofs_per_el()*discr.dimensions
1160 additional_columns = 0
1161
1162 if columns % 2 == 0:
1163 columns += 1
1164 additional_columns += 1
1165
1166 block_floats = self.discr.devdata.align_dtype(
1167 columns*lplan.chunk_size, fplan.float_size)
1168
1169 vstacked_matrices = [
1170 numpy.vstack(fplan.microblock.elements*(m,))
1171 for m in diff_op_cls.matrices(elgroup)
1172 ]
1173
1174 chunks = []
1175
1176 from pytools import single_valued
1177 for chunk_start in range(0, fplan.microblock.elements*fplan.dofs_per_el(), lplan.chunk_size):
1178 matrices = [
1179 m[chunk_start:chunk_start+lplan.chunk_size]
1180 for m in vstacked_matrices]
1181
1182 matrices.append(
1183 numpy.zeros((single_valued(m.shape[0] for m in matrices),
1184 additional_columns))
1185 )
1186
1187 diffmats = numpy.asarray(
1188 numpy.hstack(matrices),
1189 dtype=self.discr.flux_plan.float_type,
1190 order="C")
1191 chunks.append(buffer(diffmats))
1192
1193 from pytools import Record
1194 from hedge.cuda.tools import pad_and_join
1195 return Record(
1196 device_memory=cuda.to_device(
1197 pad_and_join(chunks, block_floats*fplan.float_size)),
1198 block_floats=block_floats,
1199 matrix_columns=columns)
1200
1201 @memoize_method
1203 discr = self.discr
1204 fplan = discr.flux_plan
1205 lplan = fplan.flux_lifting_plan()
1206
1207 columns = fplan.face_dofs_per_el()
1208
1209 if columns % 2 == 0:
1210 columns += 1
1211
1212 block_floats = self.discr.devdata.align_dtype(
1213 columns*lplan.chunk_size, fplan.float_size)
1214
1215 if is_lift:
1216 mat = fplan.ldis.lifting_matrix()
1217 else:
1218 mat = fplan.ldis.multi_face_mass_matrix()
1219
1220 vstacked_matrix = numpy.vstack(
1221 fplan.microblock.elements*(mat,)
1222 )
1223
1224 if vstacked_matrix.shape[1] < columns:
1225 vstacked_matrix = numpy.hstack((
1226 vstacked_matrix,
1227 numpy.zeros((
1228 vstacked_matrix.shape[0],
1229 columns-vstacked_matrix.shape[1]
1230 ))
1231 ))
1232
1233 chunks = [
1234 buffer(numpy.asarray(
1235 vstacked_matrix[
1236 chunk_start:chunk_start+lplan.chunk_size],
1237 dtype=self.discr.flux_plan.float_type,
1238 order="C"))
1239 for chunk_start in range(
1240 0, fplan.microblock.elements*fplan.dofs_per_el(),
1241 lplan.chunk_size)
1242 ]
1243
1244 from pytools import Record
1245 from hedge.cuda.tools import pad_and_join
1246 return Record(
1247 device_memory=cuda.to_device(
1248 pad_and_join(chunks, block_floats*fplan.float_size)),
1249 block_floats=block_floats,
1250 matrix_columns=columns,
1251 )
1252
1253
1254 @memoize_method
1256 def get_el_index_in_el_group(el):
1257 mygroup, idx = discr.group_map[el.id]
1258 assert mygroup is elgroup
1259 return idx
1260
1261 discr = self.discr
1262 fplan = discr.flux_plan
1263
1264 el_count = len(discr.blocks) * fplan.elements_per_block()
1265 elgroup_indices = numpy.zeros((el_count,), dtype=numpy.intp)
1266
1267 for block in discr.blocks:
1268 block_elgroup_indices = [ get_el_index_in_el_group(el)
1269 for mb in block.microblocks
1270 for el in mb]
1271 offset = block.number * fplan.elements_per_block()
1272 elgroup_indices[offset:offset+len(block_elgroup_indices)] = \
1273 block_elgroup_indices
1274
1275 return elgroup_indices
1276
1277 @memoize_method
1279 discr = self.discr
1280 d = discr.dimensions
1281
1282 fplan = discr.flux_plan
1283 coeffs = diff_op.coefficients(elgroup)
1284
1285 elgroup_indices = self.elgroup_microblock_indices(elgroup)
1286 el_count = len(discr.blocks) * fplan.elements_per_block()
1287
1288
1289 result_matrix = (coeffs[:,:,elgroup_indices]
1290 .transpose(1,0,2))
1291 channels = discr.devdata.make_valid_tex_channel_count(d)
1292 add_channels = channels - result_matrix.shape[0]
1293 if add_channels:
1294 result_matrix = numpy.vstack((
1295 result_matrix,
1296 numpy.zeros((add_channels,d,el_count), dtype=result_matrix.dtype)
1297 ))
1298
1299 assert result_matrix.shape == (channels, d, el_count)
1300
1301 if discr.debug:
1302 def get_el_index_in_el_group(el):
1303 mygroup, idx = discr.group_map[el.id]
1304 assert mygroup is elgroup
1305 return idx
1306
1307 for block in discr.blocks:
1308 i = block.number * fplan.elements_per_block()
1309 for mb in block.microblocks:
1310 for el in mb:
1311 egi = get_el_index_in_el_group(el)
1312 assert egi == elgroup_indices[i]
1313 assert (result_matrix[:d,:,i].T == coeffs[:,:,egi]).all()
1314 i += 1
1315
1316 return cuda.make_multichannel_2d_array(result_matrix)
1317
1318 @memoize_method
1320 ij = elgroup.inverse_jacobians[
1321 self.elgroup_microblock_indices(elgroup)]
1322 return gpuarray.to_gpu(
1323 ij.astype(self.discr.flux_plan.float_type))
1324
1325 @memoize_method
1327 discr = self.discr
1328 d = discr.dimensions
1329
1330 fplan = discr.flux_plan
1331
1332 floats_per_block = fplan.elements_per_block()
1333 bytes_per_block = floats_per_block*fplan.float_size
1334
1335 inv_jacs = elgroup.inverse_jacobians
1336
1337 blocks = []
1338
1339 def get_el_index_in_el_group(el):
1340 mygroup, idx = discr.group_map[el.id]
1341 assert mygroup is elgroup
1342 return idx
1343
1344 from hedge.cuda.tools import pad
1345 for block in discr.blocks:
1346 block_elgroup_indices = numpy.fromiter(
1347 (get_el_index_in_el_group(el)
1348 for mb in block.microblocks
1349 for el in mb
1350 ),
1351 dtype=numpy.intp)
1352
1353 block_inv_jacs = (inv_jacs[block_elgroup_indices].copy().astype(fplan.float_type))
1354 blocks.append(pad(str(buffer(block_inv_jacs)), bytes_per_block))
1355
1356 from hedge.cuda.cgen import POD, ArrayOf
1357 return blocks, ArrayOf(
1358 POD(fplan.float_type, "inverse_jacobians"),
1359 floats_per_block)
1360
1361 @memoize_method
1381
1382 outf = open("el_faces.txt", "w")
1383 for block in discr.blocks:
1384 ldis = block.local_discretization
1385 el_dofs = ldis.node_count()
1386 face_dofs = ldis.face_node_count()
1387
1388 faces_todo = set((el,face_nbr)
1389 for mb in block.microblocks
1390 for el in mb
1391 for face_nbr in range(ldis.face_count()))
1392 same_fp_structs = []
1393 diff_fp_structs = []
1394 bdry_fp_structs = []
1395
1396 while faces_todo:
1397 elface = faces_todo.pop()
1398
1399 a_face = discr.face_storage_map[elface]
1400 b_face = a_face.opposite
1401
1402 print>>outf, "block %d el %d (global: %d) face %d" % (
1403 block.number, discr.find_number_in_block(a_face.el_face[0]),
1404 elface[0].id, elface[1]),
1405
1406 if isinstance(b_face, GPUBoundaryFaceStorage):
1407
1408 b_base = b_face.gpu_bdry_index_in_floats
1409 boundary_id = wdflux.boundary_elface_to_bdry_id(
1410 a_face.el_face)
1411 b_write_index_list = 0
1412 b_dest = INVALID_DEST
1413 print>>outf, "bdy%d" % boundary_id
1414
1415 fp_structs = bdry_fp_structs
1416 else:
1417
1418 b_base = discr.find_el_gpu_index(b_face.el_face[0])
1419 boundary_id = 0
1420
1421 if b_face.native_block == a_face.native_block:
1422
1423 faces_todo.remove(b_face.el_face)
1424 b_write_index_list = a_face.opp_write_index_list_id
1425 b_dest = find_elface_dest(b_face.el_face)
1426
1427 fp_structs = same_fp_structs
1428
1429 print>>outf, "same el %d (global: %d) face %d" % (
1430 discr.find_number_in_block(b_face.el_face[0]),
1431 b_face.el_face[0].id, b_face.el_face[1])
1432 else:
1433
1434 b_write_index_list = 0
1435 b_dest = INVALID_DEST
1436
1437 fp_structs = diff_fp_structs
1438
1439 print>>outf, "diff"
1440
1441 fp_structs.append(
1442 fp_struct.make(
1443 h=a_face.face_pair_side.h,
1444 order=a_face.face_pair_side.order,
1445 face_jacobian=a_face.face_pair_side.face_jacobian,
1446 normal=a_face.face_pair_side.normal,
1447
1448 a_base=discr.find_el_gpu_index(a_face.el_face[0]),
1449 b_base=b_base,
1450
1451 a_ilist_index= \
1452 a_face.global_int_flux_index_list_id*face_dofs,
1453 b_ilist_index= \
1454 a_face.global_ext_flux_index_list_id*face_dofs,
1455
1456 boundary_id=boundary_id,
1457 pad=0,
1458 b_write_ilist_index= \
1459 b_write_index_list*face_dofs,
1460
1461 a_dest=find_elface_dest(a_face.el_face),
1462 b_dest=b_dest
1463 ))
1464
1465 headers.append(flux_header_struct().make(
1466 els_in_block=len(block.el_number_map),
1467 same_facepairs_end=\
1468 len(same_fp_structs),
1469 diff_facepairs_end=\
1470 len(same_fp_structs)+len(diff_fp_structs),
1471 bdry_facepairs_end=\
1472 len(same_fp_structs)+len(diff_fp_structs)\
1473 +len(bdry_fp_structs),
1474 ))
1475 fp_blocks.append(same_fp_structs+diff_fp_structs+bdry_fp_structs)
1476
1477 from hedge.cuda.cgen import Value
1478 from hedge.cuda.tools import make_superblocks
1479
1480 return make_superblocks(
1481 discr.devdata, "flux_data",
1482 [
1483 (headers, Value(flux_header_struct().tpname, "header")),
1484 ],
1485 [ (fp_blocks, Value(fp_struct.tpname, "facepairs")), ])
1486
1487 @memoize_method
1489 discr = self.discr
1490
1491 from pytools import single_valued
1492 ilist_length = single_valued(len(il) for il in discr.index_lists)
1493
1494 if ilist_length > 256:
1495 tp = numpy.uint16
1496 else:
1497 tp = numpy.uint8
1498
1499 from hedge.cuda.cgen import ArrayInitializer, ArrayOf, \
1500 Typedef, POD, Value, CudaConstant, Define
1501
1502 from pytools import flatten, Record
1503 flat_ilists = numpy.array(
1504 list(flatten(discr.index_lists)),
1505 dtype=tp)
1506 return Record(
1507 code=[
1508 Define("INDEX_LISTS_LENGTH", len(flat_ilists)),
1509 Typedef(POD(tp, "index_list_entry_t")),
1510 ],
1511 device_memory=cuda.to_device(flat_ilists)
1512 )
1513